From 625ed2bf049d5a352c1bcca962d6e133454eaaff Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 26 Apr 2017 08:27:56 +0200
Subject: sched/cfs: Make util/load_avg more stable

In the current implementation of load/util_avg, we assume that the
ongoing time segment has fully elapsed, and util/load_sum is divided
by LOAD_AVG_MAX, even if part of the time segment still remains to
run. As a consequence, this remaining part is considered as idle time
and generates unexpected variations of util_avg of a busy CPU in the
range [1002..1024[ whereas util_avg should stay at 1023.

In order to keep the metric stable, we should not consider the ongoing
time segment when computing load/util_avg but only the segments that
have already fully elapsed. But to not consider the current time
segment adds unwanted latency in the load/util_avg responsivness
especially when the time is scaled instead of the contribution.

Instead of waiting for the current time segment to have fully elapsed
before accounting it in load/util_avg, we can already account the
elapsed part but change the range used to compute load/util_avg
accordingly.

At the very beginning of a new time segment, the past segments have
been decayed and the max value is LOAD_AVG_MAX*y. At the very end of
the current time segment, the max value becomes:

  LOAD_AVG_MAX*y + 1024(us)  (== LOAD_AVG_MAX)

In fact, the max value is:

  LOAD_AVG_MAX*y + sa->period_contrib

at any time in the time segment.

Taking advantage of the fact that:

  LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024

the range becomes [0..LOAD_AVG_MAX-1024+sa->period_contrib].

As the elapsed part is already accounted in load/util_sum, we update
the max value according to the current position in the time segment
instead of removing its contribution.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten.Rasmussen@arm.com
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: pjt@google.com
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1493188076-2767-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d71109321841..4f1825d60937 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2916,12 +2916,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 	/*
 	 * Step 2: update *_avg.
 	 */
-	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
 	if (cfs_rq) {
 		cfs_rq->runnable_load_avg =
-			div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+			div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
 	}
-	sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+	sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
 
 	return 1;
 }
-- 
cgit v1.3-14-g43fede


From cf15ca8deda86b27b66e27848b4b0fe58098fc0b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 21 Apr 2017 12:11:53 +0200
Subject: sched/clock: Initialize all per-CPU state before switching (back) to
 unstable

In preparation for not keeping the sched_clock_tick() active for
stable TSC, we need to explicitly initialize all per-CPU state
before switching back to unstable.

Note: this patch looses the __gtod_offset calculation; it will be
restored in the next one.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/clock.c | 60 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 00a45c45beca..dc650851935f 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -124,6 +124,12 @@ int sched_clock_stable(void)
 	return static_branch_likely(&__sched_clock_stable);
 }
 
+static void __scd_stamp(struct sched_clock_data *scd)
+{
+	scd->tick_gtod = ktime_get_ns();
+	scd->tick_raw = sched_clock();
+}
+
 static void __set_sched_clock_stable(void)
 {
 	struct sched_clock_data *scd = this_scd();
@@ -141,8 +147,37 @@ static void __set_sched_clock_stable(void)
 	tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
 }
 
+/*
+ * If we ever get here, we're screwed, because we found out -- typically after
+ * the fact -- that TSC wasn't good. This means all our clocksources (including
+ * ktime) could have reported wrong values.
+ *
+ * What we do here is an attempt to fix up and continue sort of where we left
+ * off in a coherent manner.
+ *
+ * The only way to fully avoid random clock jumps is to boot with:
+ * "tsc=unstable".
+ */
 static void __sched_clock_work(struct work_struct *work)
 {
+	struct sched_clock_data *scd;
+	int cpu;
+
+	/* take a current timestamp and set 'now' */
+	preempt_disable();
+	scd = this_scd();
+	__scd_stamp(scd);
+	scd->clock = scd->tick_gtod + __gtod_offset;
+	preempt_enable();
+
+	/* clone to all CPUs */
+	for_each_possible_cpu(cpu)
+		per_cpu(sched_clock_data, cpu) = *scd;
+
+	printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
+			scd->tick_gtod, __gtod_offset,
+			scd->tick_raw,  __sched_clock_offset);
+
 	static_branch_disable(&__sched_clock_stable);
 }
 
@@ -150,27 +185,11 @@ static DECLARE_WORK(sched_clock_work, __sched_clock_work);
 
 static void __clear_sched_clock_stable(void)
 {
-	struct sched_clock_data *scd = this_scd();
-
-	/*
-	 * Attempt to make the stable->unstable transition continuous.
-	 *
-	 * Trouble is, this is typically called from the TSC watchdog
-	 * timer, which is late per definition. This means the tick
-	 * values can already be screwy.
-	 *
-	 * Still do what we can.
-	 */
-	__gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod);
-
-	printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
-			scd->tick_gtod, __gtod_offset,
-			scd->tick_raw,  __sched_clock_offset);
+	if (!sched_clock_stable())
+		return;
 
 	tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
-
-	if (sched_clock_stable())
-		schedule_work(&sched_clock_work);
+	schedule_work(&sched_clock_work);
 }
 
 void clear_sched_clock_stable(void)
@@ -357,8 +376,7 @@ void sched_clock_tick(void)
 	 * XXX arguably we can skip this if we expose tsc_clocksource_reliable
 	 */
 	scd = this_scd();
-	scd->tick_raw  = sched_clock();
-	scd->tick_gtod = ktime_get_ns();
+	__scd_stamp(scd);
 
 	if (!sched_clock_stable() && likely(sched_clock_running))
 		sched_clock_local(scd);
-- 
cgit v1.3-14-g43fede


From b421b22b00b0011f6a2ce3561176c4e79e640c49 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 21 Apr 2017 12:14:13 +0200
Subject: x86/tsc, sched/clock, clocksource: Use clocksource watchdog to
 provide stable sync points

Currently we keep sched_clock_tick() active for stable TSC in order to
keep the per-CPU state semi up-to-date. The (obvious) problem is that
by the time we detect TSC is borked, our per-CPU state is also borked.

So hook into the clocksource watchdog and call a method after we've
found it to still be stable.

There's the obvious race where the TSC goes wonky between finding it
stable and us running the callback, but closing that is too much work
and not really worth it, since we're already detecting TSC wobbles
after the fact, so we cannot, per definition, fully avoid funny clock
values.

And since the watchdog runs less often than the tick, this is also an
optimization.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/tsc.c       | 10 ++++++++++
 include/linux/clocksource.h |  1 +
 include/linux/sched/clock.h |  2 +-
 kernel/sched/clock.c        | 36 +++++++++++++++++++++++++++---------
 kernel/time/clocksource.c   |  3 +++
 5 files changed, 42 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 66015195bd18..c1b16b328abe 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1033,6 +1033,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
 	pr_info("Marking TSC unstable due to clocksource watchdog\n");
 }
 
+static void tsc_cs_tick_stable(struct clocksource *cs)
+{
+	if (tsc_unstable)
+		return;
+
+	if (using_native_sched_clock())
+		sched_clock_tick_stable();
+}
+
 /*
  * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
  */
@@ -1046,6 +1055,7 @@ static struct clocksource clocksource_tsc = {
 	.archdata               = { .vclock_mode = VCLOCK_TSC },
 	.resume			= tsc_resume,
 	.mark_unstable		= tsc_cs_mark_unstable,
+	.tick_stable		= tsc_cs_tick_stable,
 };
 
 void mark_tsc_unstable(char *reason)
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index f2b10d9ebd04..81490456c242 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -96,6 +96,7 @@ struct clocksource {
 	void (*suspend)(struct clocksource *cs);
 	void (*resume)(struct clocksource *cs);
 	void (*mark_unstable)(struct clocksource *cs);
+	void (*tick_stable)(struct clocksource *cs);
 
 	/* private: */
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 34fe92ce1ebd..978cbb0af5f3 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -63,8 +63,8 @@ extern void clear_sched_clock_stable(void);
  */
 extern u64 __sched_clock_offset;
 
-
 extern void sched_clock_tick(void);
+extern void sched_clock_tick_stable(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index dc650851935f..f861637f7fdc 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -366,20 +366,38 @@ void sched_clock_tick(void)
 {
 	struct sched_clock_data *scd;
 
+	if (sched_clock_stable())
+		return;
+
+	if (unlikely(!sched_clock_running))
+		return;
+
 	WARN_ON_ONCE(!irqs_disabled());
 
-	/*
-	 * Update these values even if sched_clock_stable(), because it can
-	 * become unstable at any point in time at which point we need some
-	 * values to fall back on.
-	 *
-	 * XXX arguably we can skip this if we expose tsc_clocksource_reliable
-	 */
 	scd = this_scd();
 	__scd_stamp(scd);
+	sched_clock_local(scd);
+}
+
+void sched_clock_tick_stable(void)
+{
+	u64 gtod, clock;
 
-	if (!sched_clock_stable() && likely(sched_clock_running))
-		sched_clock_local(scd);
+	if (!sched_clock_stable())
+		return;
+
+	/*
+	 * Called under watchdog_lock.
+	 *
+	 * The watchdog just found this TSC to (still) be stable, so now is a
+	 * good moment to update our __gtod_offset. Because once we find the
+	 * TSC to be unstable, any computation will be computing crap.
+	 */
+	local_irq_disable();
+	gtod = ktime_get_ns();
+	clock = sched_clock();
+	__gtod_offset = (clock + __sched_clock_offset) - gtod;
+	local_irq_enable();
 }
 
 /*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 93621ae718d3..03918a19cf2d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigned long data)
 			continue;
 		}
 
+		if (cs == curr_clocksource && cs->tick_stable)
+			cs->tick_stable(cs);
+
 		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
 		    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
 		    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
-- 
cgit v1.3-14-g43fede


From ac1e843f0900bea92fcb47f6205e1f9ffb0d469c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 21 Apr 2017 12:26:23 +0200
Subject: sched/clock: Remove unused argument to
 sched_clock_idle_wakeup_event()

The argument to sched_clock_idle_wakeup_event() has not been used in a
long time. Remove it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/tsc.c       | 2 +-
 include/linux/sched/clock.h | 4 ++--
 kernel/sched/clock.c        | 4 ++--
 kernel/time/tick-sched.c    | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c1b16b328abe..a3b544264360 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -182,7 +182,7 @@ static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long ts
 	c2n->data[1] = data;
 
 done:
-	sched_clock_idle_wakeup_event(0);
+	sched_clock_idle_wakeup_event();
 	local_irq_restore(flags);
 }
 
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 978cbb0af5f3..9c36f0722966 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -39,7 +39,7 @@ static inline void sched_clock_idle_sleep_event(void)
 {
 }
 
-static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
+static inline void sched_clock_idle_wakeup_event(void)
 {
 }
 
@@ -66,7 +66,7 @@ extern u64 __sched_clock_offset;
 extern void sched_clock_tick(void);
 extern void sched_clock_tick_stable(void);
 extern void sched_clock_idle_sleep_event(void);
-extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+extern void sched_clock_idle_wakeup_event(void);
 
 /*
  * As outlined in clock.c, provides a fast, high resolution, nanosecond
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index f861637f7fdc..750a92c9db7e 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -410,9 +410,9 @@ void sched_clock_idle_sleep_event(void)
 EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
 
 /*
- * We just idled delta nanoseconds (called with irqs disabled):
+ * We just idled; resync with ktime. (called with irqs disabled):
  */
-void sched_clock_idle_wakeup_event(u64 delta_ns)
+void sched_clock_idle_wakeup_event(void)
 {
 	if (timekeeping_suspended)
 		return;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 64c97fc130c4..9c2dc64e31d8 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -554,7 +554,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
 	update_ts_time_stats(smp_processor_id(), ts, now, NULL);
 	ts->idle_active = 0;
 
-	sched_clock_idle_wakeup_event(0);
+	sched_clock_idle_wakeup_event();
 }
 
 static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
-- 
cgit v1.3-14-g43fede


From 3067a33d5fec856bb297d58e7f03411d060ccdee Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 21 Apr 2017 12:42:03 +0200
Subject: sched/clock: Remove watchdog touching

Commit:

  2bacec8c318c ("sched: touch softlockup watchdog after idling")

introduced the touch_softlockup_watchdog_sched() call without
justification and I feel sched_clock management is not the right
place, it should only be concerned with producing semi coherent time.

If this causes watchdog thingies, we can find a better place.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/clock.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 750a92c9db7e..c30c05f05d6f 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -418,7 +418,6 @@ void sched_clock_idle_wakeup_event(void)
 		return;
 
 	sched_clock_tick();
-	touch_softlockup_watchdog_sched();
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
-- 
cgit v1.3-14-g43fede


From f9fccdb9efef60dbcf84d493514b475c41aa866f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 21 Apr 2017 12:43:59 +0200
Subject: cpuidle: Fix idle time tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ville reported that on his Core2, which has TSC stop in idle, we would
always report very short idle durations. He tracked this down to
commit:

  e93e59ce5b85 ("cpuidle: Replace ktime_get() with local_clock()")

which replaces ktime_get() with local_clock().

Add a sched_clock_idle_wakeup_event() call, which will re-sync the
clock with ktime_get_ns() when TSC is unstable and no-op otherwise.

Reported-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Tested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Fixes: e93e59ce5b85 ("cpuidle: Replace ktime_get() with local_clock()")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/cpuidle/cpuidle.c |  1 +
 kernel/sched/clock.c      | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 2706be7ed334..60bb64f4329d 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -220,6 +220,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	entered_state = target_state->enter(dev, drv, index);
 	start_critical_timings();
 
+	sched_clock_idle_wakeup_event();
 	time_end = ns_to_ktime(local_clock());
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
 
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c30c05f05d6f..d4c2f89fac92 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -410,14 +410,21 @@ void sched_clock_idle_sleep_event(void)
 EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
 
 /*
- * We just idled; resync with ktime. (called with irqs disabled):
+ * We just idled; resync with ktime.
  */
 void sched_clock_idle_wakeup_event(void)
 {
-	if (timekeeping_suspended)
+	unsigned long flags;
+
+	if (sched_clock_stable())
+		return;
+
+	if (unlikely(timekeeping_suspended))
 		return;
 
+	local_irq_save(flags);
 	sched_clock_tick();
+	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
-- 
cgit v1.3-14-g43fede


From 2e44b7ddf8ab01cf98106c68388f87af15fbde73 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 21 Apr 2017 12:46:57 +0200
Subject: sched/clock: Use late_initcall() instead of sched_init_smp()

Core2 marks its TSC unstable in ACPI Processor Idle, which is probed
after sched_init_smp(). Luckily it appears both acpi_processor and
intel_idle (which has a similar check) are mandatory built-in.

This means we can delay switching to stable until after these drivers
have ran (if they were modules, this would be impossible).

Delay the stable switch to late_initcall() to allow these drivers to
mark TSC unstable and avoid difficult stable->unstable transitions.

Reported-by: Lofstedt, Marta <marta.lofstedt@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/clock.h |  5 -----
 kernel/sched/clock.c        | 10 +++++++++-
 kernel/sched/core.c         |  2 --
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h
index 9c36f0722966..a55600ffdf4b 100644
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -23,10 +23,6 @@ extern u64 sched_clock_cpu(int cpu);
 extern void sched_clock_init(void);
 
 #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline void sched_clock_init_late(void)
-{
-}
-
 static inline void sched_clock_tick(void)
 {
 }
@@ -53,7 +49,6 @@ static inline u64 local_clock(void)
 	return sched_clock();
 }
 #else
-extern void sched_clock_init_late(void);
 extern int sched_clock_stable(void);
 extern void clear_sched_clock_stable(void);
 
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index d4c2f89fac92..a2f847c6ada8 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -64,6 +64,7 @@
 #include <linux/workqueue.h>
 #include <linux/compiler.h>
 #include <linux/tick.h>
+#include <linux/init.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -202,7 +203,11 @@ void clear_sched_clock_stable(void)
 		__clear_sched_clock_stable();
 }
 
-void sched_clock_init_late(void)
+/*
+ * We run this as late_initcall() such that it runs after all built-in drivers,
+ * notably: acpi_processor and intel_idle, which can mark the TSC as unstable.
+ */
+static int __init sched_clock_init_late(void)
 {
 	sched_clock_running = 2;
 	/*
@@ -216,7 +221,10 @@ void sched_clock_init_late(void)
 
 	if (__sched_clock_stable_early)
 		__set_sched_clock_stable();
+
+	return 0;
 }
+late_initcall(sched_clock_init_late);
 
 /*
  * min, max except they take wrapping into account
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 803c3bc274c4..5794f4acad15 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5984,7 +5984,6 @@ void __init sched_init_smp(void)
 	init_sched_dl_class();
 
 	sched_init_smt();
-	sched_clock_init_late();
 
 	sched_smp_initialized = true;
 }
@@ -6000,7 +5999,6 @@ early_initcall(migration_init);
 void __init sched_init_smp(void)
 {
 	sched_init_granularity();
-	sched_clock_init_late();
 }
 #endif /* CONFIG_SMP */
 
-- 
cgit v1.3-14-g43fede


From 7708d5f04de4dd5d2110df3244372b1e3f61bc7c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 21 Apr 2017 12:52:52 +0200
Subject: sched/clock: Print a warning recommending 'tsc=unstable'

With our switch to stable delayed until late_initcall(), the most
likely cause of hitting mark_tsc_unstable() is the watchdog. The
watchdog typically only triggers when creative BIOS'es fiddle with the
TSC to hide SMI latency.

Since the watchdog can only detect TSC fiddling after the fact all TSC
clocks (including userspace GTOD) can already have reported funny
values.

The only way to fully avoid this, is manually marking the TSC unstable
at boot. Suggest people do this on their broken systems.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/clock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index a2f847c6ada8..1a0d389d2f2b 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -175,6 +175,7 @@ static void __sched_clock_work(struct work_struct *work)
 	for_each_possible_cpu(cpu)
 		per_cpu(sched_clock_data, cpu) = *scd;
 
+	printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n");
 	printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
 			scd->tick_gtod, __gtod_offset,
 			scd->tick_raw,  __sched_clock_offset);
-- 
cgit v1.3-14-g43fede


From 8c0334697dc37eb3d6d7632304d3a3662248daac Mon Sep 17 00:00:00 2001
From: Lauro Ramos Venancio <lvenanci@redhat.com>
Date: Thu, 13 Apr 2017 10:56:07 -0300
Subject: sched/topology: Refactor function build_overlap_sched_groups()

Create functions build_group_from_child_sched_domain() and
init_overlap_sched_group(). No functional change.

Signed-off-by: Lauro Ramos Venancio <lvenanci@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1492091769-19879-2-git-send-email-lvenanci@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 62 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 43 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1b0b4fb12837..d786d45c44d9 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -513,6 +513,47 @@ int group_balance_cpu(struct sched_group *sg)
 	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
 }
 
+static struct sched_group *
+build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
+{
+	struct sched_group *sg;
+	struct cpumask *sg_span;
+
+	sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+			GFP_KERNEL, cpu_to_node(cpu));
+
+	if (!sg)
+		return NULL;
+
+	sg_span = sched_group_cpus(sg);
+	if (sd->child)
+		cpumask_copy(sg_span, sched_domain_span(sd->child));
+	else
+		cpumask_copy(sg_span, sched_domain_span(sd));
+
+	return sg;
+}
+
+static void init_overlap_sched_group(struct sched_domain *sd,
+				     struct sched_group *sg, int cpu)
+{
+	struct sd_data *sdd = sd->private;
+	struct cpumask *sg_span;
+
+	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+	if (atomic_inc_return(&sg->sgc->ref) == 1)
+		build_group_mask(sd, sg);
+
+	/*
+	 * Initialize sgc->capacity such that even if we mess up the
+	 * domains and no possible iteration will get us here, we won't
+	 * die on a /0 trap.
+	 */
+	sg_span = sched_group_cpus(sg);
+	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+}
+
 static int
 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
@@ -537,31 +578,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
 			continue;
 
-		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
-				GFP_KERNEL, cpu_to_node(cpu));
-
+		sg = build_group_from_child_sched_domain(sibling, cpu);
 		if (!sg)
 			goto fail;
 
 		sg_span = sched_group_cpus(sg);
-		if (sibling->child)
-			cpumask_copy(sg_span, sched_domain_span(sibling->child));
-		else
-			cpumask_set_cpu(i, sg_span);
-
 		cpumask_or(covered, covered, sg_span);
 
-		sg->sgc = *per_cpu_ptr(sdd->sgc, i);
-		if (atomic_inc_return(&sg->sgc->ref) == 1)
-			build_group_mask(sd, sg);
-
-		/*
-		 * Initialize sgc->capacity such that even if we mess up the
-		 * domains and no possible iteration will get us here, we won't
-		 * die on a /0 trap.
-		 */
-		sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
-		sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+		init_overlap_sched_group(sd, sg, i);
 
 		/*
 		 * Make sure the first group of this domain contains the
-- 
cgit v1.3-14-g43fede


From c743f0a5c50f2fcbc628526279cfa24f3dabe182 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 14 Apr 2017 14:20:05 +0200
Subject: sched/fair, cpumask: Export for_each_cpu_wrap()

More users for for_each_cpu_wrap() have appeared. Promote the construct
to generic cpumask interface.

The implementation is slightly modified to reduce arguments.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Lauro Ramos Venancio <lvenanci@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: lwang@redhat.com
Link: http://lkml.kernel.org/r/20170414122005.o35me2h5nowqkxbv@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpumask.h | 17 +++++++++++++++++
 kernel/sched/fair.c     | 45 ++++-----------------------------------------
 lib/cpumask.c           | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 2404ad238c0b..a21b1fb9a968 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -236,6 +236,23 @@ unsigned int cpumask_local_spread(unsigned int i, int node);
 		(cpu) = cpumask_next_zero((cpu), (mask)),	\
 		(cpu) < nr_cpu_ids;)
 
+extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
+
+/**
+ * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
+ * @cpu: the (optionally unsigned) integer iterator
+ * @mask: the cpumask poiter
+ * @start: the start location
+ *
+ * The implementation does not assume any bit in @mask is set (including @start).
+ *
+ * After the loop, cpu is >= nr_cpu_ids.
+ */
+#define for_each_cpu_wrap(cpu, mask, start)					\
+	for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false);	\
+	     (cpu) < nr_cpumask_bits;						\
+	     (cpu) = cpumask_next_wrap((cpu), (mask), (start), true))
+
 /**
  * for_each_cpu_and - iterate over every cpu in both masks
  * @cpu: the (optionally unsigned) integer iterator
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4f1825d60937..f80c825e2b43 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5640,43 +5640,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 }
 
-/*
- * Implement a for_each_cpu() variant that starts the scan at a given cpu
- * (@start), and wraps around.
- *
- * This is used to scan for idle CPUs; such that not all CPUs looking for an
- * idle CPU find the same CPU. The down-side is that tasks tend to cycle
- * through the LLC domain.
- *
- * Especially tbench is found sensitive to this.
- */
-
-static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
-{
-	int next;
-
-again:
-	next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
-
-	if (*wrapped) {
-		if (next >= start)
-			return nr_cpumask_bits;
-	} else {
-		if (next >= nr_cpumask_bits) {
-			*wrapped = 1;
-			n = -1;
-			goto again;
-		}
-	}
-
-	return next;
-}
-
-#define for_each_cpu_wrap(cpu, mask, start, wrap)				\
-	for ((wrap) = 0, (cpu) = (start)-1;					\
-		(cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)),	\
-		(cpu) < nr_cpumask_bits; )
-
 #ifdef CONFIG_SCHED_SMT
 
 static inline void set_idle_cores(int cpu, int val)
@@ -5736,7 +5699,7 @@ unlock:
 static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
 {
 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
-	int core, cpu, wrap;
+	int core, cpu;
 
 	if (!static_branch_likely(&sched_smt_present))
 		return -1;
@@ -5746,7 +5709,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
 
 	cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
 
-	for_each_cpu_wrap(core, cpus, target, wrap) {
+	for_each_cpu_wrap(core, cpus, target) {
 		bool idle = true;
 
 		for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -5812,7 +5775,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	u64 avg_cost, avg_idle = this_rq()->avg_idle;
 	u64 time, cost;
 	s64 delta;
-	int cpu, wrap;
+	int cpu;
 
 	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
 	if (!this_sd)
@@ -5829,7 +5792,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 
 	time = local_clock();
 
-	for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+	for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
 		if (idle_cpu(cpu))
diff --git a/lib/cpumask.c b/lib/cpumask.c
index 81dedaab36cc..4731a0895760 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -43,6 +43,38 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
 }
 EXPORT_SYMBOL(cpumask_any_but);
 
+/**
+ * cpumask_next_wrap - helper to implement for_each_cpu_wrap
+ * @n: the cpu prior to the place to search
+ * @mask: the cpumask pointer
+ * @start: the start point of the iteration
+ * @wrap: assume @n crossing @start terminates the iteration
+ *
+ * Returns >= nr_cpu_ids on completion
+ *
+ * Note: the @wrap argument is required for the start condition when
+ * we cannot assume @start is set in @mask.
+ */
+int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
+{
+	int next;
+
+again:
+	next = cpumask_next(n, mask);
+
+	if (wrap && n < start && next >= start) {
+		return nr_cpumask_bits;
+
+	} else if (next >= nr_cpumask_bits) {
+		wrap = true;
+		n = -1;
+		goto again;
+	}
+
+	return next;
+}
+EXPORT_SYMBOL(cpumask_next_wrap);
+
 /* These are not inline because of header tangles. */
 #ifdef CONFIG_CPUMASK_OFFSTACK
 /**
-- 
cgit v1.3-14-g43fede


From 0372dd2736e02672ac6e189c31f7d8c02ad543cd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 14 Apr 2017 17:24:02 +0200
Subject: sched/topology: Fix building of overlapping sched-groups

When building the overlapping groups, we very obviously should start
with the previous domain of _this_ @cpu, not CPU-0.

This can be readily demonstrated with a topology like:

  node   0   1   2   3
    0:  10  20  30  20
    1:  20  10  20  30
    2:  30  20  10  20
    3:  20  30  20  10

Where (for example) CPU1 ends up generating the following nonsensical groups:

  [] CPU1 attaching sched-domain:
  []  domain 0: span 0-2 level NUMA
  []   groups: 1 2 0
  []   domain 1: span 0-3 level NUMA
  []    groups: 1-3 (cpu_capacity = 3072) 0-1,3 (cpu_capacity = 3072)

Where the fact that domain 1 doesn't include a group with span 0-2 is
the obvious fail.

With patch this looks like:

  [] CPU1 attaching sched-domain:
  []  domain 0: span 0-2 level NUMA
  []   groups: 1 0 2
  []   domain 1: span 0-3 level NUMA
  []    groups: 0-2 (cpu_capacity = 3072) 0,2-3 (cpu_capacity = 3072)

Debugged-by: Lauro Ramos Venancio <lvenanci@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org
Fixes: e3589f6c81e4 ("sched: Allow for overlapping sched_domain spans")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d786d45c44d9..921dedde2ee1 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -566,7 +566,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 
 	cpumask_clear(covered);
 
-	for_each_cpu(i, span) {
+	for_each_cpu_wrap(i, span, cpu) {
 		struct cpumask *sg_span;
 
 		if (cpumask_test_cpu(i, covered))
-- 
cgit v1.3-14-g43fede


From 91eaed0d61319f58a9f8e43d41a8cbb069b4f73d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 14 Apr 2017 17:32:07 +0200
Subject: sched/topology: Simplify build_overlap_sched_groups()

Now that the first group will always be the previous domain of this
@cpu this can be simplified.

In fact, writing the code now removed should've been a big clue I was
doing it wrong :/

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 921dedde2ee1..6b10e0a956c7 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -557,7 +557,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
 static int
 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
-	struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+	struct sched_group *first = NULL, *last = NULL, *sg;
 	const struct cpumask *span = sched_domain_span(sd);
 	struct cpumask *covered = sched_domains_tmpmask;
 	struct sd_data *sdd = sd->private;
@@ -587,15 +587,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 
 		init_overlap_sched_group(sd, sg, i);
 
-		/*
-		 * Make sure the first group of this domain contains the
-		 * canonical balance CPU. Otherwise the sched_domain iteration
-		 * breaks. See update_sg_lb_stats().
-		 */
-		if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
-		    group_balance_cpu(sg) == cpu)
-			groups = sg;
-
 		if (!first)
 			first = sg;
 		if (last)
@@ -603,7 +594,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 		last = sg;
 		last->next = first;
 	}
-	sd->groups = groups;
+	sd->groups = first;
 
 	return 0;
 
-- 
cgit v1.3-14-g43fede


From b0151c25548cacc50771a7930475727c6c8ee869 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 14 Apr 2017 17:29:16 +0200
Subject: sched/debug: Print the scheduler topology group mask

In order to determine the balance_cpu (for should_we_balance()) we need
the sched_group_mask() for overlapping domains.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6b10e0a956c7..3d50ee38b8fb 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -82,12 +82,22 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
 		printk(KERN_CONT " %*pbl",
 		       cpumask_pr_args(sched_group_cpus(group)));
+
+		if ((sd->flags & SD_OVERLAP) && !cpumask_full(sched_group_mask(group))) {
+			printk(KERN_CONT " (mask: %*pbl)",
+				cpumask_pr_args(sched_group_mask(group)));
+		}
+
 		if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
-			printk(KERN_CONT " (cpu_capacity = %lu)",
+			printk(KERN_CONT " (cpu_capacity: %lu)",
 				group->sgc->capacity);
 		}
 
 		group = group->next;
+
+		if (group != sd->groups)
+			printk(KERN_CONT ",");
+
 	} while (group != sd->groups);
 	printk(KERN_CONT "\n");
 
-- 
cgit v1.3-14-g43fede


From a420b0630362c2c451060e6187e36d72df827134 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 14 Apr 2017 18:20:48 +0200
Subject: sched/topology: Verify the first group matches the child domain

We want sched_groups to be sibling child domains (or individual CPUs
when there are no child domains). Furthermore, since the first group
of a domain should include the CPU of that domain, the first group of
each domain should match the child domain.

Verify this is indeed so.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3d50ee38b8fb..81c82031ed95 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -93,6 +93,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				group->sgc->capacity);
 		}
 
+		if (group == sd->groups && sd->child &&
+		    !cpumask_equal(sched_domain_span(sd->child),
+				   sched_group_cpus(group))) {
+			printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
+		}
+
 		group = group->next;
 
 		if (group != sd->groups)
-- 
cgit v1.3-14-g43fede


From f32d782e31bf079f600dcec126ed117b0577e85c Mon Sep 17 00:00:00 2001
From: Lauro Ramos Venancio <lvenanci@redhat.com>
Date: Thu, 20 Apr 2017 16:51:40 -0300
Subject: sched/topology: Optimize build_group_mask()

The group mask is always used in intersection with the group CPUs. So,
when building the group mask, we don't have to care about CPUs that are
not part of the group.

Signed-off-by: Lauro Ramos Venancio <lvenanci@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: lwang@redhat.com
Cc: riel@redhat.com
Link: http://lkml.kernel.org/r/1492717903-5195-2-git-send-email-lvenanci@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 81c82031ed95..5a4d9aeda258 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -506,12 +506,12 @@ enum s_alloc {
  */
 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
 {
-	const struct cpumask *span = sched_domain_span(sd);
+	const struct cpumask *sg_span = sched_group_cpus(sg);
 	struct sd_data *sdd = sd->private;
 	struct sched_domain *sibling;
 	int i;
 
-	for_each_cpu(i, span) {
+	for_each_cpu(i, sg_span) {
 		sibling = *per_cpu_ptr(sdd->sd, i);
 		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
 			continue;
-- 
cgit v1.3-14-g43fede


From c20e1ea4b61c3d99a354d912f2d74822fd2a001d Mon Sep 17 00:00:00 2001
From: Lauro Ramos Venancio <lvenanci@redhat.com>
Date: Thu, 20 Apr 2017 16:51:42 -0300
Subject: sched/topology: Move comment about asymmetric node setups

Signed-off-by: Lauro Ramos Venancio <lvenanci@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: lwang@redhat.com
Cc: riel@redhat.com
Link: http://lkml.kernel.org/r/1492717903-5195-4-git-send-email-lvenanci@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 5a4d9aeda258..c10f44a1ab2d 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -495,14 +495,6 @@ enum s_alloc {
 /*
  * Build an iteration mask that can exclude certain CPUs from the upwards
  * domain traversal.
- *
- * Asymmetric node setups can result in situations where the domain tree is of
- * unequal depth, make sure to skip domains that already cover the entire
- * range.
- *
- * In that case build_sched_domains() will have terminated the iteration early
- * and our sibling sd spans will be empty. Domains should always include the
- * CPU they're built on, so check that.
  */
 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
 {
@@ -590,7 +582,16 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 
 		sibling = *per_cpu_ptr(sdd->sd, i);
 
-		/* See the comment near build_group_mask(). */
+		/*
+		 * Asymmetric node setups can result in situations where the
+		 * domain tree is of unequal depth, make sure to skip domains
+		 * that already cover the entire range.
+		 *
+		 * In that case build_sched_domains() will have terminated the
+		 * iteration early and our sibling sd spans will be empty.
+		 * Domains should always include the CPU they're built on, so
+		 * check that.
+		 */
 		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
 			continue;
 
-- 
cgit v1.3-14-g43fede


From af85596c74de2fd9abb87501ae280038ac28a3f4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 26 Apr 2017 17:36:41 +0200
Subject: sched/topology: Remove FORCE_SD_OVERLAP

Its an obsolete debug mechanism and future code wants to rely on
properties this undermines.

Namely, it would be good to assume that SD_OVERLAP domains have
children, but if we build the entire hierarchy with SD_OVERLAP this is
obviously false.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/features.h | 1 -
 kernel/sched/topology.c | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 11192e0cb122..dc4d1483b038 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -76,7 +76,6 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
 SCHED_FEAT(RT_PUSH_IPI, true)
 #endif
 
-SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index c10f44a1ab2d..21efacf547d4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1439,7 +1439,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
 			if (tl == sched_domain_topology)
 				*per_cpu_ptr(d.sd, i) = sd;
-			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+			if (tl->flags & SDTL_OVERLAP)
 				sd->flags |= SD_OVERLAP;
 			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
 				break;
-- 
cgit v1.3-14-g43fede


From 73bb059f9b8a00c5e1bf2f7ca83138c05d05e600 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 25 Apr 2017 14:00:49 +0200
Subject: sched/topology: Fix overlapping sched_group_mask

The point of sched_group_mask is to select those CPUs from
sched_group_cpus that can actually arrive at this balance domain.

The current code gets it wrong, as can be readily demonstrated with a
topology like:

  node   0   1   2   3
    0:  10  20  30  20
    1:  20  10  20  30
    2:  30  20  10  20
    3:  20  30  20  10

Where (for example) domain 1 on CPU1 ends up with a mask that includes
CPU0:

  [] CPU1 attaching sched-domain:
  []  domain 0: span 0-2 level NUMA
  []   groups: 1 (mask: 1), 2, 0
  []   domain 1: span 0-3 level NUMA
  []    groups: 0-2 (mask: 0-2) (cpu_capacity: 3072), 0,2-3 (cpu_capacity: 3072)

This causes sched_balance_cpu() to compute the wrong CPU and
consequently should_we_balance() will terminate early resulting in
missed load-balance opportunities.

The fixed topology looks like:

  [] CPU1 attaching sched-domain:
  []  domain 0: span 0-2 level NUMA
  []   groups: 1 (mask: 1), 2, 0
  []   domain 1: span 0-3 level NUMA
  []    groups: 0-2 (mask: 1) (cpu_capacity: 3072), 0,2-3 (cpu_capacity: 3072)

(note: this relies on OVERLAP domains to always have children, this is
 true because the regular topology domains are still here -- this is
 before degenerate trimming)

Debugged-by: Lauro Ramos Venancio <lvenanci@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org
Fixes: e3589f6c81e4 ("sched: Allow for overlapping sched_domain spans")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 21efacf547d4..09563c1d1d5b 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -495,6 +495,9 @@ enum s_alloc {
 /*
  * Build an iteration mask that can exclude certain CPUs from the upwards
  * domain traversal.
+ *
+ * Only CPUs that can arrive at this group should be considered to continue
+ * balancing.
  */
 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
 {
@@ -505,11 +508,24 @@ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
 
 	for_each_cpu(i, sg_span) {
 		sibling = *per_cpu_ptr(sdd->sd, i);
-		if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+
+		/*
+		 * Can happen in the asymmetric case, where these siblings are
+		 * unused. The mask will not be empty because those CPUs that
+		 * do have the top domain _should_ span the domain.
+		 */
+		if (!sibling->child)
+			continue;
+
+		/* If we would not end up here, we can't continue from here */
+		if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
 			continue;
 
 		cpumask_set_cpu(i, sched_group_mask(sg));
 	}
+
+	/* We must not have empty masks here */
+	WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg)));
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 8d5dc5126bb2bbcebf0b1e061cca2fc02c935620 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 25 Apr 2017 15:29:40 +0200
Subject: sched/topology: Small cleanup

Move the allocation of topology specific cpumasks into the topology
code.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c     | 4 +---
 kernel/sched/sched.h    | 4 +---
 kernel/sched/topology.c | 7 +++++--
 3 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5794f4acad15..dde5d1e860f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5958,7 +5958,6 @@ void __init sched_init_smp(void)
 	cpumask_var_t non_isolated_cpus;
 
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
 	sched_init_numa();
 
@@ -5968,7 +5967,7 @@ void __init sched_init_smp(void)
 	 * happen.
 	 */
 	mutex_lock(&sched_domains_mutex);
-	init_sched_domains(cpu_active_mask);
+	sched_init_domains(cpu_active_mask);
 	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
 	if (cpumask_empty(non_isolated_cpus))
 		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -6197,7 +6196,6 @@ void __init sched_init(void)
 	calc_load_update = jiffies + LOAD_FREQ;
 
 #ifdef CONFIG_SMP
-	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
 	/* May be allocated at isolcpus cmdline parse time */
 	if (cpu_isolated_map == NULL)
 		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6dda2aab731e..6e1eae717a24 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -606,11 +606,9 @@ struct root_domain {
 
 extern struct root_domain def_root_domain;
 extern struct mutex sched_domains_mutex;
-extern cpumask_var_t fallback_doms;
-extern cpumask_var_t sched_domains_tmpmask;
 
 extern void init_defrootdomain(void);
-extern int init_sched_domains(const struct cpumask *cpu_map);
+extern int sched_init_domains(const struct cpumask *cpu_map);
 extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
 
 #endif /* CONFIG_SMP */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 09563c1d1d5b..a4b868c76f3c 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1526,7 +1526,7 @@ static struct sched_domain_attr		*dattr_cur;
  * cpumask) fails, then fallback to a single sched domain,
  * as determined by the single cpumask fallback_doms.
  */
-cpumask_var_t				fallback_doms;
+static cpumask_var_t			fallback_doms;
 
 /*
  * arch_update_cpu_topology lets virtualized architectures update the
@@ -1568,10 +1568,13 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
  * For now this just excludes isolated CPUs, but could be used to
  * exclude other special cases in the future.
  */
-int init_sched_domains(const struct cpumask *cpu_map)
+int sched_init_domains(const struct cpumask *cpu_map)
 {
 	int err;
 
+	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
+	zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
 	arch_update_cpu_topology();
 	ndoms_cur = 1;
 	doms_cur = alloc_sched_domains(ndoms_cur);
-- 
cgit v1.3-14-g43fede


From 005f874dd2843116e2ea079e3679f4f318f12fee Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 26 Apr 2017 17:35:35 +0200
Subject: sched/topology: Add sched_group_capacity debugging

Add sgc::id to easier spot domain construction issues.

Take the opportunity to slightly rework the group printing, because
adding more "(id: %d)" strings makes the entire thing very hard to
read. Also the individual groups are very hard to separate, so add
explicit visual grouping, which allows replacing all the "(%s: %d)"
format things with shorter "%s=%d" variants.

Then fix up some inconsistencies in surrounding prints for domains.

The end result looks like:

  [] CPU0 attaching sched-domain(s):
  []  domain-0: span=0,4 level=DIE
  []   groups: 0:{ span=0 }, 4:{ span=4 }
  []   domain-1: span=0-1,3-5,7 level=NUMA
  []    groups: 0:{ span=0,4 mask=0,4 cap=2048 }, 1:{ span=1,5 mask=1,5 cap=2048 }, 3:{ span=3,7 mask=3,7 cap=2048 }
  []    domain-2: span=0-7 level=NUMA
  []     groups: 0:{ span=0-1,3-5,7 mask=0,4 cap=6144 }, 2:{ span=1-3,5-7 mask=2,6 cap=6144 }

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/sched.h    |  4 ++++
 kernel/sched/topology.c | 25 +++++++++++++++----------
 2 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6e1eae717a24..4312b2adfb02 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1023,6 +1023,10 @@ struct sched_group_capacity {
 	unsigned long next_update;
 	int imbalance; /* XXX unrelated to capacity but shared group state */
 
+#ifdef CONFIG_SCHED_DEBUG
+	int id;
+#endif
+
 	unsigned long cpumask[0]; /* iteration mask */
 };
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a4b868c76f3c..12af4b157928 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -35,7 +35,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
 	cpumask_clear(groupmask);
 
-	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+	printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
 
 	if (!(sd->flags & SD_LOAD_BALANCE)) {
 		printk("does not load-balance\n");
@@ -45,7 +45,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		return -1;
 	}
 
-	printk(KERN_CONT "span %*pbl level %s\n",
+	printk(KERN_CONT "span=%*pbl level=%s\n",
 	       cpumask_pr_args(sched_domain_span(sd)), sd->name);
 
 	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
@@ -80,18 +80,17 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
 		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
 
-		printk(KERN_CONT " %*pbl",
-		       cpumask_pr_args(sched_group_cpus(group)));
+		printk(KERN_CONT " %d:{ span=%*pbl",
+				group->sgc->id,
+				cpumask_pr_args(sched_group_cpus(group)));
 
 		if ((sd->flags & SD_OVERLAP) && !cpumask_full(sched_group_mask(group))) {
-			printk(KERN_CONT " (mask: %*pbl)",
+			printk(KERN_CONT " mask=%*pbl",
 				cpumask_pr_args(sched_group_mask(group)));
 		}
 
-		if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
-			printk(KERN_CONT " (cpu_capacity: %lu)",
-				group->sgc->capacity);
-		}
+		if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
+			printk(KERN_CONT " cap=%lu", group->sgc->capacity);
 
 		if (group == sd->groups && sd->child &&
 		    !cpumask_equal(sched_domain_span(sd->child),
@@ -99,6 +98,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 			printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
 		}
 
+		printk(KERN_CONT " }");
+
 		group = group->next;
 
 		if (group != sd->groups)
@@ -129,7 +130,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 		return;
 	}
 
-	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+	printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
 
 	for (;;) {
 		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
@@ -1356,6 +1357,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 			if (!sgc)
 				return -ENOMEM;
 
+#ifdef CONFIG_SCHED_DEBUG
+			sgc->id = j;
+#endif
+
 			*per_cpu_ptr(sdd->sgc, j) = sgc;
 		}
 	}
-- 
cgit v1.3-14-g43fede


From 1676330ecfa840113a37b25a49afda068380d19c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 25 Apr 2017 14:31:11 +0200
Subject: sched/topology: Fix overlapping sched_group_capacity

When building the overlapping groups we need to attach a consistent
sched_group_capacity structure. That is, all 'identical' sched_group's
should have the _same_ sched_group_capacity.

This can (once again) be demonstrated with a topology like:

  node   0   1   2   3
    0:  10  20  30  20
    1:  20  10  20  30
    2:  30  20  10  20
    3:  20  30  20  10

But we need at least 2 CPUs per node for this to show up, after all,
if there is only one CPU per node, our CPU @i is per definition a
unique CPU that reaches this domain (aka balance-cpu).

Given the above NUMA topo and 2 CPUs per node:

  [] CPU0 attaching sched-domain(s):
  []  domain-0: span=0,4 level=DIE
  []   groups: 0:{ span=0 }, 4:{ span=4 }
  []   domain-1: span=0-1,3-5,7 level=NUMA
  []    groups: 0:{ span=0,4 mask=0,4 cap=2048 }, 1:{ span=1,5 mask=1,5 cap=2048 }, 3:{ span=3,7 mask=3,7 cap=2048 }
  []    domain-2: span=0-7 level=NUMA
  []     groups: 0:{ span=0-1,3-5,7 mask=0,4 cap=6144 }, 2:{ span=1-3,5-7 mask=2,6 cap=6144 }
  [] CPU1 attaching sched-domain(s):
  []  domain-0: span=1,5 level=DIE
  []   groups: 1:{ span=1 }, 5:{ span=5 }
  []   domain-1: span=0-2,4-6 level=NUMA
  []    groups: 1:{ span=1,5 mask=1,5 cap=2048 }, 2:{ span=2,6 mask=2,6 cap=2048 }, 4:{ span=0,4 mask=0,4 cap=2048 }
  []    domain-2: span=0-7 level=NUMA
  []     groups: 1:{ span=0-2,4-6 mask=1,5 cap=6144 }, 3:{ span=0,2-4,6-7 mask=3,7 cap=6144 }

Observe how CPU0-domain1-group0 and CPU1-domain1-group4 are the
'same' but have a different id (0 vs 4).

To fix this, use the group balance CPU to select the SGC. This means
we have to compute the full mask for each CPU and require a second
temporary mask to store the group mask in (it otherwise lives in the
SGC).

The fixed topology looks like:

  [] CPU0 attaching sched-domain(s):
  []  domain-0: span=0,4 level=DIE
  []   groups: 0:{ span=0 }, 4:{ span=4 }
  []   domain-1: span=0-1,3-5,7 level=NUMA
  []    groups: 0:{ span=0,4 mask=0,4 cap=2048 }, 1:{ span=1,5 mask=1,5 cap=2048 }, 3:{ span=3,7 mask=3,7 cap=2048 }
  []    domain-2: span=0-7 level=NUMA
  []     groups: 0:{ span=0-1,3-5,7 mask=0,4 cap=6144 }, 2:{ span=1-3,5-7 mask=2,6 cap=6144 }
  [] CPU1 attaching sched-domain(s):
  []  domain-0: span=1,5 level=DIE
  []   groups: 1:{ span=1 }, 5:{ span=5 }
  []   domain-1: span=0-2,4-6 level=NUMA
  []    groups: 1:{ span=1,5 mask=1,5 cap=2048 }, 2:{ span=2,6 mask=2,6 cap=2048 }, 0:{ span=0,4 mask=0,4 cap=2048 }
  []    domain-2: span=0-7 level=NUMA
  []     groups: 1:{ span=0-2,4-6 mask=1,5 cap=6144 }, 3:{ span=0,2-4,6-7 mask=3,7 cap=6144 }

Debugged-by: Lauro Ramos Venancio <lvenanci@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Fixes: e3589f6c81e4 ("sched: Allow for overlapping sched_domain spans")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 12af4b157928..4f6fa7553d92 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -10,6 +10,7 @@ DEFINE_MUTEX(sched_domains_mutex);
 
 /* Protected by sched_domains_mutex: */
 cpumask_var_t sched_domains_tmpmask;
+cpumask_var_t sched_domains_tmpmask2;
 
 #ifdef CONFIG_SCHED_DEBUG
 
@@ -500,13 +501,16 @@ enum s_alloc {
  * Only CPUs that can arrive at this group should be considered to continue
  * balancing.
  */
-static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+static void
+build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
 {
 	const struct cpumask *sg_span = sched_group_cpus(sg);
 	struct sd_data *sdd = sd->private;
 	struct sched_domain *sibling;
 	int i;
 
+	cpumask_clear(mask);
+
 	for_each_cpu(i, sg_span) {
 		sibling = *per_cpu_ptr(sdd->sd, i);
 
@@ -522,11 +526,11 @@ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
 		if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
 			continue;
 
-		cpumask_set_cpu(i, sched_group_mask(sg));
+		cpumask_set_cpu(i, mask);
 	}
 
 	/* We must not have empty masks here */
-	WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg)));
+	WARN_ON_ONCE(cpumask_empty(mask));
 }
 
 /*
@@ -560,14 +564,19 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
 }
 
 static void init_overlap_sched_group(struct sched_domain *sd,
-				     struct sched_group *sg, int cpu)
+				     struct sched_group *sg)
 {
+	struct cpumask *mask = sched_domains_tmpmask2;
 	struct sd_data *sdd = sd->private;
 	struct cpumask *sg_span;
+	int cpu;
+
+	build_group_mask(sd, sg, mask);
+	cpu = cpumask_first_and(sched_group_cpus(sg), mask);
 
 	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
 	if (atomic_inc_return(&sg->sgc->ref) == 1)
-		build_group_mask(sd, sg);
+		cpumask_copy(sched_group_mask(sg), mask);
 
 	/*
 	 * Initialize sgc->capacity such that even if we mess up the
@@ -619,7 +628,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 		sg_span = sched_group_cpus(sg);
 		cpumask_or(covered, covered, sg_span);
 
-		init_overlap_sched_group(sd, sg, i);
+		init_overlap_sched_group(sd, sg);
 
 		if (!first)
 			first = sg;
@@ -1578,6 +1587,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
 	int err;
 
 	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
+	zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
 	zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
 	arch_update_cpu_topology();
-- 
cgit v1.3-14-g43fede


From 35a566e6e8a18c3bc16229abeac146a707b8f216 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 28 Apr 2017 10:54:26 +0200
Subject: sched/topology: Add a few comments

Try and describe what this code is about..

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 193 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 4f6fa7553d92..b2790830e184 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -494,12 +494,128 @@ enum s_alloc {
 	sa_none,
 };
 
+/*
+ * Return the canonical balance CPU for this group, this is the first CPU
+ * of this group that's also in the iteration mask.
+ *
+ * The iteration mask are all those CPUs that could actually end up at this
+ * group. See build_group_mask().
+ *
+ * Also see should_we_balance().
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
+
+
+/*
+ * NUMA topology (first read the regular topology blurb below)
+ *
+ * Given a node-distance table, for example:
+ *
+ *   node   0   1   2   3
+ *     0:  10  20  30  20
+ *     1:  20  10  20  30
+ *     2:  30  20  10  20
+ *     3:  20  30  20  10
+ *
+ * which represents a 4 node ring topology like:
+ *
+ *   0 ----- 1
+ *   |       |
+ *   |       |
+ *   |       |
+ *   3 ----- 2
+ *
+ * We want to construct domains and groups to represent this. The way we go
+ * about doing this is to build the domains on 'hops'. For each NUMA level we
+ * construct the mask of all nodes reachable in @level hops.
+ *
+ * For the above NUMA topology that gives 3 levels:
+ *
+ * NUMA-2	0-3		0-3		0-3		0-3
+ *  groups:	{0-1,3},{1-3}	{0-2},{0,2-3}	{1-3},{0-1,3}	{0,2-3},{0-2}
+ *
+ * NUMA-1	0-1,3		0-2		1-3		0,2-3
+ *  groups:	{0},{1},{3}	{0},{1},{2}	{1},{2},{3}	{0},{2},{3}
+ *
+ * NUMA-0	0		1		2		3
+ *
+ *
+ * As can be seen; things don't nicely line up as with the regular topology.
+ * When we iterate a domain in child domain chunks some nodes can be
+ * represented multiple times -- hence the "overlap" naming for this part of
+ * the topology.
+ *
+ * In order to minimize this overlap, we only build enough groups to cover the
+ * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
+ *
+ * Because:
+ *
+ *  - the first group of each domain is its child domain; this
+ *    gets us the first 0-1,3
+ *  - the only uncovered node is 2, who's child domain is 1-3.
+ *
+ * However, because of the overlap, computing a unique CPU for each group is
+ * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
+ * groups include the CPUs of Node-0, while those CPUs would not in fact ever
+ * end up at those groups (they would end up in group: 0-1,3).
+ *
+ * To correct this we have to introduce the group iteration mask. This mask
+ * will contain those CPUs in the group that can reach this group given the
+ * (child) domain tree.
+ *
+ * With this we can once again compute balance_cpu and sched_group_capacity
+ * relations.
+ *
+ * XXX include words on how balance_cpu is unique and therefore can be
+ * used for sched_group_capacity links.
+ *
+ *
+ * Another 'interesting' topology is:
+ *
+ *   node   0   1   2   3
+ *     0:  10  20  20  30
+ *     1:  20  10  20  20
+ *     2:  20  20  10  20
+ *     3:  30  20  20  10
+ *
+ * Which looks a little like:
+ *
+ *   0 ----- 1
+ *   |     / |
+ *   |   /   |
+ *   | /     |
+ *   2 ----- 3
+ *
+ * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
+ * are not.
+ *
+ * This leads to a few particularly weird cases where the sched_domain's are
+ * not of the same number for each cpu. Consider:
+ *
+ * NUMA-2	0-3						0-3
+ *  groups:	{0-2},{1-3}					{1-3},{0-2}
+ *
+ * NUMA-1	0-2		0-3		0-3		1-3
+ *
+ * NUMA-0	0		1		2		3
+ *
+ */
+
+
 /*
  * Build an iteration mask that can exclude certain CPUs from the upwards
  * domain traversal.
  *
  * Only CPUs that can arrive at this group should be considered to continue
  * balancing.
+ *
+ * We do this during the group creation pass, therefore the group information
+ * isn't complete yet, however since each group represents a (child) domain we
+ * can fully construct this using the sched_domain bits (which are already
+ * complete).
  */
 static void
 build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
@@ -534,14 +650,10 @@ build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask
 }
 
 /*
- * Return the canonical balance CPU for this group, this is the first CPU
- * of this group that's also in the iteration mask.
+ * XXX: This creates per-node group entries; since the load-balancer will
+ * immediately access remote memory to construct this group's load-balance
+ * statistics having the groups node local is of dubious benefit.
  */
-int group_balance_cpu(struct sched_group *sg)
-{
-	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
-}
-
 static struct sched_group *
 build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
 {
@@ -577,6 +689,8 @@ static void init_overlap_sched_group(struct sched_domain *sd,
 	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
 	if (atomic_inc_return(&sg->sgc->ref) == 1)
 		cpumask_copy(sched_group_mask(sg), mask);
+	else
+		WARN_ON_ONCE(!cpumask_equal(sched_group_mask(sg), mask));
 
 	/*
 	 * Initialize sgc->capacity such that even if we mess up the
@@ -647,6 +761,78 @@ fail:
 	return -ENOMEM;
 }
 
+
+/*
+ * Package topology (also see the load-balance blurb in fair.c)
+ *
+ * The scheduler builds a tree structure to represent a number of important
+ * topology features. By default (default_topology[]) these include:
+ *
+ *  - Simultaneous multithreading (SMT)
+ *  - Multi-Core Cache (MC)
+ *  - Package (DIE)
+ *
+ * Where the last one more or less denotes everything up to a NUMA node.
+ *
+ * The tree consists of 3 primary data structures:
+ *
+ *	sched_domain -> sched_group -> sched_group_capacity
+ *	    ^ ^             ^ ^
+ *          `-'             `-'
+ *
+ * The sched_domains are per-cpu and have a two way link (parent & child) and
+ * denote the ever growing mask of CPUs belonging to that level of topology.
+ *
+ * Each sched_domain has a circular (double) linked list of sched_group's, each
+ * denoting the domains of the level below (or individual CPUs in case of the
+ * first domain level). The sched_group linked by a sched_domain includes the
+ * CPU of that sched_domain [*].
+ *
+ * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
+ *
+ * CPU   0   1   2   3   4   5   6   7
+ *
+ * DIE  [                             ]
+ * MC   [             ] [             ]
+ * SMT  [     ] [     ] [     ] [     ]
+ *
+ *  - or -
+ *
+ * DIE  0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
+ * MC	0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
+ * SMT  0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
+ *
+ * CPU   0   1   2   3   4   5   6   7
+ *
+ * One way to think about it is: sched_domain moves you up and down among these
+ * topology levels, while sched_group moves you sideways through it, at child
+ * domain granularity.
+ *
+ * sched_group_capacity ensures each unique sched_group has shared storage.
+ *
+ * There are two related construction problems, both require a CPU that
+ * uniquely identify each group (for a given domain):
+ *
+ *  - The first is the balance_cpu (see should_we_balance() and the
+ *    load-balance blub in fair.c); for each group we only want 1 CPU to
+ *    continue balancing at a higher domain.
+ *
+ *  - The second is the sched_group_capacity; we want all identical groups
+ *    to share a single sched_group_capacity.
+ *
+ * Since these topologies are exclusive by construction. That is, its
+ * impossible for an SMT thread to belong to multiple cores, and cores to
+ * be part of multiple caches. There is a very clear and unique location
+ * for each CPU in the hierarchy.
+ *
+ * Therefore computing a unique CPU for each group is trivial (the iteration
+ * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
+ * group), we can simply pick the first CPU in each group.
+ *
+ *
+ * [*] in other words, the first group of each domain is its child domain.
+ */
+
 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 {
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
-- 
cgit v1.3-14-g43fede


From 0c0e776a9b0f21ac41d4c6982c57493928524dba Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 3 May 2017 14:18:06 +0200
Subject: sched/topology: Rewrite get_group()

We want to attain:

  sg_cpus() & sg_mask() == sg_mask()

for this to be so we must initialize sg_mask() to sg_cpus() for the
!overlap case (its currently cpumask_setall()).

Since the code makes my head hurt bad, rewrite it into a simpler form,
inspired by the now fixed overlap code.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 46 ++++++++++++++++++++++------------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b2790830e184..dea1950b42a5 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -833,23 +833,34 @@ fail:
  * [*] in other words, the first group of each domain is its child domain.
  */
 
-static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
+static struct sched_group *get_group(int cpu, struct sd_data *sdd)
 {
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
 	struct sched_domain *child = sd->child;
+	struct sched_group *sg;
 
 	if (child)
 		cpu = cpumask_first(sched_domain_span(child));
 
-	if (sg) {
-		*sg = *per_cpu_ptr(sdd->sg, cpu);
-		(*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+	sg = *per_cpu_ptr(sdd->sg, cpu);
+	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+
+	/* For claim_allocations: */
+	atomic_inc(&sg->ref);
+	atomic_inc(&sg->sgc->ref);
 
-		/* For claim_allocations: */
-		atomic_set(&(*sg)->sgc->ref, 1);
+	if (child) {
+		cpumask_copy(sched_group_cpus(sg), sched_domain_span(child));
+		cpumask_copy(sched_group_mask(sg), sched_group_cpus(sg));
+	} else {
+		cpumask_set_cpu(cpu, sched_group_cpus(sg));
+		cpumask_set_cpu(cpu, sched_group_mask(sg));
 	}
 
-	return cpu;
+	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_cpus(sg));
+	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+
+	return sg;
 }
 
 /*
@@ -868,34 +879,20 @@ build_sched_groups(struct sched_domain *sd, int cpu)
 	struct cpumask *covered;
 	int i;
 
-	get_group(cpu, sdd, &sd->groups);
-	atomic_inc(&sd->groups->ref);
-
-	if (cpu != cpumask_first(span))
-		return 0;
-
 	lockdep_assert_held(&sched_domains_mutex);
 	covered = sched_domains_tmpmask;
 
 	cpumask_clear(covered);
 
-	for_each_cpu(i, span) {
+	for_each_cpu_wrap(i, span, cpu) {
 		struct sched_group *sg;
-		int group, j;
 
 		if (cpumask_test_cpu(i, covered))
 			continue;
 
-		group = get_group(i, sdd, &sg);
-		cpumask_setall(sched_group_mask(sg));
+		sg = get_group(i, sdd);
 
-		for_each_cpu(j, span) {
-			if (get_group(j, sdd, NULL) != group)
-				continue;
-
-			cpumask_set_cpu(j, covered);
-			cpumask_set_cpu(j, sched_group_cpus(sg));
-		}
+		cpumask_or(covered, covered, sched_group_cpus(sg));
 
 		if (!first)
 			first = sg;
@@ -904,6 +901,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
 		last = sg;
 	}
 	last->next = first;
+	sd->groups = first;
 
 	return 0;
 }
-- 
cgit v1.3-14-g43fede


From af218122b103900fa33d408aea0c2468791e698c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 1 May 2017 08:51:05 +0200
Subject: sched/topology: Simplify sched_group_mask() usage

While writing the comments, it occurred to me that:

  sg_cpus & sg_mask == sg_mask

at least conceptually; the !overlap case sets the all 1s mask. If we
correct that we can simplify things and directly use sg_mask.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c     | 7 +++----
 kernel/sched/topology.c | 5 +++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f80c825e2b43..1eb32d4513ea 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7996,7 +7996,7 @@ static int active_load_balance_cpu_stop(void *data);
 static int should_we_balance(struct lb_env *env)
 {
 	struct sched_group *sg = env->sd->groups;
-	struct cpumask *sg_cpus, *sg_mask;
+	struct cpumask *sg_mask;
 	int cpu, balance_cpu = -1;
 
 	/*
@@ -8006,11 +8006,10 @@ static int should_we_balance(struct lb_env *env)
 	if (env->idle == CPU_NEWLY_IDLE)
 		return 1;
 
-	sg_cpus = sched_group_cpus(sg);
 	sg_mask = sched_group_mask(sg);
 	/* Try to find first idle cpu */
-	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
-		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+	for_each_cpu_and(cpu, sg_mask, env->cpus) {
+		if (!idle_cpu(cpu))
 			continue;
 
 		balance_cpu = cpu;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index dea1950b42a5..bf53a99eb511 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -85,7 +85,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				group->sgc->id,
 				cpumask_pr_args(sched_group_cpus(group)));
 
-		if ((sd->flags & SD_OVERLAP) && !cpumask_full(sched_group_mask(group))) {
+		if ((sd->flags & SD_OVERLAP) &&
+		    !cpumask_equal(sched_group_mask(group), sched_group_cpus(group))) {
 			printk(KERN_CONT " mask=%*pbl",
 				cpumask_pr_args(sched_group_mask(group)));
 		}
@@ -505,7 +506,7 @@ enum s_alloc {
  */
 int group_balance_cpu(struct sched_group *sg)
 {
-	return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+	return cpumask_first(sched_group_mask(sg));
 }
 
 
-- 
cgit v1.3-14-g43fede


From e5c14b1fb89213ff718261e6fb1bb29c5ffbbe99 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 1 May 2017 10:47:02 +0200
Subject: sched/topology: Rename sched_group_mask()

Since sched_group_mask() is now an independent cpumask (it no longer
masks sched_group_cpus()), rename the thing.

Suggested-by: Lauro Ramos Venancio <lvenanci@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c     |  4 +---
 kernel/sched/sched.h    |  7 +++----
 kernel/sched/topology.c | 33 +++++++++++++++------------------
 3 files changed, 19 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1eb32d4513ea..a7d84c8a7881 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7996,7 +7996,6 @@ static int active_load_balance_cpu_stop(void *data);
 static int should_we_balance(struct lb_env *env)
 {
 	struct sched_group *sg = env->sd->groups;
-	struct cpumask *sg_mask;
 	int cpu, balance_cpu = -1;
 
 	/*
@@ -8006,9 +8005,8 @@ static int should_we_balance(struct lb_env *env)
 	if (env->idle == CPU_NEWLY_IDLE)
 		return 1;
 
-	sg_mask = sched_group_mask(sg);
 	/* Try to find first idle cpu */
-	for_each_cpu_and(cpu, sg_mask, env->cpus) {
+	for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
 		if (!idle_cpu(cpu))
 			continue;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4312b2adfb02..f7c70575ae34 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1027,7 +1027,7 @@ struct sched_group_capacity {
 	int id;
 #endif
 
-	unsigned long cpumask[0]; /* iteration mask */
+	unsigned long cpumask[0]; /* balance mask */
 };
 
 struct sched_group {
@@ -1054,10 +1054,9 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
 }
 
 /*
- * cpumask masking which cpus in the group are allowed to iterate up the domain
- * tree.
+ * See build_balance_mask().
  */
-static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+static inline struct cpumask *group_balance_mask(struct sched_group *sg)
 {
 	return to_cpumask(sg->sgc->cpumask);
 }
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index bf53a99eb511..070191f02035 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -86,9 +86,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				cpumask_pr_args(sched_group_cpus(group)));
 
 		if ((sd->flags & SD_OVERLAP) &&
-		    !cpumask_equal(sched_group_mask(group), sched_group_cpus(group))) {
+		    !cpumask_equal(group_balance_mask(group), sched_group_cpus(group))) {
 			printk(KERN_CONT " mask=%*pbl",
-				cpumask_pr_args(sched_group_mask(group)));
+				cpumask_pr_args(group_balance_mask(group)));
 		}
 
 		if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
@@ -497,16 +497,16 @@ enum s_alloc {
 
 /*
  * Return the canonical balance CPU for this group, this is the first CPU
- * of this group that's also in the iteration mask.
+ * of this group that's also in the balance mask.
  *
- * The iteration mask are all those CPUs that could actually end up at this
- * group. See build_group_mask().
+ * The balance mask are all those CPUs that could actually end up at this
+ * group. See build_balance_mask().
  *
  * Also see should_we_balance().
  */
 int group_balance_cpu(struct sched_group *sg)
 {
-	return cpumask_first(sched_group_mask(sg));
+	return cpumask_first(group_balance_mask(sg));
 }
 
 
@@ -563,7 +563,7 @@ int group_balance_cpu(struct sched_group *sg)
  * groups include the CPUs of Node-0, while those CPUs would not in fact ever
  * end up at those groups (they would end up in group: 0-1,3).
  *
- * To correct this we have to introduce the group iteration mask. This mask
+ * To correct this we have to introduce the group balance mask. This mask
  * will contain those CPUs in the group that can reach this group given the
  * (child) domain tree.
  *
@@ -607,11 +607,8 @@ int group_balance_cpu(struct sched_group *sg)
 
 
 /*
- * Build an iteration mask that can exclude certain CPUs from the upwards
- * domain traversal.
- *
- * Only CPUs that can arrive at this group should be considered to continue
- * balancing.
+ * Build the balance mask; it contains only those CPUs that can arrive at this
+ * group and should be considered to continue balancing.
  *
  * We do this during the group creation pass, therefore the group information
  * isn't complete yet, however since each group represents a (child) domain we
@@ -619,7 +616,7 @@ int group_balance_cpu(struct sched_group *sg)
  * complete).
  */
 static void
-build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
+build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
 {
 	const struct cpumask *sg_span = sched_group_cpus(sg);
 	struct sd_data *sdd = sd->private;
@@ -684,14 +681,14 @@ static void init_overlap_sched_group(struct sched_domain *sd,
 	struct cpumask *sg_span;
 	int cpu;
 
-	build_group_mask(sd, sg, mask);
+	build_balance_mask(sd, sg, mask);
 	cpu = cpumask_first_and(sched_group_cpus(sg), mask);
 
 	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
 	if (atomic_inc_return(&sg->sgc->ref) == 1)
-		cpumask_copy(sched_group_mask(sg), mask);
+		cpumask_copy(group_balance_mask(sg), mask);
 	else
-		WARN_ON_ONCE(!cpumask_equal(sched_group_mask(sg), mask));
+		WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
 
 	/*
 	 * Initialize sgc->capacity such that even if we mess up the
@@ -852,10 +849,10 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
 
 	if (child) {
 		cpumask_copy(sched_group_cpus(sg), sched_domain_span(child));
-		cpumask_copy(sched_group_mask(sg), sched_group_cpus(sg));
+		cpumask_copy(group_balance_mask(sg), sched_group_cpus(sg));
 	} else {
 		cpumask_set_cpu(cpu, sched_group_cpus(sg));
-		cpumask_set_cpu(cpu, sched_group_mask(sg));
+		cpumask_set_cpu(cpu, group_balance_mask(sg));
 	}
 
 	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_cpus(sg));
-- 
cgit v1.3-14-g43fede


From ae4df9d6c935105857d9d166b615e3f17531ce6b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 1 May 2017 11:03:12 +0200
Subject: sched/topology: Rename sched_group_cpus()

There's a discrepancy in naming between the sched_domain and
sched_group cpumask accessor. Since we're doing changes, fix it.

  $ git grep sched_group_cpus | wc -l
  28
  $ git grep sched_domain_span | wc -l
  38

Suggests changing sched_group_cpus() into sched_group_span():

  for i  in `git grep -l sched_group_cpus`
  do
    sed -ie 's/sched_group_cpus/sched_group_span/g' $i
  done

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c     | 20 ++++++++++----------
 kernel/sched/sched.h    |  4 ++--
 kernel/sched/topology.c | 38 +++++++++++++++++++-------------------
 3 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a7d84c8a7881..eede181b4530 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5484,12 +5484,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		int i;
 
 		/* Skip over this group if it has no CPUs allowed */
-		if (!cpumask_intersects(sched_group_cpus(group),
+		if (!cpumask_intersects(sched_group_span(group),
 					&p->cpus_allowed))
 			continue;
 
 		local_group = cpumask_test_cpu(this_cpu,
-					       sched_group_cpus(group));
+					       sched_group_span(group));
 
 		/*
 		 * Tally up the load of all CPUs in the group and find
@@ -5499,7 +5499,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		runnable_load = 0;
 		max_spare_cap = 0;
 
-		for_each_cpu(i, sched_group_cpus(group)) {
+		for_each_cpu(i, sched_group_span(group)) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
 				load = source_load(i, load_idx);
@@ -5602,10 +5602,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 
 	/* Check if we have any choice: */
 	if (group->group_weight == 1)
-		return cpumask_first(sched_group_cpus(group));
+		return cpumask_first(sched_group_span(group));
 
 	/* Traverse only the allowed CPUs */
-	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+	for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
 		if (idle_cpu(i)) {
 			struct rq *rq = cpu_rq(i);
 			struct cpuidle_state *idle = idle_get_state(rq);
@@ -7192,7 +7192,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 		 * span the current group.
 		 */
 
-		for_each_cpu(cpu, sched_group_cpus(sdg)) {
+		for_each_cpu(cpu, sched_group_span(sdg)) {
 			struct sched_group_capacity *sgc;
 			struct rq *rq = cpu_rq(cpu);
 
@@ -7371,7 +7371,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 
 	memset(sgs, 0, sizeof(*sgs));
 
-	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 
 		/* Bias balancing toward cpus of our domain */
@@ -7535,7 +7535,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		struct sg_lb_stats *sgs = &tmp_sgs;
 		int local_group;
 
-		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
+		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
 		if (local_group) {
 			sds->local = sg;
 			sgs = local;
@@ -7890,7 +7890,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 	unsigned long busiest_load = 0, busiest_capacity = 1;
 	int i;
 
-	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
+	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		unsigned long capacity, wl;
 		enum fbq_type rt;
 
@@ -8043,7 +8043,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		.sd		= sd,
 		.dst_cpu	= this_cpu,
 		.dst_rq		= this_rq,
-		.dst_grpmask    = sched_group_cpus(sd->groups),
+		.dst_grpmask    = sched_group_span(sd->groups),
 		.idle		= idle,
 		.loop_break	= sched_nr_migrate_break,
 		.cpus		= cpus,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f7c70575ae34..f8cf1d87f065 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1048,7 +1048,7 @@ struct sched_group {
 	unsigned long cpumask[0];
 };
 
-static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+static inline struct cpumask *sched_group_span(struct sched_group *sg)
 {
 	return to_cpumask(sg->cpumask);
 }
@@ -1067,7 +1067,7 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
  */
 static inline unsigned int group_first_cpu(struct sched_group *group)
 {
-	return cpumask_first(sched_group_cpus(group));
+	return cpumask_first(sched_group_span(group));
 }
 
 extern int group_balance_cpu(struct sched_group *sg);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 070191f02035..79895aec281e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -53,7 +53,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		printk(KERN_ERR "ERROR: domain->span does not contain "
 				"CPU%d\n", cpu);
 	}
-	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
+	if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
 		printk(KERN_ERR "ERROR: domain->groups does not contain"
 				" CPU%d\n", cpu);
 	}
@@ -66,27 +66,27 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 			break;
 		}
 
-		if (!cpumask_weight(sched_group_cpus(group))) {
+		if (!cpumask_weight(sched_group_span(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: empty group\n");
 			break;
 		}
 
 		if (!(sd->flags & SD_OVERLAP) &&
-		    cpumask_intersects(groupmask, sched_group_cpus(group))) {
+		    cpumask_intersects(groupmask, sched_group_span(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: repeated CPUs\n");
 			break;
 		}
 
-		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
+		cpumask_or(groupmask, groupmask, sched_group_span(group));
 
 		printk(KERN_CONT " %d:{ span=%*pbl",
 				group->sgc->id,
-				cpumask_pr_args(sched_group_cpus(group)));
+				cpumask_pr_args(sched_group_span(group)));
 
 		if ((sd->flags & SD_OVERLAP) &&
-		    !cpumask_equal(group_balance_mask(group), sched_group_cpus(group))) {
+		    !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
 			printk(KERN_CONT " mask=%*pbl",
 				cpumask_pr_args(group_balance_mask(group)));
 		}
@@ -96,7 +96,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
 		if (group == sd->groups && sd->child &&
 		    !cpumask_equal(sched_domain_span(sd->child),
-				   sched_group_cpus(group))) {
+				   sched_group_span(group))) {
 			printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
 		}
 
@@ -618,7 +618,7 @@ int group_balance_cpu(struct sched_group *sg)
 static void
 build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
 {
-	const struct cpumask *sg_span = sched_group_cpus(sg);
+	const struct cpumask *sg_span = sched_group_span(sg);
 	struct sd_data *sdd = sd->private;
 	struct sched_domain *sibling;
 	int i;
@@ -664,7 +664,7 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
 	if (!sg)
 		return NULL;
 
-	sg_span = sched_group_cpus(sg);
+	sg_span = sched_group_span(sg);
 	if (sd->child)
 		cpumask_copy(sg_span, sched_domain_span(sd->child));
 	else
@@ -682,7 +682,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
 	int cpu;
 
 	build_balance_mask(sd, sg, mask);
-	cpu = cpumask_first_and(sched_group_cpus(sg), mask);
+	cpu = cpumask_first_and(sched_group_span(sg), mask);
 
 	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
 	if (atomic_inc_return(&sg->sgc->ref) == 1)
@@ -695,7 +695,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
 	 * domains and no possible iteration will get us here, we won't
 	 * die on a /0 trap.
 	 */
-	sg_span = sched_group_cpus(sg);
+	sg_span = sched_group_span(sg);
 	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
 	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
 }
@@ -737,7 +737,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 		if (!sg)
 			goto fail;
 
-		sg_span = sched_group_cpus(sg);
+		sg_span = sched_group_span(sg);
 		cpumask_or(covered, covered, sg_span);
 
 		init_overlap_sched_group(sd, sg);
@@ -848,14 +848,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
 	atomic_inc(&sg->sgc->ref);
 
 	if (child) {
-		cpumask_copy(sched_group_cpus(sg), sched_domain_span(child));
-		cpumask_copy(group_balance_mask(sg), sched_group_cpus(sg));
+		cpumask_copy(sched_group_span(sg), sched_domain_span(child));
+		cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
 	} else {
-		cpumask_set_cpu(cpu, sched_group_cpus(sg));
+		cpumask_set_cpu(cpu, sched_group_span(sg));
 		cpumask_set_cpu(cpu, group_balance_mask(sg));
 	}
 
-	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_cpus(sg));
+	sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
 	sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
 
 	return sg;
@@ -890,7 +890,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
 
 		sg = get_group(i, sdd);
 
-		cpumask_or(covered, covered, sched_group_cpus(sg));
+		cpumask_or(covered, covered, sched_group_span(sg));
 
 		if (!first)
 			first = sg;
@@ -923,12 +923,12 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 	do {
 		int cpu, max_cpu = -1;
 
-		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+		sg->group_weight = cpumask_weight(sched_group_span(sg));
 
 		if (!(sd->flags & SD_ASYM_PACKING))
 			goto next;
 
-		for_each_cpu(cpu, sched_group_cpus(sg)) {
+		for_each_cpu(cpu, sched_group_span(sg)) {
 			if (max_cpu < 0)
 				max_cpu = cpu;
 			else if (sched_asym_prefer(cpu, max_cpu))
-- 
cgit v1.3-14-g43fede


From 502ce005ab95d5d9481768649dbab808845b24d7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 4 May 2017 15:31:22 +0200
Subject: sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all
 cfs_rqs

In order to allow leaf_cfs_rq_list to remove entries switch the
bandwidth hotplug code over to the task_groups list.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170504133122.a6qjlj3hlblbjxux@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eede181b4530..c8fa777de76c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4642,24 +4642,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	hrtimer_cancel(&cfs_b->slack_timer);
 }
 
+/*
+ * Both these cpu hotplug callbacks race against unregister_fair_sched_group()
+ *
+ * The race is harmless, since modifying bandwidth settings of unhooked group
+ * bits doesn't do much.
+ */
+
+/* cpu online calback */
 static void __maybe_unused update_runtime_enabled(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
 
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+	lockdep_assert_held(&rq->lock);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
 		raw_spin_lock(&cfs_b->lock);
 		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
 		raw_spin_unlock(&cfs_b->lock);
 	}
+	rcu_read_unlock();
 }
 
+/* cpu offline callback */
 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
-	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
+
+	lockdep_assert_held(&rq->lock);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
 		if (!cfs_rq->runtime_enabled)
 			continue;
 
@@ -4677,6 +4696,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
 	}
+	rcu_read_unlock();
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */
-- 
cgit v1.3-14-g43fede


From a9e7f6544b9cebdae54d29f87a7ba2a83c0471b5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Apr 2017 17:43:50 -0700
Subject: sched/fair: Fix O(nr_cgroups) in load balance path

Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all
live cfs_rqs which have ever been active on the CPU; unfortunately,
this makes update_blocked_averages() O(# total cgroups) which isn't
scalable at all.

This shows up as a small CPU consumption and scheduling latency
increase in the load balancing path in systems with CPU controller
enabled across most cgroups.  In an edge case where temporary cgroups
were leaking, this caused the kernel to consume good several tens of
percents of CPU cycles running update_blocked_averages(), each run
taking multiple millisecs.

This patch fixes the issue by taking empty and fully decayed cfs_rqs
off the rq->leaf_cfs_rq_list.

Signed-off-by: Tejun Heo <tj@kernel.org>
[ Added cfs_rq_is_decayed() ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Chris Mason <clm@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170426004350.GB3222@wtj.duckdns.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c8fa777de76c..219fe58e3023 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 }
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
+	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
+				 leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
@@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
 
-#define for_each_leaf_cfs_rq(rq, cfs_rq) \
-		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
+		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
@@ -6953,10 +6954,28 @@ static void attach_tasks(struct lb_env *env)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->load.weight)
+		return false;
+
+	if (cfs_rq->avg.load_sum)
+		return false;
+
+	if (cfs_rq->avg.util_sum)
+		return false;
+
+	if (cfs_rq->runnable_load_sum)
+		return false;
+
+	return true;
+}
+
 static void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 	struct rq_flags rf;
 
 	rq_lock_irqsave(rq, &rf);
@@ -6966,7 +6985,7 @@ static void update_blocked_averages(int cpu)
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
 	 */
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
+	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
 		struct sched_entity *se;
 
 		/* throttled entities do not contribute to load */
@@ -6980,6 +6999,13 @@ static void update_blocked_averages(int cpu)
 		se = cfs_rq->tg->se[cpu];
 		if (se && !skip_blocked_update(se))
 			update_load_avg(se, 0);
+
+		/*
+		 * There can be a lot of idle CPU cgroups.  Don't let fully
+		 * decayed cfs_rqs linger on the list.
+		 */
+		if (cfs_rq_is_decayed(cfs_rq))
+			list_del_leaf_cfs_rq(cfs_rq);
 	}
 	rq_unlock_irqrestore(rq, &rf);
 }
@@ -9503,10 +9529,10 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu)
 {
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *pos;
 
 	rcu_read_lock();
-	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
+	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
 		print_cfs_rq(m, cpu, cfs_rq);
 	rcu_read_unlock();
 }
-- 
cgit v1.3-14-g43fede


From 7b4ff1adb57ad96d8f12a05d8c661a3d8c4d2be1 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Thu, 11 May 2017 10:17:45 -0300
Subject: mutex, futex: adjust kernel-doc markups to generate ReST

There are a few issues on some kernel-doc markups that was
causing troubles with kernel-doc output on ReST format:

./kernel/futex.c:492: WARNING: Inline emphasis start-string without end-string.
./kernel/futex.c:1264: WARNING: Block quote ends without a blank line; unexpected unindent.
./kernel/futex.c:1721: WARNING: Block quote ends without a blank line; unexpected unindent.
./kernel/futex.c:2338: WARNING: Block quote ends without a blank line; unexpected unindent.
./kernel/futex.c:2426: WARNING: Block quote ends without a blank line; unexpected unindent.
./kernel/futex.c:2899: WARNING: Block quote ends without a blank line; unexpected unindent.
./kernel/futex.c:2972: WARNING: Block quote ends without a blank line; unexpected unindent.

Fix them.

No functional changes.

Acked-by: Darren Hart (VMware) <dvhart@infradead.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 include/linux/mutex.h  |  6 +++---
 kernel/futex.c         | 40 ++++++++++++++++++++--------------------
 kernel/locking/mutex.c |  6 +++---
 3 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 1127fe31645d..ffcba1f337da 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -214,9 +214,9 @@ enum mutex_trylock_recursive_enum {
  * raisins, and once those are gone this will be removed.
  *
  * Returns:
- *  MUTEX_TRYLOCK_FAILED    - trylock failed,
- *  MUTEX_TRYLOCK_SUCCESS   - lock acquired,
- *  MUTEX_TRYLOCK_RECURSIVE - we already owned the lock.
+ *  - MUTEX_TRYLOCK_FAILED    - trylock failed,
+ *  - MUTEX_TRYLOCK_SUCCESS   - lock acquired,
+ *  - MUTEX_TRYLOCK_RECURSIVE - we already owned the lock.
  */
 static inline /* __deprecated */ __must_check enum mutex_trylock_recursive_enum
 mutex_trylock_recursive(struct mutex *lock)
diff --git a/kernel/futex.c b/kernel/futex.c
index 357348a6cf6b..b8ae87d227da 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -488,7 +488,7 @@ static void drop_futex_key_refs(union futex_key *key)
  *
  * Return: a negative error code or 0
  *
- * The key words are stored in *key on success.
+ * The key words are stored in @key on success.
  *
  * For shared mappings, it's (page->index, file_inode(vma->vm_file),
  * offset_within_page).  For private mappings, it's (uaddr, current->mm).
@@ -1259,9 +1259,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
  * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
  *
  * Return:
- *  0 - ready to wait;
- *  1 - acquired the lock;
- * <0 - error
+ *  -  0 - ready to wait;
+ *  -  1 - acquired the lock;
+ *  - <0 - error
  *
  * The hb->lock and futex_key refs shall be held by the caller.
  */
@@ -1717,9 +1717,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
  * hb1 and hb2 must be held by the caller.
  *
  * Return:
- *  0 - failed to acquire the lock atomically;
- * >0 - acquired the lock, return value is vpid of the top_waiter
- * <0 - error
+ *  -  0 - failed to acquire the lock atomically;
+ *  - >0 - acquired the lock, return value is vpid of the top_waiter
+ *  - <0 - error
  */
 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 				 struct futex_hash_bucket *hb1,
@@ -1785,8 +1785,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
  * uaddr2 atomically on behalf of the top waiter.
  *
  * Return:
- * >=0 - on success, the number of tasks requeued or woken;
- *  <0 - on error
+ *  - >=0 - on success, the number of tasks requeued or woken;
+ *  -  <0 - on error
  */
 static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 			 u32 __user *uaddr2, int nr_wake, int nr_requeue,
@@ -2142,8 +2142,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
  * be paired with exactly one earlier call to queue_me().
  *
  * Return:
- *   1 - if the futex_q was still queued (and we removed unqueued it);
- *   0 - if the futex_q was already removed by the waking thread
+ *  - 1 - if the futex_q was still queued (and we removed unqueued it);
+ *  - 0 - if the futex_q was already removed by the waking thread
  */
 static int unqueue_me(struct futex_q *q)
 {
@@ -2333,9 +2333,9 @@ static long futex_wait_restart(struct restart_block *restart);
  * acquire the lock. Must be called with the hb lock held.
  *
  * Return:
- *  1 - success, lock taken;
- *  0 - success, lock not taken;
- * <0 - on error (-EFAULT)
+ *  -  1 - success, lock taken;
+ *  -  0 - success, lock not taken;
+ *  - <0 - on error (-EFAULT)
  */
 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 {
@@ -2422,8 +2422,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
  * with no q.key reference on failure.
  *
  * Return:
- *  0 - uaddr contains val and hb has been locked;
- * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
+ *  -  0 - uaddr contains val and hb has been locked;
+ *  - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
  */
 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 			   struct futex_q *q, struct futex_hash_bucket **hb)
@@ -2895,8 +2895,8 @@ pi_faulted:
  * called with the hb lock held.
  *
  * Return:
- *  0 = no early wakeup detected;
- * <0 = -ETIMEDOUT or -ERESTARTNOINTR
+ *  -  0 = no early wakeup detected;
+ *  - <0 = -ETIMEDOUT or -ERESTARTNOINTR
  */
 static inline
 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2968,8 +2968,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  * If 4 or 7, we cleanup and return with -ETIMEDOUT.
  *
  * Return:
- *  0 - On success;
- * <0 - On error
+ *  -  0 - On success;
+ *  - <0 - On error
  */
 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 				 u32 val, ktime_t *abs_time, u32 bitset,
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 198527a62149..858a07590e39 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -227,9 +227,9 @@ static void __sched __mutex_lock_slowpath(struct mutex *lock);
  * (or statically defined) before it can be locked. memset()-ing
  * the mutex to 0 is not allowed.
  *
- * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
- *   checks that will enforce the restrictions and will also do
- *   deadlock debugging. )
+ * (The CONFIG_DEBUG_MUTEXES .config option turns on debugging
+ * checks that will enforce the restrictions and will also do
+ * deadlock debugging)
  *
  * This function is similar to (but not equivalent to) down().
  */
-- 
cgit v1.3-14-g43fede


From c0c6e0850514c16814c37f64a9a1bcc6c52a19ab Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Sun, 14 May 2017 12:03:39 -0300
Subject: irq: update genericirq book location

This book got converted from DocBook. Update its references to
point to the current location.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
---
 kernel/irq/chip.c    | 2 +-
 kernel/irq/handle.c  | 2 +-
 kernel/irq/irqdesc.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 686be4b73018..4188a0a7691f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -7,7 +7,7 @@
  * This file contains the core interrupt handling code, for irq-chip
  * based architectures.
  *
- * Detailed information is available in Documentation/DocBook/genericirq
+ * Detailed information is available in Documentation/core-api/genericirq.rst
  */
 
 #include <linux/irq.h>
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d3f24905852c..bbf9a7174283 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -6,7 +6,7 @@
  *
  * This file contains the core interrupt handling code.
  *
- * Detailed information is available in Documentation/DocBook/genericirq
+ * Detailed information is available in Documentation/core-api/genericirq.rst
  *
  */
 
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 00bb0aeea1d0..22e443133987 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -4,7 +4,7 @@
  *
  * This file contains the interrupt descriptor management code
  *
- * Detailed information is available in Documentation/DocBook/genericirq
+ * Detailed information is available in Documentation/core-api/genericirq.rst
  *
  */
 #include <linux/irq.h>
-- 
cgit v1.3-14-g43fede


From ce6cf9a15d62fd7ee92f4f9bb754883bacf85a3e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 11 May 2017 16:36:19 +0200
Subject: nohz: Add hrtimer sanity check

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 64c97fc130c4..d212bb62bc08 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -771,8 +771,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	tick = expires;
 
 	/* Skip reprogram of event if its not changed */
-	if (ts->tick_stopped && (expires == dev->next_event))
-		goto out;
+	if (ts->tick_stopped) {
+		if (hrtimer_active(&ts->sched_timer))
+			WARN_ON_ONCE(hrtimer_get_expires(&ts->sched_timer) < dev->next_event);
+
+		if (expires == dev->next_event)
+			goto out;
+	}
 
 	/*
 	 * nohz_stop_sched_tick can be called several times before
-- 
cgit v1.3-14-g43fede


From 411fe24e6b7c283c3a1911450cdba6dd3aaea56e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 21 Apr 2017 16:00:54 +0200
Subject: nohz: Fix collision between tick and other hrtimers, again

This restores commit:

  24b91e360ef5: ("nohz: Fix collision between tick and other hrtimers")

... which got reverted by commit:

  558e8e27e73f: ('Revert "nohz: Fix collision between tick and other hrtimers"')

... due to a regression where CPUs spuriously stopped ticking.

The bug happened when a tick fired too early past its expected expiration:
on IRQ exit the tick was scheduled again to the same deadline but skipped
reprogramming because ts->next_tick still kept in cache the deadline.
This has been fixed now with resetting ts->next_tick from the tick
itself. Extra care has also been taken to prevent from obsolete values
throughout CPU hotplug operations.

When the tick is stopped and an interrupt occurs afterward, we check on
that interrupt exit if the next tick needs to be rescheduled. If it
doesn't need any update, we don't want to do anything.

In order to check if the tick needs an update, we compare it against the
clockevent device deadline. Now that's a problem because the clockevent
device is at a lower level than the tick itself if it is implemented
on top of hrtimer.

Every hrtimer share this clockevent device. So comparing the next tick
deadline against the clockevent device deadline is wrong because the
device may be programmed for another hrtimer whose deadline collides
with the tick. As a result we may end up not reprogramming the tick
accidentally.

In a worst case scenario under full dynticks mode, the tick stops firing
as it is supposed to every 1hz, leaving /proc/stat stalled:

      Task in a full dynticks CPU
      ----------------------------

      * hrtimer A is queued 2 seconds ahead
      * the tick is stopped, scheduled 1 second ahead
      * tick fires 1 second later
      * on tick exit, nohz schedules the tick 1 second ahead but sees
        the clockevent device is already programmed to that deadline,
        fooled by hrtimer A, the tick isn't rescheduled.
      * hrtimer A is cancelled before its deadline
      * tick never fires again until an interrupt happens...

In order to fix this, store the next tick deadline to the tick_sched
local structure and reuse that value later to check whether we need to
reprogram the clock after an interrupt.

On the other hand, ts->sleep_length still wants to know about the next
clock event and not just the tick, so we want to improve the related
comment to avoid confusion.

Reported-and-tested-by: Tim Wright <tim@binbash.co.uk>
Reported-and-tested-by: Pavel Machek <pavel@ucw.cz>
Reported-by: James Hartsock <hartsjc@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1492783255-5051-2-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 37 +++++++++++++++++++++++++++++++------
 kernel/time/tick-sched.h |  2 ++
 2 files changed, 33 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d212bb62bc08..764d2905e6a5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,6 +150,12 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 		touch_softlockup_watchdog_sched();
 		if (is_idle_task(current))
 			ts->idle_jiffies++;
+		/*
+		 * In case the current tick fired too early past its expected
+		 * expiration, make sure we don't bypass the next clock reprogramming
+		 * to the same deadline.
+		 */
+		ts->next_tick = 0;
 	}
 #endif
 	update_process_times(user_mode(regs));
@@ -660,6 +666,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 		hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
 	else
 		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+
+	/*
+	 * Reset to make sure next tick stop doesn't get fooled by past
+	 * cached clock deadline.
+	 */
+	ts->next_tick = 0;
 }
 
 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
@@ -771,12 +783,15 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	tick = expires;
 
 	/* Skip reprogram of event if its not changed */
-	if (ts->tick_stopped) {
-		if (hrtimer_active(&ts->sched_timer))
-			WARN_ON_ONCE(hrtimer_get_expires(&ts->sched_timer) < dev->next_event);
-
-		if (expires == dev->next_event)
+	if (ts->tick_stopped && (expires == ts->next_tick)) {
+		/* Sanity check: make sure clockevent is actually programmed */
+		if (likely(dev->next_event <= ts->next_tick))
 			goto out;
+
+		WARN_ON_ONCE(1);
+		printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
+			    basemono, ts->next_tick, dev->next_event,
+			    hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer));
 	}
 
 	/*
@@ -796,6 +811,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		trace_tick_stop(1, TICK_DEP_MASK_NONE);
 	}
 
+	ts->next_tick = tick;
+
 	/*
 	 * If the expiration time == KTIME_MAX, then we simply stop
 	 * the tick timer.
@@ -811,7 +828,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	else
 		tick_program_event(tick, 1);
 out:
-	/* Update the estimated sleep length */
+	/*
+	 * Update the estimated sleep length until the next timer
+	 * (not only the tick).
+	 */
 	ts->sleep_length = ktime_sub(dev->next_event, now);
 	return tick;
 }
@@ -869,6 +889,11 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 	if (unlikely(!cpu_online(cpu))) {
 		if (cpu == tick_do_timer_cpu)
 			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+		/*
+		 * Make sure the CPU doesn't get fooled by obsolete tick
+		 * deadline if it comes back online later.
+		 */
+		ts->next_tick = 0;
 		return false;
 	}
 
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index bf38226e5c17..075444e3d48e 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -27,6 +27,7 @@ enum tick_nohz_mode {
  *			timer is modified for nohz sleeps. This is necessary
  *			to resume the tick timer operation in the timeline
  *			when the CPU returns from nohz sleep.
+ * @next_tick:		Next tick to be fired when in dynticks mode.
  * @tick_stopped:	Indicator that the idle tick has been stopped
  * @idle_jiffies:	jiffies at the entry to idle for idle time accounting
  * @idle_calls:		Total number of idle calls
@@ -44,6 +45,7 @@ struct tick_sched {
 	unsigned long			check_clocks;
 	enum tick_nohz_mode		nohz_mode;
 	ktime_t				last_tick;
+	ktime_t				next_tick;
 	int				inidle;
 	int				tick_stopped;
 	unsigned long			idle_jiffies;
-- 
cgit v1.3-14-g43fede


From af777cd1b83e95138e7285fde87c795ef0ae7c4d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sat, 13 May 2017 04:51:40 -0700
Subject: doc: ReSTify credentials.txt

This updates the credentials API documentation to ReST markup and moves
it under the security subsection of kernel API documentation.

Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/security/00-INDEX        |   2 -
 Documentation/security/credentials.rst | 554 +++++++++++++++++++++++++++++++
 Documentation/security/credentials.txt | 581 ---------------------------------
 Documentation/security/index.rst       |   1 +
 include/linux/cred.h                   |   2 +-
 kernel/cred.c                          |   2 +-
 6 files changed, 557 insertions(+), 585 deletions(-)
 create mode 100644 Documentation/security/credentials.rst
 delete mode 100644 Documentation/security/credentials.txt

(limited to 'kernel')

diff --git a/Documentation/security/00-INDEX b/Documentation/security/00-INDEX
index 414235c1fcfc..c4df62a9ae5b 100644
--- a/Documentation/security/00-INDEX
+++ b/Documentation/security/00-INDEX
@@ -10,8 +10,6 @@ Yama.txt
 	- documentation on the Yama Linux Security Module.
 apparmor.txt
 	- documentation on the AppArmor security extension.
-credentials.txt
-	- documentation about credentials in Linux.
 keys-ecryptfs.txt
 	- description of the encryption keys for the ecryptfs filesystem.
 keys-request-key.txt
diff --git a/Documentation/security/credentials.rst b/Documentation/security/credentials.rst
new file mode 100644
index 000000000000..038a7e19eff9
--- /dev/null
+++ b/Documentation/security/credentials.rst
@@ -0,0 +1,554 @@
+====================
+Credentials in Linux
+====================
+
+By: David Howells <dhowells@redhat.com>
+
+.. contents:: :local:
+
+Overview
+========
+
+There are several parts to the security check performed by Linux when one
+object acts upon another:
+
+ 1. Objects.
+
+     Objects are things in the system that may be acted upon directly by
+     userspace programs.  Linux has a variety of actionable objects, including:
+
+	- Tasks
+	- Files/inodes
+	- Sockets
+	- Message queues
+	- Shared memory segments
+	- Semaphores
+	- Keys
+
+     As a part of the description of all these objects there is a set of
+     credentials.  What's in the set depends on the type of object.
+
+ 2. Object ownership.
+
+     Amongst the credentials of most objects, there will be a subset that
+     indicates the ownership of that object.  This is used for resource
+     accounting and limitation (disk quotas and task rlimits for example).
+
+     In a standard UNIX filesystem, for instance, this will be defined by the
+     UID marked on the inode.
+
+ 3. The objective context.
+
+     Also amongst the credentials of those objects, there will be a subset that
+     indicates the 'objective context' of that object.  This may or may not be
+     the same set as in (2) - in standard UNIX files, for instance, this is the
+     defined by the UID and the GID marked on the inode.
+
+     The objective context is used as part of the security calculation that is
+     carried out when an object is acted upon.
+
+ 4. Subjects.
+
+     A subject is an object that is acting upon another object.
+
+     Most of the objects in the system are inactive: they don't act on other
+     objects within the system.  Processes/tasks are the obvious exception:
+     they do stuff; they access and manipulate things.
+
+     Objects other than tasks may under some circumstances also be subjects.
+     For instance an open file may send SIGIO to a task using the UID and EUID
+     given to it by a task that called ``fcntl(F_SETOWN)`` upon it.  In this case,
+     the file struct will have a subjective context too.
+
+ 5. The subjective context.
+
+     A subject has an additional interpretation of its credentials.  A subset
+     of its credentials forms the 'subjective context'.  The subjective context
+     is used as part of the security calculation that is carried out when a
+     subject acts.
+
+     A Linux task, for example, has the FSUID, FSGID and the supplementary
+     group list for when it is acting upon a file - which are quite separate
+     from the real UID and GID that normally form the objective context of the
+     task.
+
+ 6. Actions.
+
+     Linux has a number of actions available that a subject may perform upon an
+     object.  The set of actions available depends on the nature of the subject
+     and the object.
+
+     Actions include reading, writing, creating and deleting files; forking or
+     signalling and tracing tasks.
+
+ 7. Rules, access control lists and security calculations.
+
+     When a subject acts upon an object, a security calculation is made.  This
+     involves taking the subjective context, the objective context and the
+     action, and searching one or more sets of rules to see whether the subject
+     is granted or denied permission to act in the desired manner on the
+     object, given those contexts.
+
+     There are two main sources of rules:
+
+     a. Discretionary access control (DAC):
+
+	 Sometimes the object will include sets of rules as part of its
+	 description.  This is an 'Access Control List' or 'ACL'.  A Linux
+	 file may supply more than one ACL.
+
+	 A traditional UNIX file, for example, includes a permissions mask that
+	 is an abbreviated ACL with three fixed classes of subject ('user',
+	 'group' and 'other'), each of which may be granted certain privileges
+	 ('read', 'write' and 'execute' - whatever those map to for the object
+	 in question).  UNIX file permissions do not allow the arbitrary
+	 specification of subjects, however, and so are of limited use.
+
+	 A Linux file might also sport a POSIX ACL.  This is a list of rules
+	 that grants various permissions to arbitrary subjects.
+
+     b. Mandatory access control (MAC):
+
+	 The system as a whole may have one or more sets of rules that get
+	 applied to all subjects and objects, regardless of their source.
+	 SELinux and Smack are examples of this.
+
+	 In the case of SELinux and Smack, each object is given a label as part
+	 of its credentials.  When an action is requested, they take the
+	 subject label, the object label and the action and look for a rule
+	 that says that this action is either granted or denied.
+
+
+Types of Credentials
+====================
+
+The Linux kernel supports the following types of credentials:
+
+ 1. Traditional UNIX credentials.
+
+	- Real User ID
+	- Real Group ID
+
+     The UID and GID are carried by most, if not all, Linux objects, even if in
+     some cases it has to be invented (FAT or CIFS files for example, which are
+     derived from Windows).  These (mostly) define the objective context of
+     that object, with tasks being slightly different in some cases.
+
+	- Effective, Saved and FS User ID
+	- Effective, Saved and FS Group ID
+	- Supplementary groups
+
+     These are additional credentials used by tasks only.  Usually, an
+     EUID/EGID/GROUPS will be used as the subjective context, and real UID/GID
+     will be used as the objective.  For tasks, it should be noted that this is
+     not always true.
+
+ 2. Capabilities.
+
+	- Set of permitted capabilities
+	- Set of inheritable capabilities
+	- Set of effective capabilities
+	- Capability bounding set
+
+     These are only carried by tasks.  They indicate superior capabilities
+     granted piecemeal to a task that an ordinary task wouldn't otherwise have.
+     These are manipulated implicitly by changes to the traditional UNIX
+     credentials, but can also be manipulated directly by the ``capset()``
+     system call.
+
+     The permitted capabilities are those caps that the process might grant
+     itself to its effective or permitted sets through ``capset()``.  This
+     inheritable set might also be so constrained.
+
+     The effective capabilities are the ones that a task is actually allowed to
+     make use of itself.
+
+     The inheritable capabilities are the ones that may get passed across
+     ``execve()``.
+
+     The bounding set limits the capabilities that may be inherited across
+     ``execve()``, especially when a binary is executed that will execute as
+     UID 0.
+
+ 3. Secure management flags (securebits).
+
+     These are only carried by tasks.  These govern the way the above
+     credentials are manipulated and inherited over certain operations such as
+     execve().  They aren't used directly as objective or subjective
+     credentials.
+
+ 4. Keys and keyrings.
+
+     These are only carried by tasks.  They carry and cache security tokens
+     that don't fit into the other standard UNIX credentials.  They are for
+     making such things as network filesystem keys available to the file
+     accesses performed by processes, without the necessity of ordinary
+     programs having to know about security details involved.
+
+     Keyrings are a special type of key.  They carry sets of other keys and can
+     be searched for the desired key.  Each process may subscribe to a number
+     of keyrings:
+
+	Per-thread keying
+	Per-process keyring
+	Per-session keyring
+
+     When a process accesses a key, if not already present, it will normally be
+     cached on one of these keyrings for future accesses to find.
+
+     For more information on using keys, see Documentation/security/keys.txt.
+
+ 5. LSM
+
+     The Linux Security Module allows extra controls to be placed over the
+     operations that a task may do.  Currently Linux supports several LSM
+     options.
+
+     Some work by labelling the objects in a system and then applying sets of
+     rules (policies) that say what operations a task with one label may do to
+     an object with another label.
+
+ 6. AF_KEY
+
+     This is a socket-based approach to credential management for networking
+     stacks [RFC 2367].  It isn't discussed by this document as it doesn't
+     interact directly with task and file credentials; rather it keeps system
+     level credentials.
+
+
+When a file is opened, part of the opening task's subjective context is
+recorded in the file struct created.  This allows operations using that file
+struct to use those credentials instead of the subjective context of the task
+that issued the operation.  An example of this would be a file opened on a
+network filesystem where the credentials of the opened file should be presented
+to the server, regardless of who is actually doing a read or a write upon it.
+
+
+File Markings
+=============
+
+Files on disk or obtained over the network may have annotations that form the
+objective security context of that file.  Depending on the type of filesystem,
+this may include one or more of the following:
+
+ * UNIX UID, GID, mode;
+ * Windows user ID;
+ * Access control list;
+ * LSM security label;
+ * UNIX exec privilege escalation bits (SUID/SGID);
+ * File capabilities exec privilege escalation bits.
+
+These are compared to the task's subjective security context, and certain
+operations allowed or disallowed as a result.  In the case of execve(), the
+privilege escalation bits come into play, and may allow the resulting process
+extra privileges, based on the annotations on the executable file.
+
+
+Task Credentials
+================
+
+In Linux, all of a task's credentials are held in (uid, gid) or through
+(groups, keys, LSM security) a refcounted structure of type 'struct cred'.
+Each task points to its credentials by a pointer called 'cred' in its
+task_struct.
+
+Once a set of credentials has been prepared and committed, it may not be
+changed, barring the following exceptions:
+
+ 1. its reference count may be changed;
+
+ 2. the reference count on the group_info struct it points to may be changed;
+
+ 3. the reference count on the security data it points to may be changed;
+
+ 4. the reference count on any keyrings it points to may be changed;
+
+ 5. any keyrings it points to may be revoked, expired or have their security
+    attributes changed; and
+
+ 6. the contents of any keyrings to which it points may be changed (the whole
+    point of keyrings being a shared set of credentials, modifiable by anyone
+    with appropriate access).
+
+To alter anything in the cred struct, the copy-and-replace principle must be
+adhered to.  First take a copy, then alter the copy and then use RCU to change
+the task pointer to make it point to the new copy.  There are wrappers to aid
+with this (see below).
+
+A task may only alter its _own_ credentials; it is no longer permitted for a
+task to alter another's credentials.  This means the ``capset()`` system call
+is no longer permitted to take any PID other than the one of the current
+process. Also ``keyctl_instantiate()`` and ``keyctl_negate()`` functions no
+longer permit attachment to process-specific keyrings in the requesting
+process as the instantiating process may need to create them.
+
+
+Immutable Credentials
+---------------------
+
+Once a set of credentials has been made public (by calling ``commit_creds()``
+for example), it must be considered immutable, barring two exceptions:
+
+ 1. The reference count may be altered.
+
+ 2. Whilst the keyring subscriptions of a set of credentials may not be
+    changed, the keyrings subscribed to may have their contents altered.
+
+To catch accidental credential alteration at compile time, struct task_struct
+has _const_ pointers to its credential sets, as does struct file.  Furthermore,
+certain functions such as ``get_cred()`` and ``put_cred()`` operate on const
+pointers, thus rendering casts unnecessary, but require to temporarily ditch
+the const qualification to be able to alter the reference count.
+
+
+Accessing Task Credentials
+--------------------------
+
+A task being able to alter only its own credentials permits the current process
+to read or replace its own credentials without the need for any form of locking
+-- which simplifies things greatly.  It can just call::
+
+	const struct cred *current_cred()
+
+to get a pointer to its credentials structure, and it doesn't have to release
+it afterwards.
+
+There are convenience wrappers for retrieving specific aspects of a task's
+credentials (the value is simply returned in each case)::
+
+	uid_t current_uid(void)		Current's real UID
+	gid_t current_gid(void)		Current's real GID
+	uid_t current_euid(void)	Current's effective UID
+	gid_t current_egid(void)	Current's effective GID
+	uid_t current_fsuid(void)	Current's file access UID
+	gid_t current_fsgid(void)	Current's file access GID
+	kernel_cap_t current_cap(void)	Current's effective capabilities
+	void *current_security(void)	Current's LSM security pointer
+	struct user_struct *current_user(void)  Current's user account
+
+There are also convenience wrappers for retrieving specific associated pairs of
+a task's credentials::
+
+	void current_uid_gid(uid_t *, gid_t *);
+	void current_euid_egid(uid_t *, gid_t *);
+	void current_fsuid_fsgid(uid_t *, gid_t *);
+
+which return these pairs of values through their arguments after retrieving
+them from the current task's credentials.
+
+
+In addition, there is a function for obtaining a reference on the current
+process's current set of credentials::
+
+	const struct cred *get_current_cred(void);
+
+and functions for getting references to one of the credentials that don't
+actually live in struct cred::
+
+	struct user_struct *get_current_user(void);
+	struct group_info *get_current_groups(void);
+
+which get references to the current process's user accounting structure and
+supplementary groups list respectively.
+
+Once a reference has been obtained, it must be released with ``put_cred()``,
+``free_uid()`` or ``put_group_info()`` as appropriate.
+
+
+Accessing Another Task's Credentials
+------------------------------------
+
+Whilst a task may access its own credentials without the need for locking, the
+same is not true of a task wanting to access another task's credentials.  It
+must use the RCU read lock and ``rcu_dereference()``.
+
+The ``rcu_dereference()`` is wrapped by::
+
+	const struct cred *__task_cred(struct task_struct *task);
+
+This should be used inside the RCU read lock, as in the following example::
+
+	void foo(struct task_struct *t, struct foo_data *f)
+	{
+		const struct cred *tcred;
+		...
+		rcu_read_lock();
+		tcred = __task_cred(t);
+		f->uid = tcred->uid;
+		f->gid = tcred->gid;
+		f->groups = get_group_info(tcred->groups);
+		rcu_read_unlock();
+		...
+	}
+
+Should it be necessary to hold another task's credentials for a long period of
+time, and possibly to sleep whilst doing so, then the caller should get a
+reference on them using::
+
+	const struct cred *get_task_cred(struct task_struct *task);
+
+This does all the RCU magic inside of it.  The caller must call put_cred() on
+the credentials so obtained when they're finished with.
+
+.. note::
+   The result of ``__task_cred()`` should not be passed directly to
+   ``get_cred()`` as this may race with ``commit_cred()``.
+
+There are a couple of convenience functions to access bits of another task's
+credentials, hiding the RCU magic from the caller::
+
+	uid_t task_uid(task)		Task's real UID
+	uid_t task_euid(task)		Task's effective UID
+
+If the caller is holding the RCU read lock at the time anyway, then::
+
+	__task_cred(task)->uid
+	__task_cred(task)->euid
+
+should be used instead.  Similarly, if multiple aspects of a task's credentials
+need to be accessed, RCU read lock should be used, ``__task_cred()`` called,
+the result stored in a temporary pointer and then the credential aspects called
+from that before dropping the lock.  This prevents the potentially expensive
+RCU magic from being invoked multiple times.
+
+Should some other single aspect of another task's credentials need to be
+accessed, then this can be used::
+
+	task_cred_xxx(task, member)
+
+where 'member' is a non-pointer member of the cred struct.  For instance::
+
+	uid_t task_cred_xxx(task, suid);
+
+will retrieve 'struct cred::suid' from the task, doing the appropriate RCU
+magic.  This may not be used for pointer members as what they point to may
+disappear the moment the RCU read lock is dropped.
+
+
+Altering Credentials
+--------------------
+
+As previously mentioned, a task may only alter its own credentials, and may not
+alter those of another task.  This means that it doesn't need to use any
+locking to alter its own credentials.
+
+To alter the current process's credentials, a function should first prepare a
+new set of credentials by calling::
+
+	struct cred *prepare_creds(void);
+
+this locks current->cred_replace_mutex and then allocates and constructs a
+duplicate of the current process's credentials, returning with the mutex still
+held if successful.  It returns NULL if not successful (out of memory).
+
+The mutex prevents ``ptrace()`` from altering the ptrace state of a process
+whilst security checks on credentials construction and changing is taking place
+as the ptrace state may alter the outcome, particularly in the case of
+``execve()``.
+
+The new credentials set should be altered appropriately, and any security
+checks and hooks done.  Both the current and the proposed sets of credentials
+are available for this purpose as current_cred() will return the current set
+still at this point.
+
+
+When the credential set is ready, it should be committed to the current process
+by calling::
+
+	int commit_creds(struct cred *new);
+
+This will alter various aspects of the credentials and the process, giving the
+LSM a chance to do likewise, then it will use ``rcu_assign_pointer()`` to
+actually commit the new credentials to ``current->cred``, it will release
+``current->cred_replace_mutex`` to allow ``ptrace()`` to take place, and it
+will notify the scheduler and others of the changes.
+
+This function is guaranteed to return 0, so that it can be tail-called at the
+end of such functions as ``sys_setresuid()``.
+
+Note that this function consumes the caller's reference to the new credentials.
+The caller should _not_ call ``put_cred()`` on the new credentials afterwards.
+
+Furthermore, once this function has been called on a new set of credentials,
+those credentials may _not_ be changed further.
+
+
+Should the security checks fail or some other error occur after
+``prepare_creds()`` has been called, then the following function should be
+invoked::
+
+	void abort_creds(struct cred *new);
+
+This releases the lock on ``current->cred_replace_mutex`` that
+``prepare_creds()`` got and then releases the new credentials.
+
+
+A typical credentials alteration function would look something like this::
+
+	int alter_suid(uid_t suid)
+	{
+		struct cred *new;
+		int ret;
+
+		new = prepare_creds();
+		if (!new)
+			return -ENOMEM;
+
+		new->suid = suid;
+		ret = security_alter_suid(new);
+		if (ret < 0) {
+			abort_creds(new);
+			return ret;
+		}
+
+		return commit_creds(new);
+	}
+
+
+Managing Credentials
+--------------------
+
+There are some functions to help manage credentials:
+
+ - ``void put_cred(const struct cred *cred);``
+
+     This releases a reference to the given set of credentials.  If the
+     reference count reaches zero, the credentials will be scheduled for
+     destruction by the RCU system.
+
+ - ``const struct cred *get_cred(const struct cred *cred);``
+
+     This gets a reference on a live set of credentials, returning a pointer to
+     that set of credentials.
+
+ - ``struct cred *get_new_cred(struct cred *cred);``
+
+     This gets a reference on a set of credentials that is under construction
+     and is thus still mutable, returning a pointer to that set of credentials.
+
+
+Open File Credentials
+=====================
+
+When a new file is opened, a reference is obtained on the opening task's
+credentials and this is attached to the file struct as ``f_cred`` in place of
+``f_uid`` and ``f_gid``.  Code that used to access ``file->f_uid`` and
+``file->f_gid`` should now access ``file->f_cred->fsuid`` and
+``file->f_cred->fsgid``.
+
+It is safe to access ``f_cred`` without the use of RCU or locking because the
+pointer will not change over the lifetime of the file struct, and nor will the
+contents of the cred struct pointed to, barring the exceptions listed above
+(see the Task Credentials section).
+
+
+Overriding the VFS's Use of Credentials
+=======================================
+
+Under some circumstances it is desirable to override the credentials used by
+the VFS, and that can be done by calling into such as ``vfs_mkdir()`` with a
+different set of credentials.  This is done in the following places:
+
+ * ``sys_faccessat()``.
+ * ``do_coredump()``.
+ * nfs4recover.c.
diff --git a/Documentation/security/credentials.txt b/Documentation/security/credentials.txt
deleted file mode 100644
index 86257052e31a..000000000000
--- a/Documentation/security/credentials.txt
+++ /dev/null
@@ -1,581 +0,0 @@
-			     ====================
-			     CREDENTIALS IN LINUX
-			     ====================
-
-By: David Howells <dhowells@redhat.com>
-
-Contents:
-
- (*) Overview.
-
- (*) Types of credentials.
-
- (*) File markings.
-
- (*) Task credentials.
-
-     - Immutable credentials.
-     - Accessing task credentials.
-     - Accessing another task's credentials.
-     - Altering credentials.
-     - Managing credentials.
-
- (*) Open file credentials.
-
- (*) Overriding the VFS's use of credentials.
-
-
-========
-OVERVIEW
-========
-
-There are several parts to the security check performed by Linux when one
-object acts upon another:
-
- (1) Objects.
-
-     Objects are things in the system that may be acted upon directly by
-     userspace programs.  Linux has a variety of actionable objects, including:
-
-	- Tasks
-	- Files/inodes
-	- Sockets
-	- Message queues
-	- Shared memory segments
-	- Semaphores
-	- Keys
-
-     As a part of the description of all these objects there is a set of
-     credentials.  What's in the set depends on the type of object.
-
- (2) Object ownership.
-
-     Amongst the credentials of most objects, there will be a subset that
-     indicates the ownership of that object.  This is used for resource
-     accounting and limitation (disk quotas and task rlimits for example).
-
-     In a standard UNIX filesystem, for instance, this will be defined by the
-     UID marked on the inode.
-
- (3) The objective context.
-
-     Also amongst the credentials of those objects, there will be a subset that
-     indicates the 'objective context' of that object.  This may or may not be
-     the same set as in (2) - in standard UNIX files, for instance, this is the
-     defined by the UID and the GID marked on the inode.
-
-     The objective context is used as part of the security calculation that is
-     carried out when an object is acted upon.
-
- (4) Subjects.
-
-     A subject is an object that is acting upon another object.
-
-     Most of the objects in the system are inactive: they don't act on other
-     objects within the system.  Processes/tasks are the obvious exception:
-     they do stuff; they access and manipulate things.
-
-     Objects other than tasks may under some circumstances also be subjects.
-     For instance an open file may send SIGIO to a task using the UID and EUID
-     given to it by a task that called fcntl(F_SETOWN) upon it.  In this case,
-     the file struct will have a subjective context too.
-
- (5) The subjective context.
-
-     A subject has an additional interpretation of its credentials.  A subset
-     of its credentials forms the 'subjective context'.  The subjective context
-     is used as part of the security calculation that is carried out when a
-     subject acts.
-
-     A Linux task, for example, has the FSUID, FSGID and the supplementary
-     group list for when it is acting upon a file - which are quite separate
-     from the real UID and GID that normally form the objective context of the
-     task.
-
- (6) Actions.
-
-     Linux has a number of actions available that a subject may perform upon an
-     object.  The set of actions available depends on the nature of the subject
-     and the object.
-
-     Actions include reading, writing, creating and deleting files; forking or
-     signalling and tracing tasks.
-
- (7) Rules, access control lists and security calculations.
-
-     When a subject acts upon an object, a security calculation is made.  This
-     involves taking the subjective context, the objective context and the
-     action, and searching one or more sets of rules to see whether the subject
-     is granted or denied permission to act in the desired manner on the
-     object, given those contexts.
-
-     There are two main sources of rules:
-
-     (a) Discretionary access control (DAC):
-
-	 Sometimes the object will include sets of rules as part of its
-	 description.  This is an 'Access Control List' or 'ACL'.  A Linux
-	 file may supply more than one ACL.
-
-	 A traditional UNIX file, for example, includes a permissions mask that
-	 is an abbreviated ACL with three fixed classes of subject ('user',
-	 'group' and 'other'), each of which may be granted certain privileges
-	 ('read', 'write' and 'execute' - whatever those map to for the object
-	 in question).  UNIX file permissions do not allow the arbitrary
-	 specification of subjects, however, and so are of limited use.
-
-	 A Linux file might also sport a POSIX ACL.  This is a list of rules
-	 that grants various permissions to arbitrary subjects.
-
-     (b) Mandatory access control (MAC):
-
-	 The system as a whole may have one or more sets of rules that get
-	 applied to all subjects and objects, regardless of their source.
-	 SELinux and Smack are examples of this.
-
-	 In the case of SELinux and Smack, each object is given a label as part
-	 of its credentials.  When an action is requested, they take the
-	 subject label, the object label and the action and look for a rule
-	 that says that this action is either granted or denied.
-
-
-====================
-TYPES OF CREDENTIALS
-====================
-
-The Linux kernel supports the following types of credentials:
-
- (1) Traditional UNIX credentials.
-
-	Real User ID
-	Real Group ID
-
-     The UID and GID are carried by most, if not all, Linux objects, even if in
-     some cases it has to be invented (FAT or CIFS files for example, which are
-     derived from Windows).  These (mostly) define the objective context of
-     that object, with tasks being slightly different in some cases.
-
-	Effective, Saved and FS User ID
-	Effective, Saved and FS Group ID
-	Supplementary groups
-
-     These are additional credentials used by tasks only.  Usually, an
-     EUID/EGID/GROUPS will be used as the subjective context, and real UID/GID
-     will be used as the objective.  For tasks, it should be noted that this is
-     not always true.
-
- (2) Capabilities.
-
-	Set of permitted capabilities
-	Set of inheritable capabilities
-	Set of effective capabilities
-	Capability bounding set
-
-     These are only carried by tasks.  They indicate superior capabilities
-     granted piecemeal to a task that an ordinary task wouldn't otherwise have.
-     These are manipulated implicitly by changes to the traditional UNIX
-     credentials, but can also be manipulated directly by the capset() system
-     call.
-
-     The permitted capabilities are those caps that the process might grant
-     itself to its effective or permitted sets through capset().  This
-     inheritable set might also be so constrained.
-
-     The effective capabilities are the ones that a task is actually allowed to
-     make use of itself.
-
-     The inheritable capabilities are the ones that may get passed across
-     execve().
-
-     The bounding set limits the capabilities that may be inherited across
-     execve(), especially when a binary is executed that will execute as UID 0.
-
- (3) Secure management flags (securebits).
-
-     These are only carried by tasks.  These govern the way the above
-     credentials are manipulated and inherited over certain operations such as
-     execve().  They aren't used directly as objective or subjective
-     credentials.
-
- (4) Keys and keyrings.
-
-     These are only carried by tasks.  They carry and cache security tokens
-     that don't fit into the other standard UNIX credentials.  They are for
-     making such things as network filesystem keys available to the file
-     accesses performed by processes, without the necessity of ordinary
-     programs having to know about security details involved.
-
-     Keyrings are a special type of key.  They carry sets of other keys and can
-     be searched for the desired key.  Each process may subscribe to a number
-     of keyrings:
-
-	Per-thread keying
-	Per-process keyring
-	Per-session keyring
-
-     When a process accesses a key, if not already present, it will normally be
-     cached on one of these keyrings for future accesses to find.
-
-     For more information on using keys, see Documentation/security/keys.txt.
-
- (5) LSM
-
-     The Linux Security Module allows extra controls to be placed over the
-     operations that a task may do.  Currently Linux supports several LSM
-     options.
-
-     Some work by labelling the objects in a system and then applying sets of
-     rules (policies) that say what operations a task with one label may do to
-     an object with another label.
-
- (6) AF_KEY
-
-     This is a socket-based approach to credential management for networking
-     stacks [RFC 2367].  It isn't discussed by this document as it doesn't
-     interact directly with task and file credentials; rather it keeps system
-     level credentials.
-
-
-When a file is opened, part of the opening task's subjective context is
-recorded in the file struct created.  This allows operations using that file
-struct to use those credentials instead of the subjective context of the task
-that issued the operation.  An example of this would be a file opened on a
-network filesystem where the credentials of the opened file should be presented
-to the server, regardless of who is actually doing a read or a write upon it.
-
-
-=============
-FILE MARKINGS
-=============
-
-Files on disk or obtained over the network may have annotations that form the
-objective security context of that file.  Depending on the type of filesystem,
-this may include one or more of the following:
-
- (*) UNIX UID, GID, mode;
-
- (*) Windows user ID;
-
- (*) Access control list;
-
- (*) LSM security label;
-
- (*) UNIX exec privilege escalation bits (SUID/SGID);
-
- (*) File capabilities exec privilege escalation bits.
-
-These are compared to the task's subjective security context, and certain
-operations allowed or disallowed as a result.  In the case of execve(), the
-privilege escalation bits come into play, and may allow the resulting process
-extra privileges, based on the annotations on the executable file.
-
-
-================
-TASK CREDENTIALS
-================
-
-In Linux, all of a task's credentials are held in (uid, gid) or through
-(groups, keys, LSM security) a refcounted structure of type 'struct cred'.
-Each task points to its credentials by a pointer called 'cred' in its
-task_struct.
-
-Once a set of credentials has been prepared and committed, it may not be
-changed, barring the following exceptions:
-
- (1) its reference count may be changed;
-
- (2) the reference count on the group_info struct it points to may be changed;
-
- (3) the reference count on the security data it points to may be changed;
-
- (4) the reference count on any keyrings it points to may be changed;
-
- (5) any keyrings it points to may be revoked, expired or have their security
-     attributes changed; and
-
- (6) the contents of any keyrings to which it points may be changed (the whole
-     point of keyrings being a shared set of credentials, modifiable by anyone
-     with appropriate access).
-
-To alter anything in the cred struct, the copy-and-replace principle must be
-adhered to.  First take a copy, then alter the copy and then use RCU to change
-the task pointer to make it point to the new copy.  There are wrappers to aid
-with this (see below).
-
-A task may only alter its _own_ credentials; it is no longer permitted for a
-task to alter another's credentials.  This means the capset() system call is no
-longer permitted to take any PID other than the one of the current process.
-Also keyctl_instantiate() and keyctl_negate() functions no longer permit
-attachment to process-specific keyrings in the requesting process as the
-instantiating process may need to create them.
-
-
-IMMUTABLE CREDENTIALS
----------------------
-
-Once a set of credentials has been made public (by calling commit_creds() for
-example), it must be considered immutable, barring two exceptions:
-
- (1) The reference count may be altered.
-
- (2) Whilst the keyring subscriptions of a set of credentials may not be
-     changed, the keyrings subscribed to may have their contents altered.
-
-To catch accidental credential alteration at compile time, struct task_struct
-has _const_ pointers to its credential sets, as does struct file.  Furthermore,
-certain functions such as get_cred() and put_cred() operate on const pointers,
-thus rendering casts unnecessary, but require to temporarily ditch the const
-qualification to be able to alter the reference count.
-
-
-ACCESSING TASK CREDENTIALS
---------------------------
-
-A task being able to alter only its own credentials permits the current process
-to read or replace its own credentials without the need for any form of locking
-- which simplifies things greatly.  It can just call:
-
-	const struct cred *current_cred()
-
-to get a pointer to its credentials structure, and it doesn't have to release
-it afterwards.
-
-There are convenience wrappers for retrieving specific aspects of a task's
-credentials (the value is simply returned in each case):
-
-	uid_t current_uid(void)		Current's real UID
-	gid_t current_gid(void)		Current's real GID
-	uid_t current_euid(void)	Current's effective UID
-	gid_t current_egid(void)	Current's effective GID
-	uid_t current_fsuid(void)	Current's file access UID
-	gid_t current_fsgid(void)	Current's file access GID
-	kernel_cap_t current_cap(void)	Current's effective capabilities
-	void *current_security(void)	Current's LSM security pointer
-	struct user_struct *current_user(void)  Current's user account
-
-There are also convenience wrappers for retrieving specific associated pairs of
-a task's credentials:
-
-	void current_uid_gid(uid_t *, gid_t *);
-	void current_euid_egid(uid_t *, gid_t *);
-	void current_fsuid_fsgid(uid_t *, gid_t *);
-
-which return these pairs of values through their arguments after retrieving
-them from the current task's credentials.
-
-
-In addition, there is a function for obtaining a reference on the current
-process's current set of credentials:
-
-	const struct cred *get_current_cred(void);
-
-and functions for getting references to one of the credentials that don't
-actually live in struct cred:
-
-	struct user_struct *get_current_user(void);
-	struct group_info *get_current_groups(void);
-
-which get references to the current process's user accounting structure and
-supplementary groups list respectively.
-
-Once a reference has been obtained, it must be released with put_cred(),
-free_uid() or put_group_info() as appropriate.
-
-
-ACCESSING ANOTHER TASK'S CREDENTIALS
-------------------------------------
-
-Whilst a task may access its own credentials without the need for locking, the
-same is not true of a task wanting to access another task's credentials.  It
-must use the RCU read lock and rcu_dereference().
-
-The rcu_dereference() is wrapped by:
-
-	const struct cred *__task_cred(struct task_struct *task);
-
-This should be used inside the RCU read lock, as in the following example:
-
-	void foo(struct task_struct *t, struct foo_data *f)
-	{
-		const struct cred *tcred;
-		...
-		rcu_read_lock();
-		tcred = __task_cred(t);
-		f->uid = tcred->uid;
-		f->gid = tcred->gid;
-		f->groups = get_group_info(tcred->groups);
-		rcu_read_unlock();
-		...
-	}
-
-Should it be necessary to hold another task's credentials for a long period of
-time, and possibly to sleep whilst doing so, then the caller should get a
-reference on them using:
-
-	const struct cred *get_task_cred(struct task_struct *task);
-
-This does all the RCU magic inside of it.  The caller must call put_cred() on
-the credentials so obtained when they're finished with.
-
- [*] Note: The result of __task_cred() should not be passed directly to
-     get_cred() as this may race with commit_cred().
-
-There are a couple of convenience functions to access bits of another task's
-credentials, hiding the RCU magic from the caller:
-
-	uid_t task_uid(task)		Task's real UID
-	uid_t task_euid(task)		Task's effective UID
-
-If the caller is holding the RCU read lock at the time anyway, then:
-
-	__task_cred(task)->uid
-	__task_cred(task)->euid
-
-should be used instead.  Similarly, if multiple aspects of a task's credentials
-need to be accessed, RCU read lock should be used, __task_cred() called, the
-result stored in a temporary pointer and then the credential aspects called
-from that before dropping the lock.  This prevents the potentially expensive
-RCU magic from being invoked multiple times.
-
-Should some other single aspect of another task's credentials need to be
-accessed, then this can be used:
-
-	task_cred_xxx(task, member)
-
-where 'member' is a non-pointer member of the cred struct.  For instance:
-
-	uid_t task_cred_xxx(task, suid);
-
-will retrieve 'struct cred::suid' from the task, doing the appropriate RCU
-magic.  This may not be used for pointer members as what they point to may
-disappear the moment the RCU read lock is dropped.
-
-
-ALTERING CREDENTIALS
---------------------
-
-As previously mentioned, a task may only alter its own credentials, and may not
-alter those of another task.  This means that it doesn't need to use any
-locking to alter its own credentials.
-
-To alter the current process's credentials, a function should first prepare a
-new set of credentials by calling:
-
-	struct cred *prepare_creds(void);
-
-this locks current->cred_replace_mutex and then allocates and constructs a
-duplicate of the current process's credentials, returning with the mutex still
-held if successful.  It returns NULL if not successful (out of memory).
-
-The mutex prevents ptrace() from altering the ptrace state of a process whilst
-security checks on credentials construction and changing is taking place as
-the ptrace state may alter the outcome, particularly in the case of execve().
-
-The new credentials set should be altered appropriately, and any security
-checks and hooks done.  Both the current and the proposed sets of credentials
-are available for this purpose as current_cred() will return the current set
-still at this point.
-
-
-When the credential set is ready, it should be committed to the current process
-by calling:
-
-	int commit_creds(struct cred *new);
-
-This will alter various aspects of the credentials and the process, giving the
-LSM a chance to do likewise, then it will use rcu_assign_pointer() to actually
-commit the new credentials to current->cred, it will release
-current->cred_replace_mutex to allow ptrace() to take place, and it will notify
-the scheduler and others of the changes.
-
-This function is guaranteed to return 0, so that it can be tail-called at the
-end of such functions as sys_setresuid().
-
-Note that this function consumes the caller's reference to the new credentials.
-The caller should _not_ call put_cred() on the new credentials afterwards.
-
-Furthermore, once this function has been called on a new set of credentials,
-those credentials may _not_ be changed further.
-
-
-Should the security checks fail or some other error occur after prepare_creds()
-has been called, then the following function should be invoked:
-
-	void abort_creds(struct cred *new);
-
-This releases the lock on current->cred_replace_mutex that prepare_creds() got
-and then releases the new credentials.
-
-
-A typical credentials alteration function would look something like this:
-
-	int alter_suid(uid_t suid)
-	{
-		struct cred *new;
-		int ret;
-
-		new = prepare_creds();
-		if (!new)
-			return -ENOMEM;
-
-		new->suid = suid;
-		ret = security_alter_suid(new);
-		if (ret < 0) {
-			abort_creds(new);
-			return ret;
-		}
-
-		return commit_creds(new);
-	}
-
-
-MANAGING CREDENTIALS
---------------------
-
-There are some functions to help manage credentials:
-
- (*) void put_cred(const struct cred *cred);
-
-     This releases a reference to the given set of credentials.  If the
-     reference count reaches zero, the credentials will be scheduled for
-     destruction by the RCU system.
-
- (*) const struct cred *get_cred(const struct cred *cred);
-
-     This gets a reference on a live set of credentials, returning a pointer to
-     that set of credentials.
-
- (*) struct cred *get_new_cred(struct cred *cred);
-
-     This gets a reference on a set of credentials that is under construction
-     and is thus still mutable, returning a pointer to that set of credentials.
-
-
-=====================
-OPEN FILE CREDENTIALS
-=====================
-
-When a new file is opened, a reference is obtained on the opening task's
-credentials and this is attached to the file struct as 'f_cred' in place of
-'f_uid' and 'f_gid'.  Code that used to access file->f_uid and file->f_gid
-should now access file->f_cred->fsuid and file->f_cred->fsgid.
-
-It is safe to access f_cred without the use of RCU or locking because the
-pointer will not change over the lifetime of the file struct, and nor will the
-contents of the cred struct pointed to, barring the exceptions listed above
-(see the Task Credentials section).
-
-
-=======================================
-OVERRIDING THE VFS'S USE OF CREDENTIALS
-=======================================
-
-Under some circumstances it is desirable to override the credentials used by
-the VFS, and that can be done by calling into such as vfs_mkdir() with a
-different set of credentials.  This is done in the following places:
-
- (*) sys_faccessat().
-
- (*) do_coredump().
-
- (*) nfs4recover.c.
diff --git a/Documentation/security/index.rst b/Documentation/security/index.rst
index 07335659ce8d..415be8e0b013 100644
--- a/Documentation/security/index.rst
+++ b/Documentation/security/index.rst
@@ -5,5 +5,6 @@ Security Documentation
 .. toctree::
    :maxdepth: 1
 
+   credentials
    IMA-templates
    tpm/index
diff --git a/include/linux/cred.h b/include/linux/cred.h
index b03e7d049a64..c728d515e5e2 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -1,4 +1,4 @@
-/* Credentials management - see Documentation/security/credentials.txt
+/* Credentials management - see Documentation/security/credentials.rst
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
diff --git a/kernel/cred.c b/kernel/cred.c
index 2bc66075740f..ecf03657e71c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
-/* Task credentials management - see Documentation/security/credentials.txt
+/* Task credentials management - see Documentation/security/credentials.rst
  *
  * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
-- 
cgit v1.3-14-g43fede


From 719f6a7040f1bdaf96fcc709d272548facb88e90 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Thu, 20 Apr 2017 10:52:31 +0200
Subject: printk: Use the main logbuf in NMI when logbuf_lock is available

The commit 42a0bb3f71383b457a7d ("printk/nmi: generic solution for safe
printk in NMI") caused that printk stores messages into a temporary
buffer in NMI context.

The buffer is per-CPU and therefore the size is rather limited.
It works quite well for NMI backtraces. But there are longer logs
that might get printed in NMI context, for example, lockdep
warnings, ftrace_dump_on_oops.

The temporary buffer is used to avoid deadlocks caused by
logbuf_lock. Also it is needed to avoid races with the other
temporary buffer that is used when PRINTK_SAFE_CONTEXT is entered.
But the main buffer can be used in NMI if the lock is available
and we did not interrupt PRINTK_SAFE_CONTEXT.

The lock is checked using raw_spin_is_locked(). It might cause
false negatives when the lock is taken on another CPU and
this CPU is in the safe context from other reasons. Note that
the safe context is used also to get console semaphore or when
calling console drivers. For this reason, we do the check in
printk_nmi_enter(). It makes the handling consistent for
the entire NMI handler and avoids reshuffling of the messages.

The patch also defines special printk context that allows
to use printk_deferred() in NMI. Note that we could not flush
the messages to the consoles because console drivers might use
many other internal locks.

The newly created vprintk_deferred() disables the preemption
only around the irq work handling. It is needed there to keep
the consistency between the two per-CPU variables. But there
is no reason to disable preemption around vprintk_emit().

Finally, the patch puts back explicit serialization of the NMI
backtraces from different CPUs. It was removed by the
commit a9edc88093287183ac934b ("x86/nmi: Perform a safe
NMI stack trace on all CPUs"). It was not needed because
the flushing of the temporary per-CPU buffers was serialized.

Link: http://lkml.kernel.org/r/1493912763-24873-1-git-send-email-pmladek@suse.com
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <rack+kernel@arm.linux.org.uk>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: x86@kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Suggested-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/printk/internal.h    |  6 ++++--
 kernel/printk/printk.c      | 19 ++++++++++++++-----
 kernel/printk/printk_safe.c | 26 ++++++++++++++++++++++++--
 lib/nmi_backtrace.c         |  3 +++
 4 files changed, 45 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 1db044f808b7..2a7d04049af4 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -18,12 +18,14 @@
 
 #ifdef CONFIG_PRINTK
 
-#define PRINTK_SAFE_CONTEXT_MASK	0x7fffffff
-#define PRINTK_NMI_CONTEXT_MASK	0x80000000
+#define PRINTK_SAFE_CONTEXT_MASK	 0x3fffffff
+#define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000
+#define PRINTK_NMI_CONTEXT_MASK		 0x80000000
 
 extern raw_spinlock_t logbuf_lock;
 
 __printf(1, 0) int vprintk_default(const char *fmt, va_list args);
+__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
 __printf(1, 0) int vprintk_func(const char *fmt, va_list args);
 void __printk_safe_enter(void);
 void __printk_safe_exit(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 779479ac9f57..e5278e7d1922 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2722,16 +2722,13 @@ void wake_up_klogd(void)
 	preempt_enable();
 }
 
-int printk_deferred(const char *fmt, ...)
+int vprintk_deferred(const char *fmt, va_list args)
 {
-	va_list args;
 	int r;
 
-	preempt_disable();
-	va_start(args, fmt);
 	r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
-	va_end(args);
 
+	preempt_disable();
 	__this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
 	irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
 	preempt_enable();
@@ -2739,6 +2736,18 @@ int printk_deferred(const char *fmt, ...)
 	return r;
 }
 
+int printk_deferred(const char *fmt, ...)
+{
+	va_list args;
+	int r;
+
+	va_start(args, fmt);
+	r = vprintk_deferred(fmt, args);
+	va_end(args);
+
+	return r;
+}
+
 /*
  * printk rate limiting, lifted from the networking subsystem.
  *
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 033e50a7d706..03a42a539b20 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -308,12 +308,24 @@ static int vprintk_nmi(const char *fmt, va_list args)
 
 void printk_nmi_enter(void)
 {
-	this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
+	/*
+	 * The size of the extra per-CPU buffer is limited. Use it only when
+	 * the main one is locked. If this CPU is not in the safe context,
+	 * the lock must be taken on another CPU and we could wait for it.
+	 */
+	if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) &&
+	    raw_spin_is_locked(&logbuf_lock)) {
+		this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
+	} else {
+		this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK);
+	}
 }
 
 void printk_nmi_exit(void)
 {
-	this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
+	this_cpu_and(printk_context,
+		     ~(PRINTK_NMI_CONTEXT_MASK |
+		       PRINTK_NMI_DEFERRED_CONTEXT_MASK));
 }
 
 #else
@@ -351,12 +363,22 @@ void __printk_safe_exit(void)
 
 __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 {
+	/* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
 	if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
 		return vprintk_nmi(fmt, args);
 
+	/* Use extra buffer to prevent a recursion deadlock in safe mode. */
 	if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
 		return vprintk_safe(fmt, args);
 
+	/*
+	 * Use the main logbuf when logbuf_lock is available in NMI.
+	 * But avoid calling console drivers that might have their own locks.
+	 */
+	if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK)
+		return vprintk_deferred(fmt, args);
+
+	/* No obstacles. */
 	return vprintk_default(fmt, args);
 }
 
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 4e8a30d1c22f..0bc0a3535a8a 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -86,9 +86,11 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
 
 bool nmi_cpu_backtrace(struct pt_regs *regs)
 {
+	static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
 	int cpu = smp_processor_id();
 
 	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
+		arch_spin_lock(&lock);
 		if (regs && cpu_in_idle(instruction_pointer(regs))) {
 			pr_warn("NMI backtrace for cpu %d skipped: idling at pc %#lx\n",
 				cpu, instruction_pointer(regs));
@@ -99,6 +101,7 @@ bool nmi_cpu_backtrace(struct pt_regs *regs)
 			else
 				dump_stack();
 		}
+		arch_spin_unlock(&lock);
 		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
 		return true;
 	}
-- 
cgit v1.3-14-g43fede


From 7e95a225901a5d2fd140f14b4302805cecc22da7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 May 2017 19:52:01 -0400
Subject: move compat wait4 and waitid next to native variants

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/compat.c | 66 ------------------------------------------------------
 kernel/exit.c   | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 66 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 933bcb31ae10..b4cdba6bbd02 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -543,72 +543,6 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
 	return 0;
 }
 
-COMPAT_SYSCALL_DEFINE4(wait4,
-	compat_pid_t, pid,
-	compat_uint_t __user *, stat_addr,
-	int, options,
-	struct compat_rusage __user *, ru)
-{
-	if (!ru) {
-		return sys_wait4(pid, stat_addr, options, NULL);
-	} else {
-		struct rusage r;
-		int ret;
-		unsigned int status;
-		mm_segment_t old_fs = get_fs();
-
-		set_fs (KERNEL_DS);
-		ret = sys_wait4(pid,
-				(stat_addr ?
-				 (unsigned int __user *) &status : NULL),
-				options, (struct rusage __user *) &r);
-		set_fs (old_fs);
-
-		if (ret > 0) {
-			if (put_compat_rusage(&r, ru))
-				return -EFAULT;
-			if (stat_addr && put_user(status, stat_addr))
-				return -EFAULT;
-		}
-		return ret;
-	}
-}
-
-COMPAT_SYSCALL_DEFINE5(waitid,
-		int, which, compat_pid_t, pid,
-		struct compat_siginfo __user *, uinfo, int, options,
-		struct compat_rusage __user *, uru)
-{
-	siginfo_t info;
-	struct rusage ru;
-	long ret;
-	mm_segment_t old_fs = get_fs();
-
-	memset(&info, 0, sizeof(info));
-
-	set_fs(KERNEL_DS);
-	ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
-			 uru ? (struct rusage __user *)&ru : NULL);
-	set_fs(old_fs);
-
-	if ((ret < 0) || (info.si_signo == 0))
-		return ret;
-
-	if (uru) {
-		/* sys_waitid() overwrites everything in ru */
-		if (COMPAT_USE_64BIT_TIME)
-			ret = copy_to_user(uru, &ru, sizeof(ru));
-		else
-			ret = put_compat_rusage(&ru, uru);
-		if (ret)
-			return -EFAULT;
-	}
-
-	BUG_ON(info.si_code & __SI_MASK);
-	info.si_code |= __SI_CHLD;
-	return copy_siginfo_to_user32(uinfo, &info);
-}
-
 static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
 				    unsigned len, struct cpumask *new_mask)
 {
diff --git a/kernel/exit.c b/kernel/exit.c
index 516acdb0e0ec..f98782bd27b6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -62,6 +62,7 @@
 #include <linux/kcov.h>
 #include <linux/random.h>
 #include <linux/rcuwait.h>
+#include <linux/compat.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -1735,3 +1736,71 @@ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
 }
 
 #endif
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(wait4,
+	compat_pid_t, pid,
+	compat_uint_t __user *, stat_addr,
+	int, options,
+	struct compat_rusage __user *, ru)
+{
+	if (!ru) {
+		return sys_wait4(pid, stat_addr, options, NULL);
+	} else {
+		struct rusage r;
+		int ret;
+		unsigned int status;
+		mm_segment_t old_fs = get_fs();
+
+		set_fs (KERNEL_DS);
+		ret = sys_wait4(pid,
+				(stat_addr ?
+				 (unsigned int __user *) &status : NULL),
+				options, (struct rusage __user *) &r);
+		set_fs (old_fs);
+
+		if (ret > 0) {
+			if (put_compat_rusage(&r, ru))
+				return -EFAULT;
+			if (stat_addr && put_user(status, stat_addr))
+				return -EFAULT;
+		}
+		return ret;
+	}
+}
+
+COMPAT_SYSCALL_DEFINE5(waitid,
+		int, which, compat_pid_t, pid,
+		struct compat_siginfo __user *, infop, int, options,
+		struct compat_rusage __user *, uru)
+{
+	siginfo_t info;
+	struct rusage ru;
+	long ret;
+	mm_segment_t old_fs = get_fs();
+
+	memset(&info, 0, sizeof(info));
+
+	set_fs(KERNEL_DS);
+	ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
+			 uru ? (struct rusage __user *)&ru : NULL);
+	set_fs(old_fs);
+
+	if ((ret < 0) || (info.si_signo == 0))
+		return ret;
+
+	if (uru) {
+		/* sys_waitid() overwrites everything in ru */
+		if (COMPAT_USE_64BIT_TIME)
+			ret = copy_to_user(uru, &ru, sizeof(ru));
+		else
+			ret = put_compat_rusage(&ru, uru);
+		if (ret)
+			return -EFAULT;
+	}
+
+	BUG_ON(info.si_code & __SI_MASK);
+	info.si_code |= __SI_CHLD;
+	return copy_siginfo_to_user32(infop, &info);
+}
+#endif
-- 
cgit v1.3-14-g43fede


From ce72a16fa705f960ca2352e95a7c5f4801475e75 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 May 2017 20:25:02 -0400
Subject: wait4(2)/waitid(2): separate copying rusage to userland

New helpers: kernel_waitid() and kernel_wait4().  sys_waitid(),
sys_wait4() and their compat variants switched to those.  Copying
struct rusage to userland is left to syscall itself.  For
compat_sys_wait4() that eliminates the use of set_fs() completely.
For compat_sys_waitid() it's still needed (for siginfo handling);
that will change shortly.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/resource.h |  2 +-
 kernel/exit.c            | 89 ++++++++++++++++++++++++++++--------------------
 kernel/sys.c             | 16 ++++-----
 3 files changed, 59 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/resource.h b/include/linux/resource.h
index 5bc3116e649c..277afdad6589 100644
--- a/include/linux/resource.h
+++ b/include/linux/resource.h
@@ -6,7 +6,7 @@
 
 struct task_struct;
 
-int getrusage(struct task_struct *p, int who, struct rusage __user *ru);
+void getrusage(struct task_struct *p, int who, struct rusage *ru);
 int do_prlimit(struct task_struct *tsk, unsigned int resource,
 		struct rlimit *new_rlim, struct rlimit *old_rlim);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index f98782bd27b6..d44f12948c5f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1003,7 +1003,7 @@ struct wait_opts {
 
 	struct siginfo __user	*wo_info;
 	int __user		*wo_stat;
-	struct rusage __user	*wo_rusage;
+	struct rusage		*wo_rusage;
 
 	wait_queue_t		child_wait;
 	int			notask_error;
@@ -1054,8 +1054,10 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
 				pid_t pid, uid_t uid, int why, int status)
 {
 	struct siginfo __user *infop;
-	int retval = wo->wo_rusage
-		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+	int retval = 0;
+
+	if (wo->wo_rusage)
+		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
 
 	put_task_struct(p);
 	infop = wo->wo_info;
@@ -1182,8 +1184,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		spin_unlock_irq(&current->sighand->siglock);
 	}
 
-	retval = wo->wo_rusage
-		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+	if (wo->wo_rusage)
+		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+	retval = 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
 	if (!retval && wo->wo_stat)
@@ -1316,8 +1319,9 @@ unlock_sig:
 	if (unlikely(wo->wo_flags & WNOWAIT))
 		return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
 
-	retval = wo->wo_rusage
-		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+	if (wo->wo_rusage)
+		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+	retval = 0;
 	if (!retval && wo->wo_stat)
 		retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
 
@@ -1377,8 +1381,9 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	sched_annotate_sleep();
 
 	if (!wo->wo_info) {
-		retval = wo->wo_rusage
-			? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+		if (wo->wo_rusage)
+			getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+		retval = 0;
 		put_task_struct(p);
 		if (!retval && wo->wo_stat)
 			retval = put_user(0xffff, wo->wo_stat);
@@ -1618,8 +1623,8 @@ end:
 	return retval;
 }
 
-SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
-		infop, int, options, struct rusage __user *, ru)
+static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop,
+			  int options, struct rusage *ru)
 {
 	struct wait_opts wo;
 	struct pid *pid = NULL;
@@ -1687,8 +1692,21 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	return ret;
 }
 
-SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
-		int, options, struct rusage __user *, ru)
+SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
+		infop, int, options, struct rusage __user *, ru)
+{
+	struct rusage r;
+	long err = kernel_waitid(which, upid, infop, options, ru ? &r : NULL);
+
+	if (!err) {
+		if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
+			return -EFAULT;
+	}
+	return err;
+}
+
+static long kernel_wait4(pid_t upid, int __user *stat_addr,
+			int options, struct rusage *ru)
 {
 	struct wait_opts wo;
 	struct pid *pid = NULL;
@@ -1724,6 +1742,19 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 	return ret;
 }
 
+SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
+		int, options, struct rusage __user *, ru)
+{
+	struct rusage r;
+	long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
+
+	if (err > 0) {
+		if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
+			return -EFAULT;
+	}
+	return err;
+}
+
 #ifdef __ARCH_WANT_SYS_WAITPID
 
 /*
@@ -1744,29 +1775,13 @@ COMPAT_SYSCALL_DEFINE4(wait4,
 	int, options,
 	struct compat_rusage __user *, ru)
 {
-	if (!ru) {
-		return sys_wait4(pid, stat_addr, options, NULL);
-	} else {
-		struct rusage r;
-		int ret;
-		unsigned int status;
-		mm_segment_t old_fs = get_fs();
-
-		set_fs (KERNEL_DS);
-		ret = sys_wait4(pid,
-				(stat_addr ?
-				 (unsigned int __user *) &status : NULL),
-				options, (struct rusage __user *) &r);
-		set_fs (old_fs);
-
-		if (ret > 0) {
-			if (put_compat_rusage(&r, ru))
-				return -EFAULT;
-			if (stat_addr && put_user(status, stat_addr))
-				return -EFAULT;
-		}
-		return ret;
+	struct rusage r;
+	long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
+	if (err > 0) {
+		if (ru && put_compat_rusage(&r, ru))
+			return -EFAULT;
 	}
+	return err;
 }
 
 COMPAT_SYSCALL_DEFINE5(waitid,
@@ -1782,8 +1797,8 @@ COMPAT_SYSCALL_DEFINE5(waitid,
 	memset(&info, 0, sizeof(info));
 
 	set_fs(KERNEL_DS);
-	ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
-			 uru ? (struct rusage __user *)&ru : NULL);
+	ret = kernel_waitid(which, pid, (siginfo_t __user *)&info, options,
+			 uru ? &ru : NULL);
 	set_fs(old_fs);
 
 	if ((ret < 0) || (info.si_signo == 0))
diff --git a/kernel/sys.c b/kernel/sys.c
index 8a94b4eabcaa..dab1a0658a92 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1552,7 +1552,7 @@ static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
 	r->ru_oublock += task_io_get_oublock(t);
 }
 
-static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
+void getrusage(struct task_struct *p, int who, struct rusage *r)
 {
 	struct task_struct *t;
 	unsigned long flags;
@@ -1626,20 +1626,16 @@ out:
 	r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
 }
 
-int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
+SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
 {
 	struct rusage r;
 
-	k_getrusage(p, who, &r);
-	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
-}
-
-SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
-{
 	if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
 	    who != RUSAGE_THREAD)
 		return -EINVAL;
-	return getrusage(current, who, ru);
+
+	getrusage(current, who, &r);
+	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
 }
 
 #ifdef CONFIG_COMPAT
@@ -1651,7 +1647,7 @@ COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
 	    who != RUSAGE_THREAD)
 		return -EINVAL;
 
-	k_getrusage(current, who, &r);
+	getrusage(current, who, &r);
 	return put_compat_rusage(&r, ru);
 }
 #endif
-- 
cgit v1.3-14-g43fede


From 359566faefa850504d146839d74496f0cf12d3b9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 May 2017 20:39:39 -0400
Subject: kernel_wait4()/kernel_waitid(): delay copying status to userland

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exit.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index d44f12948c5f..94cdccf8e7e7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1002,7 +1002,7 @@ struct wait_opts {
 	struct pid		*wo_pid;
 
 	struct siginfo __user	*wo_info;
-	int __user		*wo_stat;
+	int			wo_stat;
 	struct rusage		*wo_rusage;
 
 	wait_queue_t		child_wait;
@@ -1189,8 +1189,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	retval = 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
-	if (!retval && wo->wo_stat)
-		retval = put_user(status, wo->wo_stat);
+	wo->wo_stat = status;
 
 	infop = wo->wo_info;
 	if (!retval && infop)
@@ -1322,8 +1321,7 @@ unlock_sig:
 	if (wo->wo_rusage)
 		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
 	retval = 0;
-	if (!retval && wo->wo_stat)
-		retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
+	wo->wo_stat = (exit_code << 8) | 0x7f;
 
 	infop = wo->wo_info;
 	if (!retval && infop)
@@ -1383,12 +1381,9 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	if (!wo->wo_info) {
 		if (wo->wo_rusage)
 			getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
-		retval = 0;
 		put_task_struct(p);
-		if (!retval && wo->wo_stat)
-			retval = put_user(0xffff, wo->wo_stat);
-		if (!retval)
-			retval = pid;
+		wo->wo_stat = 0xffff;
+		retval = pid;
 	} else {
 		retval = wait_noreap_copyout(wo, p, pid, uid,
 					     CLD_CONTINUED, SIGCONT);
@@ -1662,7 +1657,6 @@ static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop,
 	wo.wo_pid	= pid;
 	wo.wo_flags	= options;
 	wo.wo_info	= infop;
-	wo.wo_stat	= NULL;
 	wo.wo_rusage	= ru;
 	ret = do_wait(&wo);
 
@@ -1734,10 +1728,12 @@ static long kernel_wait4(pid_t upid, int __user *stat_addr,
 	wo.wo_pid	= pid;
 	wo.wo_flags	= options | WEXITED;
 	wo.wo_info	= NULL;
-	wo.wo_stat	= stat_addr;
+	wo.wo_stat	= 0;
 	wo.wo_rusage	= ru;
 	ret = do_wait(&wo);
 	put_pid(pid);
+	if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
+		ret = -EFAULT;
 
 	return ret;
 }
-- 
cgit v1.3-14-g43fede


From 67d7ddded322db99f451a7959d56ed6c70a6c4aa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 May 2017 20:53:13 -0400
Subject: waitid(2): leave copyout of siginfo to syscall itself

have kernel_waitid() collect the information needed for siginfo into
a small structure (waitid_info) passed to it; deal with copyout in
sys_waitid()/compat_sys_waitid().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exit.c | 168 ++++++++++++++++++++++------------------------------------
 1 file changed, 64 insertions(+), 104 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 94cdccf8e7e7..42f26480b3cc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -996,12 +996,19 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
 	return 0;
 }
 
+struct waitid_info {
+	pid_t pid;
+	uid_t uid;
+	int status;
+	int cause;
+};
+
 struct wait_opts {
 	enum pid_type		wo_type;
 	int			wo_flags;
 	struct pid		*wo_pid;
 
-	struct siginfo __user	*wo_info;
+	struct waitid_info	*wo_info;
 	int			wo_stat;
 	struct rusage		*wo_rusage;
 
@@ -1053,8 +1060,7 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
 static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
 				pid_t pid, uid_t uid, int why, int status)
 {
-	struct siginfo __user *infop;
-	int retval = 0;
+	struct waitid_info *infop;
 
 	if (wo->wo_rusage)
 		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
@@ -1062,22 +1068,12 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
 	put_task_struct(p);
 	infop = wo->wo_info;
 	if (infop) {
-		if (!retval)
-			retval = put_user(SIGCHLD, &infop->si_signo);
-		if (!retval)
-			retval = put_user(0, &infop->si_errno);
-		if (!retval)
-			retval = put_user((short)why, &infop->si_code);
-		if (!retval)
-			retval = put_user(pid, &infop->si_pid);
-		if (!retval)
-			retval = put_user(uid, &infop->si_uid);
-		if (!retval)
-			retval = put_user(status, &infop->si_status);
+		infop->cause = why;
+		infop->pid = pid;
+		infop->uid = uid;
+		infop->status = status;
 	}
-	if (!retval)
-		retval = pid;
-	return retval;
+	return pid;
 }
 
 /*
@@ -1088,10 +1084,10 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
  */
 static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 {
-	int state, retval, status;
+	int state, status;
 	pid_t pid = task_pid_vnr(p);
 	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
-	struct siginfo __user *infop;
+	struct waitid_info *infop;
 
 	if (!likely(wo->wo_flags & WEXITED))
 		return 0;
@@ -1186,36 +1182,22 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 
 	if (wo->wo_rusage)
 		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
-	retval = 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
 	wo->wo_stat = status;
 
 	infop = wo->wo_info;
-	if (!retval && infop)
-		retval = put_user(SIGCHLD, &infop->si_signo);
-	if (!retval && infop)
-		retval = put_user(0, &infop->si_errno);
-	if (!retval && infop) {
-		int why;
-
+	if (infop) {
 		if ((status & 0x7f) == 0) {
-			why = CLD_EXITED;
-			status >>= 8;
+			infop->cause = CLD_EXITED;
+			infop->status = status >> 8;
 		} else {
-			why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
-			status &= 0x7f;
+			infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
+			infop->status = status & 0x7f;
 		}
-		retval = put_user((short)why, &infop->si_code);
-		if (!retval)
-			retval = put_user(status, &infop->si_status);
+		infop->pid = pid;
+		infop->uid = uid;
 	}
-	if (!retval && infop)
-		retval = put_user(pid, &infop->si_pid);
-	if (!retval && infop)
-		retval = put_user(uid, &infop->si_uid);
-	if (!retval)
-		retval = pid;
 
 	if (state == EXIT_TRACE) {
 		write_lock_irq(&tasklist_lock);
@@ -1232,7 +1214,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	if (state == EXIT_DEAD)
 		release_task(p);
 
-	return retval;
+	return pid;
 }
 
 static int *task_stopped_code(struct task_struct *p, bool ptrace)
@@ -1268,8 +1250,8 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
 static int wait_task_stopped(struct wait_opts *wo,
 				int ptrace, struct task_struct *p)
 {
-	struct siginfo __user *infop;
-	int retval, exit_code, *p_code, why;
+	struct waitid_info *infop;
+	int exit_code, *p_code, why;
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 
@@ -1320,28 +1302,19 @@ unlock_sig:
 
 	if (wo->wo_rusage)
 		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
-	retval = 0;
 	wo->wo_stat = (exit_code << 8) | 0x7f;
 
 	infop = wo->wo_info;
-	if (!retval && infop)
-		retval = put_user(SIGCHLD, &infop->si_signo);
-	if (!retval && infop)
-		retval = put_user(0, &infop->si_errno);
-	if (!retval && infop)
-		retval = put_user((short)why, &infop->si_code);
-	if (!retval && infop)
-		retval = put_user(exit_code, &infop->si_status);
-	if (!retval && infop)
-		retval = put_user(pid, &infop->si_pid);
-	if (!retval && infop)
-		retval = put_user(uid, &infop->si_uid);
-	if (!retval)
-		retval = pid;
+	if (infop) {
+		infop->cause = why;
+		infop->status = exit_code;
+		infop->pid = pid;
+		infop->uid = uid;
+	}
 	put_task_struct(p);
 
-	BUG_ON(!retval);
-	return retval;
+	BUG_ON(!pid);
+	return pid;
 }
 
 /*
@@ -1618,7 +1591,7 @@ end:
 	return retval;
 }
 
-static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop,
+static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
 			  int options, struct rusage *ru)
 {
 	struct wait_opts wo;
@@ -1660,27 +1633,8 @@ static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop,
 	wo.wo_rusage	= ru;
 	ret = do_wait(&wo);
 
-	if (ret > 0) {
+	if (ret > 0)
 		ret = 0;
-	} else if (infop) {
-		/*
-		 * For a WNOHANG return, clear out all the fields
-		 * we would set so the user can easily tell the
-		 * difference.
-		 */
-		if (!ret)
-			ret = put_user(0, &infop->si_signo);
-		if (!ret)
-			ret = put_user(0, &infop->si_errno);
-		if (!ret)
-			ret = put_user(0, &infop->si_code);
-		if (!ret)
-			ret = put_user(0, &infop->si_pid);
-		if (!ret)
-			ret = put_user(0, &infop->si_uid);
-		if (!ret)
-			ret = put_user(0, &infop->si_status);
-	}
 
 	put_pid(pid);
 	return ret;
@@ -1690,12 +1644,24 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 		infop, int, options, struct rusage __user *, ru)
 {
 	struct rusage r;
-	long err = kernel_waitid(which, upid, infop, options, ru ? &r : NULL);
+	struct waitid_info info = {.status = 0};
+	long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
 
 	if (!err) {
 		if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
 			return -EFAULT;
 	}
+	if (!infop)
+		return err;
+
+	if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) ||
+	    put_user(0, &infop->si_errno) ||
+	    put_user((short)info.cause, &infop->si_code) ||
+	    put_user(info.pid, &infop->si_pid) ||
+	    put_user(info.uid, &infop->si_uid) ||
+	    put_user(info.status, &infop->si_status))
+		err = -EFAULT;
+
 	return err;
 }
 
@@ -1785,33 +1751,27 @@ COMPAT_SYSCALL_DEFINE5(waitid,
 		struct compat_siginfo __user *, infop, int, options,
 		struct compat_rusage __user *, uru)
 {
-	siginfo_t info;
 	struct rusage ru;
-	long ret;
-	mm_segment_t old_fs = get_fs();
-
-	memset(&info, 0, sizeof(info));
+	struct waitid_info info = {.status = 0};
+	long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
 
-	set_fs(KERNEL_DS);
-	ret = kernel_waitid(which, pid, (siginfo_t __user *)&info, options,
-			 uru ? &ru : NULL);
-	set_fs(old_fs);
-
-	if ((ret < 0) || (info.si_signo == 0))
-		return ret;
-
-	if (uru) {
-		/* sys_waitid() overwrites everything in ru */
+	if (!err && uru) {
+		/* kernel_waitid() overwrites everything in ru */
 		if (COMPAT_USE_64BIT_TIME)
-			ret = copy_to_user(uru, &ru, sizeof(ru));
+			err = copy_to_user(uru, &ru, sizeof(ru));
 		else
-			ret = put_compat_rusage(&ru, uru);
-		if (ret)
+			err = put_compat_rusage(&ru, uru);
+		if (err)
 			return -EFAULT;
 	}
 
-	BUG_ON(info.si_code & __SI_MASK);
-	info.si_code |= __SI_CHLD;
-	return copy_siginfo_to_user32(infop, &info);
+	if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) ||
+	    put_user(0, &infop->si_errno) ||
+	    put_user((short)info.cause, &infop->si_code) ||
+	    put_user(info.pid, &infop->si_pid) ||
+	    put_user(info.uid, &infop->si_uid) ||
+	    put_user(info.status, &infop->si_status))
+		err = -EFAULT;
+	return err;
 }
 #endif
-- 
cgit v1.3-14-g43fede


From e61a250229fbf0f003e93676bf4d8a555a8c9eec Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 May 2017 21:25:03 -0400
Subject: lift getrusage() from wait_noreap_copyout()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exit.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 42f26480b3cc..d4f5097da85a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1062,9 +1062,6 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
 {
 	struct waitid_info *infop;
 
-	if (wo->wo_rusage)
-		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
-
 	put_task_struct(p);
 	infop = wo->wo_info;
 	if (infop) {
@@ -1099,6 +1096,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		get_task_struct(p);
 		read_unlock(&tasklist_lock);
 		sched_annotate_sleep();
+		if (wo->wo_rusage)
+			getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
 
 		if ((exit_code & 0x7f) == 0) {
 			why = CLD_EXITED;
@@ -1296,12 +1295,12 @@ unlock_sig:
 	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
 	read_unlock(&tasklist_lock);
 	sched_annotate_sleep();
+	if (wo->wo_rusage)
+		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
 
 	if (unlikely(wo->wo_flags & WNOWAIT))
 		return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
 
-	if (wo->wo_rusage)
-		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
 	wo->wo_stat = (exit_code << 8) | 0x7f;
 
 	infop = wo->wo_info;
@@ -1350,10 +1349,10 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 	sched_annotate_sleep();
+	if (wo->wo_rusage)
+		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
 
 	if (!wo->wo_info) {
-		if (wo->wo_rusage)
-			getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
 		put_task_struct(p);
 		wo->wo_stat = 0xffff;
 		retval = pid;
-- 
cgit v1.3-14-g43fede


From bb380ec33a7d8ee048e722889627869d21a5d527 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 May 2017 21:33:21 -0400
Subject: kill wait_noreap_copyout()

folds into callers

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exit.c | 65 +++++++++++++++++++++++------------------------------------
 1 file changed, 25 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index d4f5097da85a..f01ebaab978a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1057,22 +1057,6 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
 	return 1;
 }
 
-static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
-				pid_t pid, uid_t uid, int why, int status)
-{
-	struct waitid_info *infop;
-
-	put_task_struct(p);
-	infop = wo->wo_info;
-	if (infop) {
-		infop->cause = why;
-		infop->pid = pid;
-		infop->uid = uid;
-		infop->status = status;
-	}
-	return pid;
-}
-
 /*
  * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
  * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
@@ -1091,22 +1075,27 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 
 	if (unlikely(wo->wo_flags & WNOWAIT)) {
 		int exit_code = p->exit_code;
-		int why;
 
 		get_task_struct(p);
 		read_unlock(&tasklist_lock);
 		sched_annotate_sleep();
 		if (wo->wo_rusage)
 			getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+		put_task_struct(p);
 
-		if ((exit_code & 0x7f) == 0) {
-			why = CLD_EXITED;
-			status = exit_code >> 8;
-		} else {
-			why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
-			status = exit_code & 0x7f;
+		infop = wo->wo_info;
+		if (infop) {
+			if ((exit_code & 0x7f) == 0) {
+				infop->cause = CLD_EXITED;
+				infop->status = exit_code >> 8;
+			} else {
+				infop->cause = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
+				infop->status = exit_code & 0x7f;
+			}
+			infop->pid = pid;
+			infop->uid = uid;
 		}
-		return wait_noreap_copyout(wo, p, pid, uid, why, status);
+		return pid;
 	}
 	/*
 	 * Move the task's state to DEAD/TRACE, only one thread can do this.
@@ -1297,11 +1286,10 @@ unlock_sig:
 	sched_annotate_sleep();
 	if (wo->wo_rusage)
 		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+	put_task_struct(p);
 
-	if (unlikely(wo->wo_flags & WNOWAIT))
-		return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
-
-	wo->wo_stat = (exit_code << 8) | 0x7f;
+	if (likely(!(wo->wo_flags & WNOWAIT)))
+		wo->wo_stat = (exit_code << 8) | 0x7f;
 
 	infop = wo->wo_info;
 	if (infop) {
@@ -1310,9 +1298,6 @@ unlock_sig:
 		infop->pid = pid;
 		infop->uid = uid;
 	}
-	put_task_struct(p);
-
-	BUG_ON(!pid);
 	return pid;
 }
 
@@ -1324,7 +1309,7 @@ unlock_sig:
  */
 static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 {
-	int retval;
+	struct waitid_info *infop;
 	pid_t pid;
 	uid_t uid;
 
@@ -1351,18 +1336,18 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 	sched_annotate_sleep();
 	if (wo->wo_rusage)
 		getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+	put_task_struct(p);
 
-	if (!wo->wo_info) {
-		put_task_struct(p);
+	infop = wo->wo_info;
+	if (!infop) {
 		wo->wo_stat = 0xffff;
-		retval = pid;
 	} else {
-		retval = wait_noreap_copyout(wo, p, pid, uid,
-					     CLD_CONTINUED, SIGCONT);
-		BUG_ON(retval == 0);
+		infop->cause = CLD_CONTINUED;
+		infop->pid = pid;
+		infop->uid = uid;
+		infop->status = SIGCONT;
 	}
-
-	return retval;
+	return pid;
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 76d9871e1122aabc086e7aade5251b1e5124cbb9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 May 2017 21:38:26 -0400
Subject: wait_task_zombie: consolidate info logics

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exit.c | 45 ++++++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index f01ebaab978a..97db9ee03f90 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1074,28 +1074,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		return 0;
 
 	if (unlikely(wo->wo_flags & WNOWAIT)) {
-		int exit_code = p->exit_code;
-
+		status = p->exit_code;
 		get_task_struct(p);
 		read_unlock(&tasklist_lock);
 		sched_annotate_sleep();
 		if (wo->wo_rusage)
 			getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
 		put_task_struct(p);
-
-		infop = wo->wo_info;
-		if (infop) {
-			if ((exit_code & 0x7f) == 0) {
-				infop->cause = CLD_EXITED;
-				infop->status = exit_code >> 8;
-			} else {
-				infop->cause = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
-				infop->status = exit_code & 0x7f;
-			}
-			infop->pid = pid;
-			infop->uid = uid;
-		}
-		return pid;
+		goto out_info;
 	}
 	/*
 	 * Move the task's state to DEAD/TRACE, only one thread can do this.
@@ -1174,19 +1160,6 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		? p->signal->group_exit_code : p->exit_code;
 	wo->wo_stat = status;
 
-	infop = wo->wo_info;
-	if (infop) {
-		if ((status & 0x7f) == 0) {
-			infop->cause = CLD_EXITED;
-			infop->status = status >> 8;
-		} else {
-			infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
-			infop->status = status & 0x7f;
-		}
-		infop->pid = pid;
-		infop->uid = uid;
-	}
-
 	if (state == EXIT_TRACE) {
 		write_lock_irq(&tasklist_lock);
 		/* We dropped tasklist, ptracer could die and untrace */
@@ -1202,6 +1175,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	if (state == EXIT_DEAD)
 		release_task(p);
 
+out_info:
+	infop = wo->wo_info;
+	if (infop) {
+		if ((status & 0x7f) == 0) {
+			infop->cause = CLD_EXITED;
+			infop->status = status >> 8;
+		} else {
+			infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
+			infop->status = status & 0x7f;
+		}
+		infop->pid = pid;
+		infop->uid = uid;
+	}
+
 	return pid;
 }
 
-- 
cgit v1.3-14-g43fede


From 4c48abe91be03d191d0c20cc755877da2cb35622 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 May 2017 19:27:32 -0400
Subject: waitid(): switch copyout of siginfo to unsafe_put_user()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exit.c | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 97db9ee03f90..f3b8c3a87bc1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1625,15 +1625,18 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	if (!infop)
 		return err;
 
-	if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) ||
-	    put_user(0, &infop->si_errno) ||
-	    put_user((short)info.cause, &infop->si_code) ||
-	    put_user(info.pid, &infop->si_pid) ||
-	    put_user(info.uid, &infop->si_uid) ||
-	    put_user(info.status, &infop->si_status))
-		err = -EFAULT;
-
+	user_access_begin();
+	unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault);
+	unsafe_put_user(0, &infop->si_errno, Efault);
+	unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+	unsafe_put_user(info.pid, &infop->si_pid, Efault);
+	unsafe_put_user(info.uid, &infop->si_uid, Efault);
+	unsafe_put_user(info.status, &infop->si_status, Efault);
+	user_access_end();
 	return err;
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 
 static long kernel_wait4(pid_t upid, int __user *stat_addr,
@@ -1736,13 +1739,20 @@ COMPAT_SYSCALL_DEFINE5(waitid,
 			return -EFAULT;
 	}
 
-	if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) ||
-	    put_user(0, &infop->si_errno) ||
-	    put_user((short)info.cause, &infop->si_code) ||
-	    put_user(info.pid, &infop->si_pid) ||
-	    put_user(info.uid, &infop->si_uid) ||
-	    put_user(info.status, &infop->si_status))
-		err = -EFAULT;
+	if (!infop)
+		return err;
+
+	user_access_begin();
+	unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault);
+	unsafe_put_user(0, &infop->si_errno, Efault);
+	unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+	unsafe_put_user(info.pid, &infop->si_pid, Efault);
+	unsafe_put_user(info.uid, &infop->si_uid, Efault);
+	unsafe_put_user(info.status, &infop->si_status, Efault);
+	user_access_end();
 	return err;
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 #endif
-- 
cgit v1.3-14-g43fede


From 92ebce5ac55dba258c608248dddf59eca3f7f514 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 May 2017 23:54:33 -0400
Subject: osf_wait4: switch to kernel_wait4()

... and sanitize copying rusage to userland

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/osf_sys.c | 53 ++++++++++++---------------------------------
 include/linux/sched/task.h  |  2 ++
 kernel/exit.c               |  4 ++--
 3 files changed, 18 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index ce93124a850b..b23d6fbbb225 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1183,48 +1183,23 @@ SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru)
 SYSCALL_DEFINE4(osf_wait4, pid_t, pid, int __user *, ustatus, int, options,
 		struct rusage32 __user *, ur)
 {
-	struct rusage r;
-	long ret, err;
 	unsigned int status = 0;
-	mm_segment_t old_fs;
-
+	struct rusage r;
+	long err = kernel_wait4(pid, &status, options, &r);
+	if (err <= 0)
+		return err;
+	if (put_user(status, ustatus))
+		return -EFAULT;
 	if (!ur)
-		return sys_wait4(pid, ustatus, options, NULL);
-
-	old_fs = get_fs();
-		
-	set_fs (KERNEL_DS);
-	ret = sys_wait4(pid, (unsigned int __user *) &status, options,
-			(struct rusage __user *) &r);
-	set_fs (old_fs);
-
-	if (!access_ok(VERIFY_WRITE, ur, sizeof(*ur)))
+		return err;
+	if (put_tv32(&ur->ru_utime, &r.ru_utime))
 		return -EFAULT;
-
-	err = put_user(status, ustatus);
-	if (ret < 0)
-		return err ? err : ret;
-
-	err |= __put_user(r.ru_utime.tv_sec, &ur->ru_utime.tv_sec);
-	err |= __put_user(r.ru_utime.tv_usec, &ur->ru_utime.tv_usec);
-	err |= __put_user(r.ru_stime.tv_sec, &ur->ru_stime.tv_sec);
-	err |= __put_user(r.ru_stime.tv_usec, &ur->ru_stime.tv_usec);
-	err |= __put_user(r.ru_maxrss, &ur->ru_maxrss);
-	err |= __put_user(r.ru_ixrss, &ur->ru_ixrss);
-	err |= __put_user(r.ru_idrss, &ur->ru_idrss);
-	err |= __put_user(r.ru_isrss, &ur->ru_isrss);
-	err |= __put_user(r.ru_minflt, &ur->ru_minflt);
-	err |= __put_user(r.ru_majflt, &ur->ru_majflt);
-	err |= __put_user(r.ru_nswap, &ur->ru_nswap);
-	err |= __put_user(r.ru_inblock, &ur->ru_inblock);
-	err |= __put_user(r.ru_oublock, &ur->ru_oublock);
-	err |= __put_user(r.ru_msgsnd, &ur->ru_msgsnd);
-	err |= __put_user(r.ru_msgrcv, &ur->ru_msgrcv);
-	err |= __put_user(r.ru_nsignals, &ur->ru_nsignals);
-	err |= __put_user(r.ru_nvcsw, &ur->ru_nvcsw);
-	err |= __put_user(r.ru_nivcsw, &ur->ru_nivcsw);
-
-	return err ? err : ret;
+	if (put_tv32(&ur->ru_stime, &r.ru_stime))
+		return -EFAULT;
+	if (copy_to_user(&ur->ru_maxrss, &r.ru_maxrss,
+	      sizeof(struct rusage32) - offsetof(struct rusage32, ru_maxrss)))
+		return -EFAULT;
+	return err;
 }
 
 /*
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a978d7189cfd..6b830fd9d809 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -9,6 +9,7 @@
 #include <linux/sched.h>
 
 struct task_struct;
+struct rusage;
 union thread_union;
 
 /*
@@ -74,6 +75,7 @@ extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *,
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+extern long kernel_wait4(pid_t, int *, int, struct rusage *);
 
 extern void free_task(struct task_struct *tsk);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index f3b8c3a87bc1..462fc25eec6e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1639,8 +1639,8 @@ Efault:
 	return -EFAULT;
 }
 
-static long kernel_wait4(pid_t upid, int __user *stat_addr,
-			int options, struct rusage *ru)
+long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
+		  struct rusage *ru)
 {
 	struct wait_opts wo;
 	struct pid *pid = NULL;
-- 
cgit v1.3-14-g43fede


From 6f9a22bc5775d231ab8fbe2c2f3c88e45e3e7c28 Mon Sep 17 00:00:00 2001
From: Michael Hernandez <michael.hernandez@cavium.com>
Date: Thu, 18 May 2017 10:47:47 -0700
Subject: PCI/MSI: Ignore affinity if pre/post vector count is more than
 min_vecs

min_vecs is the minimum amount of vectors needed to operate in MSI-X mode
which may just include the vectors that don't need affinity.

Disabling affinity settings causes the qla2xxx driver scsi_add_host() to fail
when blk_mq is enabled as the blk_mq_pci_map_queues() expects affinity masks
on each vector.

Fixes: dfef358bd1be ("PCI/MSI: Don't apply affinity if there aren't enough vectors left")
Signed-off-by: Michael Hernandez <michael.hernandez@cavium.com>
Signed-off-by: Himanshu Madhani <himanshu.madhani@cavium.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: stable@vger.kernel.org	# v4.10+
---
 drivers/pci/msi.c         | 14 ++------------
 include/linux/interrupt.h |  4 ++--
 kernel/irq/affinity.c     | 13 ++++++++++++-
 3 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index ba44fdfda66b..9e1569107cd6 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1058,7 +1058,7 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 
 	for (;;) {
 		if (affd) {
-			nvec = irq_calc_affinity_vectors(nvec, affd);
+			nvec = irq_calc_affinity_vectors(minvec, nvec, affd);
 			if (nvec < minvec)
 				return -ENOSPC;
 		}
@@ -1097,7 +1097,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 
 	for (;;) {
 		if (affd) {
-			nvec = irq_calc_affinity_vectors(nvec, affd);
+			nvec = irq_calc_affinity_vectors(minvec, nvec, affd);
 			if (nvec < minvec)
 				return -ENOSPC;
 		}
@@ -1165,16 +1165,6 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 	if (flags & PCI_IRQ_AFFINITY) {
 		if (!affd)
 			affd = &msi_default_affd;
-
-		if (affd->pre_vectors + affd->post_vectors > min_vecs)
-			return -EINVAL;
-
-		/*
-		 * If there aren't any vectors left after applying the pre/post
-		 * vectors don't bother with assigning affinity.
-		 */
-		if (affd->pre_vectors + affd->post_vectors == min_vecs)
-			affd = NULL;
 	} else {
 		if (WARN_ON(affd))
 			affd = NULL;
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a6fba4804672..0991f973f8ca 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -291,7 +291,7 @@ extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
 
 struct cpumask *irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
-int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd);
+int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd);
 
 #else /* CONFIG_SMP */
 
@@ -331,7 +331,7 @@ irq_create_affinity_masks(int nvec, const struct irq_affinity *affd)
 }
 
 static inline int
-irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd)
+irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
 {
 	return maxvec;
 }
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e2d356dd7581..9b71406d2eec 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -66,6 +66,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	struct cpumask *masks;
 	cpumask_var_t nmsk;
 
+	/*
+	 * If there aren't any vectors left after applying the pre/post
+	 * vectors don't bother with assigning affinity.
+	 */
+	if (!affv)
+		return NULL;
+
 	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
 		return NULL;
 
@@ -140,15 +147,19 @@ out:
 
 /**
  * irq_calc_affinity_vectors - Calculate the optimal number of vectors
+ * @minvec:	The minimum number of vectors available
  * @maxvec:	The maximum number of vectors available
  * @affd:	Description of the affinity requirements
  */
-int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd)
+int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
 {
 	int resv = affd->pre_vectors + affd->post_vectors;
 	int vecs = maxvec - resv;
 	int cpus;
 
+	if (resv > minvec)
+		return 0;
+
 	/* Stabilize the cpumasks */
 	get_online_cpus();
 	cpus = cpumask_weight(cpu_online_mask);
-- 
cgit v1.3-14-g43fede


From fe17a42e704a64477b15bb2cf8366fe3e5119aff Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 12 May 2017 12:55:35 +0100
Subject: irqdomain: Let irq_domain_mapping display hierarchical domains

Hierarchical domains seem to be hard to grasp, and a number of
aspiring kernel hackers find them utterly discombobulating.

In order to ease their pain, let's make them appear in
/sys/kernel/debug/irq_domain_mapping, such as the following:

   96  0x81808  MSI    0x          (null) RADIX   MSI
   96+ 0x00063  GICv2m 0xffff8003ee116980 RADIX   GICv2m
   96+ 0x00063  GICv2  0xffff00000916bfd8 LINEAR  GICv2

[output compressed to fit in a commit log]

This shows that IRQ96 is implemented by a stack of three domains,
the + sign indicating the stacking.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/20170512115538.10767-2-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdomain.c | 68 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 31805f237396..1f6cd2cacf74 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -746,13 +746,54 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
 EXPORT_SYMBOL_GPL(irq_find_mapping);
 
 #ifdef CONFIG_IRQ_DOMAIN_DEBUG
+static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc)
+{
+	struct irq_domain *domain;
+	struct irq_data *data;
+
+	domain = desc->irq_data.domain;
+	data = &desc->irq_data;
+
+	while (domain) {
+		unsigned int irq = data->irq;
+		unsigned long hwirq = data->hwirq;
+		struct irq_chip *chip;
+		bool direct;
+
+		if (data == &desc->irq_data)
+			seq_printf(m, "%5d  ", irq);
+		else
+			seq_printf(m, "%5d+ ", irq);
+		seq_printf(m, "0x%05lx  ", hwirq);
+
+		chip = irq_data_get_irq_chip(data);
+		seq_printf(m, "%-15s  ", (chip && chip->name) ? chip->name : "none");
+
+		seq_printf(m, data ? "0x%p  " : "  %p  ",
+			   irq_data_get_irq_chip_data(data));
+
+		seq_printf(m, "   %c    ", (desc->action && desc->action->handler) ? '*' : ' ');
+		direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq);
+		seq_printf(m, "%6s%-8s  ",
+			   (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
+			   direct ? "(DIRECT)" : "");
+		seq_printf(m, "%s\n", domain->name);
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+		domain = domain->parent;
+		data = data->parent_data;
+#else
+		domain = NULL;
+#endif
+	}
+}
+
 static int virq_debug_show(struct seq_file *m, void *private)
 {
 	unsigned long flags;
 	struct irq_desc *desc;
 	struct irq_domain *domain;
 	struct radix_tree_iter iter;
-	void *data, **slot;
+	void **slot;
 	int i;
 
 	seq_printf(m, " %-16s  %-6s  %-10s  %-10s  %s\n",
@@ -782,30 +823,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
 			continue;
 
 		raw_spin_lock_irqsave(&desc->lock, flags);
-		domain = desc->irq_data.domain;
-
-		if (domain) {
-			struct irq_chip *chip;
-			int hwirq = desc->irq_data.hwirq;
-			bool direct;
-
-			seq_printf(m, "%5d  ", i);
-			seq_printf(m, "0x%05x  ", hwirq);
-
-			chip = irq_desc_get_chip(desc);
-			seq_printf(m, "%-15s  ", (chip && chip->name) ? chip->name : "none");
-
-			data = irq_desc_get_chip_data(desc);
-			seq_printf(m, data ? "0x%p  " : "  %p  ", data);
-
-			seq_printf(m, "   %c    ", (desc->action && desc->action->handler) ? '*' : ' ');
-			direct = (i == hwirq) && (i < domain->revmap_direct_max_irq);
-			seq_printf(m, "%6s%-8s  ",
-				   (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
-				   direct ? "(DIRECT)" : "");
-			seq_printf(m, "%s\n", desc->irq_data.domain->name);
-		}
-
+		virq_debug_show_one(m, desc);
 		raw_spin_unlock_irqrestore(&desc->lock, flags);
 	}
 
-- 
cgit v1.3-14-g43fede


From 2370c00dc7232d0c4af224e7730b4de031f3b1a0 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 12 May 2017 12:55:36 +0100
Subject: irqdomain: Let irq_domain_mapping display ACPI fwnode attributes

If the system is using ACPI, there is no of_node to display. But ACPI can
use a struct irqchip_fwid as a domain identifier, and it can be used to
display the name contained in that structure.

The output on such a system will look like this:

 pMSI      0           0           0  irqchip@00000000e1180000
 MSI      37           0           0  irqchip@00000000e1180000
 GICv2m   37           0           0  irqchip@00000000e1180000
 GICv2   448         448           0  irqchip@ffff000008003000

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/20170512115538.10767-3-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdomain.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1f6cd2cacf74..70b9da72018b 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -801,15 +801,26 @@ static int virq_debug_show(struct seq_file *m, void *private)
 	mutex_lock(&irq_domain_mutex);
 	list_for_each_entry(domain, &irq_domain_list, link) {
 		struct device_node *of_node;
+		const char *name;
+
 		int count = 0;
+
 		of_node = irq_domain_get_of_node(domain);
+		if (of_node)
+			name = of_node_full_name(of_node);
+		else if (is_fwnode_irqchip(domain->fwnode))
+			name = container_of(domain->fwnode, struct irqchip_fwid,
+					    fwnode)->name;
+		else
+			name = "";
+
 		radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
 			count++;
 		seq_printf(m, "%c%-16s  %6u  %10u  %10u  %s\n",
 			   domain == irq_default_domain ? '*' : ' ', domain->name,
 			   domain->revmap_size + count, domain->revmap_size,
 			   domain->revmap_direct_max_irq,
-			   of_node ? of_node_full_name(of_node) : "");
+			   name);
 	}
 	mutex_unlock(&irq_domain_mutex);
 
-- 
cgit v1.3-14-g43fede


From a97b852b4d4c2f8c50cab13c71566639f9a1a990 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 12 May 2017 12:55:37 +0100
Subject: genirq/msi: Populate the domain name if provided by the irqchip

In order to ease debug, let's populate the domain name upfront, before any
MSI gets requested. This allows the domain to appear in the
irq_domain_mapping, and the user to easily find the expected data.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/20170512115538.10767-4-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/msi.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index ddc2f5427f75..fe4d48ec5bc4 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -265,13 +265,19 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
 					 struct msi_domain_info *info,
 					 struct irq_domain *parent)
 {
+	struct irq_domain *domain;
+
 	if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
 		msi_domain_update_dom_ops(info);
 	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
 		msi_domain_update_chip_ops(info);
 
-	return irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
-					   fwnode, &msi_domain_ops, info);
+	domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
+					     fwnode, &msi_domain_ops, info);
+	if (domain && info->chip && info->chip->name)
+		domain->name = info->chip->name;
+
+	return domain;
 }
 
 int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
-- 
cgit v1.3-14-g43fede


From 85c617abc786d7da9e95c0b4174159864dd3f85c Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 22 May 2017 12:03:49 +0300
Subject: perf/core: Remove some dead code

perf_init_event() can't return NULL.  If it did, the error handling is
incomplete and we would crash.  I have removed this confusing dead code.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/20170522090348.5g7yyld5en3yeky4@mwanda
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6e75a5c9412d..0028efa0abc3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9172,7 +9172,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 
 static struct pmu *perf_init_event(struct perf_event *event)
 {
-	struct pmu *pmu = NULL;
+	struct pmu *pmu;
 	int idx;
 	int ret;
 
@@ -9456,9 +9456,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	}
 
 	pmu = perf_init_event(event);
-	if (!pmu)
-		goto err_ns;
-	else if (IS_ERR(pmu)) {
+	if (IS_ERR(pmu)) {
 		err = PTR_ERR(pmu);
 		goto err_ns;
 	}
-- 
cgit v1.3-14-g43fede


From 36cc2b9222b5106de34085c4dd8635ac67ef5cba Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 22 May 2017 12:04:18 +0300
Subject: perf/core: Fix error handling in perf_event_alloc()

We don't set an error code here which means that perf_event_alloc()
returns ERR_PTR(0) (in other words NULL).  The callers are not expecting
that and would Oops.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Fixes: 375637bc5249 ("perf/core: Introduce address range filtering")
Link: http://lkml.kernel.org/r/20170522090418.hvs6icgpdo53wkn5@mwanda
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0028efa0abc3..f8c27d3ef3a1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9469,8 +9469,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
 						   sizeof(unsigned long),
 						   GFP_KERNEL);
-		if (!event->addr_filters_offs)
+		if (!event->addr_filters_offs) {
+			err = -ENOMEM;
 			goto err_per_task;
+		}
 
 		/* force hw sync on the address filters */
 		event->addr_filters_gen = 1;
-- 
cgit v1.3-14-g43fede


From 3fc5b3b6a80b2e08a0fec0056208c5dff757e547 Mon Sep 17 00:00:00 2001
From: Aaron Lu <aaron.lu@intel.com>
Date: Fri, 19 May 2017 15:53:31 +0800
Subject: smp: Avoid sending needless IPI in smp_call_function_many()

Inter-Processor-Interrupt(IPI) is needed when a page is unmapped and the
process' mm_cpumask() shows the process has ever run on other CPUs. page
migration, page reclaim all need IPIs. The number of IPI needed to send
to different CPUs is especially large for multi-threaded workload since
mm_cpumask() is per process.

For smp_call_function_many(), whenever a CPU queues a CSD to a target
CPU, it will send an IPI to let the target CPU to handle the work.
This isn't necessary - we need only send IPI when queueing a CSD
to an empty call_single_queue.

The reason:

flush_smp_call_function_queue() that is called upon a CPU receiving an
IPI will empty the queue and then handle all of the CSDs there. So if
the target CPU's call_single_queue is not empty, we know that:
i.  An IPI for the target CPU has already been sent by 'previous queuers';
ii. flush_smp_call_function_queue() hasn't emptied that CPU's queue yet.
Thus, it's safe for us to just queue our CSD there without sending an
addtional IPI. And for the 'previous queuers', we can limit it to the
first queuer.

To demonstrate the effect of this patch, a multi-thread workload that
spawns 80 threads to equally consume 100G memory is used. This is tested
on a 2 node broadwell-EP which has 44cores/88threads and 32G memory. So
after 32G memory is used up, page reclaiming starts to happen a lot.

With this patch, IPI number dropped 88% and throughput increased about
15% for the above workload.

Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: http://lkml.kernel.org/r/20170519075331.GE2084@aaronlu.sh.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/smp.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index a817769b53c0..76d16fe3c427 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -30,6 +30,7 @@ enum {
 struct call_function_data {
 	struct call_single_data	__percpu *csd;
 	cpumask_var_t		cpumask;
+	cpumask_var_t		cpumask_ipi;
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu)
 	if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
 				     cpu_to_node(cpu)))
 		return -ENOMEM;
+	if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
+				     cpu_to_node(cpu))) {
+		free_cpumask_var(cfd->cpumask);
+		return -ENOMEM;
+	}
 	cfd->csd = alloc_percpu(struct call_single_data);
 	if (!cfd->csd) {
 		free_cpumask_var(cfd->cpumask);
+		free_cpumask_var(cfd->cpumask_ipi);
 		return -ENOMEM;
 	}
 
@@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu)
 	struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
 
 	free_cpumask_var(cfd->cpumask);
+	free_cpumask_var(cfd->cpumask_ipi);
 	free_percpu(cfd->csd);
 	return 0;
 }
@@ -434,6 +442,7 @@ void smp_call_function_many(const struct cpumask *mask,
 	if (unlikely(!cpumask_weight(cfd->cpumask)))
 		return;
 
+	cpumask_clear(cfd->cpumask_ipi);
 	for_each_cpu(cpu, cfd->cpumask) {
 		struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
 
@@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask,
 			csd->flags |= CSD_FLAG_SYNCHRONOUS;
 		csd->func = func;
 		csd->info = info;
-		llist_add(&csd->llist, &per_cpu(call_single_queue, cpu));
+		if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
+			cpumask_set_cpu(cpu, cfd->cpumask_ipi);
 	}
 
 	/* Send a message to all CPUs in the map */
-	arch_send_call_function_ipi_mask(cfd->cpumask);
+	arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
 
 	if (wait) {
 		for_each_cpu(cpu, cfd->cpumask) {
-- 
cgit v1.3-14-g43fede


From 6c8557bdb28df3ae97476c5e2aed6373cd235aab Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 19 May 2017 12:58:25 +0200
Subject: smp, cpumask: Use non-atomic cpumask_{set,clear}_cpu()

The cpumasks in smp_call_function_many() are private and not subject
to concurrency, atomic bitops are pointless and expensive.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpumask.h | 11 +++++++++++
 kernel/smp.c            |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index a21b1fb9a968..4bf4479a3a80 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -293,6 +293,12 @@ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
 	set_bit(cpumask_check(cpu), cpumask_bits(dstp));
 }
 
+static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
+{
+	__set_bit(cpumask_check(cpu), cpumask_bits(dstp));
+}
+
+
 /**
  * cpumask_clear_cpu - clear a cpu in a cpumask
  * @cpu: cpu number (< nr_cpu_ids)
@@ -303,6 +309,11 @@ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
 	clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
 }
 
+static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
+{
+	__clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
+}
+
 /**
  * cpumask_test_cpu - test for a cpu in a cpumask
  * @cpu: cpu number (< nr_cpu_ids)
diff --git a/kernel/smp.c b/kernel/smp.c
index 76d16fe3c427..3061483cb3ad 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -436,7 +436,7 @@ void smp_call_function_many(const struct cpumask *mask,
 	cfd = this_cpu_ptr(&cfd_data);
 
 	cpumask_and(cfd->cpumask, mask, cpu_online_mask);
-	cpumask_clear_cpu(this_cpu, cfd->cpumask);
+	__cpumask_clear_cpu(this_cpu, cfd->cpumask);
 
 	/* Some callers race with other cpus changing the passed mask */
 	if (unlikely(!cpumask_weight(cfd->cpumask)))
@@ -452,7 +452,7 @@ void smp_call_function_many(const struct cpumask *mask,
 		csd->func = func;
 		csd->info = info;
 		if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
-			cpumask_set_cpu(cpu, cfd->cpumask_ipi);
+			__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
 	}
 
 	/* Send a message to all CPUs in the map */
-- 
cgit v1.3-14-g43fede


From 73215849dfbf63421c1cafea5a1b6da9bb17831e Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Fri, 12 May 2017 09:39:44 +0900
Subject: sched/core: Use the new llist_for_each_entry_safe() primitive

Now that we've added llist_for_each_entry_safe(), use it to simplify
an open coded version of it in sched_ttwu_pending().

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <kernel-team@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1494549584-11730-1-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dde5d1e860f0..4a31239956da 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1731,7 +1731,7 @@ void sched_ttwu_pending(void)
 {
 	struct rq *rq = this_rq();
 	struct llist_node *llist = llist_del_all(&rq->wake_list);
-	struct task_struct *p;
+	struct task_struct *p, *t;
 	struct rq_flags rf;
 
 	if (!llist)
@@ -1740,17 +1740,8 @@ void sched_ttwu_pending(void)
 	rq_lock_irqsave(rq, &rf);
 	update_rq_clock(rq);
 
-	while (llist) {
-		int wake_flags = 0;
-
-		p = llist_entry(llist, struct task_struct, wake_entry);
-		llist = llist_next(llist);
-
-		if (p->sched_remote_wakeup)
-			wake_flags = WF_MIGRATED;
-
-		ttwu_do_activate(rq, p, wake_flags, &rf);
-	}
+	llist_for_each_entry_safe(p, t, llist, wake_entry)
+		ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
 
 	rq_unlock_irqrestore(rq, &rf);
 }
-- 
cgit v1.3-14-g43fede


From de16b91effdbf5aeff8346b99bcd0991a5362db9 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Fri, 12 May 2017 10:05:43 +0900
Subject: sched/rt: Remove unnecessary condition in push_rt_task()

pick_next_pushable_task(rq) has BUG_ON(rq_cpu != task_cpu(task)) when
it returns a task other than NULL, which means that task_cpu(task) must
be rq->cpu. So if task == next_task, then task_cpu(next_task) must be
rq->cpu as well. Remove the redundant condition and make the code simpler.

This way one unnecessary branch and two LOAD operations can be avoided.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Juri Lelli <juri.lelli@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: <kernel-team@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1494551143-22219-1-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/rt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 979b7341008a..c18b50094fab 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1819,7 +1819,7 @@ retry:
 		 * pushing.
 		 */
 		task = pick_next_pushable_task(rq);
-		if (task_cpu(next_task) == rq->cpu && task == next_task) {
+		if (task == next_task) {
 			/*
 			 * The task hasn't migrated, and is still the next
 			 * eligible task, but we failed to find a run-queue
-- 
cgit v1.3-14-g43fede


From a776b968e52895a350d636e6e7fdcb3b10846fa4 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Fri, 12 May 2017 10:05:59 +0900
Subject: sched/deadline: Remove unnecessary condition in push_dl_task()

pick_next_pushable_dl_task(rq) has BUG_ON(rq->cpu != task_cpu(task))
when it returns a task other than NULL, which means that task_cpu(task)
must be rq->cpu. So if task == next_task, then task_cpu(next_task) must
be rq->cpu as well. Remove the redundant condition and make the code simpler.

This way one unnecessary branch and two LOAD operations can be avoided.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Juri Lelli <juri.lelli@arm.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: <kernel-team@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1494551159-22367-1-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a2ce59015642..df6c2912bd60 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1533,7 +1533,7 @@ retry:
 		 * then possible that next_task has migrated.
 		 */
 		task = pick_next_pushable_dl_task(rq);
-		if (task_cpu(next_task) == rq->cpu && task == next_task) {
+		if (task == next_task) {
 			/*
 			 * The task is still there. We don't try
 			 * again, some other cpu will pull it when ready.
-- 
cgit v1.3-14-g43fede


From 896bbb2522587e3b8eb2a0d204d43ccc1042a00d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 9 Mar 2017 10:18:42 -0500
Subject: sched/core: Allow __sched_setscheduler() in interrupts when PI is not
 used

When priority inheritance was added back in 2.6.18 to sched_setscheduler(), it
added a path to taking an rt-mutex wait_lock, which is not IRQ safe. As PI
is not a common occurrence, lockdep will likely never trigger if
sched_setscheduler was called from interrupt context. A BUG_ON() was added
to trigger if __sched_setscheduler() was ever called from interrupt context
because there was a possibility to take the wait_lock.

Today the wait_lock is irq safe, but the path to taking it in
sched_setscheduler() is the same as the path to taking it from normal
context. The wait_lock is taken with raw_spin_lock_irq() and released with
raw_spin_unlock_irq() which will indiscriminately enable interrupts,
which would be bad in interrupt context.

The problem is that normalize_rt_tasks, which is called by triggering the
sysrq nice-all-RT-tasks was changed to call __sched_setscheduler(), and this
is done from interrupt context!

Now __sched_setscheduler() takes a "pi" parameter that is used to know if
the priority inheritance should be called or not. As the BUG_ON() only cares
about calling the PI code, it should only bug if called from interrupt
context with the "pi" parameter set to true.

Reported-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Tested-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: dbc7f069b93a ("sched: Use replace normalize_task() with __sched_setscheduler()")
Link: http://lkml.kernel.org/r/20170308124654.10e598f2@gandalf.local.home
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4a31239956da..877241e9f2b0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4188,8 +4188,8 @@ static int __sched_setscheduler(struct task_struct *p,
 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 	struct rq *rq;
 
-	/* May grab non-irq protected spin_locks: */
-	BUG_ON(in_interrupt());
+	/* The pi code expects interrupts enabled */
+	BUG_ON(pi && in_interrupt());
 recheck:
 	/* Double check policy once rq lock held: */
 	if (policy < 0) {
-- 
cgit v1.3-14-g43fede


From c249f255aab86b9b187ba319b9d2684841ac7c8d Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <dave.kleikamp@oracle.com>
Date: Mon, 15 May 2017 14:14:13 -0500
Subject: sched/rt: Minimize rq->lock contention in do_sched_rt_period_timer()

With CONFIG_RT_GROUP_SCHED=y, do_sched_rt_period_timer() sequentially
takes each CPU's rq->lock. On a large, busy system, the cumulative time it
takes to acquire each lock can be excessive, even triggering a watchdog
timeout.

If rt_rq->rt_time and rt_rq->rt_nr_running are both zero, this function does
nothing while holding the lock, so don't bother taking it at all.

Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a767637b-df85-912f-ba69-c90ee00a3fb6@oracle.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/rt.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c18b50094fab..581d5c7a5264 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -840,6 +840,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 		int enqueue = 0;
 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 		struct rq *rq = rq_of_rt_rq(rt_rq);
+		int skip;
+
+		/*
+		 * When span == cpu_online_mask, taking each rq->lock
+		 * can be time-consuming. Try to avoid it when possible.
+		 */
+		raw_spin_lock(&rt_rq->rt_runtime_lock);
+		skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
+		raw_spin_unlock(&rt_rq->rt_runtime_lock);
+		if (skip)
+			continue;
 
 		raw_spin_lock(&rq->lock);
 		if (rt_rq->rt_time) {
-- 
cgit v1.3-14-g43fede


From 8655d5497735b288f8a9b458bd22e7d1bf95bb61 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 15 May 2017 15:13:16 +0200
Subject: sched/numa: Use down_read_trylock() for the mmap_sem

A customer has reported a soft-lockup when running an intensive
memory stress test, where the trace on multiple CPU's looks like this:

 RIP: 0010:[<ffffffff810c53fe>]
  [<ffffffff810c53fe>] native_queued_spin_lock_slowpath+0x10e/0x190
...
 Call Trace:
  [<ffffffff81182d07>] queued_spin_lock_slowpath+0x7/0xa
  [<ffffffff811bc331>] change_protection_range+0x3b1/0x930
  [<ffffffff811d4be8>] change_prot_numa+0x18/0x30
  [<ffffffff810adefe>] task_numa_work+0x1fe/0x310
  [<ffffffff81098322>] task_work_run+0x72/0x90

Further investigation showed that the lock contention here is pmd_lock().

The task_numa_work() function makes sure that only one thread is let to perform
the work in a single scan period (via cmpxchg), but if there's a thread with
mmap_sem locked for writing for several periods, multiple threads in
task_numa_work() can build up a convoy waiting for mmap_sem for read and then
all get unblocked at once.

This patch changes the down_read() to the trylock version, which prevents the
build up. For a workload experiencing mmap_sem contention, it's probably better
to postpone the NUMA balancing work anyway. This seems to have fixed the soft
lockups involving pmd_lock(), which is in line with the convoy theory.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170515131316.21909-1-vbabka@suse.cz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 219fe58e3023..47a0c552c77b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2470,7 +2470,8 @@ void task_numa_work(struct callback_head *work)
 		return;
 
 
-	down_read(&mm->mmap_sem);
+	if (!down_read_trylock(&mm->mmap_sem))
+		return;
 	vma = find_vma(mm, start);
 	if (!vma) {
 		reset_ptenuma_scan(p);
-- 
cgit v1.3-14-g43fede


From b4def42724594cd399cfee365221f5b38639711d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:43 +0200
Subject: async: Adjust system_state checks

To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.

Adjust the system_state check in async_run_entry_fn() and
async_synchronize_cookie_domain() to handle the extra states.

Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170516184735.865155020@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/async.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index d2edd6efec56..2cbd3dd5940d 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -114,14 +114,14 @@ static void async_run_entry_fn(struct work_struct *work)
 	ktime_t uninitialized_var(calltime), delta, rettime;
 
 	/* 1) run (and print duration) */
-	if (initcall_debug && system_state == SYSTEM_BOOTING) {
+	if (initcall_debug && system_state < SYSTEM_RUNNING) {
 		pr_debug("calling  %lli_%pF @ %i\n",
 			(long long)entry->cookie,
 			entry->func, task_pid_nr(current));
 		calltime = ktime_get();
 	}
 	entry->func(entry->data, entry->cookie);
-	if (initcall_debug && system_state == SYSTEM_BOOTING) {
+	if (initcall_debug && system_state < SYSTEM_RUNNING) {
 		rettime = ktime_get();
 		delta = ktime_sub(rettime, calltime);
 		pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
@@ -284,14 +284,14 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
 {
 	ktime_t uninitialized_var(starttime), delta, endtime;
 
-	if (initcall_debug && system_state == SYSTEM_BOOTING) {
+	if (initcall_debug && system_state < SYSTEM_RUNNING) {
 		pr_debug("async_waiting @ %i\n", task_pid_nr(current));
 		starttime = ktime_get();
 	}
 
 	wait_event(async_done, lowest_in_progress(domain) >= cookie);
 
-	if (initcall_debug && system_state == SYSTEM_BOOTING) {
+	if (initcall_debug && system_state < SYSTEM_RUNNING) {
 		endtime = ktime_get();
 		delta = ktime_sub(endtime, starttime);
 
-- 
cgit v1.3-14-g43fede


From 0594729c24d846889408a07057b5cc9e8d931419 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:44 +0200
Subject: extable: Adjust system_state checks

To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.

Adjust the system_state check in core_kernel_text() to handle the extra
states, i.e. to cover init text up to the point where the system switches
to state RUNNING.

Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170516184735.949992741@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/extable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index 2676d7f8baf6..0fbdd8582f08 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -75,7 +75,7 @@ int core_kernel_text(unsigned long addr)
 	    addr < (unsigned long)_etext)
 		return 1;
 
-	if (system_state == SYSTEM_BOOTING &&
+	if (system_state < SYSTEM_RUNNING &&
 	    init_kernel_text(addr))
 		return 1;
 	return 0;
-- 
cgit v1.3-14-g43fede


From ff48cd26fc4889b9deb5f9333d3c61746e450b7f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:45 +0200
Subject: printk: Adjust system_state checks

To enable smp_processor_id() and might_sleep() debug checks earlier, it's
required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING.

Adjust the system_state check in boot_delay_msec() to handle the extra
states.

Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170516184736.027534895@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/printk/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a1aecf44ab07..32fac391ac2a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1176,7 +1176,7 @@ static void boot_delay_msec(int level)
 	unsigned long long k;
 	unsigned long timeout;
 
-	if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
+	if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING)
 		|| suppress_message_printing(level)) {
 		return;
 	}
-- 
cgit v1.3-14-g43fede


From 1c3c5eab171590f86edd8d31389d61dd1efe3037 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 May 2017 20:42:48 +0200
Subject: sched/core: Enable might_sleep() and smp_processor_id() checks early

might_sleep() and smp_processor_id() checks are enabled after the boot
process is done. That hides bugs in the SMP bringup and driver
initialization code.

Enable it right when the scheduler starts working, i.e. when init task and
kthreadd have been created and right before the idle task enables
preemption.

Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170516184736.272225698@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 init/main.c            | 10 ++++++++++
 kernel/sched/core.c    |  4 +++-
 lib/smp_processor_id.c |  2 +-
 3 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/init/main.c b/init/main.c
index badae3bf08f1..df58a416dd1d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -414,6 +414,16 @@ static noinline void __ref rest_init(void)
 	rcu_read_lock();
 	kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
 	rcu_read_unlock();
+
+	/*
+	 * Enable might_sleep() and smp_processor_id() checks.
+	 * They cannot be enabled earlier because with CONFIG_PRREMPT=y
+	 * kernel_thread() would trigger might_sleep() splats. With
+	 * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
+	 * already, but it's stuck on the kthreadd_done completion.
+	 */
+	system_state = SYSTEM_SCHEDULING;
+
 	complete(&kthreadd_done);
 
 	/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 877241e9f2b0..c3e50cada84d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6238,8 +6238,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
 
 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
 	     !is_idle_task(current)) ||
-	    system_state != SYSTEM_RUNNING || oops_in_progress)
+	    system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+	    oops_in_progress)
 		return;
+
 	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 		return;
 	prev_jiffy = jiffies;
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 690d75b132fa..2fb007be0212 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -28,7 +28,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1,
 	/*
 	 * It is valid to assume CPU-locality during early bootup:
 	 */
-	if (system_state != SYSTEM_RUNNING)
+	if (system_state < SYSTEM_SCHEDULING)
 		goto out;
 
 	/*
-- 
cgit v1.3-14-g43fede


From 4b3e4ed6b0d958d7fb2f160bb8ebfb4f0db19382 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Thu, 20 Apr 2017 13:07:30 -0400
Subject: audit: unswing cap_* fields in PATH records

The cap_* fields swing in and out of PATH records.
If no capabilities are set, the cap_* fields are completely missing and when
one of the cap_fi or cap_fp values is empty, that field is omitted.

Original:
type=PATH msg=audit(04/20/2017 12:17:11.222:193) : item=1 name=/lib64/ld-linux-x86-64.so.2 inode=787694 dev=08:03 mode=file,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:ld_so_t:s0 nametype=NORMAL
type=PATH msg=audit(04/20/2017 12:17:11.222:193) : item=0 name=/home/sleep inode=1319469 dev=08:03 mode=file,suid,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:bin_t:s0 nametype=NORMAL cap_fp=sys_admin cap_fe=1 cap_fver=2

Normalize the PATH record by always printing all 4 cap_* fields.

Fixed:
type=PATH msg=audit(04/20/2017 13:01:31.679:201) : item=1 name=/lib64/ld-linux-x86-64.so.2 inode=787694 dev=08:03 mode=file,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:ld_so_t:s0 nametype=NORMAL cap_fp=none cap_fi=none cap_fe=0 cap_fver=0
type=PATH msg=audit(04/20/2017 13:01:31.679:201) : item=0 name=/home/sleep inode=1319469 dev=08:03 mode=file,suid,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:bin_t:s0 nametype=NORMAL cap_fp=sys_admin cap_fi=none cap_fe=1 cap_fver=2

See: https://github.com/linux-audit/audit-kernel/issues/42

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index a7c6a50477aa..b2e877100242 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1999,22 +1999,10 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
 
 static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
 {
-	kernel_cap_t *perm = &name->fcap.permitted;
-	kernel_cap_t *inh = &name->fcap.inheritable;
-	int log = 0;
-
-	if (!cap_isclear(*perm)) {
-		audit_log_cap(ab, "cap_fp", perm);
-		log = 1;
-	}
-	if (!cap_isclear(*inh)) {
-		audit_log_cap(ab, "cap_fi", inh);
-		log = 1;
-	}
-
-	if (log)
-		audit_log_format(ab, " cap_fe=%d cap_fver=%x",
-				 name->fcap.fE, name->fcap_ver);
+	audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
+	audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
+	audit_log_format(ab, " cap_fe=%d cap_fver=%x",
+			 name->fcap.fE, name->fcap_ver);
 }
 
 static inline int audit_copy_fcaps(struct audit_names *name,
-- 
cgit v1.3-14-g43fede


From 490194269665d6d4915a4a5774f002885c5a2d8f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 21 Apr 2017 15:35:26 -0700
Subject: module: Pass struct load_info into symbol checks

Since we're already using values from struct load_info, just pass this
pointer in directly and use what's needed as we need it. This allows us
to access future fields in struct load_info too.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 kernel/module.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 4a3665f8f837..ca4509b13400 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1278,12 +1278,13 @@ static u32 resolve_rel_crc(const s32 *crc)
 	return *(u32 *)((void *)crc + *crc);
 }
 
-static int check_version(Elf_Shdr *sechdrs,
-			 unsigned int versindex,
+static int check_version(const struct load_info *info,
 			 const char *symname,
 			 struct module *mod,
 			 const s32 *crc)
 {
+	Elf_Shdr *sechdrs = info->sechdrs;
+	unsigned int versindex = info->index.vers;
 	unsigned int i, num_versions;
 	struct modversion_info *versions;
 
@@ -1326,8 +1327,7 @@ bad_version:
 	return 0;
 }
 
-static inline int check_modstruct_version(Elf_Shdr *sechdrs,
-					  unsigned int versindex,
+static inline int check_modstruct_version(const struct load_info *info,
 					  struct module *mod)
 {
 	const s32 *crc;
@@ -1343,8 +1343,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 		BUG();
 	}
 	preempt_enable();
-	return check_version(sechdrs, versindex,
-			     VMLINUX_SYMBOL_STR(module_layout), mod, crc);
+	return check_version(info, VMLINUX_SYMBOL_STR(module_layout),
+			     mod, crc);
 }
 
 /* First part is kernel version, which we ignore if module has crcs. */
@@ -1358,8 +1358,7 @@ static inline int same_magic(const char *amagic, const char *bmagic,
 	return strcmp(amagic, bmagic) == 0;
 }
 #else
-static inline int check_version(Elf_Shdr *sechdrs,
-				unsigned int versindex,
+static inline int check_version(const struct load_info *info,
 				const char *symname,
 				struct module *mod,
 				const s32 *crc)
@@ -1367,8 +1366,7 @@ static inline int check_version(Elf_Shdr *sechdrs,
 	return 1;
 }
 
-static inline int check_modstruct_version(Elf_Shdr *sechdrs,
-					  unsigned int versindex,
+static inline int check_modstruct_version(const struct load_info *info,
 					  struct module *mod)
 {
 	return 1;
@@ -1404,7 +1402,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
 	if (!sym)
 		goto unlock;
 
-	if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) {
+	if (!check_version(info, name, mod, crc)) {
 		sym = ERR_PTR(-EINVAL);
 		goto getname;
 	}
@@ -2971,7 +2969,7 @@ static struct module *setup_load_info(struct load_info *info, int flags)
 	info->index.pcpu = find_pcpusec(info);
 
 	/* Check module struct version now, before we try to use module. */
-	if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
+	if (!check_modstruct_version(info, mod))
 		return ERR_PTR(-ENOEXEC);
 
 	return mod;
-- 
cgit v1.3-14-g43fede


From 3e2e857f9c3a19d55ee0ba7b428b8be5008960bf Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 21 Apr 2017 15:35:27 -0700
Subject: module: Add module name to modinfo

Accessing the mod structure (e.g. for mod->name) prior to having completed
check_modstruct_version() can result in writing garbage to the error logs
if the layout of the mod structure loaded from disk doesn't match the
running kernel's mod structure layout. This kind of mismatch will become
much more likely if a kernel is built with different randomization seed
for the struct layout randomization plugin.

Instead, add and use a new modinfo string for logging the module name.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 kernel/module.c       | 29 ++++++++++++++++++++++-------
 scripts/mod/modpost.c |  1 +
 2 files changed, 23 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index ca4509b13400..3803449ca219 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -302,6 +302,7 @@ int unregister_module_notifier(struct notifier_block *nb)
 EXPORT_SYMBOL(unregister_module_notifier);
 
 struct load_info {
+	char *name;
 	Elf_Ehdr *hdr;
 	unsigned long len;
 	Elf_Shdr *sechdrs;
@@ -1318,12 +1319,12 @@ static int check_version(const struct load_info *info,
 	}
 
 	/* Broken toolchain. Warn once, then let it go.. */
-	pr_warn_once("%s: no symbol version for %s\n", mod->name, symname);
+	pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
 	return 1;
 
 bad_version:
 	pr_warn("%s: disagrees about version of symbol %s\n",
-	       mod->name, symname);
+	       info->name, symname);
 	return 0;
 }
 
@@ -2913,9 +2914,15 @@ static int rewrite_section_headers(struct load_info *info, int flags)
 		info->index.vers = 0; /* Pretend no __versions section! */
 	else
 		info->index.vers = find_sec(info, "__versions");
+	info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
 	info->index.info = find_sec(info, ".modinfo");
+	if (!info->index.info)
+		info->name = "(missing .modinfo section)";
+	else
+		info->name = get_modinfo(info, "name");
 	info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
-	info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
 	return 0;
 }
 
@@ -2955,14 +2962,22 @@ static struct module *setup_load_info(struct load_info *info, int flags)
 
 	info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
 	if (!info->index.mod) {
-		pr_warn("No module found in object\n");
+		pr_warn("%s: No module found in object\n",
+			info->name ?: "(missing .modinfo name field)");
 		return ERR_PTR(-ENOEXEC);
 	}
 	/* This is temporary: point mod into copy of data. */
 	mod = (void *)info->sechdrs[info->index.mod].sh_addr;
 
+	/*
+	 * If we didn't load the .modinfo 'name' field, fall back to
+	 * on-disk struct mod 'name' field.
+	 */
+	if (!info->name)
+		info->name = mod->name;
+
 	if (info->index.sym == 0) {
-		pr_warn("%s: module has no symbols (stripped?)\n", mod->name);
+		pr_warn("%s: module has no symbols (stripped?)\n", info->name);
 		return ERR_PTR(-ENOEXEC);
 	}
 
@@ -2990,7 +3005,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
 			return err;
 	} else if (!same_magic(modmagic, vermagic, info->index.vers)) {
 		pr_err("%s: version magic '%s' should be '%s'\n",
-		       mod->name, modmagic, vermagic);
+		       info->name, modmagic, vermagic);
 		return -ENOEXEC;
 	}
 
@@ -3270,7 +3285,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
 	if (IS_ERR(mod))
 		return mod;
 
-	if (blacklisted(mod->name))
+	if (blacklisted(info->name))
 		return ERR_PTR(-EPERM);
 
 	err = check_modinfo(mod, info, flags);
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 30d752a4a6a6..48397feb08fb 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -2126,6 +2126,7 @@ static void add_header(struct buffer *b, struct module *mod)
 	buf_printf(b, "#include <linux/compiler.h>\n");
 	buf_printf(b, "\n");
 	buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n");
+	buf_printf(b, "MODULE_INFO(name, KBUILD_MODNAME);\n");
 	buf_printf(b, "\n");
 	buf_printf(b, "__visible struct module __this_module\n");
 	buf_printf(b, "__attribute__((section(\".gnu.linkonce.this_module\"))) = {\n");
-- 
cgit v1.3-14-g43fede


From 45aea321678856687927c53972321ebfab77759a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 24 May 2017 08:52:02 +0200
Subject: sched/clock: Fix early boot preempt assumption in
 __set_sched_clock_stable()

The more strict early boot preemption warnings found that
__set_sched_clock_stable() was incorrectly assuming we'd still be
running on a single CPU:

  BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1
  caller is debug_smp_processor_id+0x1c/0x1e
  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.12.0-rc2-00108-g1c3c5ea #1
  Call Trace:
   dump_stack+0x110/0x192
   check_preemption_disabled+0x10c/0x128
   ? set_debug_rodata+0x25/0x25
   debug_smp_processor_id+0x1c/0x1e
   sched_clock_init_late+0x27/0x87
  [...]

Fix it by disabling IRQs.

Reported-by: kernel test robot <xiaolong.ye@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: lkp@01.org
Cc: tipbuild@zytor.com
Link: http://lkml.kernel.org/r/20170524065202.v25vyu7pvba5mhpd@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/clock.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 1a0d389d2f2b..ca0f8fc945c6 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -133,12 +133,19 @@ static void __scd_stamp(struct sched_clock_data *scd)
 
 static void __set_sched_clock_stable(void)
 {
-	struct sched_clock_data *scd = this_scd();
+	struct sched_clock_data *scd;
 
+	/*
+	 * Since we're still unstable and the tick is already running, we have
+	 * to disable IRQs in order to get a consistent scd->tick* reading.
+	 */
+	local_irq_disable();
+	scd = this_scd();
 	/*
 	 * Attempt to make the (initial) unstable->stable transition continuous.
 	 */
 	__sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
+	local_irq_enable();
 
 	printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
 			scd->tick_gtod, __gtod_offset,
-- 
cgit v1.3-14-g43fede


From f36776fafbaa0094390dd4e7e3e29805e0b82730 Mon Sep 17 00:00:00 2001
From: Peter Rajnoha <prajnoha@redhat.com>
Date: Tue, 9 May 2017 15:22:30 +0200
Subject: kobject: support passing in variables for synthetic uevents

This patch makes it possible to pass additional arguments in addition
to uevent action name when writing /sys/.../uevent attribute. These
additional arguments are then inserted into generated synthetic uevent
as additional environment variables.

Before, we were not able to pass any additional uevent environment
variables for synthetic uevents. This made it hard to identify such uevents
properly in userspace to make proper distinction between genuine uevents
originating from kernel and synthetic uevents triggered from userspace.
Also, it was not possible to pass any additional information which would
make it possible to optimize and change the way the synthetic uevents are
processed back in userspace based on the originating environment of the
triggering action in userspace. With the extra additional variables, we are
able to pass through this extra information needed and also it makes it
possible to synchronize with such synthetic uevents as they can be clearly
identified back in userspace.

The format for writing the uevent attribute is following:

    ACTION [UUID [KEY=VALUE ...]

There's no change in how "ACTION" is recognized - it stays the same
("add", "change", "remove"). The "ACTION" is the only argument required
to generate synthetic uevent, the rest of arguments, that this patch
adds support for, are optional.

The "UUID" is considered as transaction identifier so it's possible to
use the same UUID value for one or more synthetic uevents in which case
we logically group these uevents together for any userspace listeners.
The "UUID" is expected to be in "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
format where "x" is a hex digit. The value appears in uevent as
"SYNTH_UUID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" environment variable.

The "KEY=VALUE" pairs can contain alphanumeric characters only. It's
possible to define zero or more more pairs - each pair is then delimited
by a space character " ". Each pair appears in synthetic uevents as
"SYNTH_ARG_KEY=VALUE" environment variable. That means the KEY name gains
"SYNTH_ARG_" prefix to avoid possible collisions with existing variables.
To pass the "KEY=VALUE" pairs, it's also required to pass in the "UUID"
part for the synthetic uevent first.

If "UUID" is not passed in, the generated synthetic uevent gains
"SYNTH_UUID=0" environment variable automatically so it's possible to
identify this situation in userspace when reading generated uevent and so
we can still make a difference between genuine and synthetic uevents.

Signed-off-by: Peter Rajnoha <prajnoha@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-uevent |  47 ++++++++++
 drivers/base/bus.c                     |  10 +-
 drivers/base/core.c                    |   7 +-
 include/linux/kobject.h                |   4 +-
 kernel/module.c                        |   5 +-
 lib/kobject_uevent.c                   | 167 ++++++++++++++++++++++++++++++---
 6 files changed, 207 insertions(+), 33 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-uevent

(limited to 'kernel')

diff --git a/Documentation/ABI/testing/sysfs-uevent b/Documentation/ABI/testing/sysfs-uevent
new file mode 100644
index 000000000000..d7ac99072a16
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-uevent
@@ -0,0 +1,47 @@
+What:           /sys/.../uevent
+Date:           May 2017
+KernelVersion:  4.12
+Contact:        Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:
+                Enable passing additional variables for synthetic uevents that
+                are generated by writing /sys/.../uevent file.
+
+                Recognized extended format is ACTION [UUID [KEY=VALUE ...].
+
+                The ACTION is compulsory - it is the name of the uevent action
+                ("add", "change", "remove"). There is no change compared to
+                previous functionality here. The rest of the extended format
+                is optional.
+
+                You need to pass UUID first before any KEY=VALUE pairs.
+                The UUID must be in "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
+                format where 'x' is a hex digit. The UUID is considered to be
+                a transaction identifier so it's possible to use the same UUID
+                value for one or more synthetic uevents in which case we
+                logically group these uevents together for any userspace
+                listeners. The UUID value appears in uevent as
+                "SYNTH_UUID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" environment
+                variable.
+
+                If UUID is not passed in, the generated synthetic uevent gains
+                "SYNTH_UUID=0" environment variable automatically.
+
+                The KEY=VALUE pairs can contain alphanumeric characters only.
+                It's possible to define zero or more pairs - each pair is then
+                delimited by a space character ' '. Each pair appears in
+                synthetic uevent as "SYNTH_ARG_KEY=VALUE". That means the KEY
+                name gains "SYNTH_ARG_" prefix to avoid possible collisions
+                with existing variables.
+
+                Example of valid sequence written to the uevent file:
+
+                    add fe4d7c9d-b8c6-4a70-9ef1-3d8a58d18eed A=1 B=abc
+
+                This generates synthetic uevent including these variables:
+
+                    ACTION=add
+                    SYNTH_ARG_A=1
+                    SYNTH_ARG_B=abc
+                    SYNTH_UUID=fe4d7c9d-b8c6-4a70-9ef1-3d8a58d18eed
+Users:
+                udev, userspace tools generating synthetic uevents
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 6470eb8088f4..f945f2f0ee06 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -648,10 +648,7 @@ static void remove_probe_files(struct bus_type *bus)
 static ssize_t uevent_store(struct device_driver *drv, const char *buf,
 			    size_t count)
 {
-	enum kobject_action action;
-
-	if (kobject_action_type(buf, count, &action) == 0)
-		kobject_uevent(&drv->p->kobj, action);
+	kobject_synth_uevent(&drv->p->kobj, buf, count);
 	return count;
 }
 static DRIVER_ATTR_WO(uevent);
@@ -868,10 +865,7 @@ static void klist_devices_put(struct klist_node *n)
 static ssize_t bus_uevent_store(struct bus_type *bus,
 				const char *buf, size_t count)
 {
-	enum kobject_action action;
-
-	if (kobject_action_type(buf, count, &action) == 0)
-		kobject_uevent(&bus->p->subsys.kobj, action);
+	kobject_synth_uevent(&bus->p->subsys.kobj, buf, count);
 	return count;
 }
 static BUS_ATTR(uevent, S_IWUSR, NULL, bus_uevent_store);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index bbecaf9293be..6564339d7f59 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -981,12 +981,9 @@ out:
 static ssize_t uevent_store(struct device *dev, struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
-	enum kobject_action action;
+	if (kobject_synth_uevent(&dev->kobj, buf, count))
+		dev_err(dev, "uevent: failed to send synthetic uevent\n");
 
-	if (kobject_action_type(buf, count, &action) == 0)
-		kobject_uevent(&dev->kobj, action);
-	else
-		dev_err(dev, "uevent: unknown action-string\n");
 	return count;
 }
 static DEVICE_ATTR_RW(uevent);
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index ca85cb80e99a..eeab34b0f589 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -217,11 +217,9 @@ extern struct kobject *firmware_kobj;
 int kobject_uevent(struct kobject *kobj, enum kobject_action action);
 int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 			char *envp[]);
+int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count);
 
 __printf(2, 3)
 int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...);
 
-int kobject_action_type(const char *buf, size_t count,
-			enum kobject_action *type);
-
 #endif /* _KOBJECT_H_ */
diff --git a/kernel/module.c b/kernel/module.c
index 4a3665f8f837..d7eb41d772c4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1202,10 +1202,7 @@ static ssize_t store_uevent(struct module_attribute *mattr,
 			    struct module_kobject *mk,
 			    const char *buffer, size_t count)
 {
-	enum kobject_action action;
-
-	if (kobject_action_type(buffer, count, &action) == 0)
-		kobject_uevent(&mk->kobj, action);
+	kobject_synth_uevent(&mk->kobj, buffer, count);
 	return count;
 }
 
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 9a2b811966eb..719c155fce20 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -23,6 +23,8 @@
 #include <linux/socket.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
+#include <linux/uuid.h>
+#include <linux/ctype.h>
 #include <net/sock.h>
 #include <net/net_namespace.h>
 
@@ -52,19 +54,13 @@ static const char *kobject_actions[] = {
 	[KOBJ_OFFLINE] =	"offline",
 };
 
-/**
- * kobject_action_type - translate action string to numeric type
- *
- * @buf: buffer containing the action string, newline is ignored
- * @count: length of buffer
- * @type: pointer to the location to store the action type
- *
- * Returns 0 if the action string was recognized.
- */
-int kobject_action_type(const char *buf, size_t count,
-			enum kobject_action *type)
+static int kobject_action_type(const char *buf, size_t count,
+			       enum kobject_action *type,
+			       const char **args)
 {
 	enum kobject_action action;
+	size_t count_first;
+	const char *args_start;
 	int ret = -EINVAL;
 
 	if (count && (buf[count-1] == '\n' || buf[count-1] == '\0'))
@@ -73,11 +69,20 @@ int kobject_action_type(const char *buf, size_t count,
 	if (!count)
 		goto out;
 
+	args_start = strnchr(buf, count, ' ');
+	if (args_start) {
+		count_first = args_start - buf;
+		args_start = args_start + 1;
+	} else
+		count_first = count;
+
 	for (action = 0; action < ARRAY_SIZE(kobject_actions); action++) {
-		if (strncmp(kobject_actions[action], buf, count) != 0)
+		if (strncmp(kobject_actions[action], buf, count_first) != 0)
 			continue;
-		if (kobject_actions[action][count] != '\0')
+		if (kobject_actions[action][count_first] != '\0')
 			continue;
+		if (args)
+			*args = args_start;
 		*type = action;
 		ret = 0;
 		break;
@@ -86,6 +91,142 @@ out:
 	return ret;
 }
 
+static const char *action_arg_word_end(const char *buf, const char *buf_end,
+				       char delim)
+{
+	const char *next = buf;
+
+	while (next <= buf_end && *next != delim)
+		if (!isalnum(*next++))
+			return NULL;
+
+	if (next == buf)
+		return NULL;
+
+	return next;
+}
+
+static int kobject_action_args(const char *buf, size_t count,
+			       struct kobj_uevent_env **ret_env)
+{
+	struct kobj_uevent_env *env = NULL;
+	const char *next, *buf_end, *key;
+	int key_len;
+	int r = -EINVAL;
+
+	if (count && (buf[count - 1] == '\n' || buf[count - 1] == '\0'))
+		count--;
+
+	if (!count)
+		return -EINVAL;
+
+	env = kzalloc(sizeof(*env), GFP_KERNEL);
+	if (!env)
+		return -ENOMEM;
+
+	/* first arg is UUID */
+	if (count < UUID_STRING_LEN || !uuid_is_valid(buf) ||
+	    add_uevent_var(env, "SYNTH_UUID=%.*s", UUID_STRING_LEN, buf))
+		goto out;
+
+	/*
+	 * the rest are custom environment variables in KEY=VALUE
+	 * format with ' ' delimiter between each KEY=VALUE pair
+	 */
+	next = buf + UUID_STRING_LEN;
+	buf_end = buf + count - 1;
+
+	while (next <= buf_end) {
+		if (*next != ' ')
+			goto out;
+
+		/* skip the ' ', key must follow */
+		key = ++next;
+		if (key > buf_end)
+			goto out;
+
+		buf = next;
+		next = action_arg_word_end(buf, buf_end, '=');
+		if (!next || next > buf_end || *next != '=')
+			goto out;
+		key_len = next - buf;
+
+		/* skip the '=', value must follow */
+		if (++next > buf_end)
+			goto out;
+
+		buf = next;
+		next = action_arg_word_end(buf, buf_end, ' ');
+		if (!next)
+			goto out;
+
+		if (add_uevent_var(env, "SYNTH_ARG_%.*s=%.*s",
+				   key_len, key, (int) (next - buf), buf))
+			goto out;
+	}
+
+	r = 0;
+out:
+	if (r)
+		kfree(env);
+	else
+		*ret_env = env;
+	return r;
+}
+
+/**
+ * kobject_synth_uevent - send synthetic uevent with arguments
+ *
+ * @kobj: struct kobject for which synthetic uevent is to be generated
+ * @buf: buffer containing action type and action args, newline is ignored
+ * @count: length of buffer
+ *
+ * Returns 0 if kobject_synthetic_uevent() is completed with success or the
+ * corresponding error when it fails.
+ */
+int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count)
+{
+	char *no_uuid_envp[] = { "SYNTH_UUID=0", NULL };
+	enum kobject_action action;
+	const char *action_args;
+	struct kobj_uevent_env *env;
+	const char *msg = NULL, *devpath;
+	int r;
+
+	r = kobject_action_type(buf, count, &action, &action_args);
+	if (r) {
+		msg = "unknown uevent action string\n";
+		goto out;
+	}
+
+	if (!action_args) {
+		r = kobject_uevent_env(kobj, action, no_uuid_envp);
+		goto out;
+	}
+
+	r = kobject_action_args(action_args,
+				count - (action_args - buf), &env);
+	if (r == -EINVAL) {
+		msg = "incorrect uevent action arguments\n";
+		goto out;
+	}
+
+	if (r)
+		goto out;
+
+	r = kobject_uevent_env(kobj, action, env->envp);
+	kfree(env);
+out:
+	if (r) {
+		devpath = kobject_get_path(kobj, GFP_KERNEL);
+		printk(KERN_WARNING "synth uevent: %s: %s",
+		       devpath ?: "unknown device",
+		       msg ?: "failed to send uevent");
+		kfree(devpath);
+	}
+	return r;
+}
+
 #ifdef CONFIG_NET
 static int kobj_bcast_filter(struct sock *dsk, struct sk_buff *skb, void *data)
 {
-- 
cgit v1.3-14-g43fede


From 9ab6055f959032258c0f83a070cd0d26ed7a8fc5 Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@oracle.com>
Date: Wed, 24 May 2017 17:55:10 -0600
Subject: kernel/locking: Fix compile error with qrwlock.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Saw these compile errors on SPARC when queued rwlock feature is enabled.

 CC      kernel/locking/qrwlock.o
kernel/locking/qrwlock.c: In function ‘queued_read_lock_slowpath’:
kernel/locking/qrwlock.c:89: error: implicit declaration of function ‘arch_spin_lock’
kernel/locking/qrwlock.c:102: error: implicit declaration of function ‘arch_spin_unlock’
make[4]: *** [kernel/locking/qrwlock.o] Error 1

Include spinlock.h in qrwlock.c to fix it.

Signed-off-by: Babu Moger <babu.moger@oracle.com>
Reviewed-by: Håkon Bugge <haakon.bugge@oracle.com>
Reviewed-by: Jane Chu <jane.chu@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nelson@oracle.com>
Reviewed-by: Vijay Kumar <vijay.ac.kumar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/locking/qrwlock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index cc3ed0ccdfa2..2655f26ec882 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -20,6 +20,7 @@
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <linux/spinlock.h>
 #include <asm/qrwlock.h>
 
 /*
-- 
cgit v1.3-14-g43fede


From 8f553c498e1772cccb39a114da4a498d22992758 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 24 May 2017 10:15:12 +0200
Subject: cpu/hotplug: Provide cpus_read|write_[un]lock()

The counting 'rwsem' hackery of get|put_online_cpus() is going to be
replaced by percpu rwsem.

Rename the functions to make it clear that it's locking and not some
refcount style interface. These new functions will be used for the
preparatory patches which make the code ready for the percpu rwsem
conversion.

Rename all instances in the cpu hotplug code while at it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170524081547.080397752@linutronix.de
---
 include/linux/cpu.h | 34 +++++++++++++++++++---------------
 kernel/cpu.c        | 36 ++++++++++++++++++------------------
 2 files changed, 37 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index f92081234afd..055876003914 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -99,26 +99,30 @@ static inline void cpu_maps_update_done(void)
 extern struct bus_type cpu_subsys;
 
 #ifdef CONFIG_HOTPLUG_CPU
-/* Stop CPUs going up and down. */
-
-extern void cpu_hotplug_begin(void);
-extern void cpu_hotplug_done(void);
-extern void get_online_cpus(void);
-extern void put_online_cpus(void);
+extern void cpus_write_lock(void);
+extern void cpus_write_unlock(void);
+extern void cpus_read_lock(void);
+extern void cpus_read_unlock(void);
 extern void cpu_hotplug_disable(void);
 extern void cpu_hotplug_enable(void);
 void clear_tasks_mm_cpumask(int cpu);
 int cpu_down(unsigned int cpu);
 
-#else		/* CONFIG_HOTPLUG_CPU */
-
-static inline void cpu_hotplug_begin(void) {}
-static inline void cpu_hotplug_done(void) {}
-#define get_online_cpus()	do { } while (0)
-#define put_online_cpus()	do { } while (0)
-#define cpu_hotplug_disable()	do { } while (0)
-#define cpu_hotplug_enable()	do { } while (0)
-#endif		/* CONFIG_HOTPLUG_CPU */
+#else /* CONFIG_HOTPLUG_CPU */
+
+static inline void cpus_write_lock(void) { }
+static inline void cpus_write_unlock(void) { }
+static inline void cpus_read_lock(void) { }
+static inline void cpus_read_unlock(void) { }
+static inline void cpu_hotplug_disable(void) { }
+static inline void cpu_hotplug_enable(void) { }
+#endif	/* !CONFIG_HOTPLUG_CPU */
+
+/* Wrappers which go away once all code is converted */
+static inline void cpu_hotplug_begin(void) { cpus_write_lock(); }
+static inline void cpu_hotplug_done(void) { cpus_write_unlock(); }
+static inline void get_online_cpus(void) { cpus_read_lock(); }
+static inline void put_online_cpus(void) { cpus_read_unlock(); }
 
 #ifdef CONFIG_PM_SLEEP_SMP
 extern int freeze_secondary_cpus(int primary);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9ae6fbe5b5cf..d3221ae5b474 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -235,7 +235,7 @@ static struct {
 #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
 
 
-void get_online_cpus(void)
+void cpus_read_lock(void)
 {
 	might_sleep();
 	if (cpu_hotplug.active_writer == current)
@@ -245,9 +245,9 @@ void get_online_cpus(void)
 	atomic_inc(&cpu_hotplug.refcount);
 	mutex_unlock(&cpu_hotplug.lock);
 }
-EXPORT_SYMBOL_GPL(get_online_cpus);
+EXPORT_SYMBOL_GPL(cpus_read_lock);
 
-void put_online_cpus(void)
+void cpus_read_unlock(void)
 {
 	int refcount;
 
@@ -264,7 +264,7 @@ void put_online_cpus(void)
 	cpuhp_lock_release();
 
 }
-EXPORT_SYMBOL_GPL(put_online_cpus);
+EXPORT_SYMBOL_GPL(cpus_read_unlock);
 
 /*
  * This ensures that the hotplug operation can begin only when the
@@ -288,7 +288,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
  * get_online_cpus() not an api which is called all that often.
  *
  */
-void cpu_hotplug_begin(void)
+void cpus_write_lock(void)
 {
 	DEFINE_WAIT(wait);
 
@@ -306,7 +306,7 @@ void cpu_hotplug_begin(void)
 	finish_wait(&cpu_hotplug.wq, &wait);
 }
 
-void cpu_hotplug_done(void)
+void cpus_write_unlock(void)
 {
 	cpu_hotplug.active_writer = NULL;
 	mutex_unlock(&cpu_hotplug.lock);
@@ -773,7 +773,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 	if (!cpu_present(cpu))
 		return -EINVAL;
 
-	cpu_hotplug_begin();
+	cpus_write_lock();
 
 	cpuhp_tasks_frozen = tasks_frozen;
 
@@ -811,7 +811,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 	}
 
 out:
-	cpu_hotplug_done();
+	cpus_write_unlock();
 	return ret;
 }
 
@@ -893,7 +893,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 	struct task_struct *idle;
 	int ret = 0;
 
-	cpu_hotplug_begin();
+	cpus_write_lock();
 
 	if (!cpu_present(cpu)) {
 		ret = -EINVAL;
@@ -941,7 +941,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 	target = min((int)target, CPUHP_BRINGUP_CPU);
 	ret = cpuhp_up_callbacks(cpu, st, target);
 out:
-	cpu_hotplug_done();
+	cpus_write_unlock();
 	return ret;
 }
 
@@ -1424,7 +1424,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
 	if (sp->multi_instance == false)
 		return -EINVAL;
 
-	get_online_cpus();
+	cpus_read_lock();
 	mutex_lock(&cpuhp_state_mutex);
 
 	if (!invoke || !sp->startup.multi)
@@ -1453,7 +1453,7 @@ add_node:
 	hlist_add_head(node, &sp->list);
 unlock:
 	mutex_unlock(&cpuhp_state_mutex);
-	put_online_cpus();
+	cpus_read_unlock();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
@@ -1486,7 +1486,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,
 	if (cpuhp_cb_check(state) || !name)
 		return -EINVAL;
 
-	get_online_cpus();
+	cpus_read_lock();
 	mutex_lock(&cpuhp_state_mutex);
 
 	ret = cpuhp_store_callbacks(state, name, startup, teardown,
@@ -1522,7 +1522,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,
 	}
 out:
 	mutex_unlock(&cpuhp_state_mutex);
-	put_online_cpus();
+	cpus_read_unlock();
 	/*
 	 * If the requested state is CPUHP_AP_ONLINE_DYN, return the
 	 * dynamically allocated state in case of success.
@@ -1544,7 +1544,7 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,
 	if (!sp->multi_instance)
 		return -EINVAL;
 
-	get_online_cpus();
+	cpus_read_lock();
 	mutex_lock(&cpuhp_state_mutex);
 
 	if (!invoke || !cpuhp_get_teardown_cb(state))
@@ -1565,7 +1565,7 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,
 remove:
 	hlist_del(node);
 	mutex_unlock(&cpuhp_state_mutex);
-	put_online_cpus();
+	cpus_read_unlock();
 
 	return 0;
 }
@@ -1587,7 +1587,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
 
 	BUG_ON(cpuhp_cb_check(state));
 
-	get_online_cpus();
+	cpus_read_lock();
 
 	mutex_lock(&cpuhp_state_mutex);
 	if (sp->multi_instance) {
@@ -1615,7 +1615,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
 remove:
 	cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
 	mutex_unlock(&cpuhp_state_mutex);
-	put_online_cpus();
+	cpus_read_unlock();
 }
 EXPORT_SYMBOL(__cpuhp_remove_state);
 
-- 
cgit v1.3-14-g43fede


From 71def423fe3da0d40ad3427a4cd5f9edc53bff67 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 24 May 2017 10:15:14 +0200
Subject: cpu/hotplug: Provide cpuhp_setup/remove_state[_nocalls]_cpuslocked()

Some call sites of cpuhp_setup/remove_state[_nocalls]() are within a
cpus_read locked region.

cpuhp_setup/remove_state[_nocalls]() call cpus_read_lock() as well, which
is possible in the current implementation but prevents converting the
hotplug locking to a percpu rwsem.

Provide locked versions of the interfaces to avoid nested calls to
cpus_read_lock().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170524081547.239600868@linutronix.de
---
 include/linux/cpuhotplug.h | 29 ++++++++++++++++++++++++++++
 kernel/cpu.c               | 47 +++++++++++++++++++++++++++++++++++-----------
 2 files changed, 65 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 0f2a80377520..4fac564dde70 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -153,6 +153,11 @@ int __cpuhp_setup_state(enum cpuhp_state state,	const char *name, bool invoke,
 			int (*startup)(unsigned int cpu),
 			int (*teardown)(unsigned int cpu), bool multi_instance);
 
+int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name,
+				   bool invoke,
+				   int (*startup)(unsigned int cpu),
+				   int (*teardown)(unsigned int cpu),
+				   bool multi_instance);
 /**
  * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks
  * @state:	The state for which the calls are installed
@@ -171,6 +176,15 @@ static inline int cpuhp_setup_state(enum cpuhp_state state,
 	return __cpuhp_setup_state(state, name, true, startup, teardown, false);
 }
 
+static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
+					       const char *name,
+					       int (*startup)(unsigned int cpu),
+					       int (*teardown)(unsigned int cpu))
+{
+	return __cpuhp_setup_state_cpuslocked(state, name, true, startup,
+					      teardown, false);
+}
+
 /**
  * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the
  *			       callbacks
@@ -191,6 +205,15 @@ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state,
 				   false);
 }
 
+static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state,
+						     const char *name,
+						     int (*startup)(unsigned int cpu),
+						     int (*teardown)(unsigned int cpu))
+{
+	return __cpuhp_setup_state_cpuslocked(state, name, false, startup,
+					    teardown, false);
+}
+
 /**
  * cpuhp_setup_state_multi - Add callbacks for multi state
  * @state:	The state for which the calls are installed
@@ -250,6 +273,7 @@ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state,
 }
 
 void __cpuhp_remove_state(enum cpuhp_state state, bool invoke);
+void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke);
 
 /**
  * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown
@@ -273,6 +297,11 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state)
 	__cpuhp_remove_state(state, false);
 }
 
+static inline void cpuhp_remove_state_nocalls_cpuslocked(enum cpuhp_state state)
+{
+	__cpuhp_remove_state_cpuslocked(state, false);
+}
+
 /**
  * cpuhp_remove_multi_state - Remove hotplug multi state callback
  * @state:	The state for which the calls are removed
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d3221ae5b474..dc27c5a28153 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1459,7 +1459,7 @@ unlock:
 EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
 
 /**
- * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
+ * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
  * @state:		The state to setup
  * @invoke:		If true, the startup function is invoked for cpus where
  *			cpu state >= @state
@@ -1468,25 +1468,27 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
  * @multi_instance:	State is set up for multiple instances which get
  *			added afterwards.
  *
+ * The caller needs to hold cpus read locked while calling this function.
  * Returns:
  *   On success:
  *      Positive state number if @state is CPUHP_AP_ONLINE_DYN
  *      0 for all other states
  *   On failure: proper (negative) error code
  */
-int __cpuhp_setup_state(enum cpuhp_state state,
-			const char *name, bool invoke,
-			int (*startup)(unsigned int cpu),
-			int (*teardown)(unsigned int cpu),
-			bool multi_instance)
+int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
+				   const char *name, bool invoke,
+				   int (*startup)(unsigned int cpu),
+				   int (*teardown)(unsigned int cpu),
+				   bool multi_instance)
 {
 	int cpu, ret = 0;
 	bool dynstate;
 
+	lockdep_assert_cpus_held();
+
 	if (cpuhp_cb_check(state) || !name)
 		return -EINVAL;
 
-	cpus_read_lock();
 	mutex_lock(&cpuhp_state_mutex);
 
 	ret = cpuhp_store_callbacks(state, name, startup, teardown,
@@ -1522,7 +1524,6 @@ int __cpuhp_setup_state(enum cpuhp_state state,
 	}
 out:
 	mutex_unlock(&cpuhp_state_mutex);
-	cpus_read_unlock();
 	/*
 	 * If the requested state is CPUHP_AP_ONLINE_DYN, return the
 	 * dynamically allocated state in case of success.
@@ -1531,6 +1532,22 @@ out:
 		return state;
 	return ret;
 }
+EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked);
+
+int __cpuhp_setup_state(enum cpuhp_state state,
+			const char *name, bool invoke,
+			int (*startup)(unsigned int cpu),
+			int (*teardown)(unsigned int cpu),
+			bool multi_instance)
+{
+	int ret;
+
+	cpus_read_lock();
+	ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup,
+					     teardown, multi_instance);
+	cpus_read_unlock();
+	return ret;
+}
 EXPORT_SYMBOL(__cpuhp_setup_state);
 
 int __cpuhp_state_remove_instance(enum cpuhp_state state,
@@ -1572,22 +1589,23 @@ remove:
 EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
 
 /**
- * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
+ * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state
  * @state:	The state to remove
  * @invoke:	If true, the teardown function is invoked for cpus where
  *		cpu state >= @state
  *
+ * The caller needs to hold cpus read locked while calling this function.
  * The teardown callback is currently not allowed to fail. Think
  * about module removal!
  */
-void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
+void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke)
 {
 	struct cpuhp_step *sp = cpuhp_get_step(state);
 	int cpu;
 
 	BUG_ON(cpuhp_cb_check(state));
 
-	cpus_read_lock();
+	lockdep_assert_cpus_held();
 
 	mutex_lock(&cpuhp_state_mutex);
 	if (sp->multi_instance) {
@@ -1615,6 +1633,13 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
 remove:
 	cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
 	mutex_unlock(&cpuhp_state_mutex);
+}
+EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked);
+
+void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
+{
+	cpus_read_lock();
+	__cpuhp_remove_state_cpuslocked(state, invoke);
 	cpus_read_unlock();
 }
 EXPORT_SYMBOL(__cpuhp_remove_state);
-- 
cgit v1.3-14-g43fede


From 9805c6733349ea3ccd22cf75b8ebaabb5290e310 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 24 May 2017 10:15:15 +0200
Subject: cpu/hotplug: Add __cpuhp_state_add_instance_cpuslocked()

Add cpuslocked() variants for the multi instance registration so this can
be called from a cpus_read_lock() protected region.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170524081547.321782217@linutronix.de
---
 include/linux/cpuhotplug.h |  9 +++++++++
 kernel/cpu.c               | 18 +++++++++++++++---
 2 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 4fac564dde70..df3d2719a796 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -240,6 +240,8 @@ static inline int cpuhp_setup_state_multi(enum cpuhp_state state,
 
 int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
 			       bool invoke);
+int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
+					  struct hlist_node *node, bool invoke);
 
 /**
  * cpuhp_state_add_instance - Add an instance for a state and invoke startup
@@ -272,6 +274,13 @@ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state,
 	return __cpuhp_state_add_instance(state, node, false);
 }
 
+static inline int
+cpuhp_state_add_instance_nocalls_cpuslocked(enum cpuhp_state state,
+					    struct hlist_node *node)
+{
+	return __cpuhp_state_add_instance_cpuslocked(state, node, false);
+}
+
 void __cpuhp_remove_state(enum cpuhp_state state, bool invoke);
 void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke);
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index dc27c5a28153..e4389ac55b65 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1413,18 +1413,20 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
 	}
 }
 
-int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
-			       bool invoke)
+int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
+					  struct hlist_node *node,
+					  bool invoke)
 {
 	struct cpuhp_step *sp;
 	int cpu;
 	int ret;
 
+	lockdep_assert_cpus_held();
+
 	sp = cpuhp_get_step(state);
 	if (sp->multi_instance == false)
 		return -EINVAL;
 
-	cpus_read_lock();
 	mutex_lock(&cpuhp_state_mutex);
 
 	if (!invoke || !sp->startup.multi)
@@ -1453,6 +1455,16 @@ add_node:
 	hlist_add_head(node, &sp->list);
 unlock:
 	mutex_unlock(&cpuhp_state_mutex);
+	return ret;
+}
+
+int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
+			       bool invoke)
+{
+	int ret;
+
+	cpus_read_lock();
+	ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke);
 	cpus_read_unlock();
 	return ret;
 }
-- 
cgit v1.3-14-g43fede


From fe5595c074005bd94f0c7d1644175941149f6768 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 24 May 2017 10:15:16 +0200
Subject: stop_machine: Provide stop_machine_cpuslocked()

Some call sites of stop_machine() are within a get_online_cpus() protected
region.

stop_machine() calls get_online_cpus() as well, which is possible in the
current implementation but prevents converting the hotplug locking to a
percpu rwsem.

Provide stop_machine_cpuslocked() to avoid nested calls to get_online_cpus().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170524081547.400700852@linutronix.de
---
 include/linux/stop_machine.h | 26 +++++++++++++++++++++++---
 kernel/stop_machine.c        | 11 +++++++----
 2 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 3cc9632dcc2a..3d60275e3ba9 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -116,15 +116,29 @@ static inline int try_stop_cpus(const struct cpumask *cpumask,
  * @fn() runs.
  *
  * This can be thought of as a very heavy write lock, equivalent to
- * grabbing every spinlock in the kernel. */
+ * grabbing every spinlock in the kernel.
+ *
+ * Protects against CPU hotplug.
+ */
 int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus);
 
+/**
+ * stop_machine_cpuslocked: freeze the machine on all CPUs and run this function
+ * @fn: the function to run
+ * @data: the data ptr for the @fn()
+ * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
+ *
+ * Same as above. Must be called from with in a cpus_read_lock() protected
+ * region. Avoids nested calls to cpus_read_lock().
+ */
+int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus);
+
 int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
 				   const struct cpumask *cpus);
 #else	/* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
 
-static inline int stop_machine(cpu_stop_fn_t fn, void *data,
-				 const struct cpumask *cpus)
+static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
+					  const struct cpumask *cpus)
 {
 	unsigned long flags;
 	int ret;
@@ -134,6 +148,12 @@ static inline int stop_machine(cpu_stop_fn_t fn, void *data,
 	return ret;
 }
 
+static inline int stop_machine(cpu_stop_fn_t fn, void *data,
+			       const struct cpumask *cpus)
+{
+	return stop_machine_cpuslocked(fn, data, cpus);
+}
+
 static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
 						 const struct cpumask *cpus)
 {
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 1eb82661ecdb..b7591261652d 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -552,7 +552,8 @@ static int __init cpu_stop_init(void)
 }
 early_initcall(cpu_stop_init);
 
-static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
+int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
+			    const struct cpumask *cpus)
 {
 	struct multi_stop_data msdata = {
 		.fn = fn,
@@ -561,6 +562,8 @@ static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cp
 		.active_cpus = cpus,
 	};
 
+	lockdep_assert_cpus_held();
+
 	if (!stop_machine_initialized) {
 		/*
 		 * Handle the case where stop_machine() is called
@@ -590,9 +593,9 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
 	int ret;
 
 	/* No CPUs can come up or down during this. */
-	get_online_cpus();
-	ret = __stop_machine(fn, data, cpus);
-	put_online_cpus();
+	cpus_read_lock();
+	ret = stop_machine_cpuslocked(fn, data, cpus);
+	cpus_read_unlock();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(stop_machine);
-- 
cgit v1.3-14-g43fede


From 9596695ee1e7eedd743c43811fe68299eb005b5c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 24 May 2017 10:15:17 +0200
Subject: padata: Make padata_alloc() static

No users outside of padata.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linux-crypto@vger.kernel.org
Link: http://lkml.kernel.org/r/20170524081547.491457256@linutronix.de
---
 include/linux/padata.h |  3 ---
 kernel/padata.c        | 32 ++++++++++++++++----------------
 2 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 0f9e567d5e15..2f9c1f93b1ce 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -166,9 +166,6 @@ struct padata_instance {
 
 extern struct padata_instance *padata_alloc_possible(
 					struct workqueue_struct *wq);
-extern struct padata_instance *padata_alloc(struct workqueue_struct *wq,
-					    const struct cpumask *pcpumask,
-					    const struct cpumask *cbcpumask);
 extern void padata_free(struct padata_instance *pinst);
 extern int padata_do_parallel(struct padata_instance *pinst,
 			      struct padata_priv *padata, int cb_cpu);
diff --git a/kernel/padata.c b/kernel/padata.c
index ac8f1e524836..0c708f648853 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -933,19 +933,6 @@ static struct kobj_type padata_attr_type = {
 	.release = padata_sysfs_release,
 };
 
-/**
- * padata_alloc_possible - Allocate and initialize padata instance.
- *                         Use the cpu_possible_mask for serial and
- *                         parallel workers.
- *
- * @wq: workqueue to use for the allocated padata instance
- */
-struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
-{
-	return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
-}
-EXPORT_SYMBOL(padata_alloc_possible);
-
 /**
  * padata_alloc - allocate and initialize a padata instance and specify
  *                cpumasks for serial and parallel workers.
@@ -954,9 +941,9 @@ EXPORT_SYMBOL(padata_alloc_possible);
  * @pcpumask: cpumask that will be used for padata parallelization
  * @cbcpumask: cpumask that will be used for padata serialization
  */
-struct padata_instance *padata_alloc(struct workqueue_struct *wq,
-				     const struct cpumask *pcpumask,
-				     const struct cpumask *cbcpumask)
+static struct padata_instance *padata_alloc(struct workqueue_struct *wq,
+					    const struct cpumask *pcpumask,
+					    const struct cpumask *cbcpumask)
 {
 	struct padata_instance *pinst;
 	struct parallel_data *pd = NULL;
@@ -1010,6 +997,19 @@ err:
 	return NULL;
 }
 
+/**
+ * padata_alloc_possible - Allocate and initialize padata instance.
+ *                         Use the cpu_possible_mask for serial and
+ *                         parallel workers.
+ *
+ * @wq: workqueue to use for the allocated padata instance
+ */
+struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
+{
+	return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
+}
+EXPORT_SYMBOL(padata_alloc_possible);
+
 /**
  * padata_free - free a padata instance
  *
-- 
cgit v1.3-14-g43fede


From c5a81c8ff816d89941fe86961b286765d6ca2f5f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 24 May 2017 10:15:18 +0200
Subject: padata: Avoid nested calls to cpus_read_lock() in
 pcrypt_init_padata()

pcrypt_init_padata()
   cpus_read_lock()
   padata_alloc_possible()
     padata_alloc()
       cpus_read_lock()

The nested call to cpus_read_lock() works with the current implementation,
but prevents the conversion to a percpu rwsem.

The other caller of padata_alloc_possible() is pcrypt_init_padata() which
calls from a cpus_read_lock() protected region as well.

Remove the cpus_read_lock() call in padata_alloc() and document the
calling convention.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linux-crypto@vger.kernel.org
Link: http://lkml.kernel.org/r/20170524081547.571278910@linutronix.de
---
 kernel/padata.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index 0c708f648853..868f947166d7 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -940,6 +940,8 @@ static struct kobj_type padata_attr_type = {
  * @wq: workqueue to use for the allocated padata instance
  * @pcpumask: cpumask that will be used for padata parallelization
  * @cbcpumask: cpumask that will be used for padata serialization
+ *
+ * Must be called from a cpus_read_lock() protected region
  */
 static struct padata_instance *padata_alloc(struct workqueue_struct *wq,
 					    const struct cpumask *pcpumask,
@@ -952,7 +954,6 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq,
 	if (!pinst)
 		goto err;
 
-	get_online_cpus();
 	if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
 		goto err_free_inst;
 	if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
@@ -976,14 +977,12 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq,
 
 	pinst->flags = 0;
 
-	put_online_cpus();
-
 	BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
 	kobject_init(&pinst->kobj, &padata_attr_type);
 	mutex_init(&pinst->lock);
 
 #ifdef CONFIG_HOTPLUG_CPU
-	cpuhp_state_add_instance_nocalls(hp_online, &pinst->node);
+	cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node);
 #endif
 	return pinst;
 
@@ -992,7 +991,6 @@ err_free_masks:
 	free_cpumask_var(pinst->cpumask.cbcpu);
 err_free_inst:
 	kfree(pinst);
-	put_online_cpus();
 err:
 	return NULL;
 }
@@ -1003,9 +1001,12 @@ err:
  *                         parallel workers.
  *
  * @wq: workqueue to use for the allocated padata instance
+ *
+ * Must be called from a cpus_read_lock() protected region
  */
 struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
 {
+	lockdep_assert_cpus_held();
 	return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
 }
 EXPORT_SYMBOL(padata_alloc_possible);
-- 
cgit v1.3-14-g43fede


From 210e21331fc3a396af640cec652be769d146e49f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 24 May 2017 10:15:28 +0200
Subject: cpu/hotplug: Use stop_machine_cpuslocked() in takedown_cpu()

takedown_cpu() is a cpu hotplug function invoking stop_machine(). The cpu
hotplug machinery holds the hotplug lock for write.

stop_machine() invokes get_online_cpus() as well. This is correct, but
prevents the conversion of the hotplug locking to a percpu rwsem.

Use stop_machine_cpuslocked() to avoid the nested call.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170524081548.423292433@linutronix.de
---
 kernel/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index e4389ac55b65..142d889d9f69 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -701,7 +701,7 @@ static int takedown_cpu(unsigned int cpu)
 	/*
 	 * So now all preempt/rcu users must observe !cpu_active().
 	 */
-	err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu));
+	err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
 	if (err) {
 		/* CPU refused to die */
 		irq_unlock_sparse();
-- 
cgit v1.3-14-g43fede


From a63fbed776c7124ce9f606234267c3c095b2680e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 24 May 2017 10:15:34 +0200
Subject: perf/tracing/cpuhotplug: Fix locking order

perf, tracing, kprobes and jump_labels have a gazillion of ways to create
dependency lock chains. Some of those involve nested invocations of
get_online_cpus().

The conversion of the hotplug locking to a percpu rwsem requires to avoid
such nested calls. sys_perf_event_open() protects most of the syscall logic
against cpu hotplug. This causes nested calls and lock inversions versus
ftrace and kprobes in various interesting ways.

It's impossible to move the hotplug locking to the outer end of all call
chains in the involved facilities, so the hotplug protection in
sys_perf_event_open() needs to be solved differently.

Introduce 'pmus_mutex' which protects a perf private online cpumask. This
mutex is taken when the mask is updated in the cpu hotplug callbacks and
can be taken in sys_perf_event_open() to protect the swhash setup/teardown
code and when the final judgement about a valid event has to be made.

[ tglx: Produced changelog and fixed the swhash interaction ]

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Link: http://lkml.kernel.org/r/20170524081548.930941109@linutronix.de
---
 include/linux/perf_event.h |   2 +
 kernel/events/core.c       | 106 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 78 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24a635887f28..7d6aa29094b2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -801,6 +801,8 @@ struct perf_cpu_context {
 
 	struct list_head		sched_cb_entry;
 	int				sched_cb_usage;
+
+	int				online;
 };
 
 struct perf_output_handle {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6e75a5c9412d..b97cda4d1777 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -389,6 +389,7 @@ static atomic_t nr_switch_events __read_mostly;
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
 static struct srcu_struct pmus_srcu;
+static cpumask_var_t perf_online_mask;
 
 /*
  * perf event paranoia level:
@@ -3812,14 +3813,6 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
 		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
 			return ERR_PTR(-EACCES);
 
-		/*
-		 * We could be clever and allow to attach a event to an
-		 * offline CPU and activate it when the CPU comes up, but
-		 * that's for later.
-		 */
-		if (!cpu_online(cpu))
-			return ERR_PTR(-ENODEV);
-
 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 		ctx = &cpuctx->ctx;
 		get_ctx(ctx);
@@ -7703,7 +7696,8 @@ static int swevent_hlist_get_cpu(int cpu)
 	int err = 0;
 
 	mutex_lock(&swhash->hlist_mutex);
-	if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
+	if (!swevent_hlist_deref(swhash) &&
+	    cpumask_test_cpu(cpu, perf_online_mask)) {
 		struct swevent_hlist *hlist;
 
 		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
@@ -7724,7 +7718,7 @@ static int swevent_hlist_get(void)
 {
 	int err, cpu, failed_cpu;
 
-	get_online_cpus();
+	mutex_lock(&pmus_lock);
 	for_each_possible_cpu(cpu) {
 		err = swevent_hlist_get_cpu(cpu);
 		if (err) {
@@ -7732,8 +7726,7 @@ static int swevent_hlist_get(void)
 			goto fail;
 		}
 	}
-	put_online_cpus();
-
+	mutex_unlock(&pmus_lock);
 	return 0;
 fail:
 	for_each_possible_cpu(cpu) {
@@ -7741,8 +7734,7 @@ fail:
 			break;
 		swevent_hlist_put_cpu(cpu);
 	}
-
-	put_online_cpus();
+	mutex_unlock(&pmus_lock);
 	return err;
 }
 
@@ -8920,7 +8912,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
 	pmu->hrtimer_interval_ms = timer;
 
 	/* update all cpuctx for this PMU */
-	get_online_cpus();
+	cpus_read_lock();
 	for_each_online_cpu(cpu) {
 		struct perf_cpu_context *cpuctx;
 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
@@ -8929,7 +8921,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
 		cpu_function_call(cpu,
 			(remote_function_f)perf_mux_hrtimer_restart, cpuctx);
 	}
-	put_online_cpus();
+	cpus_read_unlock();
 	mutex_unlock(&mux_interval_mutex);
 
 	return count;
@@ -9059,6 +9051,7 @@ skip_type:
 		lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
 		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
 		cpuctx->ctx.pmu = pmu;
+		cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
 
 		__perf_mux_hrtimer_init(cpuctx, cpu);
 	}
@@ -9882,12 +9875,10 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_task;
 	}
 
-	get_online_cpus();
-
 	if (task) {
 		err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
 		if (err)
-			goto err_cpus;
+			goto err_cred;
 
 		/*
 		 * Reuse ptrace permission checks for now.
@@ -10073,6 +10064,23 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_locked;
 	}
 
+	if (!task) {
+		/*
+		 * Check if the @cpu we're creating an event for is online.
+		 *
+		 * We use the perf_cpu_context::ctx::mutex to serialize against
+		 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+		 */
+		struct perf_cpu_context *cpuctx =
+			container_of(ctx, struct perf_cpu_context, ctx);
+
+		if (!cpuctx->online) {
+			err = -ENODEV;
+			goto err_locked;
+		}
+	}
+
+
 	/*
 	 * Must be under the same ctx::mutex as perf_install_in_context(),
 	 * because we need to serialize with concurrent event creation.
@@ -10162,8 +10170,6 @@ SYSCALL_DEFINE5(perf_event_open,
 		put_task_struct(task);
 	}
 
-	put_online_cpus();
-
 	mutex_lock(&current->perf_event_mutex);
 	list_add_tail(&event->owner_entry, &current->perf_event_list);
 	mutex_unlock(&current->perf_event_mutex);
@@ -10197,8 +10203,6 @@ err_alloc:
 err_cred:
 	if (task)
 		mutex_unlock(&task->signal->cred_guard_mutex);
-err_cpus:
-	put_online_cpus();
 err_task:
 	if (task)
 		put_task_struct(task);
@@ -10253,6 +10257,21 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 		goto err_unlock;
 	}
 
+	if (!task) {
+		/*
+		 * Check if the @cpu we're creating an event for is online.
+		 *
+		 * We use the perf_cpu_context::ctx::mutex to serialize against
+		 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+		 */
+		struct perf_cpu_context *cpuctx =
+			container_of(ctx, struct perf_cpu_context, ctx);
+		if (!cpuctx->online) {
+			err = -ENODEV;
+			goto err_unlock;
+		}
+	}
+
 	if (!exclusive_event_installable(event, ctx)) {
 		err = -EBUSY;
 		goto err_unlock;
@@ -10920,6 +10939,8 @@ static void __init perf_event_init_all_cpus(void)
 	struct swevent_htable *swhash;
 	int cpu;
 
+	zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+
 	for_each_possible_cpu(cpu) {
 		swhash = &per_cpu(swevent_htable, cpu);
 		mutex_init(&swhash->hlist_mutex);
@@ -10935,7 +10956,7 @@ static void __init perf_event_init_all_cpus(void)
 	}
 }
 
-int perf_event_init_cpu(unsigned int cpu)
+void perf_swevent_init_cpu(unsigned int cpu)
 {
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
@@ -10948,7 +10969,6 @@ int perf_event_init_cpu(unsigned int cpu)
 		rcu_assign_pointer(swhash->swevent_hlist, hlist);
 	}
 	mutex_unlock(&swhash->hlist_mutex);
-	return 0;
 }
 
 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@ -10966,19 +10986,22 @@ static void __perf_event_exit_context(void *__info)
 
 static void perf_event_exit_cpu_context(int cpu)
 {
+	struct perf_cpu_context *cpuctx;
 	struct perf_event_context *ctx;
 	struct pmu *pmu;
-	int idx;
 
-	idx = srcu_read_lock(&pmus_srcu);
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+	mutex_lock(&pmus_lock);
+	list_for_each_entry(pmu, &pmus, entry) {
+		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+		ctx = &cpuctx->ctx;
 
 		mutex_lock(&ctx->mutex);
 		smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+		cpuctx->online = 0;
 		mutex_unlock(&ctx->mutex);
 	}
-	srcu_read_unlock(&pmus_srcu, idx);
+	cpumask_clear_cpu(cpu, perf_online_mask);
+	mutex_unlock(&pmus_lock);
 }
 #else
 
@@ -10986,6 +11009,29 @@ static void perf_event_exit_cpu_context(int cpu) { }
 
 #endif
 
+int perf_event_init_cpu(unsigned int cpu)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_event_context *ctx;
+	struct pmu *pmu;
+
+	perf_swevent_init_cpu(cpu);
+
+	mutex_lock(&pmus_lock);
+	cpumask_set_cpu(cpu, perf_online_mask);
+	list_for_each_entry(pmu, &pmus, entry) {
+		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+		ctx = &cpuctx->ctx;
+
+		mutex_lock(&ctx->mutex);
+		cpuctx->online = 1;
+		mutex_unlock(&ctx->mutex);
+	}
+	mutex_unlock(&pmus_lock);
+
+	return 0;
+}
+
 int perf_event_exit_cpu(unsigned int cpu)
 {
 	perf_event_exit_cpu_context(cpu);
-- 
cgit v1.3-14-g43fede


From f2545b2d4ce13e068897ef60ae64dffe215f4152 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 24 May 2017 10:15:35 +0200
Subject: jump_label: Reorder hotplug lock and jump_label_lock

The conversion of the hotplug locking to a percpu rwsem unearthed lock
ordering issues all over the place.

The jump_label code has two issues:

 1) Nested get_online_cpus() invocations

 2) Ordering problems vs. the cpus rwsem and the jump_label_mutex

To cure these, the following lock order has been established;

   cpus_rwsem -> jump_label_lock -> text_mutex

Even if not all architectures need protection against CPU hotplug, taking
cpus_rwsem before jump_label_lock is now mandatory in code pathes which
actually modify code and therefor need text_mutex protection.

Move the get_online_cpus() invocations into the core jump label code and
establish the proper lock order where required.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: "David S. Miller" <davem@davemloft.net>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Link: http://lkml.kernel.org/r/20170524081549.025830817@linutronix.de
---
 arch/mips/kernel/jump_label.c  |  2 --
 arch/sparc/kernel/jump_label.c |  2 --
 arch/tile/kernel/jump_label.c  |  2 --
 arch/x86/kernel/jump_label.c   |  2 --
 kernel/jump_label.c            | 20 ++++++++++++++------
 5 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/arch/mips/kernel/jump_label.c b/arch/mips/kernel/jump_label.c
index 3e586daa3a32..32e3168316cd 100644
--- a/arch/mips/kernel/jump_label.c
+++ b/arch/mips/kernel/jump_label.c
@@ -58,7 +58,6 @@ void arch_jump_label_transform(struct jump_entry *e,
 		insn.word = 0; /* nop */
 	}
 
-	get_online_cpus();
 	mutex_lock(&text_mutex);
 	if (IS_ENABLED(CONFIG_CPU_MICROMIPS)) {
 		insn_p->halfword[0] = insn.word >> 16;
@@ -70,7 +69,6 @@ void arch_jump_label_transform(struct jump_entry *e,
 			   (unsigned long)insn_p + sizeof(*insn_p));
 
 	mutex_unlock(&text_mutex);
-	put_online_cpus();
 }
 
 #endif /* HAVE_JUMP_LABEL */
diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c
index 07933b9e9ce0..93adde1ac166 100644
--- a/arch/sparc/kernel/jump_label.c
+++ b/arch/sparc/kernel/jump_label.c
@@ -41,12 +41,10 @@ void arch_jump_label_transform(struct jump_entry *entry,
 		val = 0x01000000;
 	}
 
-	get_online_cpus();
 	mutex_lock(&text_mutex);
 	*insn = val;
 	flushi(insn);
 	mutex_unlock(&text_mutex);
-	put_online_cpus();
 }
 
 #endif
diff --git a/arch/tile/kernel/jump_label.c b/arch/tile/kernel/jump_label.c
index 07802d586988..93931a46625b 100644
--- a/arch/tile/kernel/jump_label.c
+++ b/arch/tile/kernel/jump_label.c
@@ -45,14 +45,12 @@ static void __jump_label_transform(struct jump_entry *e,
 void arch_jump_label_transform(struct jump_entry *e,
 				enum jump_label_type type)
 {
-	get_online_cpus();
 	mutex_lock(&text_mutex);
 
 	__jump_label_transform(e, type);
 	flush_icache_range(e->code, e->code + sizeof(tilegx_bundle_bits));
 
 	mutex_unlock(&text_mutex);
-	put_online_cpus();
 }
 
 __init_or_module void arch_jump_label_transform_static(struct jump_entry *e,
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index c37bd0f39c70..ab4f491da2a9 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -105,11 +105,9 @@ static void __jump_label_transform(struct jump_entry *entry,
 void arch_jump_label_transform(struct jump_entry *entry,
 			       enum jump_label_type type)
 {
-	get_online_cpus();
 	mutex_lock(&text_mutex);
 	__jump_label_transform(entry, type, NULL, 0);
 	mutex_unlock(&text_mutex);
-	put_online_cpus();
 }
 
 static enum {
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 6c9cb208ac48..d11c506a6ac3 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -15,6 +15,7 @@
 #include <linux/static_key.h>
 #include <linux/jump_label_ratelimit.h>
 #include <linux/bug.h>
+#include <linux/cpu.h>
 
 #ifdef HAVE_JUMP_LABEL
 
@@ -124,6 +125,7 @@ void static_key_slow_inc(struct static_key *key)
 			return;
 	}
 
+	cpus_read_lock();
 	jump_label_lock();
 	if (atomic_read(&key->enabled) == 0) {
 		atomic_set(&key->enabled, -1);
@@ -133,12 +135,14 @@ void static_key_slow_inc(struct static_key *key)
 		atomic_inc(&key->enabled);
 	}
 	jump_label_unlock();
+	cpus_read_unlock();
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
 
 static void __static_key_slow_dec(struct static_key *key,
 		unsigned long rate_limit, struct delayed_work *work)
 {
+	cpus_read_lock();
 	/*
 	 * The negative count check is valid even when a negative
 	 * key->enabled is in use by static_key_slow_inc(); a
@@ -149,6 +153,7 @@ static void __static_key_slow_dec(struct static_key *key,
 	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
 		WARN(atomic_read(&key->enabled) < 0,
 		     "jump label: negative count!\n");
+		cpus_read_unlock();
 		return;
 	}
 
@@ -159,6 +164,7 @@ static void __static_key_slow_dec(struct static_key *key,
 		jump_label_update(key);
 	}
 	jump_label_unlock();
+	cpus_read_unlock();
 }
 
 static void jump_label_update_timeout(struct work_struct *work)
@@ -334,6 +340,7 @@ void __init jump_label_init(void)
 	if (static_key_initialized)
 		return;
 
+	cpus_read_lock();
 	jump_label_lock();
 	jump_label_sort_entries(iter_start, iter_stop);
 
@@ -353,6 +360,7 @@ void __init jump_label_init(void)
 	}
 	static_key_initialized = true;
 	jump_label_unlock();
+	cpus_read_unlock();
 }
 
 #ifdef CONFIG_MODULES
@@ -590,28 +598,28 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
 	struct module *mod = data;
 	int ret = 0;
 
+	cpus_read_lock();
+	jump_label_lock();
+
 	switch (val) {
 	case MODULE_STATE_COMING:
-		jump_label_lock();
 		ret = jump_label_add_module(mod);
 		if (ret) {
 			WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n");
 			jump_label_del_module(mod);
 		}
-		jump_label_unlock();
 		break;
 	case MODULE_STATE_GOING:
-		jump_label_lock();
 		jump_label_del_module(mod);
-		jump_label_unlock();
 		break;
 	case MODULE_STATE_LIVE:
-		jump_label_lock();
 		jump_label_invalidate_module_init(mod);
-		jump_label_unlock();
 		break;
 	}
 
+	jump_label_unlock();
+	cpus_read_unlock();
+
 	return notifier_from_errno(ret);
 }
 
-- 
cgit v1.3-14-g43fede


From 2d1e38f56622b9bb5af85be63c1052c056f5c677 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 24 May 2017 10:15:36 +0200
Subject: kprobes: Cure hotplug lock ordering issues

Converting the cpu hotplug locking to a percpu rwsem unearthed hidden lock
ordering problems.

There is a wide range of locks involved in this: kprobe_mutex,
jump_label_mutex, ftrace_lock, text_mutex, event_mutex, module_mutex,
func_hash->regex_lock and a gazillion of lock order permutations with
nested get_online_cpus() calls.

Some of those permutations are potential deadlocks even with the current
nesting hotplug locking scheme, but they can't be discovered by lockdep.

The conversion of the hotplug locking to a percpu rwsem requires to prevent
nested locking, so it's required to take the hotplug rwsem early in the
call chain and establish a proper lock order.

After quite some analysis and going down the wrong road severa times the
following lock order has been chosen:

kprobe_mutex -> cpus_rwsem -> jump_label_mutex -> text_mutex

For kprobes which hook on an ftrace function trace point, it's required to
drop cpus_rwsem before calling into the ftrace code to avoid a deadlock on
the func_hash->regex_lock.

[ Steven: Ftrace interaction fixes ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Link: http://lkml.kernel.org/r/20170524081549.104864779@linutronix.de
---
 kernel/kprobes.c | 59 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2d2d3a568e4e..9f6056749a28 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -483,11 +483,6 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
  */
 static void do_optimize_kprobes(void)
 {
-	/* Optimization never be done when disarmed */
-	if (kprobes_all_disarmed || !kprobes_allow_optimization ||
-	    list_empty(&optimizing_list))
-		return;
-
 	/*
 	 * The optimization/unoptimization refers online_cpus via
 	 * stop_machine() and cpu-hotplug modifies online_cpus.
@@ -495,14 +490,19 @@ static void do_optimize_kprobes(void)
 	 * This combination can cause a deadlock (cpu-hotplug try to lock
 	 * text_mutex but stop_machine can not be done because online_cpus
 	 * has been changed)
-	 * To avoid this deadlock, we need to call get_online_cpus()
+	 * To avoid this deadlock, caller must have locked cpu hotplug
 	 * for preventing cpu-hotplug outside of text_mutex locking.
 	 */
-	get_online_cpus();
+	lockdep_assert_cpus_held();
+
+	/* Optimization never be done when disarmed */
+	if (kprobes_all_disarmed || !kprobes_allow_optimization ||
+	    list_empty(&optimizing_list))
+		return;
+
 	mutex_lock(&text_mutex);
 	arch_optimize_kprobes(&optimizing_list);
 	mutex_unlock(&text_mutex);
-	put_online_cpus();
 }
 
 /*
@@ -513,12 +513,13 @@ static void do_unoptimize_kprobes(void)
 {
 	struct optimized_kprobe *op, *tmp;
 
+	/* See comment in do_optimize_kprobes() */
+	lockdep_assert_cpus_held();
+
 	/* Unoptimization must be done anytime */
 	if (list_empty(&unoptimizing_list))
 		return;
 
-	/* Ditto to do_optimize_kprobes */
-	get_online_cpus();
 	mutex_lock(&text_mutex);
 	arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
 	/* Loop free_list for disarming */
@@ -537,7 +538,6 @@ static void do_unoptimize_kprobes(void)
 			list_del_init(&op->list);
 	}
 	mutex_unlock(&text_mutex);
-	put_online_cpus();
 }
 
 /* Reclaim all kprobes on the free_list */
@@ -562,6 +562,7 @@ static void kick_kprobe_optimizer(void)
 static void kprobe_optimizer(struct work_struct *work)
 {
 	mutex_lock(&kprobe_mutex);
+	cpus_read_lock();
 	/* Lock modules while optimizing kprobes */
 	mutex_lock(&module_mutex);
 
@@ -587,6 +588,7 @@ static void kprobe_optimizer(struct work_struct *work)
 	do_free_cleaned_kprobes();
 
 	mutex_unlock(&module_mutex);
+	cpus_read_unlock();
 	mutex_unlock(&kprobe_mutex);
 
 	/* Step 5: Kick optimizer again if needed */
@@ -650,9 +652,8 @@ static void optimize_kprobe(struct kprobe *p)
 /* Short cut to direct unoptimizing */
 static void force_unoptimize_kprobe(struct optimized_kprobe *op)
 {
-	get_online_cpus();
+	lockdep_assert_cpus_held();
 	arch_unoptimize_kprobe(op);
-	put_online_cpus();
 	if (kprobe_disabled(&op->kp))
 		arch_disarm_kprobe(&op->kp);
 }
@@ -791,6 +792,7 @@ static void try_to_optimize_kprobe(struct kprobe *p)
 		return;
 
 	/* For preparing optimization, jump_label_text_reserved() is called */
+	cpus_read_lock();
 	jump_label_lock();
 	mutex_lock(&text_mutex);
 
@@ -812,6 +814,7 @@ static void try_to_optimize_kprobe(struct kprobe *p)
 out:
 	mutex_unlock(&text_mutex);
 	jump_label_unlock();
+	cpus_read_unlock();
 }
 
 #ifdef CONFIG_SYSCTL
@@ -826,6 +829,7 @@ static void optimize_all_kprobes(void)
 	if (kprobes_allow_optimization)
 		goto out;
 
+	cpus_read_lock();
 	kprobes_allow_optimization = true;
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
@@ -833,6 +837,7 @@ static void optimize_all_kprobes(void)
 			if (!kprobe_disabled(p))
 				optimize_kprobe(p);
 	}
+	cpus_read_unlock();
 	printk(KERN_INFO "Kprobes globally optimized\n");
 out:
 	mutex_unlock(&kprobe_mutex);
@@ -851,6 +856,7 @@ static void unoptimize_all_kprobes(void)
 		return;
 	}
 
+	cpus_read_lock();
 	kprobes_allow_optimization = false;
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
@@ -859,6 +865,7 @@ static void unoptimize_all_kprobes(void)
 				unoptimize_kprobe(p, false);
 		}
 	}
+	cpus_read_unlock();
 	mutex_unlock(&kprobe_mutex);
 
 	/* Wait for unoptimizing completion */
@@ -1010,14 +1017,11 @@ static void arm_kprobe(struct kprobe *kp)
 		arm_kprobe_ftrace(kp);
 		return;
 	}
-	/*
-	 * Here, since __arm_kprobe() doesn't use stop_machine(),
-	 * this doesn't cause deadlock on text_mutex. So, we don't
-	 * need get_online_cpus().
-	 */
+	cpus_read_lock();
 	mutex_lock(&text_mutex);
 	__arm_kprobe(kp);
 	mutex_unlock(&text_mutex);
+	cpus_read_unlock();
 }
 
 /* Disarm a kprobe with text_mutex */
@@ -1027,10 +1031,12 @@ static void disarm_kprobe(struct kprobe *kp, bool reopt)
 		disarm_kprobe_ftrace(kp);
 		return;
 	}
-	/* Ditto */
+
+	cpus_read_lock();
 	mutex_lock(&text_mutex);
 	__disarm_kprobe(kp, reopt);
 	mutex_unlock(&text_mutex);
+	cpus_read_unlock();
 }
 
 /*
@@ -1298,13 +1304,10 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
 	int ret = 0;
 	struct kprobe *ap = orig_p;
 
+	cpus_read_lock();
+
 	/* For preparing optimization, jump_label_text_reserved() is called */
 	jump_label_lock();
-	/*
-	 * Get online CPUs to avoid text_mutex deadlock.with stop machine,
-	 * which is invoked by unoptimize_kprobe() in add_new_kprobe()
-	 */
-	get_online_cpus();
 	mutex_lock(&text_mutex);
 
 	if (!kprobe_aggrprobe(orig_p)) {
@@ -1352,8 +1355,8 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
 
 out:
 	mutex_unlock(&text_mutex);
-	put_online_cpus();
 	jump_label_unlock();
+	cpus_read_unlock();
 
 	if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
 		ap->flags &= ~KPROBE_FLAG_DISABLED;
@@ -1555,9 +1558,12 @@ int register_kprobe(struct kprobe *p)
 		goto out;
 	}
 
-	mutex_lock(&text_mutex);	/* Avoiding text modification */
+	cpus_read_lock();
+	/* Prevent text modification */
+	mutex_lock(&text_mutex);
 	ret = prepare_kprobe(p);
 	mutex_unlock(&text_mutex);
+	cpus_read_unlock();
 	if (ret)
 		goto out;
 
@@ -1570,7 +1576,6 @@ int register_kprobe(struct kprobe *p)
 
 	/* Try to optimize kprobe */
 	try_to_optimize_kprobe(p);
-
 out:
 	mutex_unlock(&kprobe_mutex);
 
-- 
cgit v1.3-14-g43fede


From fc8dffd379ca5620664336eb895a426b42847558 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 24 May 2017 10:15:40 +0200
Subject: cpu/hotplug: Convert hotplug locking to percpu rwsem

There are no more (known) nested calls to get_online_cpus() and all
observed lock ordering problems have been addressed.

Replace the magic nested 'rwsem' hackery with a percpu-rwsem.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170524081549.447014063@linutronix.de
---
 include/linux/cpu.h |   2 +-
 kernel/cpu.c        | 107 +++++++---------------------------------------------
 2 files changed, 14 insertions(+), 95 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index af4d660798e5..ca73bc1563f4 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -103,7 +103,7 @@ extern void cpus_write_lock(void);
 extern void cpus_write_unlock(void);
 extern void cpus_read_lock(void);
 extern void cpus_read_unlock(void);
-static inline void lockdep_assert_cpus_held(void) { }
+extern void lockdep_assert_cpus_held(void);
 extern void cpu_hotplug_disable(void);
 extern void cpu_hotplug_enable(void);
 void clear_tasks_mm_cpumask(int cpu);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 142d889d9f69..66836216ebae 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -27,6 +27,7 @@
 #include <linux/smpboot.h>
 #include <linux/relay.h>
 #include <linux/slab.h>
+#include <linux/percpu-rwsem.h>
 
 #include <trace/events/power.h>
 #define CREATE_TRACE_POINTS
@@ -196,121 +197,41 @@ void cpu_maps_update_done(void)
 	mutex_unlock(&cpu_add_remove_lock);
 }
 
-/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+/*
+ * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
  * Should always be manipulated under cpu_add_remove_lock
  */
 static int cpu_hotplug_disabled;
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-static struct {
-	struct task_struct *active_writer;
-	/* wait queue to wake up the active_writer */
-	wait_queue_head_t wq;
-	/* verifies that no writer will get active while readers are active */
-	struct mutex lock;
-	/*
-	 * Also blocks the new readers during
-	 * an ongoing cpu hotplug operation.
-	 */
-	atomic_t refcount;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif
-} cpu_hotplug = {
-	.active_writer = NULL,
-	.wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
-	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	.dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map),
-#endif
-};
-
-/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
-#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
-#define cpuhp_lock_acquire_tryread() \
-				  lock_map_acquire_tryread(&cpu_hotplug.dep_map)
-#define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
-#define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
-
+DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
 
 void cpus_read_lock(void)
 {
-	might_sleep();
-	if (cpu_hotplug.active_writer == current)
-		return;
-	cpuhp_lock_acquire_read();
-	mutex_lock(&cpu_hotplug.lock);
-	atomic_inc(&cpu_hotplug.refcount);
-	mutex_unlock(&cpu_hotplug.lock);
+	percpu_down_read(&cpu_hotplug_lock);
 }
 EXPORT_SYMBOL_GPL(cpus_read_lock);
 
 void cpus_read_unlock(void)
 {
-	int refcount;
-
-	if (cpu_hotplug.active_writer == current)
-		return;
-
-	refcount = atomic_dec_return(&cpu_hotplug.refcount);
-	if (WARN_ON(refcount < 0)) /* try to fix things up */
-		atomic_inc(&cpu_hotplug.refcount);
-
-	if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
-		wake_up(&cpu_hotplug.wq);
-
-	cpuhp_lock_release();
-
+	percpu_up_read(&cpu_hotplug_lock);
 }
 EXPORT_SYMBOL_GPL(cpus_read_unlock);
 
-/*
- * This ensures that the hotplug operation can begin only when the
- * refcount goes to zero.
- *
- * Note that during a cpu-hotplug operation, the new readers, if any,
- * will be blocked by the cpu_hotplug.lock
- *
- * Since cpu_hotplug_begin() is always called after invoking
- * cpu_maps_update_begin(), we can be sure that only one writer is active.
- *
- * Note that theoretically, there is a possibility of a livelock:
- * - Refcount goes to zero, last reader wakes up the sleeping
- *   writer.
- * - Last reader unlocks the cpu_hotplug.lock.
- * - A new reader arrives at this moment, bumps up the refcount.
- * - The writer acquires the cpu_hotplug.lock finds the refcount
- *   non zero and goes to sleep again.
- *
- * However, this is very difficult to achieve in practice since
- * get_online_cpus() not an api which is called all that often.
- *
- */
 void cpus_write_lock(void)
 {
-	DEFINE_WAIT(wait);
-
-	cpu_hotplug.active_writer = current;
-	cpuhp_lock_acquire();
-
-	for (;;) {
-		mutex_lock(&cpu_hotplug.lock);
-		prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
-		if (likely(!atomic_read(&cpu_hotplug.refcount)))
-				break;
-		mutex_unlock(&cpu_hotplug.lock);
-		schedule();
-	}
-	finish_wait(&cpu_hotplug.wq, &wait);
+	percpu_down_write(&cpu_hotplug_lock);
 }
 
 void cpus_write_unlock(void)
 {
-	cpu_hotplug.active_writer = NULL;
-	mutex_unlock(&cpu_hotplug.lock);
-	cpuhp_lock_release();
+	percpu_up_write(&cpu_hotplug_lock);
+}
+
+void lockdep_assert_cpus_held(void)
+{
+	percpu_rwsem_assert_held(&cpu_hotplug_lock);
 }
 
 /*
@@ -344,8 +265,6 @@ void cpu_hotplug_enable(void)
 EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
 #endif	/* CONFIG_HOTPLUG_CPU */
 
-/* Notifier wrappers for transitioning to state machine */
-
 static int bringup_wait_for_ap(unsigned int cpu)
 {
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
-- 
cgit v1.3-14-g43fede


From 49dfe2a6779717d9c18395684ee31bdc98b22e53 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 24 May 2017 10:15:43 +0200
Subject: cpuhotplug: Link lock stacks for hotplug callbacks

The CPU hotplug callbacks are not covered by lockdep versus the cpu hotplug
rwsem.

CPU0						CPU1
cpuhp_setup_state(STATE, startup, teardown);
 cpus_read_lock();
  invoke_callback_on_ap();
    kick_hotplug_thread(ap);
    wait_for_completion();			hotplug_thread_fn()
    						  lock(m);
						  do_stuff();
						  unlock(m);

Lockdep does not know about this dependency and will not trigger on the
following code sequence:

	  lock(m);
	  cpus_read_lock();

Add a lockdep map and connect the initiators lock chain with the hotplug
thread lock chain, so potential deadlocks can be detected.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20170524081549.709375845@linutronix.de
---
 kernel/cpu.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 66836216ebae..7435ffc6163b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -66,6 +66,12 @@ struct cpuhp_cpu_state {
 
 static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
 
+#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
+static struct lock_class_key cpuhp_state_key;
+static struct lockdep_map cpuhp_state_lock_map =
+	STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key);
+#endif
+
 /**
  * cpuhp_step - Hotplug state machine step
  * @name:	Name of the step
@@ -403,6 +409,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
 
 	st->should_run = false;
 
+	lock_map_acquire(&cpuhp_state_lock_map);
 	/* Single callback invocation for [un]install ? */
 	if (st->single) {
 		if (st->cb_state < CPUHP_AP_ONLINE) {
@@ -429,6 +436,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
 		else if (st->state > st->target)
 			ret = cpuhp_ap_offline(cpu, st);
 	}
+	lock_map_release(&cpuhp_state_lock_map);
 	st->result = ret;
 	complete(&st->done);
 }
@@ -443,6 +451,9 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
 	if (!cpu_online(cpu))
 		return 0;
 
+	lock_map_acquire(&cpuhp_state_lock_map);
+	lock_map_release(&cpuhp_state_lock_map);
+
 	/*
 	 * If we are up and running, use the hotplug thread. For early calls
 	 * we invoke the thread function directly.
@@ -486,6 +497,8 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
 	enum cpuhp_state state = st->state;
 
 	trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
+	lock_map_acquire(&cpuhp_state_lock_map);
+	lock_map_release(&cpuhp_state_lock_map);
 	__cpuhp_kick_ap_work(st);
 	wait_for_completion(&st->done);
 	trace_cpuhp_exit(cpu, st->state, state, st->result);
-- 
cgit v1.3-14-g43fede


From 5a29ef22098874db79af7bf92a247a0f503bfa6e Mon Sep 17 00:00:00 2001
From: Vincent Legoll <vincent.legoll@gmail.com>
Date: Tue, 9 May 2017 10:34:09 +0200
Subject: genirq: Make early_irq_init() print out more informative

The printk in early_irq_init() is cryptic and badly formatted:

  NR_IRQS:33024 nr_irqs:968 16

The last number is the number of preallocated interrupts, so add a prefix
to it:

  NR_IRQS: 33024, nr_irqs: 968, preallocated irqs: 16

Cleanup the formatting for better readability as well.

Signed-off-by: Vincent Legoll <vincent.legoll@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1494318849-6733-1-git-send-email-vincent.legoll@gmail.com
---
 kernel/irq/irqdesc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 00bb0aeea1d0..09abce2ea8f0 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -480,7 +480,8 @@ int __init early_irq_init(void)
 
 	/* Let arch update nr_irqs and return the nr of preallocated irqs */
 	initcnt = arch_probe_nr_irqs();
-	printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
+	printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n",
+	       NR_IRQS, nr_irqs, initcnt);
 
 	if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
 		nr_irqs = IRQ_BITMAP_BITS;
@@ -516,7 +517,7 @@ int __init early_irq_init(void)
 
 	init_irq_default_affinity();
 
-	printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+	printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS);
 
 	desc = irq_desc;
 	count = ARRAY_SIZE(irq_desc);
-- 
cgit v1.3-14-g43fede


From d3ba5a9a345b1243276f8a982e1bce557c2504fd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 May 2017 12:03:11 +0300
Subject: posix-timers: Make posix_clocks immutable

There are no more modular users providing a posix clock. The register
function is now pointless so the posix clock array can be initialized
statically at compile time and the array including the various k_clock
structs can be marked 'const'.

Inspired by changes in the Grsecurity patch set, but done proper.

[ tglx: Massaged changelog and fixed the POSIX_TIMER=n case ]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mike Travis <mike.travis@hpe.com>
Cc: Dimitri Sivanich <sivanich@hpe.com>
Link: http://lkml.kernel.org/r/20170526090311.3377-3-hch@lst.de
---
 include/linux/posix-timers.h   |   9 +-
 kernel/time/alarmtimer.c       |  89 ++++++++++---------
 kernel/time/posix-clock.c      |   2 +-
 kernel/time/posix-cpu-timers.c |  34 +++-----
 kernel/time/posix-timers.c     | 191 +++++++++++++++++++----------------------
 5 files changed, 151 insertions(+), 174 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 8c1e43ab14a9..b313ef2e7385 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -105,10 +105,11 @@ struct k_clock {
 			   struct itimerspec64 *cur_setting);
 };
 
-extern struct k_clock clock_posix_cpu;
-extern struct k_clock clock_posix_dynamic;
-
-void posix_timers_register_clock(const clockid_t clock_id, struct k_clock *new_clock);
+extern const struct k_clock clock_posix_cpu;
+extern const struct k_clock clock_posix_dynamic;
+extern const struct k_clock clock_process;
+extern const struct k_clock clock_thread;
+extern const struct k_clock alarm_clock;
 
 /* function to call to trigger timer event */
 int posix_timer_event(struct k_itimer *timr, int si_private);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 5cb5b0008d97..4f4cc3509b30 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -307,38 +307,6 @@ static int alarmtimer_resume(struct device *dev)
 }
 #endif
 
-static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
-{
-	struct alarm_base *base;
-	unsigned long flags;
-	ktime_t delta;
-
-	switch(type) {
-	case ALARM_REALTIME:
-		base = &alarm_bases[ALARM_REALTIME];
-		type = ALARM_REALTIME_FREEZER;
-		break;
-	case ALARM_BOOTTIME:
-		base = &alarm_bases[ALARM_BOOTTIME];
-		type = ALARM_BOOTTIME_FREEZER;
-		break;
-	default:
-		WARN_ONCE(1, "Invalid alarm type: %d\n", type);
-		return;
-	}
-
-	delta = ktime_sub(absexp, base->gettime());
-
-	spin_lock_irqsave(&freezer_delta_lock, flags);
-	if (!freezer_delta || (delta < freezer_delta)) {
-		freezer_delta = delta;
-		freezer_expires = absexp;
-		freezer_alarmtype = type;
-	}
-	spin_unlock_irqrestore(&freezer_delta_lock, flags);
-}
-
-
 /**
  * alarm_init - Initialize an alarm structure
  * @alarm: ptr to alarm to be initialized
@@ -488,6 +456,38 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
 }
 EXPORT_SYMBOL_GPL(alarm_forward_now);
 
+#ifdef CONFIG_POSIX_TIMERS
+
+static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
+{
+	struct alarm_base *base;
+	unsigned long flags;
+	ktime_t delta;
+
+	switch(type) {
+	case ALARM_REALTIME:
+		base = &alarm_bases[ALARM_REALTIME];
+		type = ALARM_REALTIME_FREEZER;
+		break;
+	case ALARM_BOOTTIME:
+		base = &alarm_bases[ALARM_BOOTTIME];
+		type = ALARM_BOOTTIME_FREEZER;
+		break;
+	default:
+		WARN_ONCE(1, "Invalid alarm type: %d\n", type);
+		return;
+	}
+
+	delta = ktime_sub(absexp, base->gettime());
+
+	spin_lock_irqsave(&freezer_delta_lock, flags);
+	if (!freezer_delta || (delta < freezer_delta)) {
+		freezer_delta = delta;
+		freezer_expires = absexp;
+		freezer_alarmtype = type;
+	}
+	spin_unlock_irqrestore(&freezer_delta_lock, flags);
+}
 
 /**
  * clock2alarm - helper that converts from clockid to alarmtypes
@@ -846,6 +846,17 @@ out:
 	return ret;
 }
 
+const struct k_clock alarm_clock = {
+	.clock_getres	= alarm_clock_getres,
+	.clock_get	= alarm_clock_get,
+	.timer_create	= alarm_timer_create,
+	.timer_set	= alarm_timer_set,
+	.timer_del	= alarm_timer_del,
+	.timer_get	= alarm_timer_get,
+	.nsleep		= alarm_timer_nsleep,
+};
+#endif /* CONFIG_POSIX_TIMERS */
+
 
 /* Suspend hook structures */
 static const struct dev_pm_ops alarmtimer_pm_ops = {
@@ -871,23 +882,9 @@ static int __init alarmtimer_init(void)
 	struct platform_device *pdev;
 	int error = 0;
 	int i;
-	struct k_clock alarm_clock = {
-		.clock_getres	= alarm_clock_getres,
-		.clock_get	= alarm_clock_get,
-		.timer_create	= alarm_timer_create,
-		.timer_set	= alarm_timer_set,
-		.timer_del	= alarm_timer_del,
-		.timer_get	= alarm_timer_get,
-		.nsleep		= alarm_timer_nsleep,
-	};
 
 	alarmtimer_rtc_timer_init();
 
-	if (IS_ENABLED(CONFIG_POSIX_TIMERS)) {
-		posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
-		posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
-	}
-
 	/* Initialize alarm bases */
 	alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
 	alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 31d588d37a17..7e453005e078 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -434,7 +434,7 @@ static int pc_timer_settime(struct k_itimer *kit, int flags,
 	return err;
 }
 
-struct k_clock clock_posix_dynamic = {
+const struct k_clock clock_posix_dynamic = {
 	.clock_getres	= pc_clock_getres,
 	.clock_set	= pc_clock_settime,
 	.clock_get	= pc_clock_gettime,
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1370f067fb51..1a522b39f19d 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1413,7 +1413,7 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
 	return posix_cpu_timer_create(timer);
 }
 
-struct k_clock clock_posix_cpu = {
+const struct k_clock clock_posix_cpu = {
 	.clock_getres	= posix_cpu_clock_getres,
 	.clock_set	= posix_cpu_clock_set,
 	.clock_get	= posix_cpu_clock_get,
@@ -1425,24 +1425,16 @@ struct k_clock clock_posix_cpu = {
 	.timer_get	= posix_cpu_timer_get,
 };
 
-static __init int init_posix_cpu_timers(void)
-{
-	struct k_clock process = {
-		.clock_getres	= process_cpu_clock_getres,
-		.clock_get	= process_cpu_clock_get,
-		.timer_create	= process_cpu_timer_create,
-		.nsleep		= process_cpu_nsleep,
-		.nsleep_restart	= process_cpu_nsleep_restart,
-	};
-	struct k_clock thread = {
-		.clock_getres	= thread_cpu_clock_getres,
-		.clock_get	= thread_cpu_clock_get,
-		.timer_create	= thread_cpu_timer_create,
-	};
-
-	posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
-	posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+const struct k_clock clock_process = {
+	.clock_getres	= process_cpu_clock_getres,
+	.clock_get	= process_cpu_clock_get,
+	.timer_create	= process_cpu_timer_create,
+	.nsleep		= process_cpu_nsleep,
+	.nsleep_restart	= process_cpu_nsleep_restart,
+};
 
-	return 0;
-}
-__initcall(init_posix_cpu_timers);
+const struct k_clock clock_thread = {
+	.clock_getres	= thread_cpu_clock_getres,
+	.clock_get	= thread_cpu_clock_get,
+	.timer_create	= thread_cpu_timer_create,
+};
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 4d7b2ce09c27..0c0cccfa3586 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -125,8 +125,6 @@ static DEFINE_SPINLOCK(hash_lock);
  *	    which we beg off on and pass to do_sys_settimeofday().
  */
 
-static struct k_clock posix_clocks[MAX_CLOCKS];
-
 /*
  * These ones are defined below.
  */
@@ -280,74 +278,87 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
 	return 0;
 }
 
+
+static const struct k_clock clock_realtime = {
+	.clock_getres	= posix_get_hrtimer_res,
+	.clock_get	= posix_clock_realtime_get,
+	.clock_set	= posix_clock_realtime_set,
+	.clock_adj	= posix_clock_realtime_adj,
+	.nsleep		= common_nsleep,
+	.nsleep_restart	= hrtimer_nanosleep_restart,
+	.timer_create	= common_timer_create,
+	.timer_set	= common_timer_set,
+	.timer_get	= common_timer_get,
+	.timer_del	= common_timer_del,
+};
+
+static const struct k_clock clock_monotonic = {
+	.clock_getres	= posix_get_hrtimer_res,
+	.clock_get	= posix_ktime_get_ts,
+	.nsleep		= common_nsleep,
+	.nsleep_restart	= hrtimer_nanosleep_restart,
+	.timer_create	= common_timer_create,
+	.timer_set	= common_timer_set,
+	.timer_get	= common_timer_get,
+	.timer_del	= common_timer_del,
+};
+
+static const struct k_clock clock_monotonic_raw = {
+	.clock_getres	= posix_get_hrtimer_res,
+	.clock_get	= posix_get_monotonic_raw,
+};
+
+static const struct k_clock clock_realtime_coarse = {
+	.clock_getres	= posix_get_coarse_res,
+	.clock_get	= posix_get_realtime_coarse,
+};
+
+static const struct k_clock clock_monotonic_coarse = {
+	.clock_getres	= posix_get_coarse_res,
+	.clock_get	= posix_get_monotonic_coarse,
+};
+
+static const struct k_clock clock_tai = {
+	.clock_getres	= posix_get_hrtimer_res,
+	.clock_get	= posix_get_tai,
+	.nsleep		= common_nsleep,
+	.nsleep_restart	= hrtimer_nanosleep_restart,
+	.timer_create	= common_timer_create,
+	.timer_set	= common_timer_set,
+	.timer_get	= common_timer_get,
+	.timer_del	= common_timer_del,
+};
+
+static const struct k_clock clock_boottime = {
+	.clock_getres	= posix_get_hrtimer_res,
+	.clock_get	= posix_get_boottime,
+	.nsleep		= common_nsleep,
+	.nsleep_restart	= hrtimer_nanosleep_restart,
+	.timer_create	= common_timer_create,
+	.timer_set	= common_timer_set,
+	.timer_get	= common_timer_get,
+	.timer_del	= common_timer_del,
+};
+
+static const struct k_clock * const posix_clocks[] = {
+	[CLOCK_REALTIME]		= &clock_realtime,
+	[CLOCK_MONOTONIC]		= &clock_monotonic,
+	[CLOCK_PROCESS_CPUTIME_ID]	= &clock_process,
+	[CLOCK_THREAD_CPUTIME_ID]	= &clock_thread,
+	[CLOCK_MONOTONIC_RAW]		= &clock_monotonic_raw,
+	[CLOCK_REALTIME_COARSE]		= &clock_realtime_coarse,
+	[CLOCK_MONOTONIC_COARSE]	= &clock_monotonic_coarse,
+	[CLOCK_BOOTTIME]		= &clock_boottime,
+	[CLOCK_REALTIME_ALARM]		= &alarm_clock,
+	[CLOCK_BOOTTIME_ALARM]		= &alarm_clock,
+	[CLOCK_TAI]			= &clock_tai,
+};
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
 static __init int init_posix_timers(void)
 {
-	struct k_clock clock_realtime = {
-		.clock_getres	= posix_get_hrtimer_res,
-		.clock_get	= posix_clock_realtime_get,
-		.clock_set	= posix_clock_realtime_set,
-		.clock_adj	= posix_clock_realtime_adj,
-		.nsleep		= common_nsleep,
-		.nsleep_restart	= hrtimer_nanosleep_restart,
-		.timer_create	= common_timer_create,
-		.timer_set	= common_timer_set,
-		.timer_get	= common_timer_get,
-		.timer_del	= common_timer_del,
-	};
-	struct k_clock clock_monotonic = {
-		.clock_getres	= posix_get_hrtimer_res,
-		.clock_get	= posix_ktime_get_ts,
-		.nsleep		= common_nsleep,
-		.nsleep_restart	= hrtimer_nanosleep_restart,
-		.timer_create	= common_timer_create,
-		.timer_set	= common_timer_set,
-		.timer_get	= common_timer_get,
-		.timer_del	= common_timer_del,
-	};
-	struct k_clock clock_monotonic_raw = {
-		.clock_getres	= posix_get_hrtimer_res,
-		.clock_get	= posix_get_monotonic_raw,
-	};
-	struct k_clock clock_realtime_coarse = {
-		.clock_getres	= posix_get_coarse_res,
-		.clock_get	= posix_get_realtime_coarse,
-	};
-	struct k_clock clock_monotonic_coarse = {
-		.clock_getres	= posix_get_coarse_res,
-		.clock_get	= posix_get_monotonic_coarse,
-	};
-	struct k_clock clock_tai = {
-		.clock_getres	= posix_get_hrtimer_res,
-		.clock_get	= posix_get_tai,
-		.nsleep		= common_nsleep,
-		.nsleep_restart	= hrtimer_nanosleep_restart,
-		.timer_create	= common_timer_create,
-		.timer_set	= common_timer_set,
-		.timer_get	= common_timer_get,
-		.timer_del	= common_timer_del,
-	};
-	struct k_clock clock_boottime = {
-		.clock_getres	= posix_get_hrtimer_res,
-		.clock_get	= posix_get_boottime,
-		.nsleep		= common_nsleep,
-		.nsleep_restart	= hrtimer_nanosleep_restart,
-		.timer_create	= common_timer_create,
-		.timer_set	= common_timer_set,
-		.timer_get	= common_timer_get,
-		.timer_del	= common_timer_del,
-	};
-
-	posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
-	posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
-	posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
-	posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
-	posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
-	posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
-	posix_timers_register_clock(CLOCK_TAI, &clock_tai);
-
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, SLAB_PANIC,
 					NULL);
@@ -521,30 +532,6 @@ static struct pid *good_sigevent(sigevent_t * event)
 	return task_pid(rtn);
 }
 
-void posix_timers_register_clock(const clockid_t clock_id,
-				 struct k_clock *new_clock)
-{
-	if ((unsigned) clock_id >= MAX_CLOCKS) {
-		printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
-		       clock_id);
-		return;
-	}
-
-	if (!new_clock->clock_get) {
-		printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
-		       clock_id);
-		return;
-	}
-	if (!new_clock->clock_getres) {
-		printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
-		       clock_id);
-		return;
-	}
-
-	posix_clocks[clock_id] = *new_clock;
-}
-EXPORT_SYMBOL_GPL(posix_timers_register_clock);
-
 static struct k_itimer * alloc_posix_timer(void)
 {
 	struct k_itimer *tmr;
@@ -581,15 +568,15 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 	call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
 }
 
-static struct k_clock *clockid_to_kclock(const clockid_t id)
+static const struct k_clock *clockid_to_kclock(const clockid_t id)
 {
 	if (id < 0)
 		return (id & CLOCKFD_MASK) == CLOCKFD ?
 			&clock_posix_dynamic : &clock_posix_cpu;
 
-	if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
+	if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id])
 		return NULL;
-	return &posix_clocks[id];
+	return posix_clocks[id];
 }
 
 static int common_timer_create(struct k_itimer *new_timer)
@@ -604,7 +591,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 		struct sigevent __user *, timer_event_spec,
 		timer_t __user *, created_timer_id)
 {
-	struct k_clock *kc = clockid_to_kclock(which_clock);
+	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct k_itimer *new_timer;
 	int error, new_timer_id;
 	sigevent_t event;
@@ -781,7 +768,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 	struct itimerspec64 cur_setting64;
 	struct itimerspec cur_setting;
 	struct k_itimer *timr;
-	struct k_clock *kc;
+	const struct k_clock *kc;
 	unsigned long flags;
 	int ret = 0;
 
@@ -890,7 +877,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
 	struct itimerspec new_spec, old_spec;
 	struct k_itimer *timr;
 	unsigned long flag;
-	struct k_clock *kc;
+	const struct k_clock *kc;
 	int error = 0;
 
 	if (!new_setting)
@@ -939,7 +926,7 @@ static int common_timer_del(struct k_itimer *timer)
 
 static inline int timer_delete_hook(struct k_itimer *timer)
 {
-	struct k_clock *kc = clockid_to_kclock(timer->it_clock);
+	const struct k_clock *kc = clockid_to_kclock(timer->it_clock);
 
 	if (WARN_ON_ONCE(!kc || !kc->timer_del))
 		return -EINVAL;
@@ -1018,7 +1005,7 @@ void exit_itimers(struct signal_struct *sig)
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 		const struct timespec __user *, tp)
 {
-	struct k_clock *kc = clockid_to_kclock(which_clock);
+	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 new_tp64;
 	struct timespec new_tp;
 
@@ -1035,7 +1022,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 		struct timespec __user *,tp)
 {
-	struct k_clock *kc = clockid_to_kclock(which_clock);
+	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 kernel_tp64;
 	struct timespec kernel_tp;
 	int error;
@@ -1055,7 +1042,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
 		struct timex __user *, utx)
 {
-	struct k_clock *kc = clockid_to_kclock(which_clock);
+	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timex ktx;
 	int err;
 
@@ -1078,7 +1065,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 		struct timespec __user *, tp)
 {
-	struct k_clock *kc = clockid_to_kclock(which_clock);
+	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 rtn_tp64;
 	struct timespec rtn_tp;
 	int error;
@@ -1110,7 +1097,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 		const struct timespec __user *, rqtp,
 		struct timespec __user *, rmtp)
 {
-	struct k_clock *kc = clockid_to_kclock(which_clock);
+	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 t64;
 	struct timespec t;
 
@@ -1136,7 +1123,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 long clock_nanosleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->nanosleep.clockid;
-	struct k_clock *kc = clockid_to_kclock(which_clock);
+	const struct k_clock *kc = clockid_to_kclock(which_clock);
 
 	if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
 		return -EINVAL;
-- 
cgit v1.3-14-g43fede


From b6b3b80fceb175c825ad6c72659e0a72e201fc5f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 27 May 2017 12:23:47 +0200
Subject: alarmtimer: Fix posix-timer constification fallout

Some freezer related variables are only used when either CONFIG_POSIX_TIMER
or CONFIG_RTC_CLASS are enabled. Hide them when both are off.

Fixes: d3ba5a9a345b ("posix-timers: Make posix_clocks immutable")
Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Helwig <hch@lst.de>
---
 kernel/time/alarmtimer.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 4f4cc3509b30..4cfebfff848d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -45,11 +45,13 @@ static struct alarm_base {
 	clockid_t		base_clockid;
 } alarm_bases[ALARM_NUMTYPE];
 
+#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS)
 /* freezer information to handle clock_nanosleep triggered wakeups */
 static enum alarmtimer_type freezer_alarmtype;
 static ktime_t freezer_expires;
 static ktime_t freezer_delta;
 static DEFINE_SPINLOCK(freezer_delta_lock);
+#endif
 
 static struct wakeup_source *ws;
 
-- 
cgit v1.3-14-g43fede


From 613763a1f056211522bac77ff39f25706e678fdd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 26 May 2017 22:04:29 -0400
Subject: take compat_sys_old_getrlimit() to native syscall

... and sanitize the ifdefs in there

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/include/asm/compat.h |  1 -
 arch/s390/include/asm/compat.h    |  1 -
 arch/x86/include/asm/compat.h     |  1 -
 include/linux/syscalls.h          |  2 +-
 kernel/compat.c                   | 29 -----------------------------
 kernel/sys.c                      | 24 ++++++++++++++++++++++++
 6 files changed, 25 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h
index 4f2df589ec1d..f256e1d14a14 100644
--- a/arch/powerpc/include/asm/compat.h
+++ b/arch/powerpc/include/asm/compat.h
@@ -109,7 +109,6 @@ struct compat_statfs {
 	int		f_spare[4];
 };
 
-#define COMPAT_RLIM_OLD_INFINITY	0x7fffffff
 #define COMPAT_RLIM_INFINITY		0xffffffff
 
 typedef u32		compat_old_sigset_t;
diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h
index 0ddd37e6c29d..b9300f8aee10 100644
--- a/arch/s390/include/asm/compat.h
+++ b/arch/s390/include/asm/compat.h
@@ -178,7 +178,6 @@ struct compat_statfs64 {
 	u32		f_spare[4];
 };
 
-#define COMPAT_RLIM_OLD_INFINITY	0x7fffffff
 #define COMPAT_RLIM_INFINITY		0xffffffff
 
 typedef u32		compat_old_sigset_t;	/* at least 32 bits */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 24118c0b4640..5343c19814b3 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -116,7 +116,6 @@ struct compat_statfs {
 	int		f_spare[4];
 };
 
-#define COMPAT_RLIM_OLD_INFINITY	0x7fffffff
 #define COMPAT_RLIM_INFINITY		0xffffffff
 
 typedef u32		compat_old_sigset_t;	/* at least 32 bits */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9b06f8..3cb15ea48aee 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -650,7 +650,7 @@ asmlinkage long sys_olduname(struct oldold_utsname __user *);
 
 asmlinkage long sys_getrlimit(unsigned int resource,
 				struct rlimit __user *rlim);
-#if defined(COMPAT_RLIM_OLD_INFINITY) || !(defined(CONFIG_IA64))
+#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
 asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim);
 #endif
 asmlinkage long sys_setrlimit(unsigned int resource,
diff --git a/kernel/compat.c b/kernel/compat.c
index 933bcb31ae10..860f674fa556 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -468,35 +468,6 @@ COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
 	return do_prlimit(current, resource, &r, NULL);
 }
 
-#ifdef COMPAT_RLIM_OLD_INFINITY
-
-COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
-		       struct compat_rlimit __user *, rlim)
-{
-	struct rlimit r;
-	int ret;
-	mm_segment_t old_fs = get_fs();
-
-	set_fs(KERNEL_DS);
-	ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
-	set_fs(old_fs);
-
-	if (!ret) {
-		if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY)
-			r.rlim_cur = COMPAT_RLIM_INFINITY;
-		if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY)
-			r.rlim_max = COMPAT_RLIM_INFINITY;
-
-		if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
-		    __put_user(r.rlim_cur, &rlim->rlim_cur) ||
-		    __put_user(r.rlim_max, &rlim->rlim_max))
-			return -EFAULT;
-	}
-	return ret;
-}
-
-#endif
-
 COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
 		       struct compat_rlimit __user *, rlim)
 {
diff --git a/kernel/sys.c b/kernel/sys.c
index 8a94b4eabcaa..3778a8a417b6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1328,6 +1328,30 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
 	return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
+		       struct compat_rlimit __user *, rlim)
+{
+	struct rlimit r;
+
+	if (resource >= RLIM_NLIMITS)
+		return -EINVAL;
+
+	task_lock(current->group_leader);
+	r = current->signal->rlim[resource];
+	task_unlock(current->group_leader);
+	if (r.rlim_cur > 0x7FFFFFFF)
+		r.rlim_cur = 0x7FFFFFFF;
+	if (r.rlim_max > 0x7FFFFFFF)
+		r.rlim_max = 0x7FFFFFFF;
+
+	if (put_user(r.rlim_cur, &rlim->rlim_cur) ||
+	    put_user(r.rlim_max, &rlim->rlim_max))
+		return -EFAULT;
+	return 0;
+}
+#endif
+
 #endif
 
 static inline bool rlim64_is_infinity(__u64 rlim64)
-- 
cgit v1.3-14-g43fede


From bcfe8ad8ef55a1cf3c935c2667e8e5ae598b3b7e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 27 May 2017 00:29:34 -0400
Subject: do_sigaltstack(): lift copying to/from userland into callers

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/signal.c | 107 ++++++++++++++++++++++++--------------------------------
 1 file changed, 46 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index ca92bcfeb322..d1eed0d7ca64 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3113,78 +3113,68 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 }
 
 static int
-do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
+do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp)
 {
-	stack_t oss;
-	int error;
+	struct task_struct *t = current;
 
-	oss.ss_sp = (void __user *) current->sas_ss_sp;
-	oss.ss_size = current->sas_ss_size;
-	oss.ss_flags = sas_ss_flags(sp) |
-		(current->sas_ss_flags & SS_FLAG_BITS);
+	if (oss) {
+		memset(oss, 0, sizeof(stack_t));
+		oss->ss_sp = (void __user *) t->sas_ss_sp;
+		oss->ss_size = t->sas_ss_size;
+		oss->ss_flags = sas_ss_flags(sp) |
+			(current->sas_ss_flags & SS_FLAG_BITS);
+	}
 
-	if (uss) {
-		void __user *ss_sp;
-		size_t ss_size;
-		unsigned ss_flags;
+	if (ss) {
+		void __user *ss_sp = ss->ss_sp;
+		size_t ss_size = ss->ss_size;
+		unsigned ss_flags = ss->ss_flags;
 		int ss_mode;
 
-		error = -EFAULT;
-		if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
-			goto out;
-		error = __get_user(ss_sp, &uss->ss_sp) |
-			__get_user(ss_flags, &uss->ss_flags) |
-			__get_user(ss_size, &uss->ss_size);
-		if (error)
-			goto out;
-
-		error = -EPERM;
-		if (on_sig_stack(sp))
-			goto out;
+		if (unlikely(on_sig_stack(sp)))
+			return -EPERM;
 
 		ss_mode = ss_flags & ~SS_FLAG_BITS;
-		error = -EINVAL;
-		if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
-				ss_mode != 0)
-			goto out;
+		if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
+				ss_mode != 0))
+			return -EINVAL;
 
 		if (ss_mode == SS_DISABLE) {
 			ss_size = 0;
 			ss_sp = NULL;
 		} else {
-			error = -ENOMEM;
-			if (ss_size < MINSIGSTKSZ)
-				goto out;
+			if (unlikely(ss_size < MINSIGSTKSZ))
+				return -ENOMEM;
 		}
 
-		current->sas_ss_sp = (unsigned long) ss_sp;
-		current->sas_ss_size = ss_size;
-		current->sas_ss_flags = ss_flags;
-	}
-
-	error = 0;
-	if (uoss) {
-		error = -EFAULT;
-		if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
-			goto out;
-		error = __put_user(oss.ss_sp, &uoss->ss_sp) |
-			__put_user(oss.ss_size, &uoss->ss_size) |
-			__put_user(oss.ss_flags, &uoss->ss_flags);
+		t->sas_ss_sp = (unsigned long) ss_sp;
+		t->sas_ss_size = ss_size;
+		t->sas_ss_flags = ss_flags;
 	}
-
-out:
-	return error;
+	return 0;
 }
+
 SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
 {
-	return do_sigaltstack(uss, uoss, current_user_stack_pointer());
+	stack_t new, old;
+	int err;
+	if (uss && copy_from_user(&new, uss, sizeof(stack_t)))
+		return -EFAULT;
+	err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL,
+			      current_user_stack_pointer());
+	if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t)))
+		err = -EFAULT;
+	return err;
 }
 
 int restore_altstack(const stack_t __user *uss)
 {
-	int err = do_sigaltstack(uss, NULL, current_user_stack_pointer());
+	stack_t new;
+	if (copy_from_user(&new, uss, sizeof(stack_t)))
+		return -EFAULT;
+	(void)do_sigaltstack(&new, NULL, current_user_stack_pointer());
 	/* squash all but EFAULT for now */
-	return err == -EFAULT ? err : 0;
+	return 0;
 }
 
 int __save_altstack(stack_t __user *uss, unsigned long sp)
@@ -3207,29 +3197,24 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack,
 {
 	stack_t uss, uoss;
 	int ret;
-	mm_segment_t seg;
 
 	if (uss_ptr) {
 		compat_stack_t uss32;
-
-		memset(&uss, 0, sizeof(stack_t));
 		if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
 			return -EFAULT;
 		uss.ss_sp = compat_ptr(uss32.ss_sp);
 		uss.ss_flags = uss32.ss_flags;
 		uss.ss_size = uss32.ss_size;
 	}
-	seg = get_fs();
-	set_fs(KERNEL_DS);
-	ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
-			     (stack_t __force __user *) &uoss,
+	ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss,
 			     compat_user_stack_pointer());
-	set_fs(seg);
 	if (ret >= 0 && uoss_ptr)  {
-		if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) ||
-		    __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
-		    __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
-		    __put_user(uoss.ss_size, &uoss_ptr->ss_size))
+		compat_stack_t old;
+		memset(&old, 0, sizeof(old));
+		old.ss_sp = ptr_to_compat(uoss.ss_sp);
+		old.ss_flags = uoss.ss_flags;
+		old.ss_size = uoss.ss_size;
+		if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t)))
 			ret = -EFAULT;
 	}
 	return ret;
-- 
cgit v1.3-14-g43fede


From 7c25904508afef134d7a9d2ad1690ee6554a1faa Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 15 May 2017 14:56:50 +0200
Subject: nohz: Reset next_tick cache even when the timer has no regs

Handle tick interrupts whose regs are NULL, out of general paranoia. It happens
when hrtimer_interrupt() is called from non-interrupt contexts, such as hotplug
CPU down events.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 764d2905e6a5..e3043873fcdc 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1202,6 +1202,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 	 */
 	if (regs)
 		tick_sched_handle(ts, regs);
+	else
+		ts->next_tick = 0;
 
 	/* No need to reprogram if we are in idle or full dynticks mode */
 	if (unlikely(ts->tick_stopped))
-- 
cgit v1.3-14-g43fede


From 7786f6b6dfc12d17eea2df04116de6ebac50c884 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Fri, 7 Apr 2017 10:17:27 -0400
Subject: audit: add ambient capabilities to CAPSET and BPRM_FCAPS records

Capabilities were augmented to include ambient capabilities in v4.3
commit 58319057b784 ("capabilities: ambient capabilities").

Add ambient capabilities to the audit BPRM_FCAPS and CAPSET records.

The record contains fields "old_pp", "old_pi", "old_pe", "new_pp",
"new_pi", "new_pe" so in keeping with the previous record
normalizations, change the "new_*" variants to simply drop the "new_"
prefix.

A sample of the replaced BPRM_FCAPS record:
RAW: type=BPRM_FCAPS msg=audit(1491468034.252:237): fver=2
fp=0000000000200000 fi=0000000000000000 fe=1 old_pp=0000000000000000
old_pi=0000000000000000 old_pe=0000000000000000 old_pa=0000000000000000
pp=0000000000200000 pi=0000000000000000 pe=0000000000200000
pa=0000000000000000

INTERPRET: type=BPRM_FCAPS msg=audit(04/06/2017 04:40:34.252:237):
fver=2 fp=sys_admin fi=none fe=chown old_pp=none old_pi=none
old_pe=none old_pa=none pp=sys_admin pi=none pe=sys_admin pa=none

A sample of the replaced CAPSET record:
RAW: type=CAPSET msg=audit(1491469502.371:242): pid=833
cap_pi=0000003fffffffff cap_pp=0000003fffffffff cap_pe=0000003fffffffff
cap_pa=0000000000000000

INTERPRET: type=CAPSET msg=audit(04/06/2017 05:05:02.371:242) : pid=833
cap_pi=chown,dac_override,dac_read_search,fowner,fsetid,kill,
setgid,setuid,setpcap,linux_immutable,net_bind_service,net_broadcast,
net_admin,net_raw,ipc_lock,ipc_owner,sys_module,sys_rawio,sys_chroot,
sys_ptrace,sys_pacct,sys_admin,sys_boot,sys_nice,sys_resource,sys_time,
sys_tty_config,mknod,lease,audit_write,audit_control,setfcap,
mac_override,mac_admin,syslog,wake_alarm,block_suspend,audit_read
cap_pp=chown,dac_override,dac_read_search,fowner,fsetid,kill,setgid,
setuid,setpcap,linux_immutable,net_bind_service,net_broadcast,
net_admin,net_raw,ipc_lock,ipc_owner,sys_module,sys_rawio,sys_chroot,
sys_ptrace,sys_pacct,sys_admin,sys_boot,sys_nice,sys_resource,
sys_time,sys_tty_config,mknod,lease,audit_write,audit_control,setfcap,
mac_override,mac_admin,syslog,wake_alarm,block_suspend,audit_read
cap_pe=chown,dac_override,dac_read_search,fowner,fsetid,kill,setgid,
setuid,setpcap,linux_immutable,net_bind_service,net_broadcast,
net_admin,net_raw,ipc_lock,ipc_owner,sys_module,sys_rawio,sys_chroot,
sys_ptrace,sys_pacct,sys_admin,sys_boot,sys_nice,sys_resource,
sys_time,sys_tty_config,mknod,lease,audit_write,audit_control,setfcap,
mac_override,mac_admin,syslog,wake_alarm,block_suspend,audit_read
cap_pa=none

See: https://github.com/linux-audit/audit-kernel/issues/40

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.h   |  1 +
 kernel/auditsc.c | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.h b/kernel/audit.h
index ddfce2ea4891..bb3a4e14b7e5 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -68,6 +68,7 @@ struct audit_cap_data {
 		unsigned int	fE;		/* effective bit of file cap */
 		kernel_cap_t	effective;	/* effective set of process */
 	};
+	kernel_cap_t		ambient;
 };
 
 /* When fs/namei.c:getname() is called, we store the pointer in name and bump
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b2dcbe637b7c..5fa68d10032f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1260,6 +1260,7 @@ static void show_special(struct audit_context *context, int *call_panic)
 		audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
 		audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
 		audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
+		audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient);
 		break;
 	case AUDIT_MMAP:
 		audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
@@ -1381,9 +1382,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
 			audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
 			audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
-			audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted);
-			audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable);
-			audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
+			audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient);
+			audit_log_cap(ab, "pp", &axs->new_pcap.permitted);
+			audit_log_cap(ab, "pi", &axs->new_pcap.inheritable);
+			audit_log_cap(ab, "pe", &axs->new_pcap.effective);
+			audit_log_cap(ab, "pa", &axs->new_pcap.ambient);
 			break; }
 
 		}
@@ -2341,10 +2344,12 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 	ax->old_pcap.permitted   = old->cap_permitted;
 	ax->old_pcap.inheritable = old->cap_inheritable;
 	ax->old_pcap.effective   = old->cap_effective;
+	ax->old_pcap.ambient     = old->cap_ambient;
 
 	ax->new_pcap.permitted   = new->cap_permitted;
 	ax->new_pcap.inheritable = new->cap_inheritable;
 	ax->new_pcap.effective   = new->cap_effective;
+	ax->new_pcap.ambient     = new->cap_ambient;
 	return 0;
 }
 
@@ -2363,6 +2368,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
 	context->capset.cap.effective   = new->cap_effective;
 	context->capset.cap.inheritable = new->cap_effective;
 	context->capset.cap.permitted   = new->cap_permitted;
+	context->capset.cap.ambient     = new->cap_ambient;
 	context->type = AUDIT_CAPSET;
 }
 
-- 
cgit v1.3-14-g43fede


From 71189fa9b092ef125ee741eccb2f5fa916798afd Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 30 May 2017 13:31:27 -0700
Subject: bpf: free up BPF_JMP | BPF_CALL | BPF_X opcode

free up BPF_JMP | BPF_CALL | BPF_X opcode to be used by actual
indirect call by register and use kernel internal opcode to
mark call instruction into bpf_tail_call() helper.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm64/net/bpf_jit_comp.c     | 2 +-
 arch/powerpc/net/bpf_jit_comp64.c | 2 +-
 arch/s390/net/bpf_jit_comp.c      | 2 +-
 arch/sparc/net/bpf_jit_comp_64.c  | 2 +-
 arch/x86/net/bpf_jit_comp.c       | 2 +-
 include/linux/filter.h            | 3 +++
 kernel/bpf/core.c                 | 2 +-
 kernel/bpf/verifier.c             | 2 +-
 8 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 71f930501ade..b1d38eeb24f6 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -586,7 +586,7 @@ emit_cond_jmp:
 		break;
 	}
 	/* tail call */
-	case BPF_JMP | BPF_CALL | BPF_X:
+	case BPF_JMP | BPF_TAIL_CALL:
 		if (emit_bpf_tail_call(ctx))
 			return -EFAULT;
 		break;
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index aee2bb817ac6..a01366584a4b 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -938,7 +938,7 @@ common_load:
 		/*
 		 * Tail call
 		 */
-		case BPF_JMP | BPF_CALL | BPF_X:
+		case BPF_JMP | BPF_TAIL_CALL:
 			ctx->seen |= SEEN_TAILCALL;
 			bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]);
 			break;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 6e97a2e3fd8d..42ad3832586c 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -991,7 +991,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 		}
 		break;
 	}
-	case BPF_JMP | BPF_CALL | BPF_X:
+	case BPF_JMP | BPF_TAIL_CALL:
 		/*
 		 * Implicit input:
 		 *  B1: pointer to ctx
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 21de77419f48..4a52d34facf9 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -1217,7 +1217,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 	}
 
 	/* tail call */
-	case BPF_JMP | BPF_CALL |BPF_X:
+	case BPF_JMP | BPF_TAIL_CALL:
 		emit_tail_call(ctx);
 		break;
 
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f58939393eef..fec12eaa0dec 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -877,7 +877,7 @@ xadd:			if (is_imm8(insn->off))
 			}
 			break;
 
-		case BPF_JMP | BPF_CALL | BPF_X:
+		case BPF_JMP | BPF_TAIL_CALL:
 			emit_bpf_tail_call(&prog);
 			break;
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 62d948f80730..a20ba40fcb73 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -57,6 +57,9 @@ struct bpf_prog_aux;
 #define BPF_REG_AX		MAX_BPF_REG
 #define MAX_BPF_JIT_REG		(MAX_BPF_REG + 1)
 
+/* unused opcode to mark special call to bpf_tail_call() helper */
+#define BPF_TAIL_CALL	0xf0
+
 /* As per nm, we expose JITed images as text (code) section for
  * kallsyms. That way, tools like perf can find it to match
  * addresses.
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index dedf367f59bb..339289402b96 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -824,7 +824,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 		[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
 		/* Call instruction */
 		[BPF_JMP | BPF_CALL] = &&JMP_CALL,
-		[BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
+		[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
 		/* Jumps */
 		[BPF_JMP | BPF_JA] = &&JMP_JA,
 		[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 339c8a1371de..28113d0e8e92 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3469,7 +3469,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			 * that doesn't support bpf_tail_call yet
 			 */
 			insn->imm = 0;
-			insn->code |= BPF_X;
+			insn->code = BPF_JMP | BPF_TAIL_CALL;
 			continue;
 		}
 
-- 
cgit v1.3-14-g43fede


From f696b8f471ec987e987e38206b8eb23c39ee5a86 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 30 May 2017 13:31:28 -0700
Subject: bpf: split bpf core interpreter

split __bpf_prog_run() interpreter into stack allocation and execution parts.
The code section shrinks which helps interpreter performance in some cases.
   text	   data	    bss	    dec	    hex	filename
  26350	  10328	    624	  37302	   91b6	kernel/bpf/core.o.before
  25777	  10328	    624	  36729	   8f79	kernel/bpf/core.o.after

Very short programs got slower (due to extra function call):
Before:
test_bpf: #89 ALU64_ADD_K: 1 + 2 = 3 jited:0 7 PASS
test_bpf: #90 ALU64_ADD_K: 3 + 0 = 3 jited:0 8 PASS
test_bpf: #91 ALU64_ADD_K: 1 + 2147483646 = 2147483647 jited:0 7 PASS
test_bpf: #92 ALU64_ADD_K: 4294967294 + 2 = 4294967296 jited:0 11 PASS
test_bpf: #93 ALU64_ADD_K: 2147483646 + -2147483647 = -1 jited:0 7 PASS
After:
test_bpf: #89 ALU64_ADD_K: 1 + 2 = 3 jited:0 11 PASS
test_bpf: #90 ALU64_ADD_K: 3 + 0 = 3 jited:0 11 PASS
test_bpf: #91 ALU64_ADD_K: 1 + 2147483646 = 2147483647 jited:0 11 PASS
test_bpf: #92 ALU64_ADD_K: 4294967294 + 2 = 4294967296 jited:0 14 PASS
test_bpf: #93 ALU64_ADD_K: 2147483646 + -2147483647 = -1 jited:0 10 PASS

Longer programs got faster:
Before:
test_bpf: #266 BPF_MAXINSNS: Ctx heavy transformations jited:0 20286 20513 PASS
test_bpf: #267 BPF_MAXINSNS: Call heavy transformations jited:0 31853 31768 PASS
test_bpf: #268 BPF_MAXINSNS: Jump heavy test jited:0 9815 PASS
test_bpf: #269 BPF_MAXINSNS: Very long jump backwards jited:0 6 PASS
test_bpf: #270 BPF_MAXINSNS: Edge hopping nuthouse jited:0 13959 PASS
test_bpf: #271 BPF_MAXINSNS: Jump, gap, jump, ... jited:0 210 PASS
test_bpf: #272 BPF_MAXINSNS: ld_abs+get_processor_id jited:0 21724 PASS
test_bpf: #273 BPF_MAXINSNS: ld_abs+vlan_push/pop jited:0 19118 PASS
After:
test_bpf: #266 BPF_MAXINSNS: Ctx heavy transformations jited:0 19008 18827 PASS
test_bpf: #267 BPF_MAXINSNS: Call heavy transformations jited:0 29238 28450 PASS
test_bpf: #268 BPF_MAXINSNS: Jump heavy test jited:0 9485 PASS
test_bpf: #269 BPF_MAXINSNS: Very long jump backwards jited:0 12 PASS
test_bpf: #270 BPF_MAXINSNS: Edge hopping nuthouse jited:0 13257 PASS
test_bpf: #271 BPF_MAXINSNS: Jump, gap, jump, ... jited:0 213 PASS
test_bpf: #272 BPF_MAXINSNS: ld_abs+get_processor_id jited:0 19389 PASS
test_bpf: #273 BPF_MAXINSNS: ld_abs+vlan_push/pop jited:0 19583 PASS

For real world production programs the difference is noise.

This patch is first step towards reducing interpreter stack consumption.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 339289402b96..abd410d394bc 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -763,10 +763,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
  *
  * Decode and execute eBPF instructions.
  */
-static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
+				    u64 *stack)
 {
-	u64 stack[MAX_BPF_STACK / sizeof(u64)];
-	u64 regs[MAX_BPF_REG], tmp;
+	u64 tmp;
 	static const void *jumptable[256] = {
 		[0 ... 255] = &&default_label,
 		/* Now overwrite non-defaults ... */
@@ -874,9 +874,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 #define CONT	 ({ insn++; goto select_insn; })
 #define CONT_JMP ({ insn++; goto select_insn; })
 
-	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
-	ARG1 = (u64) (unsigned long) ctx;
-
 select_insn:
 	goto *jumptable[insn->code];
 
@@ -1219,7 +1216,17 @@ load_byte:
 		WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
 		return 0;
 }
-STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */
+STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
+
+static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+{
+	u64 stack[MAX_BPF_STACK / sizeof(u64)];
+	u64 regs[MAX_BPF_REG];
+
+	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
+	ARG1 = (u64) (unsigned long) ctx;
+	return ___bpf_prog_run(regs, insn, stack);
+}
 
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
-- 
cgit v1.3-14-g43fede


From 8726679a0fa317f8e83d0843b266453f31bff092 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 30 May 2017 13:31:29 -0700
Subject: bpf: teach verifier to track stack depth

teach verifier to track bpf program stack depth

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  1 +
 kernel/bpf/verifier.c | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6bb38d76faf4..fcc80ca11045 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -171,6 +171,7 @@ struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
 	u32 max_ctx_offset;
+	u32 stack_depth;
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
 	const struct bpf_verifier_ops *ops;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 28113d0e8e92..d96f27ff9f6f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -926,6 +926,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 			verbose("invalid stack off=%d size=%d\n", off, size);
 			return -EACCES;
 		}
+
+		if (env->prog->aux->stack_depth < -off)
+			env->prog->aux->stack_depth = -off;
+
 		if (t == BPF_WRITE) {
 			if (!env->allow_ptr_leaks &&
 			    state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
@@ -1032,6 +1036,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		return -EACCES;
 	}
 
+	if (env->prog->aux->stack_depth < -off)
+		env->prog->aux->stack_depth = -off;
+
 	if (meta && meta->raw_mode) {
 		meta->access_size = access_size;
 		meta->regno = regno;
@@ -3167,7 +3174,8 @@ process_bpf_exit:
 		insn_idx++;
 	}
 
-	verbose("processed %d insns\n", insn_processed);
+	verbose("processed %d insns, stack depth %d\n",
+		insn_processed, env->prog->aux->stack_depth);
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede


From 80a58d02559465b0ea403ff91c8bca9a733b1b0d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 30 May 2017 13:31:30 -0700
Subject: bpf: reconcile bpf_tail_call and stack_depth

The next set of patches will take advantage of stack_depth tracking,
so make sure that the program that does bpf_tail_call() has
stack depth large enough for the callee.
We could have tracked the stack depth of the prog_array owner program
and only allow insertion of the programs with stack depth less
than the owner, but it will break existing applications.
Some of them have trivial root bpf program that only does
multiple bpf_tail_calls and at init time the prog array is empty.
In the future we may add a flag to do such tracking optionally,
but for now play simple and safe.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d96f27ff9f6f..14ccb0759fa4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3470,6 +3470,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			 * the program array.
 			 */
 			prog->cb_access = 1;
+			env->prog->aux->stack_depth = MAX_BPF_STACK;
 
 			/* mark bpf_tail_call as different opcode to avoid
 			 * conditional branch in the interpeter for every normal
-- 
cgit v1.3-14-g43fede


From b870aa901f4be1d32c13faf9e8f40bf2a8562e19 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 30 May 2017 13:31:33 -0700
Subject: bpf: use different interpreter depending on required stack size

16 __bpf_prog_run() interpreters for various stack sizes add .text
but not a lot comparing to run-time stack savings

   text	   data	    bss	    dec	    hex	filename
  26350   10328     624   37302    91b6 kernel/bpf/core.o.before_split
  25777   10328     624   36729    8f79 kernel/bpf/core.o.after_split
  26970	  10328	    624	  37922	   9422	kernel/bpf/core.o.now

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 40 +++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index abd410d394bc..774069ca18a7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1218,16 +1218,38 @@ load_byte:
 }
 STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
 
-static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
-{
-	u64 stack[MAX_BPF_STACK / sizeof(u64)];
-	u64 regs[MAX_BPF_REG];
-
-	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
-	ARG1 = (u64) (unsigned long) ctx;
-	return ___bpf_prog_run(regs, insn, stack);
+#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
+#define DEFINE_BPF_PROG_RUN(stack_size) \
+static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
+{ \
+	u64 stack[stack_size / sizeof(u64)]; \
+	u64 regs[MAX_BPF_REG]; \
+\
+	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
+	ARG1 = (u64) (unsigned long) ctx; \
+	return ___bpf_prog_run(regs, insn, stack); \
 }
 
+#define EVAL1(FN, X) FN(X)
+#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
+#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
+#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
+#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
+#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
+
+EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
+EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
+EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
+
+#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
+
+static unsigned int (*interpreters[])(const void *ctx,
+				      const struct bpf_insn *insn) = {
+EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
+EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
+EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
+};
+
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
@@ -1275,7 +1297,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
  */
 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 {
-	fp->bpf_func = (void *) __bpf_prog_run;
+	fp->bpf_func = interpreters[round_down(fp->aux->stack_depth, 32) / 32];
 
 	/* eBPF JITs can rewrite the program in case constant
 	 * blinding is active. However, in case of error during
-- 
cgit v1.3-14-g43fede


From fb9a307d11d62749d75b404f15517d73f5d6e148 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 31 May 2017 18:15:59 -0700
Subject: bpf: Allow CGROUP_SKB eBPF program to access sk_buff

This allows cgroup eBPF program to classify packet based on their
protocol or other detail information. Currently program need
CAP_NET_ADMIN privilege to attach a cgroup eBPF program, and A
process with CAP_NET_ADMIN can already see all packets on the system,
for example, by creating an iptables rules that causes the packet to
be passed to userspace via NFLOG.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 14ccb0759fa4..8acae64df255 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2426,6 +2426,7 @@ static bool may_access_skb(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_SOCKET_FILTER:
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
+	case BPF_PROG_TYPE_CGROUP_SKB:
 		return true;
 	default:
 		return false;
-- 
cgit v1.3-14-g43fede


From 80b7d81912d807f161d55e9c2c9cc81061666f83 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 31 May 2017 18:16:00 -0700
Subject: bpf: Remove the capability check for cgroup skb eBPF program

Currently loading a cgroup skb eBPF program require a CAP_SYS_ADMIN
capability while attaching the program to a cgroup only requires the
user have CAP_NET_ADMIN privilege. We can escape the capability
check when load the program just like socket filter program to make
the capability requirement consistent.

Change since v1:
Change the code style in order to be compliant with checkpatch.pl
preference

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 265a0d854e33..59da103adb85 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -815,7 +815,9 @@ static int bpf_prog_load(union bpf_attr *attr)
 	    attr->kern_version != LINUX_VERSION_CODE)
 		return -EINVAL;
 
-	if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
+	    type != BPF_PROG_TYPE_CGROUP_SKB &&
+	    !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	/* plain bpf_prog allocation */
-- 
cgit v1.3-14-g43fede


From e5aeee51f6b4fb22e851105ee6d8ad211c40a214 Mon Sep 17 00:00:00 2001
From: Alexander Levin <alexander.levin@verizon.com>
Date: Sat, 3 Jun 2017 03:39:13 +0000
Subject: perf/core: Don't release cred_guard_mutex if not taken

If we failed to acquire task's cred_guard_mutex we shouldn't proceed
to release it in the error path.

Fixes: a63fbed776c ("perf/tracing/cpuhotplug: Fix locking order")
Signed-off-by: Alexander Levin <alexander.levin@verizon.com>
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: mhiramat@kernel.org
Cc: paulmck@linux.vnet.ibm.com
Cc: bigeasy@linutronix.de
Link: http://lkml.kernel.org/r/20170603033903.12056-1-alexander.levin@verizon.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index b97cda4d1777..1f1b8cdaca2d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9878,7 +9878,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (task) {
 		err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
 		if (err)
-			goto err_cred;
+			goto err_task;
 
 		/*
 		 * Reuse ptrace permission checks for now.
-- 
cgit v1.3-14-g43fede


From 201d7f47f34bd7cb19161d0426f13b141e381f30 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 May 2017 11:58:32 +0200
Subject: genirq: Handle NOAUTOEN interrupt setup proper

If an interrupt is marked NOAUTOEN then request_irq() installs the action,
but does not enable the interrupt via startup_irq().  The interrupt is
enabled via enable_irq() later from the driver. enable_irq() calls
irq_enable().

That means that for interrupts which have a irq_startup() callback this
callback is never invoked. Neither is irq_domain_activate_irq() invoked for
such interrupts.

If an interrupt depends on irq_startup() or irq_domain_activate_irq() then
the enable via irq_enable() is not enough.

Add a status flag IRQD_IRQ_STARTED_UP and use this to select the proper
mechanism in enable_irq(). Use the flag also to avoid pointless calls into
the low level functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: dianders@chromium.org
Cc: jeffy <jeffy.chen@rock-chips.com>
Cc: Brian Norris <briannorris@chromium.org>
Cc: tfiga@chromium.org
Link: http://lkml.kernel.org/r/20170531100212.130986205@linutronix.de
---
 include/linux/irq.h |  6 +++++
 kernel/irq/chip.c   | 76 +++++++++++++++++++++++++++++++++++++----------------
 kernel/irq/manage.c | 12 ++++++---
 3 files changed, 69 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index f887351aa80e..94d1ad6ffdd4 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -216,6 +216,7 @@ enum {
 	IRQD_WAKEUP_ARMED		= (1 << 19),
 	IRQD_FORWARDED_TO_VCPU		= (1 << 20),
 	IRQD_AFFINITY_MANAGED		= (1 << 21),
+	IRQD_IRQ_STARTED		= (1 << 22),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -329,6 +330,11 @@ static inline void irqd_clr_activated(struct irq_data *d)
 	__irqd_to_state(d) &= ~IRQD_ACTIVATED;
 }
 
+static inline bool irqd_is_started(struct irq_data *d)
+{
+	return __irqd_to_state(d) & IRQD_IRQ_STARTED;
+}
+
 #undef __irqd_to_state
 
 static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c94da688ee9b..e0051d58c909 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -185,37 +185,64 @@ static void irq_state_set_masked(struct irq_desc *desc)
 	irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
 }
 
+static void irq_state_clr_started(struct irq_desc *desc)
+{
+	irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED);
+}
+
+static void irq_state_set_started(struct irq_desc *desc)
+{
+	irqd_set(&desc->irq_data, IRQD_IRQ_STARTED);
+}
+
 int irq_startup(struct irq_desc *desc, bool resend)
 {
 	int ret = 0;
 
-	irq_state_clr_disabled(desc);
 	desc->depth = 0;
 
-	irq_domain_activate_irq(&desc->irq_data);
-	if (desc->irq_data.chip->irq_startup) {
-		ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
-		irq_state_clr_masked(desc);
-	} else {
+	if (irqd_is_started(&desc->irq_data)) {
 		irq_enable(desc);
+	} else {
+		irq_domain_activate_irq(&desc->irq_data);
+		if (desc->irq_data.chip->irq_startup) {
+			ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
+			irq_state_clr_disabled(desc);
+			irq_state_clr_masked(desc);
+		} else {
+			irq_enable(desc);
+		}
+		irq_state_set_started(desc);
 	}
+
 	if (resend)
 		check_irq_resend(desc);
+
 	return ret;
 }
 
+static void __irq_disable(struct irq_desc *desc, bool mask);
+
 void irq_shutdown(struct irq_desc *desc)
 {
-	irq_state_set_disabled(desc);
-	desc->depth = 1;
-	if (desc->irq_data.chip->irq_shutdown)
-		desc->irq_data.chip->irq_shutdown(&desc->irq_data);
-	else if (desc->irq_data.chip->irq_disable)
-		desc->irq_data.chip->irq_disable(&desc->irq_data);
-	else
-		desc->irq_data.chip->irq_mask(&desc->irq_data);
+	if (irqd_is_started(&desc->irq_data)) {
+		desc->depth = 1;
+		if (desc->irq_data.chip->irq_shutdown) {
+			desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+			irq_state_set_disabled(desc);
+			irq_state_set_masked(desc);
+		} else {
+			__irq_disable(desc, true);
+		}
+		irq_state_clr_started(desc);
+	}
+	/*
+	 * This must be called even if the interrupt was never started up,
+	 * because the activation can happen before the interrupt is
+	 * available for request/startup. It has it's own state tracking so
+	 * it's safe to call it unconditionally.
+	 */
 	irq_domain_deactivate_irq(&desc->irq_data);
-	irq_state_set_masked(desc);
 }
 
 void irq_enable(struct irq_desc *desc)
@@ -228,6 +255,17 @@ void irq_enable(struct irq_desc *desc)
 	irq_state_clr_masked(desc);
 }
 
+static void __irq_disable(struct irq_desc *desc, bool mask)
+{
+	irq_state_set_disabled(desc);
+	if (desc->irq_data.chip->irq_disable) {
+		desc->irq_data.chip->irq_disable(&desc->irq_data);
+		irq_state_set_masked(desc);
+	} else if (mask) {
+		mask_irq(desc);
+	}
+}
+
 /**
  * irq_disable - Mark interrupt disabled
  * @desc:	irq descriptor which should be disabled
@@ -250,13 +288,7 @@ void irq_enable(struct irq_desc *desc)
  */
 void irq_disable(struct irq_desc *desc)
 {
-	irq_state_set_disabled(desc);
-	if (desc->irq_data.chip->irq_disable) {
-		desc->irq_data.chip->irq_disable(&desc->irq_data);
-		irq_state_set_masked(desc);
-	} else if (irq_settings_disable_unlazy(desc)) {
-		mask_irq(desc);
-	}
+	__irq_disable(desc, irq_settings_disable_unlazy(desc));
 }
 
 void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 070be980c37a..57056109f176 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -533,9 +533,15 @@ void __enable_irq(struct irq_desc *desc)
 			goto err_out;
 		/* Prevent probing on this irq: */
 		irq_settings_set_noprobe(desc);
-		irq_enable(desc);
-		check_irq_resend(desc);
-		/* fall-through */
+		/*
+		 * Call irq_startup() not irq_enable() here because the
+		 * interrupt might be marked NOAUTOEN. So irq_startup()
+		 * needs to be invoked when it gets enabled the first
+		 * time. If it was already started up, then irq_startup()
+		 * will invoke irq_enable() under the hood.
+		 */
+		irq_startup(desc, true);
+		break;
 	}
 	default:
 		desc->depth--;
-- 
cgit v1.3-14-g43fede


From 04c848d398797a626608ff48804d809ae6687163 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 31 May 2017 11:58:33 +0200
Subject: genirq: Warn when IRQ_NOAUTOEN is used with shared interrupts

Shared interrupts do not go well with disabling auto enable:

1) The sharing interrupt might request it while it's still disabled and
   then wait for interrupts forever.

2) The interrupt might have been requested by the driver sharing the line
   before IRQ_NOAUTOEN has been set. So the driver which expects that
   disabled state after calling request_irq() will not get what it wants.
   Even worse, when it calls enable_irq() later, it will trigger the
   unbalanced enable_irq() warning.

Reported-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: dianders@chromium.org
Cc: jeffy <jeffy.chen@rock-chips.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: tfiga@chromium.org
Link: http://lkml.kernel.org/r/20170531100212.210682135@linutronix.de
---
 kernel/irq/chip.c   |  7 +++++++
 kernel/irq/manage.c | 12 ++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index e0051d58c909..bc1331f84fb5 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -935,6 +935,13 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 
 	if (!desc)
 		return;
+
+	/*
+	 * Warn when a driver sets the no autoenable flag on an already
+	 * active interrupt.
+	 */
+	WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN));
+
 	irq_settings_clr_and_set(desc, clr, set);
 
 	irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 57056109f176..49c37f1e71c0 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1334,11 +1334,19 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		if (new->flags & IRQF_ONESHOT)
 			desc->istate |= IRQS_ONESHOT;
 
-		if (irq_settings_can_autoenable(desc))
+		if (irq_settings_can_autoenable(desc)) {
 			irq_startup(desc, true);
-		else
+		} else {
+			/*
+			 * Shared interrupts do not go well with disabling
+			 * auto enable. The sharing interrupt might request
+			 * it while it's still disabled and then wait for
+			 * interrupts forever.
+			 */
+			WARN_ON_ONCE(new->flags & IRQF_SHARED);
 			/* Undo nested disables: */
 			desc->depth = 1;
+		}
 
 		/* Exclude IRQ from balancing if requested */
 		if (new->flags & IRQF_NOBALANCING) {
-- 
cgit v1.3-14-g43fede


From 31ea70e0308b73a1b862bd17c06efc3cbcfd2016 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 3 Jun 2017 21:01:00 +0200
Subject: posix-timers: Move the do_schedule_next_timer declaration

Having it in asm-generic/siginfo.h doesn't make any sense as it is in no way
architecture specific.  Move it to posix-timers.h instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-ia64@vger.kernel.org
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: sparclinux@vger.kernel.org
Cc: "David S. Miller" <davem@davemloft.net>
Link: http://lkml.kernel.org/r/20170603190102.28866-4-hch@lst.de
---
 include/asm-generic/siginfo.h | 1 -
 include/linux/posix-timers.h  | 3 +++
 kernel/signal.c               | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index a2508a8f9a9c..5a9394763a66 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -15,7 +15,6 @@
 #define __SI_CODE(T,N)	((T) | ((N) & 0xffff))
 
 struct siginfo;
-void do_schedule_next_timer(struct siginfo *info);
 
 extern int copy_siginfo_to_user(struct siginfo __user *to, const struct siginfo *from);
 
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 34e893a75771..8929f7e8f452 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -7,6 +7,7 @@
 #include <linux/timex.h>
 #include <linux/alarmtimer.h>
 
+struct siginfo;
 
 struct cpu_timer_list {
 	struct list_head entry;
@@ -120,4 +121,6 @@ long clock_nanosleep_restart(struct restart_block *restart_block);
 
 void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);
 
+void do_schedule_next_timer(struct siginfo *info);
+
 #endif
diff --git a/kernel/signal.c b/kernel/signal.c
index ca92bcfeb322..1f85c843be8e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -39,6 +39,7 @@
 #include <linux/compat.h>
 #include <linux/cn_proc.h>
 #include <linux/compiler.h>
+#include <linux/posix-timers.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
-- 
cgit v1.3-14-g43fede


From 18c700c4e3d0a37c43a2df8b8f740121d4dac645 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:36 +0200
Subject: alarmtimer: Remove pointless config conditional

Having a IF_ENABLED(CONFIG_POSIX_TIMERS) inside of a
#ifdef CONFIG_POSIX_TIMERS section is pointless.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211655.975218056@linutronix.de
---
 kernel/time/alarmtimer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e645dcc7d4ee..2a8675f9aac5 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -520,8 +520,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
 
 	spin_lock_irqsave(&ptr->it_lock, flags);
 	if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
-		if (IS_ENABLED(CONFIG_POSIX_TIMERS) &&
-		    posix_timer_event(ptr, 0) != 0)
+		if (posix_timer_event(ptr, 0))
 			ptr->it_overrun++;
 	}
 
-- 
cgit v1.3-14-g43fede


From a81129e5a189973abd661704b261f8aad9325407 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:37 +0200
Subject: posix-timers: Remove unused export of posix_timer_event()

Since the removal of the mmtimer driver the export is not longer needed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211656.052744418@linutronix.de
---
 kernel/time/posix-timers.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0c0cccfa3586..44d486590e6e 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -442,7 +442,6 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
 	/* If we failed to send the signal the timer stops. */
 	return ret > 0;
 }
-EXPORT_SYMBOL_GPL(posix_timer_event);
 
 /*
  * This function gets called when a POSIX.1b interval timer expires.  It
-- 
cgit v1.3-14-g43fede


From 3a06c7ac24f9f24ec059cd77c2dbdf7fbfd0aaaf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:38 +0200
Subject: posix-clocks: Remove interval timer facility and mmap/fasync
 callbacks

The only user of this facility is ptp_clock, which does not implement any of
those functions.

Remove them to prevent accidental users. Especially the interval timer
interfaces are now more or less impossible to implement because the
necessary infrastructure has been confined to the core code. Aside of that
it's really complex to make these callbacks implemented according to spec
as the alarm timer implementation demonstrates. If at all then a nanosleep
callback might be a reasonable extension. For now keep just what ptp_clock
needs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211656.145036286@linutronix.de
---
 include/linux/posix-clock.h |  22 ---------
 kernel/time/posix-clock.c   | 113 --------------------------------------------
 2 files changed, 135 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h
index 83b22ae9ae12..38d8225510f1 100644
--- a/include/linux/posix-clock.h
+++ b/include/linux/posix-clock.h
@@ -42,12 +42,6 @@ struct posix_clock;
  * @clock_gettime:  Read the current time
  * @clock_getres:   Get the clock resolution
  * @clock_settime:  Set the current time value
- * @timer_create:   Create a new timer
- * @timer_delete:   Remove a previously created timer
- * @timer_gettime:  Get remaining time and interval of a timer
- * @timer_settime: Set a timer's initial expiration and interval
- * @fasync:         Optional character device fasync method
- * @mmap:           Optional character device mmap method
  * @open:           Optional character device open method
  * @release:        Optional character device release method
  * @ioctl:          Optional character device ioctl method
@@ -66,28 +60,12 @@ struct posix_clock_operations {
 	int  (*clock_settime)(struct posix_clock *pc,
 			      const struct timespec64 *ts);
 
-	int  (*timer_create) (struct posix_clock *pc, struct k_itimer *kit);
-
-	int  (*timer_delete) (struct posix_clock *pc, struct k_itimer *kit);
-
-	void (*timer_gettime)(struct posix_clock *pc,
-			      struct k_itimer *kit, struct itimerspec64 *tsp);
-
-	int  (*timer_settime)(struct posix_clock *pc,
-			      struct k_itimer *kit, int flags,
-			      struct itimerspec64 *tsp, struct itimerspec64 *old);
 	/*
 	 * Optional character device methods:
 	 */
-	int     (*fasync)  (struct posix_clock *pc,
-			    int fd, struct file *file, int on);
-
 	long    (*ioctl)   (struct posix_clock *pc,
 			    unsigned int cmd, unsigned long arg);
 
-	int     (*mmap)    (struct posix_clock *pc,
-			    struct vm_area_struct *vma);
-
 	int     (*open)    (struct posix_clock *pc, fmode_t f_mode);
 
 	uint    (*poll)    (struct posix_clock *pc,
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 7e453005e078..bd4fb785652f 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -82,38 +82,6 @@ static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
 	return result;
 }
 
-static int posix_clock_fasync(int fd, struct file *fp, int on)
-{
-	struct posix_clock *clk = get_posix_clock(fp);
-	int err = 0;
-
-	if (!clk)
-		return -ENODEV;
-
-	if (clk->ops.fasync)
-		err = clk->ops.fasync(clk, fd, fp, on);
-
-	put_posix_clock(clk);
-
-	return err;
-}
-
-static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
-{
-	struct posix_clock *clk = get_posix_clock(fp);
-	int err = -ENODEV;
-
-	if (!clk)
-		return -ENODEV;
-
-	if (clk->ops.mmap)
-		err = clk->ops.mmap(clk, vma);
-
-	put_posix_clock(clk);
-
-	return err;
-}
-
 static long posix_clock_ioctl(struct file *fp,
 			      unsigned int cmd, unsigned long arg)
 {
@@ -199,8 +167,6 @@ static const struct file_operations posix_clock_file_operations = {
 	.unlocked_ioctl	= posix_clock_ioctl,
 	.open		= posix_clock_open,
 	.release	= posix_clock_release,
-	.fasync		= posix_clock_fasync,
-	.mmap		= posix_clock_mmap,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= posix_clock_compat_ioctl,
 #endif
@@ -359,88 +325,9 @@ out:
 	return err;
 }
 
-static int pc_timer_create(struct k_itimer *kit)
-{
-	clockid_t id = kit->it_clock;
-	struct posix_clock_desc cd;
-	int err;
-
-	err = get_clock_desc(id, &cd);
-	if (err)
-		return err;
-
-	if (cd.clk->ops.timer_create)
-		err = cd.clk->ops.timer_create(cd.clk, kit);
-	else
-		err = -EOPNOTSUPP;
-
-	put_clock_desc(&cd);
-
-	return err;
-}
-
-static int pc_timer_delete(struct k_itimer *kit)
-{
-	clockid_t id = kit->it_clock;
-	struct posix_clock_desc cd;
-	int err;
-
-	err = get_clock_desc(id, &cd);
-	if (err)
-		return err;
-
-	if (cd.clk->ops.timer_delete)
-		err = cd.clk->ops.timer_delete(cd.clk, kit);
-	else
-		err = -EOPNOTSUPP;
-
-	put_clock_desc(&cd);
-
-	return err;
-}
-
-static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts)
-{
-	clockid_t id = kit->it_clock;
-	struct posix_clock_desc cd;
-
-	if (get_clock_desc(id, &cd))
-		return;
-
-	if (cd.clk->ops.timer_gettime)
-		cd.clk->ops.timer_gettime(cd.clk, kit, ts);
-
-	put_clock_desc(&cd);
-}
-
-static int pc_timer_settime(struct k_itimer *kit, int flags,
-			    struct itimerspec64 *ts, struct itimerspec64 *old)
-{
-	clockid_t id = kit->it_clock;
-	struct posix_clock_desc cd;
-	int err;
-
-	err = get_clock_desc(id, &cd);
-	if (err)
-		return err;
-
-	if (cd.clk->ops.timer_settime)
-		err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
-	else
-		err = -EOPNOTSUPP;
-
-	put_clock_desc(&cd);
-
-	return err;
-}
-
 const struct k_clock clock_posix_dynamic = {
 	.clock_getres	= pc_clock_getres,
 	.clock_set	= pc_clock_settime,
 	.clock_get	= pc_clock_gettime,
 	.clock_adj	= pc_clock_adjtime,
-	.timer_create	= pc_timer_create,
-	.timer_set	= pc_timer_settime,
-	.timer_del	= pc_timer_delete,
-	.timer_get	= pc_timer_gettime,
 };
-- 
cgit v1.3-14-g43fede


From 6631fa12c105e326bbe5fb215eb216e86c90d1ba Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:39 +0200
Subject: posix-timers: Avoid gazillions of forward declarations

Move it below the actual implementations as there are new callbacks coming
which would require even more forward declarations.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211656.238209952@linutronix.de
---
 kernel/time/posix-timers.c | 190 +++++++++++++++++++++------------------------
 1 file changed, 89 insertions(+), 101 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 44d486590e6e..b60b655dfbcd 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -69,6 +69,9 @@ static struct kmem_cache *posix_timers_cache;
 static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
 static DEFINE_SPINLOCK(hash_lock);
 
+static const struct k_clock * const posix_clocks[];
+static const struct k_clock *clockid_to_kclock(const clockid_t id);
+
 /*
  * we assume that the new SIGEV_THREAD_ID shares no bits with the other
  * SIGEV values.  Here we put out an error if this assumption fails.
@@ -124,20 +127,6 @@ static DEFINE_SPINLOCK(hash_lock);
  *	    have is CLOCK_REALTIME and its high res counter part, both of
  *	    which we beg off on and pass to do_sys_settimeofday().
  */
-
-/*
- * These ones are defined below.
- */
-static int common_nsleep(const clockid_t, int flags, struct timespec64 *t,
-			 struct timespec __user *rmtp);
-static int common_timer_create(struct k_itimer *new_timer);
-static void common_timer_get(struct k_itimer *, struct itimerspec64 *);
-static int common_timer_set(struct k_itimer *, int,
-			    struct itimerspec64 *, struct itimerspec64 *);
-static int common_timer_del(struct k_itimer *timer);
-
-static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
-
 static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
 
 #define lock_timer(tid, flags)						   \
@@ -278,82 +267,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
 	return 0;
 }
 
-
-static const struct k_clock clock_realtime = {
-	.clock_getres	= posix_get_hrtimer_res,
-	.clock_get	= posix_clock_realtime_get,
-	.clock_set	= posix_clock_realtime_set,
-	.clock_adj	= posix_clock_realtime_adj,
-	.nsleep		= common_nsleep,
-	.nsleep_restart	= hrtimer_nanosleep_restart,
-	.timer_create	= common_timer_create,
-	.timer_set	= common_timer_set,
-	.timer_get	= common_timer_get,
-	.timer_del	= common_timer_del,
-};
-
-static const struct k_clock clock_monotonic = {
-	.clock_getres	= posix_get_hrtimer_res,
-	.clock_get	= posix_ktime_get_ts,
-	.nsleep		= common_nsleep,
-	.nsleep_restart	= hrtimer_nanosleep_restart,
-	.timer_create	= common_timer_create,
-	.timer_set	= common_timer_set,
-	.timer_get	= common_timer_get,
-	.timer_del	= common_timer_del,
-};
-
-static const struct k_clock clock_monotonic_raw = {
-	.clock_getres	= posix_get_hrtimer_res,
-	.clock_get	= posix_get_monotonic_raw,
-};
-
-static const struct k_clock clock_realtime_coarse = {
-	.clock_getres	= posix_get_coarse_res,
-	.clock_get	= posix_get_realtime_coarse,
-};
-
-static const struct k_clock clock_monotonic_coarse = {
-	.clock_getres	= posix_get_coarse_res,
-	.clock_get	= posix_get_monotonic_coarse,
-};
-
-static const struct k_clock clock_tai = {
-	.clock_getres	= posix_get_hrtimer_res,
-	.clock_get	= posix_get_tai,
-	.nsleep		= common_nsleep,
-	.nsleep_restart	= hrtimer_nanosleep_restart,
-	.timer_create	= common_timer_create,
-	.timer_set	= common_timer_set,
-	.timer_get	= common_timer_get,
-	.timer_del	= common_timer_del,
-};
-
-static const struct k_clock clock_boottime = {
-	.clock_getres	= posix_get_hrtimer_res,
-	.clock_get	= posix_get_boottime,
-	.nsleep		= common_nsleep,
-	.nsleep_restart	= hrtimer_nanosleep_restart,
-	.timer_create	= common_timer_create,
-	.timer_set	= common_timer_set,
-	.timer_get	= common_timer_get,
-	.timer_del	= common_timer_del,
-};
-
-static const struct k_clock * const posix_clocks[] = {
-	[CLOCK_REALTIME]		= &clock_realtime,
-	[CLOCK_MONOTONIC]		= &clock_monotonic,
-	[CLOCK_PROCESS_CPUTIME_ID]	= &clock_process,
-	[CLOCK_THREAD_CPUTIME_ID]	= &clock_thread,
-	[CLOCK_MONOTONIC_RAW]		= &clock_monotonic_raw,
-	[CLOCK_REALTIME_COARSE]		= &clock_realtime_coarse,
-	[CLOCK_MONOTONIC_COARSE]	= &clock_monotonic_coarse,
-	[CLOCK_BOOTTIME]		= &clock_boottime,
-	[CLOCK_REALTIME_ALARM]		= &alarm_clock,
-	[CLOCK_BOOTTIME_ALARM]		= &alarm_clock,
-	[CLOCK_TAI]			= &clock_tai,
-};
-
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
@@ -567,17 +480,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 	call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
 }
 
-static const struct k_clock *clockid_to_kclock(const clockid_t id)
-{
-	if (id < 0)
-		return (id & CLOCKFD_MASK) == CLOCKFD ?
-			&clock_posix_dynamic : &clock_posix_cpu;
-
-	if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id])
-		return NULL;
-	return posix_clocks[id];
-}
-
 static int common_timer_create(struct k_itimer *new_timer)
 {
 	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
@@ -1129,3 +1031,89 @@ long clock_nanosleep_restart(struct restart_block *restart_block)
 
 	return kc->nsleep_restart(restart_block);
 }
+
+static const struct k_clock clock_realtime = {
+	.clock_getres	= posix_get_hrtimer_res,
+	.clock_get	= posix_clock_realtime_get,
+	.clock_set	= posix_clock_realtime_set,
+	.clock_adj	= posix_clock_realtime_adj,
+	.nsleep		= common_nsleep,
+	.nsleep_restart	= hrtimer_nanosleep_restart,
+	.timer_create	= common_timer_create,
+	.timer_set	= common_timer_set,
+	.timer_get	= common_timer_get,
+	.timer_del	= common_timer_del,
+};
+
+static const struct k_clock clock_monotonic = {
+	.clock_getres	= posix_get_hrtimer_res,
+	.clock_get	= posix_ktime_get_ts,
+	.nsleep		= common_nsleep,
+	.nsleep_restart	= hrtimer_nanosleep_restart,
+	.timer_create	= common_timer_create,
+	.timer_set	= common_timer_set,
+	.timer_get	= common_timer_get,
+	.timer_del	= common_timer_del,
+};
+
+static const struct k_clock clock_monotonic_raw = {
+	.clock_getres	= posix_get_hrtimer_res,
+	.clock_get	= posix_get_monotonic_raw,
+};
+
+static const struct k_clock clock_realtime_coarse = {
+	.clock_getres	= posix_get_coarse_res,
+	.clock_get	= posix_get_realtime_coarse,
+};
+
+static const struct k_clock clock_monotonic_coarse = {
+	.clock_getres	= posix_get_coarse_res,
+	.clock_get	= posix_get_monotonic_coarse,
+};
+
+static const struct k_clock clock_tai = {
+	.clock_getres	= posix_get_hrtimer_res,
+	.clock_get	= posix_get_tai,
+	.nsleep		= common_nsleep,
+	.nsleep_restart	= hrtimer_nanosleep_restart,
+	.timer_create	= common_timer_create,
+	.timer_set	= common_timer_set,
+	.timer_get	= common_timer_get,
+	.timer_del	= common_timer_del,
+};
+
+static const struct k_clock clock_boottime = {
+	.clock_getres	= posix_get_hrtimer_res,
+	.clock_get	= posix_get_boottime,
+	.nsleep		= common_nsleep,
+	.nsleep_restart	= hrtimer_nanosleep_restart,
+	.timer_create	= common_timer_create,
+	.timer_set	= common_timer_set,
+	.timer_get	= common_timer_get,
+	.timer_del	= common_timer_del,
+};
+
+static const struct k_clock * const posix_clocks[] = {
+	[CLOCK_REALTIME]		= &clock_realtime,
+	[CLOCK_MONOTONIC]		= &clock_monotonic,
+	[CLOCK_PROCESS_CPUTIME_ID]	= &clock_process,
+	[CLOCK_THREAD_CPUTIME_ID]	= &clock_thread,
+	[CLOCK_MONOTONIC_RAW]		= &clock_monotonic_raw,
+	[CLOCK_REALTIME_COARSE]		= &clock_realtime_coarse,
+	[CLOCK_MONOTONIC_COARSE]	= &clock_monotonic_coarse,
+	[CLOCK_BOOTTIME]		= &clock_boottime,
+	[CLOCK_REALTIME_ALARM]		= &alarm_clock,
+	[CLOCK_BOOTTIME_ALARM]		= &alarm_clock,
+	[CLOCK_TAI]			= &clock_tai,
+};
+
+static const struct k_clock *clockid_to_kclock(const clockid_t id)
+{
+	if (id < 0)
+		return (id & CLOCKFD_MASK) == CLOCKFD ?
+			&clock_posix_dynamic : &clock_posix_cpu;
+
+	if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id])
+		return NULL;
+	return posix_clocks[id];
+}
-- 
cgit v1.3-14-g43fede


From bab0aae9dcba9466dcc968b8bd21914f8f691631 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:41 +0200
Subject: posix-timers: Move posix-timer internals to core

None of these declarations is required outside of kernel/time. Move them to
an internal header.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170530211656.394803853@linutronix.de
---
 include/linux/posix-timers.h   | 30 ------------------------------
 kernel/time/alarmtimer.c       |  2 ++
 kernel/time/posix-clock.c      |  2 ++
 kernel/time/posix-cpu-timers.c |  2 ++
 kernel/time/posix-timers.c     |  1 +
 kernel/time/posix-timers.h     | 29 +++++++++++++++++++++++++++++
 6 files changed, 36 insertions(+), 30 deletions(-)
 create mode 100644 kernel/time/posix-timers.h

(limited to 'kernel')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index e06062c3967b..a372e7e3a396 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -100,36 +100,6 @@ struct k_itimer {
 	} it;
 };
 
-struct k_clock {
-	int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp);
-	int (*clock_set) (const clockid_t which_clock,
-			  const struct timespec64 *tp);
-	int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp);
-	int (*clock_adj) (const clockid_t which_clock, struct timex *tx);
-	int (*timer_create) (struct k_itimer *timer);
-	int (*nsleep) (const clockid_t which_clock, int flags,
-		       struct timespec64 *, struct timespec __user *);
-	long (*nsleep_restart) (struct restart_block *restart_block);
-	int (*timer_set) (struct k_itimer *timr, int flags,
-			  struct itimerspec64 *new_setting,
-			  struct itimerspec64 *old_setting);
-	int (*timer_del) (struct k_itimer *timr);
-#define TIMER_RETRY 1
-	void (*timer_get) (struct k_itimer *timr,
-			   struct itimerspec64 *cur_setting);
-};
-
-extern const struct k_clock clock_posix_cpu;
-extern const struct k_clock clock_posix_dynamic;
-extern const struct k_clock clock_process;
-extern const struct k_clock clock_thread;
-extern const struct k_clock alarm_clock;
-
-/* function to call to trigger timer event */
-int posix_timer_event(struct k_itimer *timr, int si_private);
-
-void posix_cpu_timer_schedule(struct k_itimer *timer);
-
 void run_posix_cpu_timers(struct task_struct *task);
 void posix_cpu_timers_exit(struct task_struct *task);
 void posix_cpu_timers_exit_group(struct task_struct *task);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2a8675f9aac5..36855d675da5 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -28,6 +28,8 @@
 #include <linux/workqueue.h>
 #include <linux/freezer.h>
 
+#include "posix-timers.h"
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/alarmtimer.h>
 
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index bd4fb785652f..17cdc554c9fe 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -25,6 +25,8 @@
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
 
+#include "posix-timers.h"
+
 static void delete_clock(struct kref *kref);
 
 /*
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index c99434739fd5..a77a792f2570 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -13,6 +13,8 @@
 #include <linux/tick.h>
 #include <linux/workqueue.h>
 
+#include "posix-timers.h"
+
 /*
  * Called after updating RLIMIT_CPU to run cpu timer and update
  * tsk->signal->cputime_expires expiration cache if necessary. Needs
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index b60b655dfbcd..dee6a0d911d4 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -51,6 +51,7 @@
 #include <linux/hashtable.h>
 
 #include "timekeeping.h"
+#include "posix-timers.h"
 
 /*
  * Management arrays for POSIX timers. Timers are now kept in static hash table
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
new file mode 100644
index 000000000000..ad2dbd29b389
--- /dev/null
+++ b/kernel/time/posix-timers.h
@@ -0,0 +1,29 @@
+#define TIMER_RETRY 1
+
+struct k_clock {
+	int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp);
+	int (*clock_set) (const clockid_t which_clock,
+			  const struct timespec64 *tp);
+	int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp);
+	int (*clock_adj) (const clockid_t which_clock, struct timex *tx);
+	int (*timer_create) (struct k_itimer *timer);
+	int (*nsleep) (const clockid_t which_clock, int flags,
+		       struct timespec64 *, struct timespec __user *);
+	long (*nsleep_restart) (struct restart_block *restart_block);
+	int (*timer_set) (struct k_itimer *timr, int flags,
+			  struct itimerspec64 *new_setting,
+			  struct itimerspec64 *old_setting);
+	int (*timer_del) (struct k_itimer *timr);
+	void (*timer_get) (struct k_itimer *timr,
+			   struct itimerspec64 *cur_setting);
+};
+
+extern const struct k_clock clock_posix_cpu;
+extern const struct k_clock clock_posix_dynamic;
+extern const struct k_clock clock_process;
+extern const struct k_clock clock_thread;
+extern const struct k_clock alarm_clock;
+
+int posix_timer_event(struct k_itimer *timr, int si_private);
+
+void posix_cpu_timer_schedule(struct k_itimer *timer);
-- 
cgit v1.3-14-g43fede


From af888d677a3f4473c198b4720319dd037f398b51 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:42 +0200
Subject: posix-timers: Unify overrun/requeue_pending handling

hrtimer based posix-timers and posix-cpu-timers handle the update of the
rearming and overflow related status fields differently.

Move that update to the common rearming code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211656.484936964@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 18 +++++++-----------
 kernel/time/posix-timers.c     | 15 ++++++++-------
 2 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a77a792f2570..1683e503179e 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -527,6 +527,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
 		 * ticking in case the signal is deliverable next time.
 		 */
 		posix_cpu_timer_schedule(timer);
+		++timer->it_requeue_pending;
 	}
 }
 
@@ -997,12 +998,12 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 		cpu_clock_sample(timer->it_clock, p, &now);
 		bump_cpu_timer(timer, now);
 		if (unlikely(p->exit_state))
-			goto out;
+			return;
 
 		/* Protect timer list r/w in arm_timer() */
 		sighand = lock_task_sighand(p, &flags);
 		if (!sighand)
-			goto out;
+			return;
 	} else {
 		/*
 		 * Protect arm_timer() and timer sampling in case of call to
@@ -1015,11 +1016,10 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			 * We can't even collect a sample any more.
 			 */
 			timer->it.cpu.expires = 0;
-			goto out;
+			return;
 		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
-			unlock_task_sighand(p, &flags);
-			/* Optimizations: if the process is dying, no need to rearm */
-			goto out;
+			/* If the process is dying, no need to rearm */
+			goto unlock;
 		}
 		cpu_timer_sample_group(timer->it_clock, p, &now);
 		bump_cpu_timer(timer, now);
@@ -1031,12 +1031,8 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	 */
 	WARN_ON_ONCE(!irqs_disabled());
 	arm_timer(timer);
+unlock:
 	unlock_task_sighand(p, &flags);
-
-out:
-	timer->it_overrun_last = timer->it_overrun;
-	timer->it_overrun = -1;
-	++timer->it_requeue_pending;
 }
 
 /**
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index dee6a0d911d4..79a00e0f1ef9 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -291,10 +291,6 @@ static void schedule_next_timer(struct k_itimer *timr)
 	timr->it_overrun += (unsigned int) hrtimer_forward(timer,
 						timer->base->get_time(),
 						timr->it.real.interval);
-
-	timr->it_overrun_last = timr->it_overrun;
-	timr->it_overrun = -1;
-	++timr->it_requeue_pending;
 	hrtimer_restart(timer);
 }
 
@@ -315,18 +311,23 @@ void do_schedule_next_timer(struct siginfo *info)
 	unsigned long flags;
 
 	timr = lock_timer(info->si_tid, &flags);
+	if (!timr)
+		return;
 
-	if (timr && timr->it_requeue_pending == info->si_sys_private) {
+	if (timr->it_requeue_pending == info->si_sys_private) {
 		if (timr->it_clock < 0)
 			posix_cpu_timer_schedule(timr);
 		else
 			schedule_next_timer(timr);
 
+		timr->it_overrun_last = timr->it_overrun;
+		timr->it_overrun = -1;
+		++timr->it_requeue_pending;
+
 		info->si_overrun += timr->it_overrun_last;
 	}
 
-	if (timr)
-		unlock_timer(timr, flags);
+	unlock_timer(timr, flags);
 }
 
 int posix_timer_event(struct k_itimer *timr, int si_private)
-- 
cgit v1.3-14-g43fede


From 80105cd0e62ba8a2caf8eebd52f42952c7c04046 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:43 +0200
Subject: posix-timers: Move interval out of the union

Preparatory patch to unify the alarm timer and hrtimer based posix interval
timer handling.

The interval is used as a criteria for rearming decisions so moving it out
of the clock specific data structures allows later unification.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211656.563922908@linutronix.de
---
 include/linux/posix-timers.h |  4 ++--
 kernel/time/alarmtimer.c     | 13 ++++++-------
 kernel/time/posix-timers.c   | 20 ++++++++++----------
 3 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index a372e7e3a396..908048f488ae 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -63,6 +63,7 @@ struct cpu_timer_list {
  * @it_requeue_pending:	Indicator that timer waits for being requeued on
  *			signal delivery
  * @it_sigev_notify:	The notify word of sigevent struct for signal delivery
+ * @it_interval:	The interval for periodic timers
  * @it_signal:		Pointer to the creators signal struct
  * @it_pid:		The pid of the process/task targeted by the signal
  * @it_process:		The task to wakeup on clock_nanosleep (CPU timers)
@@ -80,6 +81,7 @@ struct k_itimer {
 	int			it_overrun_last;
 	int			it_requeue_pending;
 	int			it_sigev_notify;
+	ktime_t			it_interval;
 	struct signal_struct	*it_signal;
 	union {
 		struct pid		*it_pid;
@@ -89,12 +91,10 @@ struct k_itimer {
 	union {
 		struct {
 			struct hrtimer	timer;
-			ktime_t		interval;
 		} real;
 		struct cpu_timer_list	cpu;
 		struct {
 			struct alarm	alarmtimer;
-			ktime_t		interval;
 		} alarm;
 		struct rcu_head		rcu;
 	} it;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 36855d675da5..5b8cf4b61854 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -527,9 +527,8 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
 	}
 
 	/* Re-add periodic timers */
-	if (ptr->it.alarm.interval) {
-		ptr->it_overrun += alarm_forward(alarm, now,
-						ptr->it.alarm.interval);
+	if (ptr->it_interval) {
+		ptr->it_overrun += alarm_forward(alarm, now, ptr->it_interval);
 		result = ALARMTIMER_RESTART;
 	}
 	spin_unlock_irqrestore(&ptr->it_lock, flags);
@@ -613,7 +612,7 @@ static void alarm_timer_get(struct k_itimer *timr,
 		cur_setting->it_value.tv_nsec = 0;
 	}
 
-	cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval);
+	cur_setting->it_interval = ktime_to_timespec64(timr->it_interval);
 }
 
 /**
@@ -662,14 +661,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
 		return TIMER_RETRY;
 
 	/* start the timer */
-	timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval);
+	timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
 
 	/*
 	 * Rate limit to the tick as a hot fix to prevent DOS. Will be
 	 * mopped up later.
 	 */
-	if (timr->it.alarm.interval < TICK_NSEC)
-		timr->it.alarm.interval = TICK_NSEC;
+	if (timr->it_interval < TICK_NSEC)
+		timr->it_interval = TICK_NSEC;
 
 	exp = timespec64_to_ktime(new_setting->it_value);
 	/* Convert (if necessary) to absolute time */
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 79a00e0f1ef9..7dd992cc7105 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -285,12 +285,12 @@ static void schedule_next_timer(struct k_itimer *timr)
 {
 	struct hrtimer *timer = &timr->it.real.timer;
 
-	if (timr->it.real.interval == 0)
+	if (!timr->it_interval)
 		return;
 
 	timr->it_overrun += (unsigned int) hrtimer_forward(timer,
 						timer->base->get_time(),
-						timr->it.real.interval);
+						timr->it_interval);
 	hrtimer_restart(timer);
 }
 
@@ -375,7 +375,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 	timr = container_of(timer, struct k_itimer, it.real.timer);
 	spin_lock_irqsave(&timr->it_lock, flags);
 
-	if (timr->it.real.interval != 0)
+	if (timr->it_interval != 0)
 		si_private = ++timr->it_requeue_pending;
 
 	if (posix_timer_event(timr, si_private)) {
@@ -384,7 +384,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 		 * we will not get a call back to restart it AND
 		 * it should be restarted.
 		 */
-		if (timr->it.real.interval != 0) {
+		if (timr->it_interval != 0) {
 			ktime_t now = hrtimer_cb_get_time(timer);
 
 			/*
@@ -413,13 +413,13 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 			{
 				ktime_t kj = NSEC_PER_SEC / HZ;
 
-				if (timr->it.real.interval < kj)
+				if (timr->it_interval < kj)
 					now = ktime_add(now, kj);
 			}
 #endif
 			timr->it_overrun += (unsigned int)
 				hrtimer_forward(timer, now,
-						timr->it.real.interval);
+						timr->it_interval);
 			ret = HRTIMER_RESTART;
 			++timr->it_requeue_pending;
 		}
@@ -631,7 +631,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 
 	memset(cur_setting, 0, sizeof(*cur_setting));
 
-	iv = timr->it.real.interval;
+	iv = timr->it_interval;
 
 	/* interval timer ? */
 	if (iv)
@@ -732,7 +732,7 @@ common_timer_set(struct k_itimer *timr, int flags,
 		common_timer_get(timr, old_setting);
 
 	/* disable the timer */
-	timr->it.real.interval = 0;
+	timr->it_interval = 0;
 	/*
 	 * careful here.  If smp we could be in the "fire" routine which will
 	 * be spinning as we hold the lock.  But this is ONLY an SMP issue.
@@ -755,7 +755,7 @@ common_timer_set(struct k_itimer *timr, int flags,
 	hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value));
 
 	/* Convert interval */
-	timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval);
+	timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
 
 	/* SIGEV_NONE timers are not queued ! See common_timer_get */
 	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
@@ -820,7 +820,7 @@ retry:
 
 static int common_timer_del(struct k_itimer *timer)
 {
-	timer->it.real.interval = 0;
+	timer->it_interval = 0;
 
 	if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
 		return TIMER_RETRY;
-- 
cgit v1.3-14-g43fede


From d97bb75ddd2f38068df01da8abf26df78756253c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:44 +0200
Subject: posix-timers: Store k_clock pointer in k_itimer

Having the k_clock pointer in the k_itimer struct avoids the lookup in
several code pathes and makes the next steps of unification of the hrtimer
and alarmtimer based posix timers simpler.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211656.641222072@linutronix.de
---
 include/linux/posix-timers.h   | 2 ++
 kernel/time/posix-cpu-timers.c | 2 ++
 kernel/time/posix-timers.c     | 7 ++++---
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 908048f488ae..8f9cca390cdb 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -56,6 +56,7 @@ struct cpu_timer_list {
  * @list:		List head for binding the timer to signals->posix_timers
  * @t_hash:		Entry in the posix timer hash table
  * @it_lock:		Lock protecting the timer
+ * @kclock:		Pointer to the k_clock struct handling this timer
  * @it_clock:		The posix timer clock id
  * @it_id:		The posix timer id for identifying the timer
  * @it_overrun:		The overrun counter for pending signals
@@ -75,6 +76,7 @@ struct k_itimer {
 	struct list_head	list;
 	struct hlist_node	t_hash;
 	spinlock_t		it_lock;
+	const struct k_clock	*kclock;
 	clockid_t		it_clock;
 	timer_t			it_id;
 	int			it_overrun;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1683e503179e..0123ece6851b 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -324,6 +324,8 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
 	if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
 		return -EINVAL;
 
+	new_timer->kclock = &clock_posix_cpu;
+
 	INIT_LIST_HEAD(&new_timer->it.cpu.entry);
 
 	rcu_read_lock();
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 7dd992cc7105..eb007e19811d 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -519,6 +519,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 	it_id_set = IT_ID_SET;
 	new_timer->it_id = (timer_t) new_timer_id;
 	new_timer->it_clock = which_clock;
+	new_timer->kclock = kc;
 	new_timer->it_overrun = -1;
 
 	if (timer_event_spec) {
@@ -679,7 +680,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 	if (!timr)
 		return -EINVAL;
 
-	kc = clockid_to_kclock(timr->it_clock);
+	kc = timr->kclock;
 	if (WARN_ON_ONCE(!kc || !kc->timer_get))
 		ret = -EINVAL;
 	else
@@ -798,7 +799,7 @@ retry:
 	if (!timr)
 		return -EINVAL;
 
-	kc = clockid_to_kclock(timr->it_clock);
+	kc = timr->kclock;
 	if (WARN_ON_ONCE(!kc || !kc->timer_set))
 		error = -EINVAL;
 	else
@@ -829,7 +830,7 @@ static int common_timer_del(struct k_itimer *timer)
 
 static inline int timer_delete_hook(struct k_itimer *timer)
 {
-	const struct k_clock *kc = clockid_to_kclock(timer->it_clock);
+	const struct k_clock *kc = timer->kclock;
 
 	if (WARN_ON_ONCE(!kc || !kc->timer_del))
 		return -EINVAL;
-- 
cgit v1.3-14-g43fede


From 30802945893bc944b5971b408b37511a03b54e5c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:45 +0200
Subject: posix-timers: Add timer_rearm() callback

Add a timer_rearm() callback which is used to make the rescheduling of
posix interval timers independent of the underlying clock implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211656.732632167@linutronix.de
---
 kernel/time/posix-timers.h | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index ad2dbd29b389..02ffd1b9d230 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -1,21 +1,24 @@
 #define TIMER_RETRY 1
 
 struct k_clock {
-	int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp);
-	int (*clock_set) (const clockid_t which_clock,
-			  const struct timespec64 *tp);
-	int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp);
-	int (*clock_adj) (const clockid_t which_clock, struct timex *tx);
-	int (*timer_create) (struct k_itimer *timer);
-	int (*nsleep) (const clockid_t which_clock, int flags,
-		       struct timespec64 *, struct timespec __user *);
-	long (*nsleep_restart) (struct restart_block *restart_block);
-	int (*timer_set) (struct k_itimer *timr, int flags,
-			  struct itimerspec64 *new_setting,
-			  struct itimerspec64 *old_setting);
-	int (*timer_del) (struct k_itimer *timr);
-	void (*timer_get) (struct k_itimer *timr,
-			   struct itimerspec64 *cur_setting);
+	int	(*clock_getres)(const clockid_t which_clock,
+				struct timespec64 *tp);
+	int	(*clock_set)(const clockid_t which_clock,
+			     const struct timespec64 *tp);
+	int	(*clock_get)(const clockid_t which_clock,
+			     struct timespec64 *tp);
+	int	(*clock_adj)(const clockid_t which_clock, struct timex *tx);
+	int	(*timer_create)(struct k_itimer *timer);
+	int	(*nsleep)(const clockid_t which_clock, int flags,
+			  struct timespec64 *, struct timespec __user *);
+	long	(*nsleep_restart)(struct restart_block *restart_block);
+	int	(*timer_set)(struct k_itimer *timr, int flags,
+			     struct itimerspec64 *new_setting,
+			     struct itimerspec64 *old_setting);
+	int	(*timer_del)(struct k_itimer *timr);
+	void	(*timer_get)(struct k_itimer *timr,
+			     struct itimerspec64 *cur_setting);
+	void	(*timer_rearm)(struct k_itimer *timr);
 };
 
 extern const struct k_clock clock_posix_cpu;
-- 
cgit v1.3-14-g43fede


From 96fe3b072f134e4993f829d599eaa1e0eb5a10e5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:46 +0200
Subject: posix-timers: Rename do_schedule_next_timer

That function is a misnomer. Rename it with a proper prefix to
posixtimer_rearm().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211656.811362578@linutronix.de
---
 include/linux/posix-timers.h   |  2 +-
 kernel/signal.c                |  2 +-
 kernel/time/posix-cpu-timers.c |  2 +-
 kernel/time/posix-timers.c     | 10 +++++-----
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 8f9cca390cdb..771e5f788c90 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -112,6 +112,6 @@ long clock_nanosleep_restart(struct restart_block *restart_block);
 
 void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);
 
-void do_schedule_next_timer(struct siginfo *info);
+void posixtimer_rearm(struct siginfo *info);
 
 #endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 1f85c843be8e..d031cd24f8a9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -630,7 +630,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		 * about to disable them again anyway.
 		 */
 		spin_unlock(&tsk->sighand->siglock);
-		do_schedule_next_timer(info);
+		posixtimer_rearm(info);
 		spin_lock(&tsk->sighand->siglock);
 	}
 #endif
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0123ece6851b..1ba576d3151a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -981,7 +981,7 @@ static void check_process_timers(struct task_struct *tsk,
 }
 
 /*
- * This is called from the signal code (via do_schedule_next_timer)
+ * This is called from the signal code (via posixtimer_rearm)
  * when the last timer signal was delivered and we have to reload the timer.
  */
 void posix_cpu_timer_schedule(struct k_itimer *timer)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index eb007e19811d..036b7e70c65c 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -305,7 +305,7 @@ static void schedule_next_timer(struct k_itimer *timr)
  * To protect against the timer going away while the interrupt is queued,
  * we require that the it_requeue_pending flag be set.
  */
-void do_schedule_next_timer(struct siginfo *info)
+void posixtimer_rearm(struct siginfo *info)
 {
 	struct k_itimer *timr;
 	unsigned long flags;
@@ -336,12 +336,12 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
 	int shared, ret = -1;
 	/*
 	 * FIXME: if ->sigq is queued we can race with
-	 * dequeue_signal()->do_schedule_next_timer().
+	 * dequeue_signal()->posixtimer_rearm().
 	 *
 	 * If dequeue_signal() sees the "right" value of
-	 * si_sys_private it calls do_schedule_next_timer().
+	 * si_sys_private it calls posixtimer_rearm().
 	 * We re-queue ->sigq and drop ->it_lock().
-	 * do_schedule_next_timer() locks the timer
+	 * posixtimer_rearm() locks the timer
 	 * and re-schedules it while ->sigq is pending.
 	 * Not really bad, but not that we want.
 	 */
@@ -701,7 +701,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
  * accumulating overruns on the next timer.  The overrun is frozen when
  * the signal is delivered, either at the notify time (if the info block
  * is not queued) or at the actual delivery time (as we are informed by
- * the call back to do_schedule_next_timer().  So all we need to do is
+ * the call back to posixtimer_rearm().  So all we need to do is
  * to pick up the frozen overrun.
  */
 SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
-- 
cgit v1.3-14-g43fede


From f37fb0aa4f453c7c785bbcecc4991ac48c5c0e51 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:47 +0200
Subject: posix-timers: Use timer_rearm() callback in posixtimer_rearm()

Use the new timer_rearm() callback to replace the conditional hardcoded
calls into the hrtimer and cpu timer code.

This allows later to bring the same logic to alarmtimers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211656.889661919@linutronix.de
---
 kernel/time/posix-cpu-timers.c |  7 +++++--
 kernel/time/posix-timers.c     | 12 ++++++------
 kernel/time/posix-timers.h     |  2 --
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1ba576d3151a..96c833a61ade 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -15,6 +15,8 @@
 
 #include "posix-timers.h"
 
+static void posix_cpu_timer_rearm(struct k_itimer *timer);
+
 /*
  * Called after updating RLIMIT_CPU to run cpu timer and update
  * tsk->signal->cputime_expires expiration cache if necessary. Needs
@@ -528,7 +530,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
 		 * reload the timer.  But we need to keep it
 		 * ticking in case the signal is deliverable next time.
 		 */
-		posix_cpu_timer_schedule(timer);
+		posix_cpu_timer_rearm(timer);
 		++timer->it_requeue_pending;
 	}
 }
@@ -984,7 +986,7 @@ static void check_process_timers(struct task_struct *tsk,
  * This is called from the signal code (via posixtimer_rearm)
  * when the last timer signal was delivered and we have to reload the timer.
  */
-void posix_cpu_timer_schedule(struct k_itimer *timer)
+static void posix_cpu_timer_rearm(struct k_itimer *timer)
 {
 	struct sighand_struct *sighand;
 	unsigned long flags;
@@ -1431,6 +1433,7 @@ const struct k_clock clock_posix_cpu = {
 	.timer_set	= posix_cpu_timer_set,
 	.timer_del	= posix_cpu_timer_del,
 	.timer_get	= posix_cpu_timer_get,
+	.timer_rearm	= posix_cpu_timer_rearm,
 };
 
 const struct k_clock clock_process = {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 036b7e70c65c..b12582a4b122 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -278,10 +278,9 @@ static __init int init_posix_timers(void)
 					NULL);
 	return 0;
 }
-
 __initcall(init_posix_timers);
 
-static void schedule_next_timer(struct k_itimer *timr)
+static void common_hrtimer_rearm(struct k_itimer *timr)
 {
 	struct hrtimer *timer = &timr->it.real.timer;
 
@@ -315,10 +314,7 @@ void posixtimer_rearm(struct siginfo *info)
 		return;
 
 	if (timr->it_requeue_pending == info->si_sys_private) {
-		if (timr->it_clock < 0)
-			posix_cpu_timer_schedule(timr);
-		else
-			schedule_next_timer(timr);
+		timr->kclock->timer_rearm(timr);
 
 		timr->it_overrun_last = timr->it_overrun;
 		timr->it_overrun = -1;
@@ -1046,6 +1042,7 @@ static const struct k_clock clock_realtime = {
 	.timer_set	= common_timer_set,
 	.timer_get	= common_timer_get,
 	.timer_del	= common_timer_del,
+	.timer_rearm	= common_hrtimer_rearm,
 };
 
 static const struct k_clock clock_monotonic = {
@@ -1057,6 +1054,7 @@ static const struct k_clock clock_monotonic = {
 	.timer_set	= common_timer_set,
 	.timer_get	= common_timer_get,
 	.timer_del	= common_timer_del,
+	.timer_rearm	= common_hrtimer_rearm,
 };
 
 static const struct k_clock clock_monotonic_raw = {
@@ -1083,6 +1081,7 @@ static const struct k_clock clock_tai = {
 	.timer_set	= common_timer_set,
 	.timer_get	= common_timer_get,
 	.timer_del	= common_timer_del,
+	.timer_rearm	= common_hrtimer_rearm,
 };
 
 static const struct k_clock clock_boottime = {
@@ -1094,6 +1093,7 @@ static const struct k_clock clock_boottime = {
 	.timer_set	= common_timer_set,
 	.timer_get	= common_timer_get,
 	.timer_del	= common_timer_del,
+	.timer_rearm	= common_hrtimer_rearm,
 };
 
 static const struct k_clock * const posix_clocks[] = {
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 02ffd1b9d230..1f6f6f9a6a37 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -28,5 +28,3 @@ extern const struct k_clock clock_thread;
 extern const struct k_clock alarm_clock;
 
 int posix_timer_event(struct k_itimer *timr, int si_private);
-
-void posix_cpu_timer_schedule(struct k_itimer *timer);
-- 
cgit v1.3-14-g43fede


From 21e55c1f83880a56360287c00f2b5cd5e5a4a912 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:48 +0200
Subject: posix-timers: Add active flag to k_itimer

Keep track of the activation state of posix timers. This is a preparatory
change for making common_timer_get() usable by both hrtimer and alarm timer
implementations.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211656.967783982@linutronix.de
---
 include/linux/posix-timers.h | 2 ++
 kernel/time/posix-timers.c   | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 771e5f788c90..667095dbcd37 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -59,6 +59,7 @@ struct cpu_timer_list {
  * @kclock:		Pointer to the k_clock struct handling this timer
  * @it_clock:		The posix timer clock id
  * @it_id:		The posix timer id for identifying the timer
+ * @it_active:		Marker that timer is active
  * @it_overrun:		The overrun counter for pending signals
  * @it_overrun_last:	The overrun at the time of the last delivered signal
  * @it_requeue_pending:	Indicator that timer waits for being requeued on
@@ -79,6 +80,7 @@ struct k_itimer {
 	const struct k_clock	*kclock;
 	clockid_t		it_clock;
 	timer_t			it_id;
+	int			it_active;
 	int			it_overrun;
 	int			it_overrun_last;
 	int			it_requeue_pending;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index b12582a4b122..795215bba73d 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -316,6 +316,7 @@ void posixtimer_rearm(struct siginfo *info)
 	if (timr->it_requeue_pending == info->si_sys_private) {
 		timr->kclock->timer_rearm(timr);
 
+		timr->it_active = 1;
 		timr->it_overrun_last = timr->it_overrun;
 		timr->it_overrun = -1;
 		++timr->it_requeue_pending;
@@ -371,6 +372,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 	timr = container_of(timer, struct k_itimer, it.real.timer);
 	spin_lock_irqsave(&timr->it_lock, flags);
 
+	timr->it_active = 0;
 	if (timr->it_interval != 0)
 		si_private = ++timr->it_requeue_pending;
 
@@ -418,6 +420,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 						timr->it_interval);
 			ret = HRTIMER_RESTART;
 			++timr->it_requeue_pending;
+			timr->it_active = 1;
 		}
 	}
 
@@ -737,7 +740,8 @@ common_timer_set(struct k_itimer *timr, int flags,
 	if (hrtimer_try_to_cancel(timer) < 0)
 		return TIMER_RETRY;
 
-	timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 
+	timr->it_active = 0;
+	timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
 		~REQUEUE_PENDING;
 	timr->it_overrun_last = 0;
 
@@ -763,6 +767,7 @@ common_timer_set(struct k_itimer *timr, int flags,
 		return 0;
 	}
 
+	timr->it_active = 1;
 	hrtimer_start_expires(timer, mode);
 	return 0;
 }
@@ -821,6 +826,7 @@ static int common_timer_del(struct k_itimer *timer)
 
 	if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
 		return TIMER_RETRY;
+	timer->it_active = 0;
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede


From 63841b2a6969501de183efafc14d20175e402804 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:49 +0200
Subject: posix-timers: Add forward/remaining callbacks

Add two callbacks to kclock which allow using common_)timer_get() for both
hrtimer and alarm timer based clocks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211657.044915536@linutronix.de
---
 kernel/time/posix-timers.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 1f6f6f9a6a37..3bc5b74c342f 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -19,6 +19,8 @@ struct k_clock {
 	void	(*timer_get)(struct k_itimer *timr,
 			     struct itimerspec64 *cur_setting);
 	void	(*timer_rearm)(struct k_itimer *timr);
+	int	(*timer_forward)(struct k_itimer *timr, ktime_t now);
+	ktime_t	(*timer_remaining)(struct k_itimer *timr, ktime_t now);
 };
 
 extern const struct k_clock clock_posix_cpu;
-- 
cgit v1.3-14-g43fede


From 91d57bae08689199c8acc77a8b3b41150cafab1c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:50 +0200
Subject: posix-timers: Make use of forward/remaining callbacks

Replace the hrtimer calls by calls to the new forward/remaining kclock
callbacks and move the hrtimer specific implementation into the
corresponding callback functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211657.121437232@linutronix.de
---
 kernel/time/posix-timers.c | 64 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 49 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 795215bba73d..48f6c37ae5df 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -607,6 +607,20 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 	return NULL;
 }
 
+static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now)
+{
+	struct hrtimer *timer = &timr->it.real.timer;
+
+	return __hrtimer_expires_remaining_adjusted(timer, now);
+}
+
+static int common_hrtimer_forward(struct k_itimer *timr, ktime_t now)
+{
+	struct hrtimer *timer = &timr->it.real.timer;
+
+	return (int)hrtimer_forward(timer, now, timr->it_interval);
+}
+
 /*
  * Get the time remaining on a POSIX.1b interval timer.  This function
  * is ALWAYS called with spin_lock_irq on the timer, thus it must not
@@ -626,42 +640,54 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 static void
 common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 {
+	const struct k_clock *kc = timr->kclock;
 	ktime_t now, remaining, iv;
-	struct hrtimer *timer = &timr->it.real.timer;
+	struct timespec64 ts64;
+	bool sig_none;
 
 	memset(cur_setting, 0, sizeof(*cur_setting));
 
+	sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE;
 	iv = timr->it_interval;
 
 	/* interval timer ? */
-	if (iv)
+	if (iv) {
 		cur_setting->it_interval = ktime_to_timespec64(iv);
-	else if (!hrtimer_active(timer) &&
-		 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
-		return;
+	} else if (!timr->it_active) {
+		/*
+		 * SIGEV_NONE oneshot timers are never queued. Check them
+		 * below.
+		 */
+		if (!sig_none)
+			return;
+	}
 
-	now = timer->base->get_time();
+	/*
+	 * The timespec64 based conversion is suboptimal, but it's not
+	 * worth to implement yet another callback.
+	 */
+	kc->clock_get(timr->it_clock, &ts64);
+	now = timespec64_to_ktime(ts64);
 
 	/*
-	 * When a requeue is pending or this is a SIGEV_NONE
-	 * timer move the expiry time forward by intervals, so
-	 * expiry is > now.
+	 * When a requeue is pending or this is a SIGEV_NONE timer move the
+	 * expiry time forward by intervals, so expiry is > now.
 	 */
-	if (iv && (timr->it_requeue_pending & REQUEUE_PENDING ||
-		   (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
-		timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
+	if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none))
+		timr->it_overrun += kc->timer_forward(timr, now);
 
-	remaining = __hrtimer_expires_remaining_adjusted(timer, now);
+	remaining = kc->timer_remaining(timr, now);
 	/* Return 0 only, when the timer is expired and not pending */
 	if (remaining <= 0) {
 		/*
 		 * A single shot SIGEV_NONE timer must return 0, when
 		 * it is expired !
 		 */
-		if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
+		if (!sig_none)
 			cur_setting->it_value.tv_nsec = 1;
-	} else
+	} else {
 		cur_setting->it_value = ktime_to_timespec64(remaining);
+	}
 }
 
 /* Get the time remaining on a POSIX.1b interval timer. */
@@ -1049,6 +1075,8 @@ static const struct k_clock clock_realtime = {
 	.timer_get	= common_timer_get,
 	.timer_del	= common_timer_del,
 	.timer_rearm	= common_hrtimer_rearm,
+	.timer_forward	= common_hrtimer_forward,
+	.timer_remaining= common_hrtimer_remaining,
 };
 
 static const struct k_clock clock_monotonic = {
@@ -1061,6 +1089,8 @@ static const struct k_clock clock_monotonic = {
 	.timer_get	= common_timer_get,
 	.timer_del	= common_timer_del,
 	.timer_rearm	= common_hrtimer_rearm,
+	.timer_forward	= common_hrtimer_forward,
+	.timer_remaining= common_hrtimer_remaining,
 };
 
 static const struct k_clock clock_monotonic_raw = {
@@ -1088,6 +1118,8 @@ static const struct k_clock clock_tai = {
 	.timer_get	= common_timer_get,
 	.timer_del	= common_timer_del,
 	.timer_rearm	= common_hrtimer_rearm,
+	.timer_forward	= common_hrtimer_forward,
+	.timer_remaining= common_hrtimer_remaining,
 };
 
 static const struct k_clock clock_boottime = {
@@ -1100,6 +1132,8 @@ static const struct k_clock clock_boottime = {
 	.timer_get	= common_timer_get,
 	.timer_del	= common_timer_del,
 	.timer_rearm	= common_hrtimer_rearm,
+	.timer_forward	= common_hrtimer_forward,
+	.timer_remaining= common_hrtimer_remaining,
 };
 
 static const struct k_clock * const posix_clocks[] = {
-- 
cgit v1.3-14-g43fede


From eabdec04385376d560078992710104cc7be2ce1b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:51 +0200
Subject: posix-timers: Zero settings value in common code

Zero out the settings struct in the common code so the callbacks do not
have to do it themself.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211657.200870713@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 5 +----
 kernel/time/posix-timers.c     | 3 +--
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 96c833a61ade..cb4a4eb44279 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -719,10 +719,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
 	 */
 	itp->it_interval = ns_to_timespec64(timer->it.cpu.incr);
 
-	if (timer->it.cpu.expires == 0) {	/* Timer not armed at all.  */
-		itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
+	if (!timer->it.cpu.expires)
 		return;
-	}
 
 	/*
 	 * Sample the clock to take the difference with the expiry time.
@@ -746,7 +744,6 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp
 			 * Call the timer disarmed, nothing else to do.
 			 */
 			timer->it.cpu.expires = 0;
-			itp->it_value = ns_to_timespec64(timer->it.cpu.expires);
 			return;
 		} else {
 			cpu_timer_sample_group(timer->it_clock, p, &now);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 48f6c37ae5df..0332f7a60fd6 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -645,8 +645,6 @@ common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 	struct timespec64 ts64;
 	bool sig_none;
 
-	memset(cur_setting, 0, sizeof(*cur_setting));
-
 	sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE;
 	iv = timr->it_interval;
 
@@ -705,6 +703,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 	if (!timr)
 		return -EINVAL;
 
+	memset(&cur_setting64, 0, sizeof(cur_setting64));
 	kc = timr->kclock;
 	if (WARN_ON_ONCE(!kc || !kc->timer_get))
 		ret = -EINVAL;
-- 
cgit v1.3-14-g43fede


From 525b8ed91671e29e187dfe02d408b11190ccf494 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:52 +0200
Subject: posix-timers: Add cancel/arm callbacks

Add timer_try_to_cancel() and timer_arm() callbacks to kclock which allow
to make common_timer_set() usable by both hrtimer and alarmtimer based
clocks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211657.278022962@linutronix.de
---
 kernel/time/posix-timers.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 3bc5b74c342f..b0ad77e18886 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -21,6 +21,9 @@ struct k_clock {
 	void	(*timer_rearm)(struct k_itimer *timr);
 	int	(*timer_forward)(struct k_itimer *timr, ktime_t now);
 	ktime_t	(*timer_remaining)(struct k_itimer *timr, ktime_t now);
+	int	(*timer_try_to_cancel)(struct k_itimer *timr);
+	void	(*timer_arm)(struct k_itimer *timr, ktime_t expires,
+			     bool absolute, bool sigev_none);
 };
 
 extern const struct k_clock clock_posix_cpu;
-- 
cgit v1.3-14-g43fede


From eae1c4ae275fe3e024454c012a548ee0d700f54c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:53 +0200
Subject: posix-timers: Make use of cancel/arm callbacks

Replace the hrtimer calls by calls to the new try_to_cancel()/arm() kclock
callbacks and move the hrtimer specific implementation into the
corresponding callback functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211657.355396667@linutronix.de
---
 kernel/time/posix-timers.c | 181 +++++++++++++++++++++++++--------------------
 1 file changed, 100 insertions(+), 81 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0332f7a60fd6..8acc9ee2c2d6 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -744,25 +744,49 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
 	return overrun;
 }
 
+static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
+			       bool absolute, bool sigev_none)
+{
+	struct hrtimer *timer = &timr->it.real.timer;
+	enum hrtimer_mode mode;
+
+	mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
+	hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
+	timr->it.real.timer.function = posix_timer_fn;
+
+	if (!absolute)
+		expires = ktime_add_safe(expires, timer->base->get_time());
+	hrtimer_set_expires(timer, expires);
+
+	if (!sigev_none)
+		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+}
+
+static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
+{
+	return hrtimer_try_to_cancel(&timr->it.real.timer);
+}
+
 /* Set a POSIX.1b interval timer. */
-/* timr->it_lock is taken. */
 static int
 common_timer_set(struct k_itimer *timr, int flags,
-		 struct itimerspec64 *new_setting, struct itimerspec64 *old_setting)
+		 struct itimerspec64 *new_setting,
+		 struct itimerspec64 *old_setting)
 {
-	struct hrtimer *timer = &timr->it.real.timer;
-	enum hrtimer_mode mode;
+	const struct k_clock *kc = timr->kclock;
+	bool sigev_none;
+	ktime_t expires;
 
 	if (old_setting)
 		common_timer_get(timr, old_setting);
 
-	/* disable the timer */
+	/* Prevent rearming by clearing the interval */
 	timr->it_interval = 0;
 	/*
-	 * careful here.  If smp we could be in the "fire" routine which will
-	 * be spinning as we hold the lock.  But this is ONLY an SMP issue.
+	 * Careful here. On SMP systems the timer expiry function could be
+	 * active and spinning on timr->it_lock.
 	 */
-	if (hrtimer_try_to_cancel(timer) < 0)
+	if (kc->timer_try_to_cancel(timr) < 0)
 		return TIMER_RETRY;
 
 	timr->it_active = 0;
@@ -770,30 +794,16 @@ common_timer_set(struct k_itimer *timr, int flags,
 		~REQUEUE_PENDING;
 	timr->it_overrun_last = 0;
 
-	/* switch off the timer when it_value is zero */
+	/* Switch off the timer when it_value is zero */
 	if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
 		return 0;
 
-	mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
-	hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
-	timr->it.real.timer.function = posix_timer_fn;
-
-	hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value));
-
-	/* Convert interval */
 	timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
+	expires = timespec64_to_ktime(new_setting->it_value);
+	sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE;
 
-	/* SIGEV_NONE timers are not queued ! See common_timer_get */
-	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
-		/* Setup correct expiry time for relative timers */
-		if (mode == HRTIMER_MODE_REL) {
-			hrtimer_add_expires(timer, timer->base->get_time());
-		}
-		return 0;
-	}
-
-	timr->it_active = 1;
-	hrtimer_start_expires(timer, mode);
+	kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
+	timr->it_active = !sigev_none;
 	return 0;
 }
 
@@ -847,9 +857,10 @@ retry:
 
 static int common_timer_del(struct k_itimer *timer)
 {
-	timer->it_interval = 0;
+	const struct k_clock *kc = timer->kclock;
 
-	if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
+	timer->it_interval = 0;
+	if (kc->timer_try_to_cancel(timer) < 0)
 		return TIMER_RETRY;
 	timer->it_active = 0;
 	return 0;
@@ -1063,76 +1074,84 @@ long clock_nanosleep_restart(struct restart_block *restart_block)
 }
 
 static const struct k_clock clock_realtime = {
-	.clock_getres	= posix_get_hrtimer_res,
-	.clock_get	= posix_clock_realtime_get,
-	.clock_set	= posix_clock_realtime_set,
-	.clock_adj	= posix_clock_realtime_adj,
-	.nsleep		= common_nsleep,
-	.nsleep_restart	= hrtimer_nanosleep_restart,
-	.timer_create	= common_timer_create,
-	.timer_set	= common_timer_set,
-	.timer_get	= common_timer_get,
-	.timer_del	= common_timer_del,
-	.timer_rearm	= common_hrtimer_rearm,
-	.timer_forward	= common_hrtimer_forward,
-	.timer_remaining= common_hrtimer_remaining,
+	.clock_getres		= posix_get_hrtimer_res,
+	.clock_get		= posix_clock_realtime_get,
+	.clock_set		= posix_clock_realtime_set,
+	.clock_adj		= posix_clock_realtime_adj,
+	.nsleep			= common_nsleep,
+	.nsleep_restart		= hrtimer_nanosleep_restart,
+	.timer_create		= common_timer_create,
+	.timer_set		= common_timer_set,
+	.timer_get		= common_timer_get,
+	.timer_del		= common_timer_del,
+	.timer_rearm		= common_hrtimer_rearm,
+	.timer_forward		= common_hrtimer_forward,
+	.timer_remaining	= common_hrtimer_remaining,
+	.timer_try_to_cancel	= common_hrtimer_try_to_cancel,
+	.timer_arm		= common_hrtimer_arm,
 };
 
 static const struct k_clock clock_monotonic = {
-	.clock_getres	= posix_get_hrtimer_res,
-	.clock_get	= posix_ktime_get_ts,
-	.nsleep		= common_nsleep,
-	.nsleep_restart	= hrtimer_nanosleep_restart,
-	.timer_create	= common_timer_create,
-	.timer_set	= common_timer_set,
-	.timer_get	= common_timer_get,
-	.timer_del	= common_timer_del,
-	.timer_rearm	= common_hrtimer_rearm,
-	.timer_forward	= common_hrtimer_forward,
-	.timer_remaining= common_hrtimer_remaining,
+	.clock_getres		= posix_get_hrtimer_res,
+	.clock_get		= posix_ktime_get_ts,
+	.nsleep			= common_nsleep,
+	.nsleep_restart		= hrtimer_nanosleep_restart,
+	.timer_create		= common_timer_create,
+	.timer_set		= common_timer_set,
+	.timer_get		= common_timer_get,
+	.timer_del		= common_timer_del,
+	.timer_rearm		= common_hrtimer_rearm,
+	.timer_forward		= common_hrtimer_forward,
+	.timer_remaining	= common_hrtimer_remaining,
+	.timer_try_to_cancel	= common_hrtimer_try_to_cancel,
+	.timer_arm		= common_hrtimer_arm,
 };
 
 static const struct k_clock clock_monotonic_raw = {
-	.clock_getres	= posix_get_hrtimer_res,
-	.clock_get	= posix_get_monotonic_raw,
+	.clock_getres		= posix_get_hrtimer_res,
+	.clock_get		= posix_get_monotonic_raw,
 };
 
 static const struct k_clock clock_realtime_coarse = {
-	.clock_getres	= posix_get_coarse_res,
-	.clock_get	= posix_get_realtime_coarse,
+	.clock_getres		= posix_get_coarse_res,
+	.clock_get		= posix_get_realtime_coarse,
 };
 
 static const struct k_clock clock_monotonic_coarse = {
-	.clock_getres	= posix_get_coarse_res,
-	.clock_get	= posix_get_monotonic_coarse,
+	.clock_getres		= posix_get_coarse_res,
+	.clock_get		= posix_get_monotonic_coarse,
 };
 
 static const struct k_clock clock_tai = {
-	.clock_getres	= posix_get_hrtimer_res,
-	.clock_get	= posix_get_tai,
-	.nsleep		= common_nsleep,
-	.nsleep_restart	= hrtimer_nanosleep_restart,
-	.timer_create	= common_timer_create,
-	.timer_set	= common_timer_set,
-	.timer_get	= common_timer_get,
-	.timer_del	= common_timer_del,
-	.timer_rearm	= common_hrtimer_rearm,
-	.timer_forward	= common_hrtimer_forward,
-	.timer_remaining= common_hrtimer_remaining,
+	.clock_getres		= posix_get_hrtimer_res,
+	.clock_get		= posix_get_tai,
+	.nsleep			= common_nsleep,
+	.nsleep_restart		= hrtimer_nanosleep_restart,
+	.timer_create		= common_timer_create,
+	.timer_set		= common_timer_set,
+	.timer_get		= common_timer_get,
+	.timer_del		= common_timer_del,
+	.timer_rearm		= common_hrtimer_rearm,
+	.timer_forward		= common_hrtimer_forward,
+	.timer_remaining	= common_hrtimer_remaining,
+	.timer_try_to_cancel	= common_hrtimer_try_to_cancel,
+	.timer_arm		= common_hrtimer_arm,
 };
 
 static const struct k_clock clock_boottime = {
-	.clock_getres	= posix_get_hrtimer_res,
-	.clock_get	= posix_get_boottime,
-	.nsleep		= common_nsleep,
-	.nsleep_restart	= hrtimer_nanosleep_restart,
-	.timer_create	= common_timer_create,
-	.timer_set	= common_timer_set,
-	.timer_get	= common_timer_get,
-	.timer_del	= common_timer_del,
-	.timer_rearm	= common_hrtimer_rearm,
-	.timer_forward	= common_hrtimer_forward,
-	.timer_remaining= common_hrtimer_remaining,
+	.clock_getres		= posix_get_hrtimer_res,
+	.clock_get		= posix_get_boottime,
+	.nsleep			= common_nsleep,
+	.nsleep_restart		= hrtimer_nanosleep_restart,
+	.timer_create		= common_timer_create,
+	.timer_set		= common_timer_set,
+	.timer_get		= common_timer_get,
+	.timer_del		= common_timer_del,
+	.timer_rearm		= common_hrtimer_rearm,
+	.timer_forward		= common_hrtimer_forward,
+	.timer_remaining	= common_hrtimer_remaining,
+	.timer_try_to_cancel	= common_hrtimer_try_to_cancel,
+	.timer_arm		= common_hrtimer_arm,
 };
 
 static const struct k_clock * const posix_clocks[] = {
-- 
cgit v1.3-14-g43fede


From b3db80f77a95a45dbb2136f7b2a364dc797ea914 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:54 +0200
Subject: alarmtimer: Implement timer_rearm() callback

Preparatory change to utilize the common posix timer mechanisms.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211657.434598989@linutronix.de
---
 kernel/time/alarmtimer.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 5b8cf4b61854..be85e3cbfe1b 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -536,6 +536,18 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
 	return result;
 }
 
+/**
+ * alarm_timer_rearm - Posix timer callback for rearming timer
+ * @timr:	Pointer to the posixtimer data struct
+ */
+static void alarm_timer_rearm(struct k_itimer *timr)
+{
+	struct alarm *alarm = &timr->it.alarm.alarmtimer;
+
+	timr->it_overrun += alarm_forward_now(alarm, timr->it_interval);
+	alarm_start(alarm, alarm->node.expires);
+}
+
 /**
  * alarm_clock_getres - posix getres interface
  * @which_clock: clockid
@@ -594,7 +606,7 @@ static int alarm_timer_create(struct k_itimer *new_timer)
 
 /**
  * alarm_timer_get - posix timer_get interface
- * @new_timer: k_itimer pointer
+ * @timr: k_itimer pointer
  * @cur_setting: itimerspec data to fill
  *
  * Copies out the current itimerspec data
@@ -863,6 +875,7 @@ const struct k_clock alarm_clock = {
 	.timer_set	= alarm_timer_set,
 	.timer_del	= alarm_timer_del,
 	.timer_get	= alarm_timer_get,
+	.timer_rearm	= alarm_timer_rearm,
 	.nsleep		= alarm_timer_nsleep,
 };
 #endif /* CONFIG_POSIX_TIMERS */
-- 
cgit v1.3-14-g43fede


From e7561f1633ac735df48c55ad09a2530e9ab9fab1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:55 +0200
Subject: alarmtimer: Implement forward callback

Preparatory change to utilize the common posix timer mechanisms.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211657.513694229@linutronix.de
---
 kernel/time/alarmtimer.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index be85e3cbfe1b..6082cf1af876 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -548,6 +548,18 @@ static void alarm_timer_rearm(struct k_itimer *timr)
 	alarm_start(alarm, alarm->node.expires);
 }
 
+/**
+ * alarm_timer_forward - Posix timer callback for forwarding timer
+ * @timr:	Pointer to the posixtimer data struct
+ * @now:	Current time to forward the timer against
+ */
+static int alarm_timer_forward(struct k_itimer *timr, ktime_t now)
+{
+	struct alarm *alarm = &timr->it.alarm.alarmtimer;
+
+	return (int) alarm_forward(alarm, timr->it_interval, now);
+}
+
 /**
  * alarm_clock_getres - posix getres interface
  * @which_clock: clockid
@@ -876,6 +888,7 @@ const struct k_clock alarm_clock = {
 	.timer_del	= alarm_timer_del,
 	.timer_get	= alarm_timer_get,
 	.timer_rearm	= alarm_timer_rearm,
+	.timer_forward	= alarm_timer_forward,
 	.nsleep		= alarm_timer_nsleep,
 };
 #endif /* CONFIG_POSIX_TIMERS */
-- 
cgit v1.3-14-g43fede


From d653d8457c76da11f047af1f66256ac9b8421b69 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:56 +0200
Subject: alarmtimer: Implement remaining callback

Preparatory change to utilize the common posix timer mechanisms.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211657.592676753@linutronix.de
---
 kernel/time/alarmtimer.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 6082cf1af876..02ddc40f19fe 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -560,6 +560,18 @@ static int alarm_timer_forward(struct k_itimer *timr, ktime_t now)
 	return (int) alarm_forward(alarm, timr->it_interval, now);
 }
 
+/**
+ * alarm_timer_remaining - Posix timer callback to retrieve remaining time
+ * @timr:	Pointer to the posixtimer data struct
+ * @now:	Current time to calculate against
+ */
+static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now)
+{
+	struct alarm *alarm = &timr->it.alarm.alarmtimer;
+
+	return ktime_sub(now, alarm->node.expires);
+}
+
 /**
  * alarm_clock_getres - posix getres interface
  * @which_clock: clockid
@@ -881,15 +893,16 @@ out:
 }
 
 const struct k_clock alarm_clock = {
-	.clock_getres	= alarm_clock_getres,
-	.clock_get	= alarm_clock_get,
-	.timer_create	= alarm_timer_create,
-	.timer_set	= alarm_timer_set,
-	.timer_del	= alarm_timer_del,
-	.timer_get	= alarm_timer_get,
-	.timer_rearm	= alarm_timer_rearm,
-	.timer_forward	= alarm_timer_forward,
-	.nsleep		= alarm_timer_nsleep,
+	.clock_getres		= alarm_clock_getres,
+	.clock_get		= alarm_clock_get,
+	.timer_create		= alarm_timer_create,
+	.timer_set		= alarm_timer_set,
+	.timer_del		= alarm_timer_del,
+	.timer_get		= alarm_timer_get,
+	.timer_rearm		= alarm_timer_rearm,
+	.timer_forward		= alarm_timer_forward,
+	.timer_remaining	= alarm_timer_remaining,
+	.nsleep			= alarm_timer_nsleep,
 };
 #endif /* CONFIG_POSIX_TIMERS */
 
-- 
cgit v1.3-14-g43fede


From e344c9e76bc6af997926171bfd90d25bbae0a2c5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:57 +0200
Subject: alarmtimer: Implement try_to_cancel callback

Preparatory change to utilize the common posix timer mechanisms.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211657.670026824@linutronix.de
---
 kernel/time/alarmtimer.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 02ddc40f19fe..374bd855a488 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -572,6 +572,15 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now)
 	return ktime_sub(now, alarm->node.expires);
 }
 
+/**
+ * alarm_timer_try_to_cancel - Posix timer callback to cancel a timer
+ * @timr:	Pointer to the posixtimer data struct
+ */
+static int alarm_timer_try_to_cancel(struct k_itimer *timr)
+{
+	return alarm_try_to_cancel(&timr->it.alarm.alarmtimer);
+}
+
 /**
  * alarm_clock_getres - posix getres interface
  * @which_clock: clockid
@@ -902,6 +911,7 @@ const struct k_clock alarm_clock = {
 	.timer_rearm		= alarm_timer_rearm,
 	.timer_forward		= alarm_timer_forward,
 	.timer_remaining	= alarm_timer_remaining,
+	.timer_try_to_cancel	= alarm_timer_try_to_cancel,
 	.nsleep			= alarm_timer_nsleep,
 };
 #endif /* CONFIG_POSIX_TIMERS */
-- 
cgit v1.3-14-g43fede


From b3bf6f369d50ece9dec6338741648005d95c19e4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:58 +0200
Subject: alarmtimer: Implement arm callback

Preparatory change to utilize the common posix timer mechanisms.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211657.747567162@linutronix.de
---
 kernel/time/alarmtimer.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 374bd855a488..c618a44bb054 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -581,6 +581,27 @@ static int alarm_timer_try_to_cancel(struct k_itimer *timr)
 	return alarm_try_to_cancel(&timr->it.alarm.alarmtimer);
 }
 
+/**
+ * alarm_timer_arm - Posix timer callback to arm a timer
+ * @timr:	Pointer to the posixtimer data struct
+ * @expires:	The new expiry time
+ * @absolute:	Expiry value is absolute time
+ * @sigev_none:	Posix timer does not deliver signals
+ */
+static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires,
+			    bool absolute, bool sigev_none)
+{
+	struct alarm *alarm = &timr->it.alarm.alarmtimer;
+	struct alarm_base *base = &alarm_bases[alarm->type];
+
+	if (!absolute)
+		expires = ktime_add_safe(expires, base->gettime());
+	if (sigev_none)
+		alarm->node.expires = expires;
+	else
+		alarm_start(&timr->it.alarm.alarmtimer, expires);
+}
+
 /**
  * alarm_clock_getres - posix getres interface
  * @which_clock: clockid
@@ -908,6 +929,7 @@ const struct k_clock alarm_clock = {
 	.timer_set		= alarm_timer_set,
 	.timer_del		= alarm_timer_del,
 	.timer_get		= alarm_timer_get,
+	.timer_arm		= alarm_timer_arm,
 	.timer_rearm		= alarm_timer_rearm,
 	.timer_forward		= alarm_timer_forward,
 	.timer_remaining	= alarm_timer_remaining,
-- 
cgit v1.3-14-g43fede


From f2c45807d3992fe0f173f34af9c347d907c31686 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 30 May 2017 23:15:59 +0200
Subject: alarmtimer: Switch over to generic set/get/rearm routine

All required callbacks are in place. Switch the alarm timer based posix
interval timer callbacks to the common implementation and remove the
incorrect private implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170530211657.825471962@linutronix.de
---
 kernel/time/alarmtimer.c   | 121 +++++++--------------------------------------
 kernel/time/posix-timers.c |  12 ++---
 kernel/time/posix-timers.h |   6 +++
 3 files changed, 29 insertions(+), 110 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c618a44bb054..d8a7a7e214de 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -515,20 +515,26 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
 static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
 							ktime_t now)
 {
-	unsigned long flags;
 	struct k_itimer *ptr = container_of(alarm, struct k_itimer,
-						it.alarm.alarmtimer);
+					    it.alarm.alarmtimer);
 	enum alarmtimer_restart result = ALARMTIMER_NORESTART;
+	unsigned long flags;
+	int si_private = 0;
 
 	spin_lock_irqsave(&ptr->it_lock, flags);
-	if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
-		if (posix_timer_event(ptr, 0))
-			ptr->it_overrun++;
-	}
 
-	/* Re-add periodic timers */
-	if (ptr->it_interval) {
-		ptr->it_overrun += alarm_forward(alarm, now, ptr->it_interval);
+	ptr->it_active = 0;
+	if (ptr->it_interval)
+		si_private = ++ptr->it_requeue_pending;
+
+	if (posix_timer_event(ptr, si_private) && ptr->it_interval) {
+		/*
+		 * Handle ignored signals and rearm the timer. This will go
+		 * away once we handle ignored signals proper.
+		 */
+		ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval);
+		++ptr->it_requeue_pending;
+		ptr->it_active = 1;
 		result = ALARMTIMER_RESTART;
 	}
 	spin_unlock_irqrestore(&ptr->it_lock, flags);
@@ -658,97 +664,6 @@ static int alarm_timer_create(struct k_itimer *new_timer)
 	return 0;
 }
 
-/**
- * alarm_timer_get - posix timer_get interface
- * @timr: k_itimer pointer
- * @cur_setting: itimerspec data to fill
- *
- * Copies out the current itimerspec data
- */
-static void alarm_timer_get(struct k_itimer *timr,
-			    struct itimerspec64 *cur_setting)
-{
-	ktime_t relative_expiry_time =
-		alarm_expires_remaining(&(timr->it.alarm.alarmtimer));
-
-	if (ktime_to_ns(relative_expiry_time) > 0) {
-		cur_setting->it_value = ktime_to_timespec64(relative_expiry_time);
-	} else {
-		cur_setting->it_value.tv_sec = 0;
-		cur_setting->it_value.tv_nsec = 0;
-	}
-
-	cur_setting->it_interval = ktime_to_timespec64(timr->it_interval);
-}
-
-/**
- * alarm_timer_del - posix timer_del interface
- * @timr: k_itimer pointer to be deleted
- *
- * Cancels any programmed alarms for the given timer.
- */
-static int alarm_timer_del(struct k_itimer *timr)
-{
-	if (!rtcdev)
-		return -ENOTSUPP;
-
-	if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
-		return TIMER_RETRY;
-
-	return 0;
-}
-
-/**
- * alarm_timer_set - posix timer_set interface
- * @timr: k_itimer pointer to be deleted
- * @flags: timer flags
- * @new_setting: itimerspec to be used
- * @old_setting: itimerspec being replaced
- *
- * Sets the timer to new_setting, and starts the timer.
- */
-static int alarm_timer_set(struct k_itimer *timr, int flags,
-			   struct itimerspec64 *new_setting,
-			   struct itimerspec64 *old_setting)
-{
-	ktime_t exp;
-
-	if (!rtcdev)
-		return -ENOTSUPP;
-
-	if (flags & ~TIMER_ABSTIME)
-		return -EINVAL;
-
-	if (old_setting)
-		alarm_timer_get(timr, old_setting);
-
-	/* If the timer was already set, cancel it */
-	if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0)
-		return TIMER_RETRY;
-
-	/* start the timer */
-	timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
-
-	/*
-	 * Rate limit to the tick as a hot fix to prevent DOS. Will be
-	 * mopped up later.
-	 */
-	if (timr->it_interval < TICK_NSEC)
-		timr->it_interval = TICK_NSEC;
-
-	exp = timespec64_to_ktime(new_setting->it_value);
-	/* Convert (if necessary) to absolute time */
-	if (flags != TIMER_ABSTIME) {
-		ktime_t now;
-
-		now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
-		exp = ktime_add_safe(now, exp);
-	}
-
-	alarm_start(&timr->it.alarm.alarmtimer, exp);
-	return 0;
-}
-
 /**
  * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
  * @alarm: ptr to alarm that fired
@@ -926,9 +841,9 @@ const struct k_clock alarm_clock = {
 	.clock_getres		= alarm_clock_getres,
 	.clock_get		= alarm_clock_get,
 	.timer_create		= alarm_timer_create,
-	.timer_set		= alarm_timer_set,
-	.timer_del		= alarm_timer_del,
-	.timer_get		= alarm_timer_get,
+	.timer_set		= common_timer_set,
+	.timer_del		= common_timer_del,
+	.timer_get		= common_timer_get,
 	.timer_arm		= alarm_timer_arm,
 	.timer_rearm		= alarm_timer_rearm,
 	.timer_forward		= alarm_timer_forward,
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 8acc9ee2c2d6..6e7a70b1bf37 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -637,8 +637,7 @@ static int common_hrtimer_forward(struct k_itimer *timr, ktime_t now)
  * it is the same as a requeue pending timer WRT to what we should
  * report.
  */
-static void
-common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
+void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 {
 	const struct k_clock *kc = timr->kclock;
 	ktime_t now, remaining, iv;
@@ -768,10 +767,9 @@ static int common_hrtimer_try_to_cancel(struct k_itimer *timr)
 }
 
 /* Set a POSIX.1b interval timer. */
-static int
-common_timer_set(struct k_itimer *timr, int flags,
-		 struct itimerspec64 *new_setting,
-		 struct itimerspec64 *old_setting)
+int common_timer_set(struct k_itimer *timr, int flags,
+		     struct itimerspec64 *new_setting,
+		     struct itimerspec64 *old_setting)
 {
 	const struct k_clock *kc = timr->kclock;
 	bool sigev_none;
@@ -855,7 +853,7 @@ retry:
 	return error;
 }
 
-static int common_timer_del(struct k_itimer *timer)
+int common_timer_del(struct k_itimer *timer)
 {
 	const struct k_clock *kc = timer->kclock;
 
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index b0ad77e18886..b086f5ba2f5b 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -33,3 +33,9 @@ extern const struct k_clock clock_thread;
 extern const struct k_clock alarm_clock;
 
 int posix_timer_event(struct k_itimer *timr, int si_private);
+
+void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting);
+int common_timer_set(struct k_itimer *timr, int flags,
+		     struct itimerspec64 *new_setting,
+		     struct itimerspec64 *old_setting);
+int common_timer_del(struct k_itimer *timer);
-- 
cgit v1.3-14-g43fede


From f91840a32deef5cb1bf73338bc5010f843b01426 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Fri, 2 Jun 2017 21:03:52 -0700
Subject: perf, bpf: Add BPF support to all perf_event types

Allow BPF_PROG_TYPE_PERF_EVENT program types to attach to all
perf_event types, including HW_CACHE, RAW, and dynamic pmu events.
Only tracepoint/kprobe events are treated differently which require
BPF_PROG_TYPE_TRACEPOINT/BPF_PROG_TYPE_KPROBE program types accordingly.

Also add support for reading all event counters using
bpf_perf_event_read() helper.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h |  7 +++++--
 kernel/bpf/arraymap.c      | 28 +++++++--------------------
 kernel/events/core.c       | 47 +++++++++++++++++++++++++++-------------------
 kernel/trace/bpf_trace.c   | 22 ++++++++--------------
 4 files changed, 48 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24a635887f28..8fc5f0fada5e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -896,7 +896,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				void *context);
 extern void perf_pmu_migrate_context(struct pmu *pmu,
 				int src_cpu, int dst_cpu);
-extern u64 perf_event_read_local(struct perf_event *event);
+int perf_event_read_local(struct perf_event *event, u64 *value);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
@@ -1301,7 +1301,10 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline u64 perf_event_read_local(struct perf_event *event)	{ return -EINVAL; }
+static inline int perf_event_read_local(struct perf_event *event, u64 *value)
+{
+	return -EINVAL;
+}
 static inline void perf_event_print_debug(void)				{ }
 static inline int perf_event_task_disable(void)				{ return -EINVAL; }
 static inline int perf_event_task_enable(void)				{ return -EINVAL; }
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 172dc8ee0e3b..ecb43542246e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -452,38 +452,24 @@ static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
 static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
 					 struct file *map_file, int fd)
 {
-	const struct perf_event_attr *attr;
 	struct bpf_event_entry *ee;
 	struct perf_event *event;
 	struct file *perf_file;
+	u64 value;
 
 	perf_file = perf_event_get(fd);
 	if (IS_ERR(perf_file))
 		return perf_file;
 
+	ee = ERR_PTR(-EOPNOTSUPP);
 	event = perf_file->private_data;
-	ee = ERR_PTR(-EINVAL);
-
-	attr = perf_event_attrs(event);
-	if (IS_ERR(attr) || attr->inherit)
+	if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
 		goto err_out;
 
-	switch (attr->type) {
-	case PERF_TYPE_SOFTWARE:
-		if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
-			goto err_out;
-		/* fall-through */
-	case PERF_TYPE_RAW:
-	case PERF_TYPE_HARDWARE:
-		ee = bpf_event_entry_gen(perf_file, map_file);
-		if (ee)
-			return ee;
-		ee = ERR_PTR(-ENOMEM);
-		/* fall-through */
-	default:
-		break;
-	}
-
+	ee = bpf_event_entry_gen(perf_file, map_file);
+	if (ee)
+		return ee;
+	ee = ERR_PTR(-ENOMEM);
 err_out:
 	fput(perf_file);
 	return ee;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6e75a5c9412d..51e40e4876c0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3636,10 +3636,10 @@ static inline u64 perf_event_count(struct perf_event *event)
  *     will not be local and we cannot read them atomically
  *   - must not have a pmu::count method
  */
-u64 perf_event_read_local(struct perf_event *event)
+int perf_event_read_local(struct perf_event *event, u64 *value)
 {
 	unsigned long flags;
-	u64 val;
+	int ret = 0;
 
 	/*
 	 * Disabling interrupts avoids all counter scheduling (context
@@ -3647,25 +3647,37 @@ u64 perf_event_read_local(struct perf_event *event)
 	 */
 	local_irq_save(flags);
 
-	/* If this is a per-task event, it must be for current */
-	WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
-		     event->hw.target != current);
-
-	/* If this is a per-CPU event, it must be for this CPU */
-	WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
-		     event->cpu != smp_processor_id());
-
 	/*
 	 * It must not be an event with inherit set, we cannot read
 	 * all child counters from atomic context.
 	 */
-	WARN_ON_ONCE(event->attr.inherit);
+	if (event->attr.inherit) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
 
 	/*
 	 * It must not have a pmu::count method, those are not
 	 * NMI safe.
 	 */
-	WARN_ON_ONCE(event->pmu->count);
+	if (event->pmu->count) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	/* If this is a per-task event, it must be for current */
+	if ((event->attach_state & PERF_ATTACH_TASK) &&
+	    event->hw.target != current) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* If this is a per-CPU event, it must be for this CPU */
+	if (!(event->attach_state & PERF_ATTACH_TASK) &&
+	    event->cpu != smp_processor_id()) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	/*
 	 * If the event is currently on this CPU, its either a per-task event,
@@ -3675,10 +3687,11 @@ u64 perf_event_read_local(struct perf_event *event)
 	if (event->oncpu == smp_processor_id())
 		event->pmu->read(event);
 
-	val = local64_read(&event->count);
+	*value = local64_read(&event->count);
+out:
 	local_irq_restore(flags);
 
-	return val;
+	return ret;
 }
 
 static int perf_event_read(struct perf_event *event, bool group)
@@ -8037,12 +8050,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 	bool is_kprobe, is_tracepoint;
 	struct bpf_prog *prog;
 
-	if (event->attr.type == PERF_TYPE_HARDWARE ||
-	    event->attr.type == PERF_TYPE_SOFTWARE)
-		return perf_event_set_bpf_handler(event, prog_fd);
-
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
-		return -EINVAL;
+		return perf_event_set_bpf_handler(event, prog_fd);
 
 	if (event->tp_event->prog)
 		return -EEXIST;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 460a031c77e5..08eb072430b9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -234,7 +234,8 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
 	struct bpf_event_entry *ee;
-	struct perf_event *event;
+	u64 value = 0;
+	int err;
 
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
 		return -EINVAL;
@@ -247,21 +248,14 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 	if (!ee)
 		return -ENOENT;
 
-	event = ee->event;
-	if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
-		     event->attr.type != PERF_TYPE_RAW))
-		return -EINVAL;
-
-	/* make sure event is local and doesn't have pmu::count */
-	if (unlikely(event->oncpu != cpu || event->pmu->count))
-		return -EINVAL;
-
+	err = perf_event_read_local(ee->event, &value);
 	/*
-	 * we don't know if the function is run successfully by the
-	 * return value. It can be judged in other places, such as
-	 * eBPF programs.
+	 * this api is ugly since we miss [-22..-2] range of valid
+	 * counter values, but that's uapi
 	 */
-	return perf_event_read_local(event);
+	if (err)
+		return err;
+	return value;
 }
 
 static const struct bpf_func_proto bpf_perf_event_read_proto = {
-- 
cgit v1.3-14-g43fede


From f99973e18b65ca1fff8c81532e3132b8f622aea8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 1 Jun 2017 16:47:09 +0200
Subject: nohz: Fix buggy tick delay on IRQ storms

When the tick is stopped and we reach the dynticks evaluation code on
IRQ exit, we perform a soft tick restart if we observe an expired timer
from there. It means we program the nearest possible tick but we stay in
dynticks mode (ts->tick_stopped = 1) because we may need to stop the tick
again after that expired timer is handled.

Now this solution works most of the time but if we suffer an IRQ storm
and those interrupts trigger faster than the hardware clockevents min
delay, our tick won't fire until that IRQ storm is finished.

Here is the problem: on IRQ exit we reprog the timer to at least
NOW() + min_clockevents_delay. Another IRQ fires before the tick so we
reschedule again to NOW() + min_clockevents_delay, etc... The tick
is eternally rescheduled min_clockevents_delay ahead.

A solution is to simply remove this soft tick restart. After all
the normal dynticks evaluation path can handle 0 delay just fine. And
by doing that we benefit from the optimization branch which avoids
clock reprogramming if the clockevents deadline hasn't changed since
the last reprog. This fixes our issue because we don't do repetitive
clock reprog that always add hardware min delay.

As a side effect it should even optimize the 0 delay path in general.

Reported-and-tested-by: Octavian Purdila <octavian.purdila@nxp.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1496328429-13317-1-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e3043873fcdc..9d31f1e0067b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -713,8 +713,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	 */
 	delta = next_tick - basemono;
 	if (delta <= (u64)TICK_NSEC) {
-		tick = 0;
-
 		/*
 		 * Tell the timer code that the base is not idle, i.e. undo
 		 * the effect of get_next_timer_interrupt():
@@ -724,23 +722,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		 * We've not stopped the tick yet, and there's a timer in the
 		 * next period, so no point in stopping it either, bail.
 		 */
-		if (!ts->tick_stopped)
-			goto out;
-
-		/*
-		 * If, OTOH, we did stop it, but there's a pending (expired)
-		 * timer reprogram the timer hardware to fire now.
-		 *
-		 * We will not restart the tick proper, just prod the timer
-		 * hardware into firing an interrupt to process the pending
-		 * timers. Just like tick_irq_exit() will not restart the tick
-		 * for 'normal' interrupts.
-		 *
-		 * Only once we exit the idle loop will we re-enable the tick,
-		 * see tick_nohz_idle_exit().
-		 */
-		if (delta == 0) {
-			tick_nohz_restart(ts, now);
+		if (!ts->tick_stopped) {
+			tick = 0;
 			goto out;
 		}
 	}
-- 
cgit v1.3-14-g43fede


From 680895d6efe47332d25e49817d2d6781295c1614 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 17 May 2017 09:51:09 +0200
Subject: sysctl: switch to use uuid_t

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 kernel/sysctl_binary.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index ece4b177052b..939a158eab11 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1119,7 +1119,7 @@ static ssize_t bin_uuid(struct file *file,
 	/* Only supports reads */
 	if (oldval && oldlen) {
 		char buf[UUID_STRING_LEN + 1];
-		uuid_be uuid;
+		uuid_t uuid;
 
 		result = kernel_read(file, 0, buf, sizeof(buf) - 1);
 		if (result < 0)
@@ -1128,7 +1128,7 @@ static ssize_t bin_uuid(struct file *file,
 		buf[result] = '\0';
 
 		result = -EIO;
-		if (uuid_be_to_bin(buf, &uuid))
+		if (uuid_parse(buf, &uuid))
 			goto out;
 
 		if (oldlen > 16)
-- 
cgit v1.3-14-g43fede


From dc4bb0e2356149aee4cdae061936f3bbdd45595c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:46 -0700
Subject: bpf: Introduce bpf_prog ID

This patch generates an unique ID for each BPF_PROG_LOAD-ed prog.
It is worth to note that each BPF_PROG_LOAD-ed prog will have
a different ID even they have the same bpf instructions.

The ID is generated by the existing idr_alloc_cyclic().
The ID is ranged from [1, INT_MAX).  It is allocated in cyclic manner,
so an ID will get reused every 2 billion BPF_PROG_LOAD.

The bpf_prog_alloc_id() is done after bpf_prog_select_runtime()
because the jit process may have allocated a new prog.  Hence,
we need to ensure the value of pointer 'prog' will not be changed
any more before storing the prog to the prog_idr.

After bpf_prog_select_runtime(), the prog is read-only.  Hence,
the id is stored in 'struct bpf_prog_aux'.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h  |  1 +
 kernel/bpf/syscall.c | 40 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index fcc80ca11045..c5946d19f2ca 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -172,6 +172,7 @@ struct bpf_prog_aux {
 	u32 used_map_cnt;
 	u32 max_ctx_offset;
 	u32 stack_depth;
+	u32 id;
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
 	const struct bpf_verifier_ops *ops;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 59da103adb85..2a1b32b470f1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -22,8 +22,11 @@
 #include <linux/filter.h>
 #include <linux/version.h>
 #include <linux/kernel.h>
+#include <linux/idr.h>
 
 DEFINE_PER_CPU(int, bpf_prog_active);
+static DEFINE_IDR(prog_idr);
+static DEFINE_SPINLOCK(prog_idr_lock);
 
 int sysctl_unprivileged_bpf_disabled __read_mostly;
 
@@ -650,6 +653,34 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
 	free_uid(user);
 }
 
+static int bpf_prog_alloc_id(struct bpf_prog *prog)
+{
+	int id;
+
+	spin_lock_bh(&prog_idr_lock);
+	id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
+	if (id > 0)
+		prog->aux->id = id;
+	spin_unlock_bh(&prog_idr_lock);
+
+	/* id is in [1, INT_MAX) */
+	if (WARN_ON_ONCE(!id))
+		return -ENOSPC;
+
+	return id > 0 ? 0 : id;
+}
+
+static void bpf_prog_free_id(struct bpf_prog *prog)
+{
+	/* cBPF to eBPF migrations are currently not in the idr store. */
+	if (!prog->aux->id)
+		return;
+
+	spin_lock_bh(&prog_idr_lock);
+	idr_remove(&prog_idr, prog->aux->id);
+	spin_unlock_bh(&prog_idr_lock);
+}
+
 static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 {
 	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
@@ -663,6 +694,7 @@ void bpf_prog_put(struct bpf_prog *prog)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
 		trace_bpf_prog_put_rcu(prog);
+		bpf_prog_free_id(prog);
 		bpf_prog_kallsyms_del(prog);
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
@@ -857,15 +889,21 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (err < 0)
 		goto free_used_maps;
 
+	err = bpf_prog_alloc_id(prog);
+	if (err)
+		goto free_used_maps;
+
 	err = bpf_prog_new_fd(prog);
 	if (err < 0)
 		/* failed to allocate fd */
-		goto free_used_maps;
+		goto free_id;
 
 	bpf_prog_kallsyms_add(prog);
 	trace_bpf_prog_load(prog, err);
 	return err;
 
+free_id:
+	bpf_prog_free_id(prog);
 free_used_maps:
 	free_used_maps(prog->aux);
 free_prog:
-- 
cgit v1.3-14-g43fede


From f3f1c054c288bb6e503005e6d73611151ed20e91 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:47 -0700
Subject: bpf: Introduce bpf_map ID

This patch generates an unique ID for each created bpf_map.
The approach is similar to the earlier patch for bpf_prog ID.

It is worth to note that the bpf_map's ID and bpf_prog's ID
are in two independent ID spaces and both have the same valid range:
[1, INT_MAX).

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h  |  1 +
 kernel/bpf/syscall.c | 34 +++++++++++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c5946d19f2ca..c32bace66d3d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -46,6 +46,7 @@ struct bpf_map {
 	u32 max_entries;
 	u32 map_flags;
 	u32 pages;
+	u32 id;
 	struct user_struct *user;
 	const struct bpf_map_ops *ops;
 	struct work_struct work;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2a1b32b470f1..4c3075b5d840 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -27,6 +27,8 @@
 DEFINE_PER_CPU(int, bpf_prog_active);
 static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
+static DEFINE_IDR(map_idr);
+static DEFINE_SPINLOCK(map_idr_lock);
 
 int sysctl_unprivileged_bpf_disabled __read_mostly;
 
@@ -117,6 +119,29 @@ static void bpf_map_uncharge_memlock(struct bpf_map *map)
 	free_uid(user);
 }
 
+static int bpf_map_alloc_id(struct bpf_map *map)
+{
+	int id;
+
+	spin_lock_bh(&map_idr_lock);
+	id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
+	if (id > 0)
+		map->id = id;
+	spin_unlock_bh(&map_idr_lock);
+
+	if (WARN_ON_ONCE(!id))
+		return -ENOSPC;
+
+	return id > 0 ? 0 : id;
+}
+
+static void bpf_map_free_id(struct bpf_map *map)
+{
+	spin_lock_bh(&map_idr_lock);
+	idr_remove(&map_idr, map->id);
+	spin_unlock_bh(&map_idr_lock);
+}
+
 /* called from workqueue */
 static void bpf_map_free_deferred(struct work_struct *work)
 {
@@ -141,6 +166,7 @@ static void bpf_map_put_uref(struct bpf_map *map)
 void bpf_map_put(struct bpf_map *map)
 {
 	if (atomic_dec_and_test(&map->refcnt)) {
+		bpf_map_free_id(map);
 		INIT_WORK(&map->work, bpf_map_free_deferred);
 		schedule_work(&map->work);
 	}
@@ -239,14 +265,20 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map_nouncharge;
 
+	err = bpf_map_alloc_id(map);
+	if (err)
+		goto free_map;
+
 	err = bpf_map_new_fd(map);
 	if (err < 0)
 		/* failed to allocate fd */
-		goto free_map;
+		goto free_id;
 
 	trace_bpf_map_create(map, err);
 	return err;
 
+free_id:
+	bpf_map_free_id(map);
 free_map:
 	bpf_map_uncharge_memlock(map);
 free_map_nouncharge:
-- 
cgit v1.3-14-g43fede


From 34ad5580f8f9c86cb273ebea25c149613cd1667e Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:48 -0700
Subject: bpf: Add BPF_(PROG|MAP)_GET_NEXT_ID command

This patch adds BPF_PROG_GET_NEXT_ID and BPF_MAP_GET_NEXT_ID
to allow userspace to iterate all bpf_prog IDs and bpf_map IDs.

The API is trying to be consistent with the existing
BPF_MAP_GET_NEXT_KEY.

It is currently limited to CAP_SYS_ADMIN which we can
consider to lift it in followup patches.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  7 +++++++
 kernel/bpf/syscall.c     | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e78aece03628..629747a3f273 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -82,6 +82,8 @@ enum bpf_cmd {
 	BPF_PROG_ATTACH,
 	BPF_PROG_DETACH,
 	BPF_PROG_TEST_RUN,
+	BPF_PROG_GET_NEXT_ID,
+	BPF_MAP_GET_NEXT_ID,
 };
 
 enum bpf_map_type {
@@ -209,6 +211,11 @@ union bpf_attr {
 		__u32		repeat;
 		__u32		duration;
 	} test;
+
+	struct { /* anonymous struct used by BPF_*_GET_NEXT_ID */
+		__u32		start_id;
+		__u32		next_id;
+	};
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4c3075b5d840..2405feedb8c1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -166,6 +166,7 @@ static void bpf_map_put_uref(struct bpf_map *map)
 void bpf_map_put(struct bpf_map *map)
 {
 	if (atomic_dec_and_test(&map->refcnt)) {
+		/* bpf_map_free_id() must be called first */
 		bpf_map_free_id(map);
 		INIT_WORK(&map->work, bpf_map_free_deferred);
 		schedule_work(&map->work);
@@ -726,6 +727,7 @@ void bpf_prog_put(struct bpf_prog *prog)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
 		trace_bpf_prog_put_rcu(prog);
+		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog);
 		bpf_prog_kallsyms_del(prog);
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
@@ -1069,6 +1071,34 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
 	return ret;
 }
 
+#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
+
+static int bpf_obj_get_next_id(const union bpf_attr *attr,
+			       union bpf_attr __user *uattr,
+			       struct idr *idr,
+			       spinlock_t *lock)
+{
+	u32 next_id = attr->start_id;
+	int err = 0;
+
+	if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	next_id++;
+	spin_lock_bh(lock);
+	if (!idr_get_next(idr, &next_id))
+		err = -ENOENT;
+	spin_unlock_bh(lock);
+
+	if (!err)
+		err = put_user(next_id, &uattr->next_id);
+
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -1146,6 +1176,14 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_TEST_RUN:
 		err = bpf_prog_test_run(&attr, uattr);
 		break;
+	case BPF_PROG_GET_NEXT_ID:
+		err = bpf_obj_get_next_id(&attr, uattr,
+					  &prog_idr, &prog_idr_lock);
+		break;
+	case BPF_MAP_GET_NEXT_ID:
+		err = bpf_obj_get_next_id(&attr, uattr,
+					  &map_idr, &map_idr_lock);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.3-14-g43fede


From b16d9aa4c2b90af8d2c3201e245150f8c430c3bc Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:49 -0700
Subject: bpf: Add BPF_PROG_GET_FD_BY_ID

Add BPF_PROG_GET_FD_BY_ID command to allow user to get a fd
from a bpf_prog's ID.

bpf_prog_inc_not_zero() is added and is called with prog_idr_lock
held.

__bpf_prog_put() is also added which has the 'bool do_idr_lock'
param to decide if the prog_idr_lock should be acquired when
freeing the prog->id.

In the error path of bpf_prog_inc_not_zero(), it may have to
call __bpf_prog_put(map, false) which does not need
to take the prog_idr_lock when freeing the prog->id.

It is currently limited to CAP_SYS_ADMIN which we can
consider to lift it in followup patches.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  8 +++--
 kernel/bpf/syscall.c     | 91 ++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 87 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 629747a3f273..d70cfed19d5e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -84,6 +84,7 @@ enum bpf_cmd {
 	BPF_PROG_TEST_RUN,
 	BPF_PROG_GET_NEXT_ID,
 	BPF_MAP_GET_NEXT_ID,
+	BPF_PROG_GET_FD_BY_ID,
 };
 
 enum bpf_map_type {
@@ -212,8 +213,11 @@ union bpf_attr {
 		__u32		duration;
 	} test;
 
-	struct { /* anonymous struct used by BPF_*_GET_NEXT_ID */
-		__u32		start_id;
+	struct { /* anonymous struct used by BPF_*_GET_*_ID */
+		union {
+			__u32		start_id;
+			__u32		prog_id;
+		};
 		__u32		next_id;
 	};
 } __attribute__((aligned(8)));
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2405feedb8c1..dc6253bb8ebb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -703,15 +703,23 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
 	return id > 0 ? 0 : id;
 }
 
-static void bpf_prog_free_id(struct bpf_prog *prog)
+static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
 {
 	/* cBPF to eBPF migrations are currently not in the idr store. */
 	if (!prog->aux->id)
 		return;
 
-	spin_lock_bh(&prog_idr_lock);
+	if (do_idr_lock)
+		spin_lock_bh(&prog_idr_lock);
+	else
+		__acquire(&prog_idr_lock);
+
 	idr_remove(&prog_idr, prog->aux->id);
-	spin_unlock_bh(&prog_idr_lock);
+
+	if (do_idr_lock)
+		spin_unlock_bh(&prog_idr_lock);
+	else
+		__release(&prog_idr_lock);
 }
 
 static void __bpf_prog_put_rcu(struct rcu_head *rcu)
@@ -723,16 +731,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 	bpf_prog_free(aux->prog);
 }
 
-void bpf_prog_put(struct bpf_prog *prog)
+static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
 		trace_bpf_prog_put_rcu(prog);
 		/* bpf_prog_free_id() must be called first */
-		bpf_prog_free_id(prog);
+		bpf_prog_free_id(prog, do_idr_lock);
 		bpf_prog_kallsyms_del(prog);
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
 }
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+	__bpf_prog_put(prog, true);
+}
 EXPORT_SYMBOL_GPL(bpf_prog_put);
 
 static int bpf_prog_release(struct inode *inode, struct file *filp)
@@ -814,6 +827,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc);
 
+/* prog_idr_lock should have been held */
+static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
+{
+	int refold;
+
+	refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0);
+
+	if (refold >= BPF_MAX_REFCNT) {
+		__bpf_prog_put(prog, false);
+		return ERR_PTR(-EBUSY);
+	}
+
+	if (!refold)
+		return ERR_PTR(-ENOENT);
+
+	return prog;
+}
+
 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 {
 	struct fd f = fdget(ufd);
@@ -928,16 +959,21 @@ static int bpf_prog_load(union bpf_attr *attr)
 		goto free_used_maps;
 
 	err = bpf_prog_new_fd(prog);
-	if (err < 0)
-		/* failed to allocate fd */
-		goto free_id;
+	if (err < 0) {
+		/* failed to allocate fd.
+		 * bpf_prog_put() is needed because the above
+		 * bpf_prog_alloc_id() has published the prog
+		 * to the userspace and the userspace may
+		 * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
+		 */
+		bpf_prog_put(prog);
+		return err;
+	}
 
 	bpf_prog_kallsyms_add(prog);
 	trace_bpf_prog_load(prog, err);
 	return err;
 
-free_id:
-	bpf_prog_free_id(prog);
 free_used_maps:
 	free_used_maps(prog->aux);
 free_prog:
@@ -1099,6 +1135,38 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
 	return err;
 }
 
+#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
+
+static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	u32 id = attr->prog_id;
+	int fd;
+
+	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	spin_lock_bh(&prog_idr_lock);
+	prog = idr_find(&prog_idr, id);
+	if (prog)
+		prog = bpf_prog_inc_not_zero(prog);
+	else
+		prog = ERR_PTR(-ENOENT);
+	spin_unlock_bh(&prog_idr_lock);
+
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	fd = bpf_prog_new_fd(prog);
+	if (fd < 0)
+		bpf_prog_put(prog);
+
+	return fd;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -1184,6 +1252,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 		err = bpf_obj_get_next_id(&attr, uattr,
 					  &map_idr, &map_idr_lock);
 		break;
+	case BPF_PROG_GET_FD_BY_ID:
+		err = bpf_prog_get_fd_by_id(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.3-14-g43fede


From bd5f5f4ecb78e2698dad655645b6d6a2f7012a8c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:50 -0700
Subject: bpf: Add BPF_MAP_GET_FD_BY_ID

Add BPF_MAP_GET_FD_BY_ID command to allow user to get a fd
from a bpf_map's ID.

bpf_map_inc_not_zero() is added and is called with map_idr_lock
held.

__bpf_map_put() is also added which has the 'bool do_idr_lock'
param to decide if the map_idr_lock should be acquired when
freeing the map->id.

In the error path of bpf_map_inc_not_zero(), it may have to
call __bpf_map_put(map, false) which does not need
to take the map_idr_lock when freeing the map->id.

It is currently limited to CAP_SYS_ADMIN which we can
consider to lift it in followup patches.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  2 +
 kernel/bpf/syscall.c     | 95 +++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 87 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d70cfed19d5e..dd23f47ff00c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -85,6 +85,7 @@ enum bpf_cmd {
 	BPF_PROG_GET_NEXT_ID,
 	BPF_MAP_GET_NEXT_ID,
 	BPF_PROG_GET_FD_BY_ID,
+	BPF_MAP_GET_FD_BY_ID,
 };
 
 enum bpf_map_type {
@@ -217,6 +218,7 @@ union bpf_attr {
 		union {
 			__u32		start_id;
 			__u32		prog_id;
+			__u32		map_id;
 		};
 		__u32		next_id;
 	};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dc6253bb8ebb..1802bb9c47d9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -135,11 +135,19 @@ static int bpf_map_alloc_id(struct bpf_map *map)
 	return id > 0 ? 0 : id;
 }
 
-static void bpf_map_free_id(struct bpf_map *map)
+static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 {
-	spin_lock_bh(&map_idr_lock);
+	if (do_idr_lock)
+		spin_lock_bh(&map_idr_lock);
+	else
+		__acquire(&map_idr_lock);
+
 	idr_remove(&map_idr, map->id);
-	spin_unlock_bh(&map_idr_lock);
+
+	if (do_idr_lock)
+		spin_unlock_bh(&map_idr_lock);
+	else
+		__release(&map_idr_lock);
 }
 
 /* called from workqueue */
@@ -163,16 +171,21 @@ static void bpf_map_put_uref(struct bpf_map *map)
 /* decrement map refcnt and schedule it for freeing via workqueue
  * (unrelying map implementation ops->map_free() might sleep)
  */
-void bpf_map_put(struct bpf_map *map)
+static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
 {
 	if (atomic_dec_and_test(&map->refcnt)) {
 		/* bpf_map_free_id() must be called first */
-		bpf_map_free_id(map);
+		bpf_map_free_id(map, do_idr_lock);
 		INIT_WORK(&map->work, bpf_map_free_deferred);
 		schedule_work(&map->work);
 	}
 }
 
+void bpf_map_put(struct bpf_map *map)
+{
+	__bpf_map_put(map, true);
+}
+
 void bpf_map_put_with_uref(struct bpf_map *map)
 {
 	bpf_map_put_uref(map);
@@ -271,15 +284,20 @@ static int map_create(union bpf_attr *attr)
 		goto free_map;
 
 	err = bpf_map_new_fd(map);
-	if (err < 0)
-		/* failed to allocate fd */
-		goto free_id;
+	if (err < 0) {
+		/* failed to allocate fd.
+		 * bpf_map_put() is needed because the above
+		 * bpf_map_alloc_id() has published the map
+		 * to the userspace and the userspace may
+		 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
+		 */
+		bpf_map_put(map);
+		return err;
+	}
 
 	trace_bpf_map_create(map, err);
 	return err;
 
-free_id:
-	bpf_map_free_id(map);
 free_map:
 	bpf_map_uncharge_memlock(map);
 free_map_nouncharge:
@@ -331,6 +349,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 	return map;
 }
 
+/* map_idr_lock should have been held */
+static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
+					    bool uref)
+{
+	int refold;
+
+	refold = __atomic_add_unless(&map->refcnt, 1, 0);
+
+	if (refold >= BPF_MAX_REFCNT) {
+		__bpf_map_put(map, false);
+		return ERR_PTR(-EBUSY);
+	}
+
+	if (!refold)
+		return ERR_PTR(-ENOENT);
+
+	if (uref)
+		atomic_inc(&map->usercnt);
+
+	return map;
+}
+
 int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 {
 	return -ENOTSUPP;
@@ -1167,6 +1207,38 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	return fd;
 }
 
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+
+static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
+{
+	struct bpf_map *map;
+	u32 id = attr->map_id;
+	int fd;
+
+	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	spin_lock_bh(&map_idr_lock);
+	map = idr_find(&map_idr, id);
+	if (map)
+		map = bpf_map_inc_not_zero(map, true);
+	else
+		map = ERR_PTR(-ENOENT);
+	spin_unlock_bh(&map_idr_lock);
+
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	fd = bpf_map_new_fd(map);
+	if (fd < 0)
+		bpf_map_put(map);
+
+	return fd;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -1255,6 +1327,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_GET_FD_BY_ID:
 		err = bpf_prog_get_fd_by_id(&attr);
 		break;
+	case BPF_MAP_GET_FD_BY_ID:
+		err = bpf_map_get_fd_by_id(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.3-14-g43fede


From 1e270976908686ec25fb91b8a34145be54137976 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:52 -0700
Subject: bpf: Add BPF_OBJ_GET_INFO_BY_FD

A single BPF_OBJ_GET_INFO_BY_FD cmd is used to obtain the info
for both bpf_prog and bpf_map.  The kernel can figure out the
fd is associated with a bpf_prog or bpf_map.

The suggested struct bpf_prog_info and struct bpf_map_info are
not meant to be a complete list and it is not the goal of this patch.
New fields can be added in the future patch.

The focus of this patch is to create the interface,
BPF_OBJ_GET_INFO_BY_FD cmd for exposing the bpf_prog's and
bpf_map's info.

The obj's info, which will be extended (and get bigger) over time, is
separated from the bpf_attr to avoid bloating the bpf_attr.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h   |   2 -
 include/uapi/linux/bpf.h |  28 ++++++++
 kernel/bpf/syscall.c     | 163 ++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 174 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1e2dddf21f3b..1fa26dc562ce 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -69,8 +69,6 @@ struct bpf_prog_aux;
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
-#define BPF_TAG_SIZE	8
-
 /* Helper macros for filter block array initializers. */
 
 /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index dd23f47ff00c..9b2c10b45733 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -86,6 +86,7 @@ enum bpf_cmd {
 	BPF_MAP_GET_NEXT_ID,
 	BPF_PROG_GET_FD_BY_ID,
 	BPF_MAP_GET_FD_BY_ID,
+	BPF_OBJ_GET_INFO_BY_FD,
 };
 
 enum bpf_map_type {
@@ -222,6 +223,12 @@ union bpf_attr {
 		};
 		__u32		next_id;
 	};
+
+	struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */
+		__u32		bpf_fd;
+		__u32		info_len;
+		__aligned_u64	info;
+	} info;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
@@ -686,4 +693,25 @@ struct xdp_md {
 	__u32 data_end;
 };
 
+#define BPF_TAG_SIZE	8
+
+struct bpf_prog_info {
+	__u32 type;
+	__u32 id;
+	__u8  tag[BPF_TAG_SIZE];
+	__u32 jited_prog_len;
+	__u32 xlated_prog_len;
+	__aligned_u64 jited_prog_insns;
+	__aligned_u64 xlated_prog_insns;
+} __attribute__((aligned(8)));
+
+struct bpf_map_info {
+	__u32 type;
+	__u32 id;
+	__u32 key_size;
+	__u32 value_size;
+	__u32 max_entries;
+	__u32 map_flags;
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1802bb9c47d9..8942c820d620 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1239,6 +1239,145 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 	return fd;
 }
 
+static int check_uarg_tail_zero(void __user *uaddr,
+				size_t expected_size,
+				size_t actual_size)
+{
+	unsigned char __user *addr;
+	unsigned char __user *end;
+	unsigned char val;
+	int err;
+
+	if (actual_size <= expected_size)
+		return 0;
+
+	addr = uaddr + expected_size;
+	end  = uaddr + actual_size;
+
+	for (; addr < end; addr++) {
+		err = get_user(val, addr);
+		if (err)
+			return err;
+		if (val)
+			return -E2BIG;
+	}
+
+	return 0;
+}
+
+static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
+				   const union bpf_attr *attr,
+				   union bpf_attr __user *uattr)
+{
+	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+	struct bpf_prog_info info = {};
+	u32 info_len = attr->info.info_len;
+	char __user *uinsns;
+	u32 ulen;
+	int err;
+
+	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	if (err)
+		return err;
+	info_len = min_t(u32, sizeof(info), info_len);
+
+	if (copy_from_user(&info, uinfo, info_len))
+		return err;
+
+	info.type = prog->type;
+	info.id = prog->aux->id;
+
+	memcpy(info.tag, prog->tag, sizeof(prog->tag));
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		info.jited_prog_len = 0;
+		info.xlated_prog_len = 0;
+		goto done;
+	}
+
+	ulen = info.jited_prog_len;
+	info.jited_prog_len = prog->jited_len;
+	if (info.jited_prog_len && ulen) {
+		uinsns = u64_to_user_ptr(info.jited_prog_insns);
+		ulen = min_t(u32, info.jited_prog_len, ulen);
+		if (copy_to_user(uinsns, prog->bpf_func, ulen))
+			return -EFAULT;
+	}
+
+	ulen = info.xlated_prog_len;
+	info.xlated_prog_len = bpf_prog_size(prog->len);
+	if (info.xlated_prog_len && ulen) {
+		uinsns = u64_to_user_ptr(info.xlated_prog_insns);
+		ulen = min_t(u32, info.xlated_prog_len, ulen);
+		if (copy_to_user(uinsns, prog->insnsi, ulen))
+			return -EFAULT;
+	}
+
+done:
+	if (copy_to_user(uinfo, &info, info_len) ||
+	    put_user(info_len, &uattr->info.info_len))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int bpf_map_get_info_by_fd(struct bpf_map *map,
+				  const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+	struct bpf_map_info info = {};
+	u32 info_len = attr->info.info_len;
+	int err;
+
+	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	if (err)
+		return err;
+	info_len = min_t(u32, sizeof(info), info_len);
+
+	info.type = map->map_type;
+	info.id = map->id;
+	info.key_size = map->key_size;
+	info.value_size = map->value_size;
+	info.max_entries = map->max_entries;
+	info.map_flags = map->map_flags;
+
+	if (copy_to_user(uinfo, &info, info_len) ||
+	    put_user(info_len, &uattr->info.info_len))
+		return -EFAULT;
+
+	return 0;
+}
+
+#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
+
+static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	int ufd = attr->info.bpf_fd;
+	struct fd f;
+	int err;
+
+	if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
+		return -EINVAL;
+
+	f = fdget(ufd);
+	if (!f.file)
+		return -EBADFD;
+
+	if (f.file->f_op == &bpf_prog_fops)
+		err = bpf_prog_get_info_by_fd(f.file->private_data, attr,
+					      uattr);
+	else if (f.file->f_op == &bpf_map_fops)
+		err = bpf_map_get_info_by_fd(f.file->private_data, attr,
+					     uattr);
+	else
+		err = -EINVAL;
+
+	fdput(f);
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -1258,23 +1397,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	 * user-space does not rely on any kernel feature
 	 * extensions we dont know about yet.
 	 */
-	if (size > sizeof(attr)) {
-		unsigned char __user *addr;
-		unsigned char __user *end;
-		unsigned char val;
-
-		addr = (void __user *)uattr + sizeof(attr);
-		end  = (void __user *)uattr + size;
-
-		for (; addr < end; addr++) {
-			err = get_user(val, addr);
-			if (err)
-				return err;
-			if (val)
-				return -E2BIG;
-		}
-		size = sizeof(attr);
-	}
+	err = check_uarg_tail_zero(uattr, sizeof(attr), size);
+	if (err)
+		return err;
+	size = min_t(u32, size, sizeof(attr));
 
 	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
 	if (copy_from_user(&attr, uattr, size) != 0)
@@ -1330,6 +1456,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_GET_FD_BY_ID:
 		err = bpf_map_get_fd_by_id(&attr);
 		break;
+	case BPF_OBJ_GET_INFO_BY_FD:
+		err = bpf_obj_get_info_by_fd(&attr, uattr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.3-14-g43fede


From 92046578ac88e0a93f8ef03240e6c832b0189aa7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 6 Jun 2017 18:38:04 +0200
Subject: bpf: cgroup skb progs cannot access ld_abs/ind

Commit fb9a307d11d6 ("bpf: Allow CGROUP_SKB eBPF program to
access sk_buff") enabled programs of BPF_PROG_TYPE_CGROUP_SKB
type to use ld_abs/ind instructions. However, at this point,
we cannot use them, since offsets relative to SKF_LL_OFF will
end up pointing skb_mac_header(skb) out of bounds since in the
egress path it is not yet set at that point in time, but only
after __dev_queue_xmit() did a general reset on the mac header.
bpf_internal_load_pointer_neg_helper() will then end up reading
data from a wrong offset.

BPF_PROG_TYPE_CGROUP_SKB programs can use bpf_skb_load_bytes()
already to access packet data, which is also more flexible than
the insns carried over from cBPF.

Fixes: fb9a307d11d6 ("bpf: Allow CGROUP_SKB eBPF program to access sk_buff")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Chenbo Feng <fengc@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8acae64df255..14ccb0759fa4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2426,7 +2426,6 @@ static bool may_access_skb(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_SOCKET_FILTER:
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
-	case BPF_PROG_TYPE_CGROUP_SKB:
 		return true;
 	default:
 		return false;
-- 
cgit v1.3-14-g43fede


From c3ca46ef719580eb01994fc6032db470fde92d85 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 23 May 2017 15:05:50 +0900
Subject: ftrace/kprobes: selftests: Check kretprobe maxactive is supported

Check the kretprobe maxactive is supported by kprobe_events
interface. To ensure the kernel feature, this changes ftrace
README to describe it.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
---
 kernel/trace/trace.c                                                | 3 ++-
 tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1122f151466f..dc3f91e70345 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4473,7 +4473,8 @@ static const char readme_msg[] =
 #endif
 #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
 	"\t  accepts: event-definitions (one definition per line)\n"
-	"\t   Format: p|r[:[<group>/]<event>] <place> [<args>]\n"
+	"\t   Format: p[:[<group>/]<event>] <place> [<args>]\n"
+	"\t           r[maxactive][:[<group>/]<event>] <place> [<args>]\n"
 	"\t           -:[<group>/]<event>\n"
 #ifdef CONFIG_KPROBE_EVENTS
 	"\t    place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc
index 57abdf1caabf..7ec6f2639ad6 100644
--- a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc
@@ -2,6 +2,7 @@
 # description: Kretprobe dynamic event with maxactive
 
 [ -f kprobe_events ] || exit_unsupported # this is configurable
+grep -q 'r\[maxactive\]' README || exit_unsupported # this is older version
 
 echo > kprobe_events
 
-- 
cgit v1.3-14-g43fede


From ba5213ae6b88fb170c4771fef6553f759c7d8cdd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 30 May 2017 11:45:12 +0200
Subject: perf/core: Correct event creation with PERF_FORMAT_GROUP

Andi was asking about PERF_FORMAT_GROUP vs inherited events, which led
to the discovery of a bug from commit:

  3dab77fb1bf8 ("perf: Rework/fix the whole read vs group stuff")

 -       PERF_SAMPLE_GROUP                       = 1U << 4,
 +       PERF_SAMPLE_READ                        = 1U << 4,

 -       if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
 +       if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))

is a clear fail :/

While this changes user visible behaviour; it was previously possible
to create an inherited event with PERF_SAMPLE_READ; this is deemed
acceptible because its results were always incorrect.

Reported-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vince@deater.net>
Fixes:  3dab77fb1bf8 ("perf: Rework/fix the whole read vs group stuff")
Link: http://lkml.kernel.org/r/20170530094512.dy2nljns2uq7qa3j@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3de0b98c4414..407dad6cf89a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5729,9 +5729,6 @@ static void perf_output_read_one(struct perf_output_handle *handle,
 	__output_copy(handle, values, n * sizeof(u64));
 }
 
-/*
- * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
- */
 static void perf_output_read_group(struct perf_output_handle *handle,
 			    struct perf_event *event,
 			    u64 enabled, u64 running)
@@ -5776,6 +5773,13 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
 				 PERF_FORMAT_TOTAL_TIME_RUNNING)
 
+/*
+ * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
+ *
+ * The problem is that its both hard and excessively expensive to iterate the
+ * child list, not to mention that its impossible to IPI the children running
+ * on another CPU, from interrupt/NMI context.
+ */
 static void perf_output_read(struct perf_output_handle *handle,
 			     struct perf_event *event)
 {
@@ -9462,9 +9466,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	local64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
-	 * we currently do not support PERF_FORMAT_GROUP on inherited events
+	 * We currently do not support PERF_SAMPLE_READ on inherited events.
+	 * See perf_output_read().
 	 */
-	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
+	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
 		goto err_ns;
 
 	if (!has_branch_stack(event))
-- 
cgit v1.3-14-g43fede


From d0fabd1cb8b70073a0f44f1cf8b663b5e7241c74 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Tue, 23 May 2017 14:51:32 -0700
Subject: perf/core: Remove unused perf_cgroup_event_cgrp_time() function

The function was added by commit e5d1367f17ba ("perf: Add cgroup
support") in 2011 and hasn't been used since then. Removing it fixes the
following warning when building with Clang:

    kernel/events/core.c:696:19: error: unused function 'perf_cgroup_event_cgrp_time' [-Werror,-Wunused-function]

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170523215132.189049-1-mka@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 407dad6cf89a..bc63f8db1b0d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -925,11 +925,6 @@ static inline int is_cgroup_event(struct perf_event *event)
 	return 0;
 }
 
-static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
-{
-	return 0;
-}
-
 static inline void update_cgrp_time_from_event(struct perf_event *event)
 {
 }
-- 
cgit v1.3-14-g43fede


From 1ad3aaf3fcd2444406628a19a9b9e0922b95e2d4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 17 May 2017 12:53:50 +0200
Subject: sched/core: Implement new approach to scale select_idle_cpu()

Hackbench recently suffered a bunch of pain, first by commit:

  4c77b18cf8b7 ("sched/fair: Make select_idle_cpu() more aggressive")

and then by commit:

  c743f0a5c50f ("sched/fair, cpumask: Export for_each_cpu_wrap()")

which fixed a bug in the initial for_each_cpu_wrap() implementation
that made select_idle_cpu() even more expensive. The bug was that it
would skip over CPUs when bits were consequtive in the bitmask.

This however gave me an idea to fix select_idle_cpu(); where the old
scheme was a cliff-edge throttle on idle scanning, this introduces a
more gradual approach. Instead of stopping to scan entirely, we limit
how many CPUs we scan.

Initial benchmarks show that it mostly recovers hackbench while not
hurting anything else, except Mason's schbench, but not as bad as the
old thing.

It also appears to recover the tbench high-end, which also suffered like
hackbench.

Tested-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: hpa@zytor.com
Cc: kitsunyan <kitsunyan@inbox.ru>
Cc: linux-kernel@vger.kernel.org
Cc: lvenanci@redhat.com
Cc: riel@redhat.com
Cc: xiaolong.ye@intel.com
Link: http://lkml.kernel.org/r/20170517105350.hk5m4h4jb6dfr65a@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c     | 21 ++++++++++++++++-----
 kernel/sched/features.h |  1 +
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 47a0c552c77b..396bca9c7996 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5794,27 +5794,38 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
 {
 	struct sched_domain *this_sd;
-	u64 avg_cost, avg_idle = this_rq()->avg_idle;
+	u64 avg_cost, avg_idle;
 	u64 time, cost;
 	s64 delta;
-	int cpu;
+	int cpu, nr = INT_MAX;
 
 	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
 	if (!this_sd)
 		return -1;
 
-	avg_cost = this_sd->avg_scan_cost;
-
 	/*
 	 * Due to large variance we need a large fuzz factor; hackbench in
 	 * particularly is sensitive here.
 	 */
-	if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost)
+	avg_idle = this_rq()->avg_idle / 512;
+	avg_cost = this_sd->avg_scan_cost + 1;
+
+	if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
 		return -1;
 
+	if (sched_feat(SIS_PROP)) {
+		u64 span_avg = sd->span_weight * avg_idle;
+		if (span_avg > 4*avg_cost)
+			nr = div_u64(span_avg, avg_cost);
+		else
+			nr = 4;
+	}
+
 	time = local_clock();
 
 	for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+		if (!--nr)
+			return -1;
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
 		if (idle_cpu(cpu))
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index dc4d1483b038..d3fb15555291 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -55,6 +55,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
  * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
  */
 SCHED_FEAT(SIS_AVG_CPU, false)
+SCHED_FEAT(SIS_PROP, true)
 
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls
-- 
cgit v1.3-14-g43fede


From e36d8677bfa55054e4194ec3683189b882a538f6 Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@unitn.it>
Date: Thu, 18 May 2017 22:13:28 +0200
Subject: sched/deadline: Track the active utilization

Active utilization is defined as the total utilization of active
(TASK_RUNNING) tasks queued on a runqueue. Hence, it is increased
when a task wakes up and is decreased when a task blocks.

When a task is migrated from CPUi to CPUj, immediately subtract the
task's utilization from CPUi and add it to CPUj. This mechanism is
implemented by modifying the pull and push functions.
Note: this is not fully correct from the theoretical point of view
(the utilization should be removed from CPUi only at the 0 lag
time), a more theoretically sound solution is presented in the
next patches.

Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Luca Abeni <luca.abeni@unitn.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@arm.com>
Cc: Claudio Scordino <claudio@evidence.eu.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Link: http://lkml.kernel.org/r/1495138417-6203-2-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h    |  6 +++++
 2 files changed, 67 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index df6c2912bd60..b36ecc2b1b10 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
 	return !RB_EMPTY_NODE(&dl_se->rb_node);
 }
 
+static inline
+void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+	u64 old = dl_rq->running_bw;
+
+	lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+	dl_rq->running_bw += dl_bw;
+	SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
+}
+
+static inline
+void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+	u64 old = dl_rq->running_bw;
+
+	lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+	dl_rq->running_bw -= dl_bw;
+	SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
+	if (dl_rq->running_bw > old)
+		dl_rq->running_bw = 0;
+}
+
 static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
 {
 	struct sched_dl_entity *dl_se = &p->dl;
@@ -83,6 +105,8 @@ void init_dl_rq(struct dl_rq *dl_rq)
 #else
 	init_dl_bw(&dl_rq->dl_bw);
 #endif
+
+	dl_rq->running_bw = 0;
 }
 
 #ifdef CONFIG_SMP
@@ -946,10 +970,14 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
 	 * parameters of the task might need updating. Otherwise,
 	 * we want a replenishment of its runtime.
 	 */
-	if (flags & ENQUEUE_WAKEUP)
+	if (flags & ENQUEUE_WAKEUP) {
+		struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+		add_running_bw(dl_se->dl_bw, dl_rq);
 		update_dl_entity(dl_se, pi_se);
-	else if (flags & ENQUEUE_REPLENISH)
+	} else if (flags & ENQUEUE_REPLENISH) {
 		replenish_dl_entity(dl_se, pi_se);
+	}
 
 	__enqueue_dl_entity(dl_se);
 }
@@ -998,14 +1026,25 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 	if (!p->dl.dl_throttled && dl_is_constrained(&p->dl))
 		dl_check_constrained_dl(&p->dl);
 
+	if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE)
+		add_running_bw(p->dl.dl_bw, &rq->dl);
+
 	/*
-	 * If p is throttled, we do nothing. In fact, if it exhausted
+	 * If p is throttled, we do not enqueue it. In fact, if it exhausted
 	 * its budget it needs a replenishment and, since it now is on
 	 * its rq, the bandwidth timer callback (which clearly has not
 	 * run yet) will take care of this.
+	 * However, the active utilization does not depend on the fact
+	 * that the task is on the runqueue or not (but depends on the
+	 * task's state - in GRUB parlance, "inactive" vs "active contending").
+	 * In other words, even if a task is throttled its utilization must
+	 * be counted in the active utilization; hence, we need to call
+	 * add_running_bw().
 	 */
-	if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
+	if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
+		add_running_bw(p->dl.dl_bw, &rq->dl);
 		return;
+	}
 
 	enqueue_dl_entity(&p->dl, pi_se, flags);
 
@@ -1023,6 +1062,20 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_curr_dl(rq);
 	__dequeue_task_dl(rq, p, flags);
+
+	if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE)
+		sub_running_bw(p->dl.dl_bw, &rq->dl);
+
+	/*
+	 * This check allows to decrease the active utilization in two cases:
+	 * when the task blocks and when it is terminating
+	 * (p->state == TASK_DEAD). We can handle the two cases in the same
+	 * way, because from GRUB's point of view the same thing is happening
+	 * (the task moves from "active contending" to "active non contending"
+	 * or "inactive")
+	 */
+	if (flags & DEQUEUE_SLEEP)
+		sub_running_bw(p->dl.dl_bw, &rq->dl);
 }
 
 /*
@@ -1551,7 +1604,9 @@ retry:
 	}
 
 	deactivate_task(rq, next_task, 0);
+	sub_running_bw(next_task->dl.dl_bw, &rq->dl);
 	set_task_cpu(next_task, later_rq->cpu);
+	add_running_bw(next_task->dl.dl_bw, &later_rq->dl);
 	activate_task(later_rq, next_task, 0);
 	ret = 1;
 
@@ -1639,7 +1694,9 @@ static void pull_dl_task(struct rq *this_rq)
 			resched = true;
 
 			deactivate_task(src_rq, p, 0);
+			sub_running_bw(p->dl.dl_bw, &src_rq->dl);
 			set_task_cpu(p, this_cpu);
+			add_running_bw(p->dl.dl_bw, &this_rq->dl);
 			activate_task(this_rq, p, 0);
 			dmin = p->dl.deadline;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f8cf1d87f065..ee26867da339 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -558,6 +558,12 @@ struct dl_rq {
 #else
 	struct dl_bw dl_bw;
 #endif
+	/*
+	 * "Active utilization" for this runqueue: increased when a
+	 * task wakes up (becomes TASK_RUNNING) and decreased when a
+	 * task blocks
+	 */
+	u64 running_bw;
 };
 
 #ifdef CONFIG_SMP
-- 
cgit v1.3-14-g43fede


From 209a0cbda7a01d2ea32a8b631d35e873bee498e9 Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@santannapisa.it>
Date: Thu, 18 May 2017 22:13:29 +0200
Subject: sched/deadline: Improve the tracking of active utilization

This patch implements a more theoretically sound algorithm for
tracking active utilization: instead of decreasing it when a
task blocks, use a timer (the "inactive timer", named after the
"Inactive" task state of the GRUB algorithm) to decrease the
active utilization at the so called "0-lag time".

Tested-by: Claudio Scordino <claudio@evidence.eu.com>
Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Link: http://lkml.kernel.org/r/1495138417-6203-3-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h   |  17 +++
 kernel/sched/core.c     |   3 +
 kernel/sched/deadline.c | 269 +++++++++++++++++++++++++++++++++++++++++++++---
 kernel/sched/sched.h    |   2 +
 4 files changed, 276 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1abaa3728bf7..f1ead2e88d3d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -445,16 +445,33 @@ struct sched_dl_entity {
 	 *
 	 * @dl_yielded tells if task gave up the CPU before consuming
 	 * all its available runtime during the last job.
+	 *
+	 * @dl_non_contending tells if the task is inactive while still
+	 * contributing to the active utilization. In other words, it
+	 * indicates if the inactive timer has been armed and its handler
+	 * has not been executed yet. This flag is useful to avoid race
+	 * conditions between the inactive timer handler and the wakeup
+	 * code.
 	 */
 	int				dl_throttled;
 	int				dl_boosted;
 	int				dl_yielded;
+	int				dl_non_contending;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
 	 * own bandwidth to be enforced, thus we need one timer per task.
 	 */
 	struct hrtimer			dl_timer;
+
+	/*
+	 * Inactive timer, responsible for decreasing the active utilization
+	 * at the "0-lag time". When a -deadline task blocks, it contributes
+	 * to GRUB's active utilization until the "0-lag time", hence a
+	 * timer is needed to decrease the active utilization at the correct
+	 * time.
+	 */
+	struct hrtimer inactive_timer;
 };
 
 union rcu_special {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c3e50cada84d..968c655ec5d9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2153,6 +2153,7 @@ void __dl_clear_params(struct task_struct *p)
 
 	dl_se->dl_throttled = 0;
 	dl_se->dl_yielded = 0;
+	dl_se->dl_non_contending = 0;
 }
 
 /*
@@ -2184,6 +2185,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 
 	RB_CLEAR_NODE(&p->dl.rb_node);
 	init_dl_task_timer(&p->dl);
+	init_dl_inactive_task_timer(&p->dl);
 	__dl_clear_params(p);
 
 	INIT_LIST_HEAD(&p->rt.run_list);
@@ -2506,6 +2508,7 @@ static int dl_overflow(struct task_struct *p, int policy,
 		   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
 		__dl_clear(dl_b, p->dl.dl_bw);
 		__dl_add(dl_b, new_bw);
+		dl_change_utilization(p, new_bw);
 		err = 0;
 	} else if (!dl_policy(policy) && task_has_dl_policy(p)) {
 		__dl_clear(dl_b, p->dl.dl_bw);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b36ecc2b1b10..6480a929417c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -65,6 +65,161 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
 		dl_rq->running_bw = 0;
 }
 
+void dl_change_utilization(struct task_struct *p, u64 new_bw)
+{
+	if (task_on_rq_queued(p))
+		return;
+
+	if (!p->dl.dl_non_contending)
+		return;
+
+	sub_running_bw(p->dl.dl_bw, &task_rq(p)->dl);
+	p->dl.dl_non_contending = 0;
+	/*
+	 * If the timer handler is currently running and the
+	 * timer cannot be cancelled, inactive_task_timer()
+	 * will see that dl_not_contending is not set, and
+	 * will not touch the rq's active utilization,
+	 * so we are still safe.
+	 */
+	if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+		put_task_struct(p);
+}
+
+/*
+ * The utilization of a task cannot be immediately removed from
+ * the rq active utilization (running_bw) when the task blocks.
+ * Instead, we have to wait for the so called "0-lag time".
+ *
+ * If a task blocks before the "0-lag time", a timer (the inactive
+ * timer) is armed, and running_bw is decreased when the timer
+ * fires.
+ *
+ * If the task wakes up again before the inactive timer fires,
+ * the timer is cancelled, whereas if the task wakes up after the
+ * inactive timer fired (and running_bw has been decreased) the
+ * task's utilization has to be added to running_bw again.
+ * A flag in the deadline scheduling entity (dl_non_contending)
+ * is used to avoid race conditions between the inactive timer handler
+ * and task wakeups.
+ *
+ * The following diagram shows how running_bw is updated. A task is
+ * "ACTIVE" when its utilization contributes to running_bw; an
+ * "ACTIVE contending" task is in the TASK_RUNNING state, while an
+ * "ACTIVE non contending" task is a blocked task for which the "0-lag time"
+ * has not passed yet. An "INACTIVE" task is a task for which the "0-lag"
+ * time already passed, which does not contribute to running_bw anymore.
+ *                              +------------------+
+ *             wakeup           |    ACTIVE        |
+ *          +------------------>+   contending     |
+ *          | add_running_bw    |                  |
+ *          |                   +----+------+------+
+ *          |                        |      ^
+ *          |                dequeue |      |
+ * +--------+-------+                |      |
+ * |                |   t >= 0-lag   |      | wakeup
+ * |    INACTIVE    |<---------------+      |
+ * |                | sub_running_bw |      |
+ * +--------+-------+                |      |
+ *          ^                        |      |
+ *          |              t < 0-lag |      |
+ *          |                        |      |
+ *          |                        V      |
+ *          |                   +----+------+------+
+ *          | sub_running_bw    |    ACTIVE        |
+ *          +-------------------+                  |
+ *            inactive timer    |  non contending  |
+ *            fired             +------------------+
+ *
+ * The task_non_contending() function is invoked when a task
+ * blocks, and checks if the 0-lag time already passed or
+ * not (in the first case, it directly updates running_bw;
+ * in the second case, it arms the inactive timer).
+ *
+ * The task_contending() function is invoked when a task wakes
+ * up, and checks if the task is still in the "ACTIVE non contending"
+ * state or not (in the second case, it updates running_bw).
+ */
+static void task_non_contending(struct task_struct *p)
+{
+	struct sched_dl_entity *dl_se = &p->dl;
+	struct hrtimer *timer = &dl_se->inactive_timer;
+	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+	struct rq *rq = rq_of_dl_rq(dl_rq);
+	s64 zerolag_time;
+
+	/*
+	 * If this is a non-deadline task that has been boosted,
+	 * do nothing
+	 */
+	if (dl_se->dl_runtime == 0)
+		return;
+
+	WARN_ON(hrtimer_active(&dl_se->inactive_timer));
+	WARN_ON(dl_se->dl_non_contending);
+
+	zerolag_time = dl_se->deadline -
+		 div64_long((dl_se->runtime * dl_se->dl_period),
+			dl_se->dl_runtime);
+
+	/*
+	 * Using relative times instead of the absolute "0-lag time"
+	 * allows to simplify the code
+	 */
+	zerolag_time -= rq_clock(rq);
+
+	/*
+	 * If the "0-lag time" already passed, decrease the active
+	 * utilization now, instead of starting a timer
+	 */
+	if (zerolag_time < 0) {
+		if (dl_task(p))
+			sub_running_bw(dl_se->dl_bw, dl_rq);
+		if (!dl_task(p) || p->state == TASK_DEAD)
+			__dl_clear_params(p);
+
+		return;
+	}
+
+	dl_se->dl_non_contending = 1;
+	get_task_struct(p);
+	hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
+}
+
+static void task_contending(struct sched_dl_entity *dl_se)
+{
+	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+
+	/*
+	 * If this is a non-deadline task that has been boosted,
+	 * do nothing
+	 */
+	if (dl_se->dl_runtime == 0)
+		return;
+
+	if (dl_se->dl_non_contending) {
+		dl_se->dl_non_contending = 0;
+		/*
+		 * If the timer handler is currently running and the
+		 * timer cannot be cancelled, inactive_task_timer()
+		 * will see that dl_not_contending is not set, and
+		 * will not touch the rq's active utilization,
+		 * so we are still safe.
+		 */
+		if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
+			put_task_struct(dl_task_of(dl_se));
+	} else {
+		/*
+		 * Since "dl_non_contending" is not set, the
+		 * task's utilization has already been removed from
+		 * active utilization (either when the task blocked,
+		 * when the "inactive timer" fired).
+		 * So, add it back.
+		 */
+		add_running_bw(dl_se->dl_bw, dl_rq);
+	}
+}
+
 static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
 {
 	struct sched_dl_entity *dl_se = &p->dl;
@@ -617,10 +772,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	 * The task might have changed its scheduling policy to something
 	 * different than SCHED_DEADLINE (through switched_from_dl()).
 	 */
-	if (!dl_task(p)) {
-		__dl_clear_params(p);
+	if (!dl_task(p))
 		goto unlock;
-	}
 
 	/*
 	 * The task might have been boosted by someone else and might be in the
@@ -839,6 +992,49 @@ throttle:
 	}
 }
 
+static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
+{
+	struct sched_dl_entity *dl_se = container_of(timer,
+						     struct sched_dl_entity,
+						     inactive_timer);
+	struct task_struct *p = dl_task_of(dl_se);
+	struct rq_flags rf;
+	struct rq *rq;
+
+	rq = task_rq_lock(p, &rf);
+
+	if (!dl_task(p) || p->state == TASK_DEAD) {
+		if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
+			sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
+			dl_se->dl_non_contending = 0;
+		}
+		__dl_clear_params(p);
+
+		goto unlock;
+	}
+	if (dl_se->dl_non_contending == 0)
+		goto unlock;
+
+	sched_clock_tick();
+	update_rq_clock(rq);
+
+	sub_running_bw(dl_se->dl_bw, &rq->dl);
+	dl_se->dl_non_contending = 0;
+unlock:
+	task_rq_unlock(rq, p, &rf);
+	put_task_struct(p);
+
+	return HRTIMER_NORESTART;
+}
+
+void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
+{
+	struct hrtimer *timer = &dl_se->inactive_timer;
+
+	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	timer->function = inactive_task_timer;
+}
+
 #ifdef CONFIG_SMP
 
 static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
@@ -971,9 +1167,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
 	 * we want a replenishment of its runtime.
 	 */
 	if (flags & ENQUEUE_WAKEUP) {
-		struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
-
-		add_running_bw(dl_se->dl_bw, dl_rq);
+		task_contending(dl_se);
 		update_dl_entity(dl_se, pi_se);
 	} else if (flags & ENQUEUE_REPLENISH) {
 		replenish_dl_entity(dl_se, pi_se);
@@ -1042,7 +1236,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 	 * add_running_bw().
 	 */
 	if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
-		add_running_bw(p->dl.dl_bw, &rq->dl);
+		if (flags & ENQUEUE_WAKEUP)
+			task_contending(&p->dl);
+
 		return;
 	}
 
@@ -1067,7 +1263,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 		sub_running_bw(p->dl.dl_bw, &rq->dl);
 
 	/*
-	 * This check allows to decrease the active utilization in two cases:
+	 * This check allows to start the inactive timer (or to immediately
+	 * decrease the active utilization, if needed) in two cases:
 	 * when the task blocks and when it is terminating
 	 * (p->state == TASK_DEAD). We can handle the two cases in the same
 	 * way, because from GRUB's point of view the same thing is happening
@@ -1075,7 +1272,7 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 	 * or "inactive")
 	 */
 	if (flags & DEQUEUE_SLEEP)
-		sub_running_bw(p->dl.dl_bw, &rq->dl);
+		task_non_contending(p);
 }
 
 /*
@@ -1153,6 +1350,35 @@ out:
 	return cpu;
 }
 
+static void migrate_task_rq_dl(struct task_struct *p)
+{
+	struct rq *rq;
+
+	if (!(p->state == TASK_WAKING) || !(p->dl.dl_non_contending))
+		return;
+
+	rq = task_rq(p);
+	/*
+	 * Since p->state == TASK_WAKING, set_task_cpu() has been called
+	 * from try_to_wake_up(). Hence, p->pi_lock is locked, but
+	 * rq->lock is not... So, lock it
+	 */
+	raw_spin_lock(&rq->lock);
+	sub_running_bw(p->dl.dl_bw, &rq->dl);
+	p->dl.dl_non_contending = 0;
+	/*
+	 * If the timer handler is currently running and the
+	 * timer cannot be cancelled, inactive_task_timer()
+	 * will see that dl_not_contending is not set, and
+	 * will not touch the rq's active utilization,
+	 * so we are still safe.
+	 */
+	if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+		put_task_struct(p);
+
+	raw_spin_unlock(&rq->lock);
+}
+
 static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 {
 	/*
@@ -1794,13 +2020,23 @@ void __init init_sched_dl_class(void)
 static void switched_from_dl(struct rq *rq, struct task_struct *p)
 {
 	/*
-	 * Start the deadline timer; if we switch back to dl before this we'll
-	 * continue consuming our current CBS slice. If we stay outside of
-	 * SCHED_DEADLINE until the deadline passes, the timer will reset the
-	 * task.
+	 * task_non_contending() can start the "inactive timer" (if the 0-lag
+	 * time is in the future). If the task switches back to dl before
+	 * the "inactive timer" fires, it can continue to consume its current
+	 * runtime using its current deadline. If it stays outside of
+	 * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer()
+	 * will reset the task parameters.
 	 */
-	if (!start_dl_timer(p))
-		__dl_clear_params(p);
+	if (task_on_rq_queued(p) && p->dl.dl_runtime)
+		task_non_contending(p);
+
+	/*
+	 * We cannot use inactive_task_timer() to invoke sub_running_bw()
+	 * at the 0-lag time, because the task could have been migrated
+	 * while SCHED_OTHER in the meanwhile.
+	 */
+	if (p->dl.dl_non_contending)
+		p->dl.dl_non_contending = 0;
 
 	/*
 	 * Since this might be the only -deadline task on the rq,
@@ -1819,6 +2055,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
+	if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+		put_task_struct(p);
 
 	/* If p is not queued we will update its parameters at next wakeup. */
 	if (!task_on_rq_queued(p))
@@ -1893,6 +2131,7 @@ const struct sched_class dl_sched_class = {
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_dl,
+	.migrate_task_rq	= migrate_task_rq_dl,
 	.set_cpus_allowed       = set_cpus_allowed_dl,
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ee26867da339..c58f38905e0a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -244,6 +244,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
 	       dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
 }
 
+void dl_change_utilization(struct task_struct *p, u64 new_bw);
 extern void init_dl_bw(struct dl_bw *dl_b);
 
 #ifdef CONFIG_CGROUP_SCHED
@@ -1493,6 +1494,7 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
 extern struct dl_bandwidth def_dl_bandwidth;
 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
 extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
+extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
 
 unsigned long to_ratio(u64 period, u64 runtime);
 
-- 
cgit v1.3-14-g43fede


From 387e31300b5760169e6d3f7a9e1eeed12cc5a30b Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@santannapisa.it>
Date: Thu, 18 May 2017 22:13:30 +0200
Subject: sched/deadline: Fix the update of the total -deadline utilization

Now that the inactive timer can be armed to fire at the 0-lag time,
it is possible to use inactive_task_timer() to update the total
-deadline utilization (dl_b->total_bw) at the correct time, fixing
dl_overflow() and __setparam_dl().

Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Claudio Scordino <claudio@evidence.eu.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Link: http://lkml.kernel.org/r/1495138417-6203-4-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c     | 38 ++++++++++++++------------------------
 kernel/sched/deadline.c | 28 +++++++++++++---------------
 2 files changed, 27 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 968c655ec5d9..126339daebd7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2475,9 +2475,6 @@ static inline int dl_bw_cpus(int i)
  * allocated bandwidth to reflect the new situation.
  *
  * This function is called while holding p's rq->lock.
- *
- * XXX we should delay bw change until the task's 0-lag point, see
- * __setparam_dl().
  */
 static int dl_overflow(struct task_struct *p, int policy,
 		       const struct sched_attr *attr)
@@ -2502,16 +2499,29 @@ static int dl_overflow(struct task_struct *p, int policy,
 	cpus = dl_bw_cpus(task_cpu(p));
 	if (dl_policy(policy) && !task_has_dl_policy(p) &&
 	    !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+		if (hrtimer_active(&p->dl.inactive_timer))
+			__dl_clear(dl_b, p->dl.dl_bw);
 		__dl_add(dl_b, new_bw);
 		err = 0;
 	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
 		   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+		/*
+		 * XXX this is slightly incorrect: when the task
+		 * utilization decreases, we should delay the total
+		 * utilization change until the task's 0-lag point.
+		 * But this would require to set the task's "inactive
+		 * timer" when the task is not inactive.
+		 */
 		__dl_clear(dl_b, p->dl.dl_bw);
 		__dl_add(dl_b, new_bw);
 		dl_change_utilization(p, new_bw);
 		err = 0;
 	} else if (!dl_policy(policy) && task_has_dl_policy(p)) {
-		__dl_clear(dl_b, p->dl.dl_bw);
+		/*
+		 * Do not decrease the total deadline utilization here,
+		 * switched_from_dl() will take care to do it at the correct
+		 * (0-lag) time.
+		 */
 		err = 0;
 	}
 	raw_spin_unlock(&dl_b->lock);
@@ -4020,26 +4030,6 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
 	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
 	dl_se->flags = attr->sched_flags;
 	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
-
-	/*
-	 * Changing the parameters of a task is 'tricky' and we're not doing
-	 * the correct thing -- also see task_dead_dl() and switched_from_dl().
-	 *
-	 * What we SHOULD do is delay the bandwidth release until the 0-lag
-	 * point. This would include retaining the task_struct until that time
-	 * and change dl_overflow() to not immediately decrement the current
-	 * amount.
-	 *
-	 * Instead we retain the current runtime/deadline and let the new
-	 * parameters take effect after the current reservation period lapses.
-	 * This is safe (albeit pessimistic) because the 0-lag point is always
-	 * before the current scheduling deadline.
-	 *
-	 * We can still have temporary overloads because we do not delay the
-	 * change in bandwidth until that time; so admission control is
-	 * not on the safe side. It does however guarantee tasks will never
-	 * consume more than promised.
-	 */
 }
 
 /*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6480a929417c..add9cba1253c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -175,8 +175,14 @@ static void task_non_contending(struct task_struct *p)
 	if (zerolag_time < 0) {
 		if (dl_task(p))
 			sub_running_bw(dl_se->dl_bw, dl_rq);
-		if (!dl_task(p) || p->state == TASK_DEAD)
+		if (!dl_task(p) || p->state == TASK_DEAD) {
+			struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
+			raw_spin_lock(&dl_b->lock);
+			__dl_clear(dl_b, p->dl.dl_bw);
 			__dl_clear_params(p);
+			raw_spin_unlock(&dl_b->lock);
+		}
 
 		return;
 	}
@@ -1004,10 +1010,16 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 	rq = task_rq_lock(p, &rf);
 
 	if (!dl_task(p) || p->state == TASK_DEAD) {
+		struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+
 		if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
 			sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
 			dl_se->dl_non_contending = 0;
 		}
+
+		raw_spin_lock(&dl_b->lock);
+		__dl_clear(dl_b, p->dl.dl_bw);
+		raw_spin_unlock(&dl_b->lock);
 		__dl_clear_params(p);
 
 		goto unlock;
@@ -1534,19 +1546,6 @@ static void task_fork_dl(struct task_struct *p)
 	 */
 }
 
-static void task_dead_dl(struct task_struct *p)
-{
-	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
-
-	/*
-	 * Since we are TASK_DEAD we won't slip out of the domain!
-	 */
-	raw_spin_lock_irq(&dl_b->lock);
-	/* XXX we should retain the bw until 0-lag */
-	dl_b->total_bw -= p->dl.dl_bw;
-	raw_spin_unlock_irq(&dl_b->lock);
-}
-
 static void set_curr_task_dl(struct rq *rq)
 {
 	struct task_struct *p = rq->curr;
@@ -2141,7 +2140,6 @@ const struct sched_class dl_sched_class = {
 	.set_curr_task		= set_curr_task_dl,
 	.task_tick		= task_tick_dl,
 	.task_fork              = task_fork_dl,
-	.task_dead		= task_dead_dl,
 
 	.prio_changed           = prio_changed_dl,
 	.switched_from		= switched_from_dl,
-- 
cgit v1.3-14-g43fede


From c52f14d384628db0217a7a9080ab800d5ffb2d72 Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@santannapisa.it>
Date: Thu, 18 May 2017 22:13:31 +0200
Subject: sched/deadline: Implement GRUB accounting

According to the GRUB (Greedy Reclaimation of Unused Bandwidth)
reclaiming algorithm, the runtime is not decreased as "dq = -dt",
but as "dq = -Uact dt" (where Uact is the per-runqueue active
utilization).
Hence, this commit modifies the runtime accounting rule in
update_curr_dl() to implement the GRUB rule.

Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Claudio Scordino <claudio@evidence.eu.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Link: http://lkml.kernel.org/r/1495138417-6203-5-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c     |  4 ++--
 kernel/sched/deadline.c | 17 +++++++++++++++++
 kernel/sched/sched.h    |  2 ++
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 126339daebd7..b68a1fa05244 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2423,7 +2423,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 unsigned long to_ratio(u64 period, u64 runtime)
 {
 	if (runtime == RUNTIME_INF)
-		return 1ULL << 20;
+		return BW_UNIT;
 
 	/*
 	 * Doing this here saves a lot of checks in all
@@ -2433,7 +2433,7 @@ unsigned long to_ratio(u64 period, u64 runtime)
 	if (period == 0)
 		return 0;
 
-	return div64_u64(runtime << 20, period);
+	return div64_u64(runtime << BW_SHIFT, period);
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index add9cba1253c..0bee537554f6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -917,6 +917,22 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
 
 extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
 
+/*
+ * This function implements the GRUB accounting rule:
+ * according to the GRUB reclaiming algorithm, the runtime is
+ * not decreased as "dq = -dt", but as "dq = -Uact dt", where
+ * Uact is the (per-runqueue) active utilization.
+ * Since rq->dl.running_bw contains Uact * 2^BW_SHIFT, the result
+ * has to be shifted right by BW_SHIFT.
+ */
+u64 grub_reclaim(u64 delta, struct rq *rq)
+{
+	delta *= rq->dl.running_bw;
+	delta >>= BW_SHIFT;
+
+	return delta;
+}
+
 /*
  * Update the current task's runtime statistics (provided it is still
  * a -deadline task and has not been removed from the dl_rq).
@@ -959,6 +975,7 @@ static void update_curr_dl(struct rq *rq)
 
 	sched_rt_avg_update(rq, delta_exec);
 
+	delta_exec = grub_reclaim(delta_exec, rq);
 	dl_se->runtime -= delta_exec;
 
 throttle:
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c58f38905e0a..bb409ef40120 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1496,6 +1496,8 @@ extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime
 extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
 
+#define BW_SHIFT	20
+#define BW_UNIT		(1 << BW_SHIFT)
 unsigned long to_ratio(u64 period, u64 runtime);
 
 extern void init_entity_runnable_average(struct sched_entity *se);
-- 
cgit v1.3-14-g43fede


From 4da3abcefe178c650033f371e94fa10e80bce167 Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@santannapisa.it>
Date: Thu, 18 May 2017 22:13:32 +0200
Subject: sched/deadline: Do not reclaim the whole CPU bandwidth

Original GRUB tends to reclaim 100% of the CPU time... And this
allows a CPU hog to starve non-deadline tasks.
To address this issue, allow the scheduler to reclaim only a
specified fraction of CPU time, stored in the new "bw_ratio"
field of the dl runqueue structure.

Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Claudio Scordino <claudio@evidence.eu.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Link: http://lkml.kernel.org/r/1495138417-6203-6-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c     | 11 +++++++++++
 kernel/sched/deadline.c | 12 +++++++++++-
 kernel/sched/sched.h    |  8 ++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b68a1fa05244..7abd06400a98 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6759,6 +6759,16 @@ static int sched_dl_global_validate(void)
 	return ret;
 }
 
+void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
+{
+	if (global_rt_runtime() == RUNTIME_INF) {
+		dl_rq->bw_ratio = 1 << RATIO_SHIFT;
+	} else {
+		dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
+			  global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
+	}
+}
+
 static void sched_dl_do_global(void)
 {
 	u64 new_bw = -1;
@@ -6784,6 +6794,7 @@ static void sched_dl_do_global(void)
 		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 
 		rcu_read_unlock_sched();
+		init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
 	}
 }
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0bee537554f6..6a0614b9c8d7 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -268,6 +268,7 @@ void init_dl_rq(struct dl_rq *dl_rq)
 #endif
 
 	dl_rq->running_bw = 0;
+	init_dl_rq_bw_ratio(dl_rq);
 }
 
 #ifdef CONFIG_SMP
@@ -924,11 +925,20 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
  * Uact is the (per-runqueue) active utilization.
  * Since rq->dl.running_bw contains Uact * 2^BW_SHIFT, the result
  * has to be shifted right by BW_SHIFT.
+ * To reclaim only a fraction Umax of the CPU time, the
+ * runtime accounting rule is modified as
+ * "dq = -Uact / Umax dt"; since rq->dl.bw_ratio contains
+ * 2^RATIO_SHIFT / Umax, delta is multiplied by bw_ratio and shifted
+ * right by RATIO_SHIFT.
+ * Since delta is a 64 bit variable, to have an overflow its value
+ * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
+ * So, overflow is not an issue here.
  */
 u64 grub_reclaim(u64 delta, struct rq *rq)
 {
 	delta *= rq->dl.running_bw;
-	delta >>= BW_SHIFT;
+	delta *= rq->dl.bw_ratio;
+	delta >>= BW_SHIFT + RATIO_SHIFT;
 
 	return delta;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bb409ef40120..878fe757d6ad 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -565,6 +565,12 @@ struct dl_rq {
 	 * task blocks
 	 */
 	u64 running_bw;
+
+	/*
+	 * Inverse of the fraction of CPU utilization that can be reclaimed
+	 * by the GRUB algorithm.
+	 */
+	u64 bw_ratio;
 };
 
 #ifdef CONFIG_SMP
@@ -1495,9 +1501,11 @@ extern struct dl_bandwidth def_dl_bandwidth;
 extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
 extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
+extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
 
 #define BW_SHIFT	20
 #define BW_UNIT		(1 << BW_SHIFT)
+#define RATIO_SHIFT	8
 unsigned long to_ratio(u64 period, u64 runtime);
 
 extern void init_entity_runnable_average(struct sched_entity *se);
-- 
cgit v1.3-14-g43fede


From 2d4283e9d583a3ee8cfb1cbb9c1270614df4c29d Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@santannapisa.it>
Date: Thu, 18 May 2017 22:13:33 +0200
Subject: sched/deadline: Make GRUB a task's flag

This patch introduces the SCHED_FLAG_RECLAIM flag to specify
that a DL task is allowed to reclaim unused CPU time (using
the GRUB algorithm).

Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Claudio Scordino <claudio@evidence.eu.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Link: http://lkml.kernel.org/r/1495138417-6203-7-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/sched.h | 1 +
 kernel/sched/core.c        | 3 ++-
 kernel/sched/deadline.c    | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 5f0fe019a720..e2a6c7b3510b 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -47,5 +47,6 @@
  * For the sched_{set,get}attr() calls
  */
 #define SCHED_FLAG_RESET_ON_FORK	0x01
+#define SCHED_FLAG_RECLAIM		0x02
 
 #endif /* _UAPI_LINUX_SCHED_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7abd06400a98..8d1a5a625814 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4195,7 +4195,8 @@ recheck:
 			return -EINVAL;
 	}
 
-	if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+	if (attr->sched_flags &
+		~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
 		return -EINVAL;
 
 	/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6a0614b9c8d7..61ea3039cdc1 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -985,7 +985,8 @@ static void update_curr_dl(struct rq *rq)
 
 	sched_rt_avg_update(rq, delta_exec);
 
-	delta_exec = grub_reclaim(delta_exec, rq);
+	if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
+		delta_exec = grub_reclaim(delta_exec, rq);
 	dl_se->runtime -= delta_exec;
 
 throttle:
-- 
cgit v1.3-14-g43fede


From 8fd27231c3302e0c7e1907df1252db97b65eb241 Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@santannapisa.it>
Date: Thu, 18 May 2017 22:13:34 +0200
Subject: sched/deadline: Track the "total rq utilization" too

The total rq utilization is defined as the sum of the utilisations of
tasks that are "assigned" to a runqueue, independently from their state
(TASK_RUNNING or blocked)

Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Claudio Scordino <claudio@evidence.eu.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Link: http://lkml.kernel.org/r/1495138417-6203-8-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 118 ++++++++++++++++++++++++++++++++++--------------
 kernel/sched/sched.h    |  11 +++++
 2 files changed, 95 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 61ea3039cdc1..6c6a1f099d61 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -51,6 +51,7 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
 	lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
 	dl_rq->running_bw += dl_bw;
 	SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
+	SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
 }
 
 static inline
@@ -65,25 +66,52 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
 		dl_rq->running_bw = 0;
 }
 
+static inline
+void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+	u64 old = dl_rq->this_bw;
+
+	lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+	dl_rq->this_bw += dl_bw;
+	SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
+}
+
+static inline
+void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
+{
+	u64 old = dl_rq->this_bw;
+
+	lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
+	dl_rq->this_bw -= dl_bw;
+	SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
+	if (dl_rq->this_bw > old)
+		dl_rq->this_bw = 0;
+	SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+}
+
 void dl_change_utilization(struct task_struct *p, u64 new_bw)
 {
-	if (task_on_rq_queued(p))
-		return;
+	struct rq *rq;
 
-	if (!p->dl.dl_non_contending)
+	if (task_on_rq_queued(p))
 		return;
 
-	sub_running_bw(p->dl.dl_bw, &task_rq(p)->dl);
-	p->dl.dl_non_contending = 0;
-	/*
-	 * If the timer handler is currently running and the
-	 * timer cannot be cancelled, inactive_task_timer()
-	 * will see that dl_not_contending is not set, and
-	 * will not touch the rq's active utilization,
-	 * so we are still safe.
-	 */
-	if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
-		put_task_struct(p);
+	rq = task_rq(p);
+	if (p->dl.dl_non_contending) {
+		sub_running_bw(p->dl.dl_bw, &rq->dl);
+		p->dl.dl_non_contending = 0;
+		/*
+		 * If the timer handler is currently running and the
+		 * timer cannot be cancelled, inactive_task_timer()
+		 * will see that dl_not_contending is not set, and
+		 * will not touch the rq's active utilization,
+		 * so we are still safe.
+		 */
+		if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+			put_task_struct(p);
+	}
+	sub_rq_bw(p->dl.dl_bw, &rq->dl);
+	add_rq_bw(new_bw, &rq->dl);
 }
 
 /*
@@ -178,6 +206,8 @@ static void task_non_contending(struct task_struct *p)
 		if (!dl_task(p) || p->state == TASK_DEAD) {
 			struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
 
+			if (p->state == TASK_DEAD)
+				sub_rq_bw(p->dl.dl_bw, &rq->dl);
 			raw_spin_lock(&dl_b->lock);
 			__dl_clear(dl_b, p->dl.dl_bw);
 			__dl_clear_params(p);
@@ -192,7 +222,7 @@ static void task_non_contending(struct task_struct *p)
 	hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
 }
 
-static void task_contending(struct sched_dl_entity *dl_se)
+static void task_contending(struct sched_dl_entity *dl_se, int flags)
 {
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 
@@ -203,6 +233,9 @@ static void task_contending(struct sched_dl_entity *dl_se)
 	if (dl_se->dl_runtime == 0)
 		return;
 
+	if (flags & ENQUEUE_MIGRATED)
+		add_rq_bw(dl_se->dl_bw, dl_rq);
+
 	if (dl_se->dl_non_contending) {
 		dl_se->dl_non_contending = 0;
 		/*
@@ -268,6 +301,7 @@ void init_dl_rq(struct dl_rq *dl_rq)
 #endif
 
 	dl_rq->running_bw = 0;
+	dl_rq->this_bw = 0;
 	init_dl_rq_bw_ratio(dl_rq);
 }
 
@@ -1042,6 +1076,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 
 		if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
 			sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
+			sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl));
 			dl_se->dl_non_contending = 0;
 		}
 
@@ -1207,7 +1242,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
 	 * we want a replenishment of its runtime.
 	 */
 	if (flags & ENQUEUE_WAKEUP) {
-		task_contending(dl_se);
+		task_contending(dl_se, flags);
 		update_dl_entity(dl_se, pi_se);
 	} else if (flags & ENQUEUE_REPLENISH) {
 		replenish_dl_entity(dl_se, pi_se);
@@ -1260,8 +1295,10 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 	if (!p->dl.dl_throttled && dl_is_constrained(&p->dl))
 		dl_check_constrained_dl(&p->dl);
 
-	if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE)
+	if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
+		add_rq_bw(p->dl.dl_bw, &rq->dl);
 		add_running_bw(p->dl.dl_bw, &rq->dl);
+	}
 
 	/*
 	 * If p is throttled, we do not enqueue it. In fact, if it exhausted
@@ -1277,7 +1314,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 	 */
 	if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
 		if (flags & ENQUEUE_WAKEUP)
-			task_contending(&p->dl);
+			task_contending(&p->dl, flags);
 
 		return;
 	}
@@ -1299,8 +1336,10 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 	update_curr_dl(rq);
 	__dequeue_task_dl(rq, p, flags);
 
-	if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE)
+	if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
 		sub_running_bw(p->dl.dl_bw, &rq->dl);
+		sub_rq_bw(p->dl.dl_bw, &rq->dl);
+	}
 
 	/*
 	 * This check allows to start the inactive timer (or to immediately
@@ -1394,7 +1433,7 @@ static void migrate_task_rq_dl(struct task_struct *p)
 {
 	struct rq *rq;
 
-	if (!(p->state == TASK_WAKING) || !(p->dl.dl_non_contending))
+	if (p->state != TASK_WAKING)
 		return;
 
 	rq = task_rq(p);
@@ -1404,18 +1443,20 @@ static void migrate_task_rq_dl(struct task_struct *p)
 	 * rq->lock is not... So, lock it
 	 */
 	raw_spin_lock(&rq->lock);
-	sub_running_bw(p->dl.dl_bw, &rq->dl);
-	p->dl.dl_non_contending = 0;
-	/*
-	 * If the timer handler is currently running and the
-	 * timer cannot be cancelled, inactive_task_timer()
-	 * will see that dl_not_contending is not set, and
-	 * will not touch the rq's active utilization,
-	 * so we are still safe.
-	 */
-	if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
-		put_task_struct(p);
-
+	if (p->dl.dl_non_contending) {
+		sub_running_bw(p->dl.dl_bw, &rq->dl);
+		p->dl.dl_non_contending = 0;
+		/*
+		 * If the timer handler is currently running and the
+		 * timer cannot be cancelled, inactive_task_timer()
+		 * will see that dl_not_contending is not set, and
+		 * will not touch the rq's active utilization,
+		 * so we are still safe.
+		 */
+		if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
+			put_task_struct(p);
+	}
+	sub_rq_bw(p->dl.dl_bw, &rq->dl);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -1858,7 +1899,9 @@ retry:
 
 	deactivate_task(rq, next_task, 0);
 	sub_running_bw(next_task->dl.dl_bw, &rq->dl);
+	sub_rq_bw(next_task->dl.dl_bw, &rq->dl);
 	set_task_cpu(next_task, later_rq->cpu);
+	add_rq_bw(next_task->dl.dl_bw, &later_rq->dl);
 	add_running_bw(next_task->dl.dl_bw, &later_rq->dl);
 	activate_task(later_rq, next_task, 0);
 	ret = 1;
@@ -1948,7 +1991,9 @@ static void pull_dl_task(struct rq *this_rq)
 
 			deactivate_task(src_rq, p, 0);
 			sub_running_bw(p->dl.dl_bw, &src_rq->dl);
+			sub_rq_bw(p->dl.dl_bw, &src_rq->dl);
 			set_task_cpu(p, this_cpu);
+			add_rq_bw(p->dl.dl_bw, &this_rq->dl);
 			add_running_bw(p->dl.dl_bw, &this_rq->dl);
 			activate_task(this_rq, p, 0);
 			dmin = p->dl.deadline;
@@ -2057,6 +2102,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 	if (task_on_rq_queued(p) && p->dl.dl_runtime)
 		task_non_contending(p);
 
+	if (!task_on_rq_queued(p))
+		sub_rq_bw(p->dl.dl_bw, &rq->dl);
+
 	/*
 	 * We cannot use inactive_task_timer() to invoke sub_running_bw()
 	 * at the 0-lag time, because the task could have been migrated
@@ -2086,9 +2134,11 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 		put_task_struct(p);
 
 	/* If p is not queued we will update its parameters at next wakeup. */
-	if (!task_on_rq_queued(p))
-		return;
+	if (!task_on_rq_queued(p)) {
+		add_rq_bw(p->dl.dl_bw, &rq->dl);
 
+		return;
+	}
 	/*
 	 * If p is boosted we already updated its params in
 	 * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 878fe757d6ad..b7321dac03c1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -566,6 +566,17 @@ struct dl_rq {
 	 */
 	u64 running_bw;
 
+	/*
+	 * Utilization of the tasks "assigned" to this runqueue (including
+	 * the tasks that are in runqueue and the tasks that executed on this
+	 * CPU and blocked). Increased when a task moves to this runqueue, and
+	 * decreased when the task moves away (migrates, changes scheduling
+	 * policy, or terminates).
+	 * This is needed to compute the "inactive utilization" for the
+	 * runqueue (inactive utilization = this_bw - running_bw).
+	 */
+	u64 this_bw;
+
 	/*
 	 * Inverse of the fraction of CPU utilization that can be reclaimed
 	 * by the GRUB algorithm.
-- 
cgit v1.3-14-g43fede


From 9f0d1a5077399143aad7e1244bb031e29116074e Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@santannapisa.it>
Date: Thu, 18 May 2017 22:13:35 +0200
Subject: sched/deadline: Base GRUB reclaiming on the inactive utilization

Instead of decreasing the runtime as "dq = -Uact dt" (eventually
divided by the maximum utilization available for deadline tasks),
decrease it as "dq = -max{u, (1 - Uinact)} dt", where u is the task
utilization and Uinact is the "inactive utilization".
In this way, the maximum fraction of CPU time that can be reclaimed
is given by the total utilization of deadline tasks.
This approach solves a fairness issue with "traditional" global GRUB
reclaiming: using the traditional GRUB algorithm, if tasks are
allocated to the various cores in a non-uniform way, the
reclaiming mechanism allows some tasks to reclaim more time than
others. This issue is visible starting 11 time-consuming tasks with
runtime 10ms and period 30ms (total utilization 3.666) on a 4-cores
system: some tasks will receive much more than the reserved runtime
(thanks to the reclaiming mechanism), while other tasks will receive
less than the reserved runtime.

Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Claudio Scordino <claudio@evidence.eu.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Link: http://lkml.kernel.org/r/1495138417-6203-9-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6c6a1f099d61..7d2f05778060 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -955,26 +955,30 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
 /*
  * This function implements the GRUB accounting rule:
  * according to the GRUB reclaiming algorithm, the runtime is
- * not decreased as "dq = -dt", but as "dq = -Uact dt", where
- * Uact is the (per-runqueue) active utilization.
- * Since rq->dl.running_bw contains Uact * 2^BW_SHIFT, the result
- * has to be shifted right by BW_SHIFT.
- * To reclaim only a fraction Umax of the CPU time, the
- * runtime accounting rule is modified as
- * "dq = -Uact / Umax dt"; since rq->dl.bw_ratio contains
- * 2^RATIO_SHIFT / Umax, delta is multiplied by bw_ratio and shifted
- * right by RATIO_SHIFT.
- * Since delta is a 64 bit variable, to have an overflow its value
- * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
- * So, overflow is not an issue here.
+ * not decreased as "dq = -dt", but as "dq = -max{u, (1 - Uinact)} dt",
+ * where u is the utilization of the task and Uinact is the
+ * (per-runqueue) inactive utilization, computed as the difference
+ * between the "total runqueue utilization" and the runqueue
+ * active utilization.
+ * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
+ * multiplied by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
  */
-u64 grub_reclaim(u64 delta, struct rq *rq)
+u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
 {
-	delta *= rq->dl.running_bw;
-	delta *= rq->dl.bw_ratio;
-	delta >>= BW_SHIFT + RATIO_SHIFT;
+	u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
+	u64 u_act;
 
-	return delta;
+	/*
+	 * Instead of computing max{u, (1 - u_inact)}, we compare
+	 * u_inact with 1 - u, because u_inact can be larger than 1
+	 * (so, 1 - u_inact would be negative leading to wrong results)
+	 */
+	if (u_inact > BW_UNIT - dl_se->dl_bw)
+		u_act = dl_se->dl_bw;
+	else
+		u_act = BW_UNIT - u_inact;
+
+	return (delta * u_act) >> BW_SHIFT;
 }
 
 /*
@@ -1020,7 +1024,7 @@ static void update_curr_dl(struct rq *rq)
 	sched_rt_avg_update(rq, delta_exec);
 
 	if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
-		delta_exec = grub_reclaim(delta_exec, rq);
+		delta_exec = grub_reclaim(delta_exec, rq, &curr->dl);
 	dl_se->runtime -= delta_exec;
 
 throttle:
-- 
cgit v1.3-14-g43fede


From daec5798367012951cdb54fdb5c006e4379c9ae9 Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@santannapisa.it>
Date: Thu, 18 May 2017 22:13:36 +0200
Subject: sched/deadline: Reclaim bandwidth not used by dl tasks

This commit introduces a per-runqueue "extra utilization" that can be
reclaimed by deadline tasks. In this way, the maximum fraction of CPU
time that can reclaimed by deadline tasks is fixed (and configurable)
and does not depend on the total deadline utilization.
The GRUB accounting rule is modified to add this "extra utilization"
to the inactive utilization of the runqueue, and to avoid reclaiming
more than a maximum fraction of the CPU time.

Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Claudio Scordino <claudio@evidence.eu.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Link: http://lkml.kernel.org/r/1495138417-6203-10-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c     | 17 ++++++++++-------
 kernel/sched/deadline.c | 42 +++++++++++++++++++++++++++---------------
 kernel/sched/sched.h    | 37 +++++++++++++++++++++++++++++++++++--
 3 files changed, 72 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8d1a5a625814..799647927c4c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2444,7 +2444,7 @@ inline struct dl_bw *dl_bw_of(int i)
 	return &cpu_rq(i)->rd->dl_bw;
 }
 
-static inline int dl_bw_cpus(int i)
+inline int dl_bw_cpus(int i)
 {
 	struct root_domain *rd = cpu_rq(i)->rd;
 	int cpus = 0;
@@ -2462,7 +2462,7 @@ inline struct dl_bw *dl_bw_of(int i)
 	return &cpu_rq(i)->dl.dl_bw;
 }
 
-static inline int dl_bw_cpus(int i)
+inline int dl_bw_cpus(int i)
 {
 	return 1;
 }
@@ -2500,8 +2500,8 @@ static int dl_overflow(struct task_struct *p, int policy,
 	if (dl_policy(policy) && !task_has_dl_policy(p) &&
 	    !__dl_overflow(dl_b, cpus, 0, new_bw)) {
 		if (hrtimer_active(&p->dl.inactive_timer))
-			__dl_clear(dl_b, p->dl.dl_bw);
-		__dl_add(dl_b, new_bw);
+			__dl_clear(dl_b, p->dl.dl_bw, cpus);
+		__dl_add(dl_b, new_bw, cpus);
 		err = 0;
 	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
 		   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
@@ -2512,8 +2512,8 @@ static int dl_overflow(struct task_struct *p, int policy,
 		 * But this would require to set the task's "inactive
 		 * timer" when the task is not inactive.
 		 */
-		__dl_clear(dl_b, p->dl.dl_bw);
-		__dl_add(dl_b, new_bw);
+		__dl_clear(dl_b, p->dl.dl_bw, cpus);
+		__dl_add(dl_b, new_bw, cpus);
 		dl_change_utilization(p, new_bw);
 		err = 0;
 	} else if (!dl_policy(policy) && task_has_dl_policy(p)) {
@@ -5515,7 +5515,7 @@ int task_can_attach(struct task_struct *p,
 			 * We will free resources in the source root_domain
 			 * later on (see set_cpus_allowed_dl()).
 			 */
-			__dl_add(dl_b, p->dl.dl_bw);
+			__dl_add(dl_b, p->dl.dl_bw, cpus);
 		}
 		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 		rcu_read_unlock_sched();
@@ -6764,9 +6764,12 @@ void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
 {
 	if (global_rt_runtime() == RUNTIME_INF) {
 		dl_rq->bw_ratio = 1 << RATIO_SHIFT;
+		dl_rq->extra_bw = 1 << BW_SHIFT;
 	} else {
 		dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
 			  global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
+		dl_rq->extra_bw = to_ratio(global_rt_period(),
+						    global_rt_runtime());
 	}
 }
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7d2f05778060..e3b25dfb74f3 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -209,7 +209,7 @@ static void task_non_contending(struct task_struct *p)
 			if (p->state == TASK_DEAD)
 				sub_rq_bw(p->dl.dl_bw, &rq->dl);
 			raw_spin_lock(&dl_b->lock);
-			__dl_clear(dl_b, p->dl.dl_bw);
+			__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
 			__dl_clear_params(p);
 			raw_spin_unlock(&dl_b->lock);
 		}
@@ -955,28 +955,40 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
 /*
  * This function implements the GRUB accounting rule:
  * according to the GRUB reclaiming algorithm, the runtime is
- * not decreased as "dq = -dt", but as "dq = -max{u, (1 - Uinact)} dt",
- * where u is the utilization of the task and Uinact is the
- * (per-runqueue) inactive utilization, computed as the difference
- * between the "total runqueue utilization" and the runqueue
- * active utilization.
+ * not decreased as "dq = -dt", but as
+ * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
+ * where u is the utilization of the task, Umax is the maximum reclaimable
+ * utilization, Uinact is the (per-runqueue) inactive utilization, computed
+ * as the difference between the "total runqueue utilization" and the
+ * runqueue active utilization, and Uextra is the (per runqueue) extra
+ * reclaimable utilization.
  * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
- * multiplied by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
+ * multiplied by 2^BW_SHIFT, the result has to be shifted right by
+ * BW_SHIFT.
+ * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT,
+ * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * Since delta is a 64 bit variable, to have an overflow its value
+ * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
+ * So, overflow is not an issue here.
  */
 u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
 {
 	u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
 	u64 u_act;
+	u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
 
 	/*
-	 * Instead of computing max{u, (1 - u_inact)}, we compare
-	 * u_inact with 1 - u, because u_inact can be larger than 1
-	 * (so, 1 - u_inact would be negative leading to wrong results)
+	 * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
+	 * we compare u_inact + rq->dl.extra_bw with
+	 * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
+	 * u_inact + rq->dl.extra_bw can be larger than
+	 * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
+	 * leading to wrong results)
 	 */
-	if (u_inact > BW_UNIT - dl_se->dl_bw)
-		u_act = dl_se->dl_bw;
+	if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
+		u_act = u_act_min;
 	else
-		u_act = BW_UNIT - u_inact;
+		u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
 
 	return (delta * u_act) >> BW_SHIFT;
 }
@@ -1085,7 +1097,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 		}
 
 		raw_spin_lock(&dl_b->lock);
-		__dl_clear(dl_b, p->dl.dl_bw);
+		__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
 		raw_spin_unlock(&dl_b->lock);
 		__dl_clear_params(p);
 
@@ -2054,7 +2066,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 		 * until we complete the update.
 		 */
 		raw_spin_lock(&src_dl_b->lock);
-		__dl_clear(src_dl_b, p->dl.dl_bw);
+		__dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
 		raw_spin_unlock(&src_dl_b->lock);
 	}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b7321dac03c1..f1e400c6403c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -219,22 +219,27 @@ static inline int dl_bandwidth_enabled(void)
 }
 
 extern struct dl_bw *dl_bw_of(int i);
+extern int dl_bw_cpus(int i);
 
 struct dl_bw {
 	raw_spinlock_t lock;
 	u64 bw, total_bw;
 };
 
+static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
+
 static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
 {
 	dl_b->total_bw -= tsk_bw;
+	__dl_update(dl_b, (s32)tsk_bw / cpus);
 }
 
 static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
 {
 	dl_b->total_bw += tsk_bw;
+	__dl_update(dl_b, -((s32)tsk_bw / cpus));
 }
 
 static inline
@@ -576,6 +581,7 @@ struct dl_rq {
 	 * runqueue (inactive utilization = this_bw - running_bw).
 	 */
 	u64 this_bw;
+	u64 extra_bw;
 
 	/*
 	 * Inverse of the fraction of CPU utilization that can be reclaimed
@@ -1958,6 +1964,33 @@ extern void nohz_balance_exit_idle(unsigned int cpu);
 static inline void nohz_balance_exit_idle(unsigned int cpu) { }
 #endif
 
+
+#ifdef CONFIG_SMP
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+	struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
+	int i;
+
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+			 "sched RCU must be held");
+	for_each_cpu_and(i, rd->span, cpu_active_mask) {
+		struct rq *rq = cpu_rq(i);
+
+		rq->dl.extra_bw += bw;
+	}
+}
+#else
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+	struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
+
+	dl->extra_bw += bw;
+}
+#endif
+
+
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 struct irqtime {
 	u64			total;
-- 
cgit v1.3-14-g43fede


From ae83b56a56f8d9643dedbee86b457fa1c5d42f59 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Wed, 10 May 2017 21:03:37 +0800
Subject: sched/deadline: Zero out positive runtime after throttling
 constrained tasks

When a contrained task is throttled by dl_check_constrained_dl(),
it may carry the remaining positive runtime, as a result when
dl_task_timer() fires and calls replenish_dl_entity(), it will
not be replenished correctly due to the positive dl_se->runtime.

This patch assigns its runtime to 0 if positive after throttling.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luca Abeni <luca.abeni@santannapisa.it>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: df8eac8cafce ("sched/deadline: Throttle a constrained deadline task activated after the deadline)
Link: http://lkml.kernel.org/r/1494421417-27550-1-git-send-email-xlpang@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e3b25dfb74f3..54302cf68bb9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -941,6 +941,8 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
 		if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
 			return;
 		dl_se->dl_throttled = 1;
+		if (dl_se->runtime > 0)
+			dl_se->runtime = 0;
 	}
 }
 
-- 
cgit v1.3-14-g43fede


From 3effcb4247e74a51f5d8b775a1ee4abf87cc089a Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Mon, 29 May 2017 16:24:03 +0200
Subject: sched/deadline: Use the revised wakeup rule for suspending
 constrained dl tasks

We have been facing some problems with self-suspending constrained
deadline tasks. The main reason is that the original CBS was not
designed for such sort of tasks.

One problem reported by Xunlei Pang takes place when a task
suspends, and then is awakened before the deadline, but so close
to the deadline that its remaining runtime can cause the task
to have an absolute density higher than allowed. In such situation,
the original CBS assumes that the task is facing an early activation,
and so it replenishes the task and set another deadline, one deadline
in the future. This rule works fine for implicit deadline tasks.
Moreover, it allows the system to adapt the period of a task in which
the external event source suffered from a clock drift.

However, this opens the window for bandwidth leakage for constrained
deadline tasks. For instance, a task with the following parameters:

  runtime   = 5 ms
  deadline  = 7 ms
  [density] = 5 / 7 = 0.71
  period    = 1000 ms

If the task runs for 1 ms, and then suspends for another 1ms,
it will be awakened with the following parameters:

  remaining runtime = 4
  laxity = 5

presenting a absolute density of 4 / 5 = 0.80.

In this case, the original CBS would assume the task had an early
wakeup. Then, CBS will reset the runtime, and the absolute deadline will
be postponed by one relative deadline, allowing the task to run.

The problem is that, if the task runs this pattern forever, it will keep
receiving bandwidth, being able to run 1ms every 2ms. Following this
behavior, the task would be able to run 500 ms in 1 sec. Thus running
more than the 5 ms / 1 sec the admission control allowed it to run.

Trying to address the self-suspending case, Luca Abeni, Giuseppe
Lipari, and Juri Lelli [1] revisited the CBS in order to deal with
self-suspending tasks. In the new approach, rather than
replenishing/postponing the absolute deadline, the revised wakeup rule
adjusts the remaining runtime, reducing it to fit into the allowed
density.

A revised version of the idea is:

At a given time t, the maximum absolute density of a task cannot be
higher than its relative density, that is:

  runtime / (deadline - t) <= dl_runtime / dl_deadline

Knowing the laxity of a task (deadline - t), it is possible to move
it to the other side of the equality, thus enabling to define max
remaining runtime a task can use within the absolute deadline, without
over-running the allowed density:

  runtime = (dl_runtime / dl_deadline) * (deadline - t)

For instance, in our previous example, the task could still run:

  runtime = ( 5 / 7 ) * 5
  runtime = 3.57 ms

Without causing damage for other deadline tasks. It is note worthy
that the laxity cannot be negative because that would cause a negative
runtime. Thus, this patch depends on the patch:

  df8eac8cafce ("sched/deadline: Throttle a constrained deadline task activated after the deadline")

Which throttles a constrained deadline task activated after the
deadline.

Finally, it is also possible to use the revised wakeup rule for
all other tasks, but that would require some more discussions
about pros and cons.

Reported-by: Xunlei Pang <xpang@redhat.com>
Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
[peterz: replaced dl_is_constrained with dl_is_implicit]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luca Abeni <luca.abeni@santannapisa.it>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Romulo Silva de Oliveira <romulo.deoliveira@ufsc.br>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Link: http://lkml.kernel.org/r/5c800ab3a74a168a84ee5f3f84d12a02e11383be.1495803804.git.bristot@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h   |  1 +
 kernel/sched/core.c     |  2 +
 kernel/sched/deadline.c | 98 +++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 89 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3113c828483b..1f0f427e0292 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -422,6 +422,7 @@ struct sched_dl_entity {
 	u64				dl_deadline;	/* Relative deadline of each instance	*/
 	u64				dl_period;	/* Separation of two instances (period) */
 	u64				dl_bw;		/* dl_runtime / dl_period		*/
+	u64				dl_density;	/* dl_runtime / dl_deadline		*/
 
 	/*
 	 * Actual scheduling parameters. Initialized with the values above,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 799647927c4c..e5bd587e87f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2150,6 +2150,7 @@ void __dl_clear_params(struct task_struct *p)
 	dl_se->dl_period = 0;
 	dl_se->flags = 0;
 	dl_se->dl_bw = 0;
+	dl_se->dl_density = 0;
 
 	dl_se->dl_throttled = 0;
 	dl_se->dl_yielded = 0;
@@ -4030,6 +4031,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
 	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
 	dl_se->flags = attr->sched_flags;
 	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+	dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
 }
 
 /*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 54302cf68bb9..e12f85975857 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -704,13 +704,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
 }
 
 /*
- * When a -deadline entity is queued back on the runqueue, its runtime and
- * deadline might need updating.
+ * Revised wakeup rule [1]: For self-suspending tasks, rather then
+ * re-initializing task's runtime and deadline, the revised wakeup
+ * rule adjusts the task's runtime to avoid the task to overrun its
+ * density.
  *
- * The policy here is that we update the deadline of the entity only if:
- *  - the current deadline is in the past,
- *  - using the remaining runtime with the current deadline would make
- *    the entity exceed its bandwidth.
+ * Reasoning: a task may overrun the density if:
+ *    runtime / (deadline - t) > dl_runtime / dl_deadline
+ *
+ * Therefore, runtime can be adjusted to:
+ *     runtime = (dl_runtime / dl_deadline) * (deadline - t)
+ *
+ * In such way that runtime will be equal to the maximum density
+ * the task can use without breaking any rule.
+ *
+ * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant
+ * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24.
+ */
+static void
+update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq)
+{
+	u64 laxity = dl_se->deadline - rq_clock(rq);
+
+	/*
+	 * If the task has deadline < period, and the deadline is in the past,
+	 * it should already be throttled before this check.
+	 *
+	 * See update_dl_entity() comments for further details.
+	 */
+	WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq)));
+
+	dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT;
+}
+
+/*
+ * Regarding the deadline, a task with implicit deadline has a relative
+ * deadline == relative period. A task with constrained deadline has a
+ * relative deadline <= relative period.
+ *
+ * We support constrained deadline tasks. However, there are some restrictions
+ * applied only for tasks which do not have an implicit deadline. See
+ * update_dl_entity() to know more about such restrictions.
+ *
+ * The dl_is_implicit() returns true if the task has an implicit deadline.
+ */
+static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
+{
+	return dl_se->dl_deadline == dl_se->dl_period;
+}
+
+/*
+ * When a deadline entity is placed in the runqueue, its runtime and deadline
+ * might need to be updated. This is done by a CBS wake up rule. There are two
+ * different rules: 1) the original CBS; and 2) the Revisited CBS.
+ *
+ * When the task is starting a new period, the Original CBS is used. In this
+ * case, the runtime is replenished and a new absolute deadline is set.
+ *
+ * When a task is queued before the begin of the next period, using the
+ * remaining runtime and deadline could make the entity to overflow, see
+ * dl_entity_overflow() to find more about runtime overflow. When such case
+ * is detected, the runtime and deadline need to be updated.
+ *
+ * If the task has an implicit deadline, i.e., deadline == period, the Original
+ * CBS is applied. the runtime is replenished and a new absolute deadline is
+ * set, as in the previous cases.
+ *
+ * However, the Original CBS does not work properly for tasks with
+ * deadline < period, which are said to have a constrained deadline. By
+ * applying the Original CBS, a constrained deadline task would be able to run
+ * runtime/deadline in a period. With deadline < period, the task would
+ * overrun the runtime/period allowed bandwidth, breaking the admission test.
+ *
+ * In order to prevent this misbehave, the Revisited CBS is used for
+ * constrained deadline tasks when a runtime overflow is detected. In the
+ * Revisited CBS, rather than replenishing & setting a new absolute deadline,
+ * the remaining runtime of the task is reduced to avoid runtime overflow.
+ * Please refer to the comments update_dl_revised_wakeup() function to find
+ * more about the Revised CBS rule.
  */
 static void update_dl_entity(struct sched_dl_entity *dl_se,
 			     struct sched_dl_entity *pi_se)
@@ -720,6 +791,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
 
 	if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
 	    dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+
+		if (unlikely(!dl_is_implicit(dl_se) &&
+			     !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+			     !dl_se->dl_boosted)){
+			update_dl_revised_wakeup(dl_se, rq);
+			return;
+		}
+
 		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
 		dl_se->runtime = pi_se->dl_runtime;
 	}
@@ -1274,11 +1353,6 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
 	__dequeue_dl_entity(dl_se);
 }
 
-static inline bool dl_is_constrained(struct sched_dl_entity *dl_se)
-{
-	return dl_se->dl_deadline < dl_se->dl_period;
-}
-
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct task_struct *pi_task = rt_mutex_get_top_task(p);
@@ -1310,7 +1384,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 	 * If that is the case, the task will be throttled and
 	 * the replenishment timer will be set to the next period.
 	 */
-	if (!p->dl.dl_throttled && dl_is_constrained(&p->dl))
+	if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
 		dl_check_constrained_dl(&p->dl);
 
 	if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
-- 
cgit v1.3-14-g43fede


From f5832c1998af2ca8d9947792d1c8e1816ab58e57 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Mon, 29 May 2017 17:02:57 -0400
Subject: sched/core: Omit building stop_sched_class when !SMP

The stop class is invoked through stop_machine only.
This is dead code on UP builds.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170529210302.26868-3-nicolas.pitre@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/Makefile |  4 ++--
 kernel/sched/core.c   | 60 +++++++++++++++++++++++++--------------------------
 kernel/sched/sched.h  |  4 ++++
 3 files changed, 36 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 89ab6758667b..5e4c2e7a632b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,9 +16,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
+obj-y += idle_task.o fair.o rt.o deadline.o
 obj-y += wait.o swait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e5bd587e87f8..c343b8135774 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -788,36 +788,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 	dequeue_task(rq, p, flags);
 }
 
-void sched_set_stop_task(int cpu, struct task_struct *stop)
-{
-	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
-	struct task_struct *old_stop = cpu_rq(cpu)->stop;
-
-	if (stop) {
-		/*
-		 * Make it appear like a SCHED_FIFO task, its something
-		 * userspace knows about and won't get confused about.
-		 *
-		 * Also, it will make PI more or less work without too
-		 * much confusion -- but then, stop work should not
-		 * rely on PI working anyway.
-		 */
-		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
-
-		stop->sched_class = &stop_sched_class;
-	}
-
-	cpu_rq(cpu)->stop = stop;
-
-	if (old_stop) {
-		/*
-		 * Reset it back to a normal scheduling class so that
-		 * it can die in pieces.
-		 */
-		old_stop->sched_class = &rt_sched_class;
-	}
-}
-
 /*
  * __normal_prio - return the priority that is based on the static prio
  */
@@ -1588,6 +1558,36 @@ static void update_avg(u64 *avg, u64 sample)
 	*avg += diff >> 3;
 }
 
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+	struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+	if (stop) {
+		/*
+		 * Make it appear like a SCHED_FIFO task, its something
+		 * userspace knows about and won't get confused about.
+		 *
+		 * Also, it will make PI more or less work without too
+		 * much confusion -- but then, stop work should not
+		 * rely on PI working anyway.
+		 */
+		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+
+		stop->sched_class = &stop_sched_class;
+	}
+
+	cpu_rq(cpu)->stop = stop;
+
+	if (old_stop) {
+		/*
+		 * Reset it back to a normal scheduling class so that
+		 * it can die in pieces.
+		 */
+		old_stop->sched_class = &rt_sched_class;
+	}
+}
+
 #else
 
 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f1e400c6403c..f2ef759a4cb6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1453,7 +1453,11 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
 	curr->sched_class->set_curr_task(rq);
 }
 
+#ifdef CONFIG_SMP
 #define sched_class_highest (&stop_sched_class)
+#else
+#define sched_class_highest (&dl_sched_class)
+#endif
 #define for_each_class(class) \
    for (class = sched_class_highest; class; class = class->next)
 
-- 
cgit v1.3-14-g43fede


From ebfa4c02fa4806bfef189e88152b833f2a732bff Mon Sep 17 00:00:00 2001
From: Aubrey Li <aubrey.li@intel.com>
Date: Wed, 7 Jun 2017 10:49:02 +0800
Subject: sched/idle: Add deferrable vmstat_updater back

Deferrable vmstat_updater was missing in commit:

  c1de45ca831a ("sched/idle: Add support for tasks that inject idle")

Add it back.

Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Aubrey Li <aubrey.li@intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1496803742-38274-1-git-send-email-aubrey.li@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/idle.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index ef63adce0c9c..6c23e30c0e5c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -219,6 +219,7 @@ static void do_idle(void)
 	 */
 
 	__current_set_polling();
+	quiet_vmstat();
 	tick_nohz_idle_enter();
 
 	while (!need_resched()) {
-- 
cgit v1.3-14-g43fede


From f5694788ad8da5da41b501f3d6d2ae22379c4ef9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 19 Sep 2016 12:15:37 +0200
Subject: rt_mutex: Add lockdep annotations

Now that (PI) futexes have their own private RT-mutex interface and
implementation we can easily add lockdep annotations to the existing
RT-mutex interface.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rtmutex.h        | 25 +++++++++++++++++++++----
 kernel/locking/rtmutex-debug.c |  6 +++++-
 kernel/locking/rtmutex-debug.h |  2 +-
 kernel/locking/rtmutex.c       | 36 +++++++++++++++++++++++++++++-------
 kernel/locking/rtmutex.h       |  2 +-
 lib/Kconfig.debug              |  3 +++
 6 files changed, 60 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 1abba5ce2a2f..44fd002f7cd5 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -37,6 +37,9 @@ struct rt_mutex {
 	int			line;
 	void			*magic;
 #endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	dep_map;
+#endif
 };
 
 struct rt_mutex_waiter;
@@ -58,19 +61,33 @@ struct hrtimer_sleeper;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
 	, .name = #mutexname, .file = __FILE__, .line = __LINE__
-# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, __func__)
+
+# define rt_mutex_init(mutex) \
+do { \
+	static struct lock_class_key __key; \
+	__rt_mutex_init(mutex, __func__, &__key); \
+} while (0)
+
  extern void rt_mutex_debug_task_free(struct task_struct *tsk);
 #else
 # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname)
-# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, NULL)
+# define rt_mutex_init(mutex)			__rt_mutex_init(mutex, NULL, NULL)
 # define rt_mutex_debug_task_free(t)			do { } while (0)
 #endif
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \
+	, .dep_map = { .name = #mutexname }
+#else
+#define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)
+#endif
+
 #define __RT_MUTEX_INITIALIZER(mutexname) \
 	{ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \
 	, .waiters = RB_ROOT \
 	, .owner = NULL \
-	__DEBUG_RT_MUTEX_INITIALIZER(mutexname)}
+	__DEBUG_RT_MUTEX_INITIALIZER(mutexname) \
+	__DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)}
 
 #define DEFINE_RT_MUTEX(mutexname) \
 	struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname)
@@ -86,7 +103,7 @@ static inline int rt_mutex_is_locked(struct rt_mutex *lock)
 	return lock->owner != NULL;
 }
 
-extern void __rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key);
 extern void rt_mutex_destroy(struct rt_mutex *lock);
 
 extern void rt_mutex_lock(struct rt_mutex *lock);
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 58e366ad36f4..ac35e648b0e5 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -166,12 +166,16 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 	memset(waiter, 0x22, sizeof(*waiter));
 }
 
-void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
+void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key)
 {
 	/*
 	 * Make sure we are not reinitializing a held lock:
 	 */
 	debug_check_no_locks_freed((void *)lock, sizeof(*lock));
 	lock->name = name;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
 }
 
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index b585af9a1b50..5078c6ddf4a5 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -11,7 +11,7 @@
 
 extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
 extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
-extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
+extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key);
 extern void debug_rt_mutex_lock(struct rt_mutex *lock);
 extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
 extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 28cd09e635ed..43123533e9b1 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1481,6 +1481,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
 {
 	might_sleep();
 
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 	rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_lock);
@@ -1496,9 +1497,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
  */
 int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
 {
+	int ret;
+
 	might_sleep();
 
-	return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
 
@@ -1526,11 +1534,18 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
 int
 rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
 {
+	int ret;
+
 	might_sleep();
 
-	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+	mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+	ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
 				       RT_MUTEX_MIN_CHAINWALK,
 				       rt_mutex_slowlock);
+	if (ret)
+		mutex_release(&lock->dep_map, 1, _RET_IP_);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
 
@@ -1547,10 +1562,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
  */
 int __sched rt_mutex_trylock(struct rt_mutex *lock)
 {
+	int ret;
+
 	if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq()))
 		return 0;
 
-	return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+	ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
+	if (ret)
+		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(rt_mutex_trylock);
 
@@ -1561,6 +1582,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock);
  */
 void __sched rt_mutex_unlock(struct rt_mutex *lock)
 {
+	mutex_release(&lock->dep_map, 1, _RET_IP_);
 	rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
@@ -1620,7 +1642,6 @@ void rt_mutex_destroy(struct rt_mutex *lock)
 	lock->magic = NULL;
 #endif
 }
-
 EXPORT_SYMBOL_GPL(rt_mutex_destroy);
 
 /**
@@ -1632,14 +1653,15 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
  *
  * Initializing of a locked rt lock is not allowed
  */
-void __rt_mutex_init(struct rt_mutex *lock, const char *name)
+void __rt_mutex_init(struct rt_mutex *lock, const char *name,
+		     struct lock_class_key *key)
 {
 	lock->owner = NULL;
 	raw_spin_lock_init(&lock->wait_lock);
 	lock->waiters = RB_ROOT;
 	lock->waiters_leftmost = NULL;
 
-	debug_rt_mutex_init(lock, name);
+	debug_rt_mutex_init(lock, name, key);
 }
 EXPORT_SYMBOL_GPL(__rt_mutex_init);
 
@@ -1660,7 +1682,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
 void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				struct task_struct *proxy_owner)
 {
-	__rt_mutex_init(lock, NULL);
+	__rt_mutex_init(lock, NULL, NULL);
 	debug_rt_mutex_proxy_lock(lock, proxy_owner);
 	rt_mutex_set_owner(lock, proxy_owner);
 }
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index 6607802efa8b..5c253caffe91 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -17,7 +17,7 @@
 #define debug_rt_mutex_proxy_lock(l,p)			do { } while (0)
 #define debug_rt_mutex_proxy_unlock(l)			do { } while (0)
 #define debug_rt_mutex_unlock(l)			do { } while (0)
-#define debug_rt_mutex_init(m, n)			do { } while (0)
+#define debug_rt_mutex_init(m, n, k)			do { } while (0)
 #define debug_rt_mutex_deadlock(d, a ,l)		do { } while (0)
 #define debug_rt_mutex_print_deadlock(w)		do { } while (0)
 #define debug_rt_mutex_reset_waiter(w)			do { } while (0)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e4587ebe52c7..ca615129aec5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1052,6 +1052,7 @@ config DEBUG_LOCK_ALLOC
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
 	select DEBUG_SPINLOCK
 	select DEBUG_MUTEXES
+	select DEBUG_RT_MUTEXES if RT_MUTEXES
 	select LOCKDEP
 	help
 	 This feature will check whether any held lock (spinlock, rwlock,
@@ -1067,6 +1068,7 @@ config PROVE_LOCKING
 	select LOCKDEP
 	select DEBUG_SPINLOCK
 	select DEBUG_MUTEXES
+	select DEBUG_RT_MUTEXES if RT_MUTEXES
 	select DEBUG_LOCK_ALLOC
 	select TRACE_IRQFLAGS
 	default n
@@ -1121,6 +1123,7 @@ config LOCK_STAT
 	select LOCKDEP
 	select DEBUG_SPINLOCK
 	select DEBUG_MUTEXES
+	select DEBUG_RT_MUTEXES if RT_MUTEXES
 	select DEBUG_LOCK_ALLOC
 	default n
 	help
-- 
cgit v1.3-14-g43fede


From f92c734f02cbf10e40569facff82059ae9b61920 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 10 Apr 2017 15:40:35 -0700
Subject: rcu: Prevent rcu_barrier() from starting needless grace periods

Currently rcu_barrier() uses call_rcu() to enqueue new callbacks
on each CPU with a non-empty callback list.  This works, but means
that rcu_barrier() forces grace periods that are not otherwise needed.
The key point is that rcu_barrier() never needs to wait for a grace
period, but instead only for all pre-existing callbacks to be invoked.
This means that rcu_barrier()'s new callbacks should be placed in
the callback-list segment containing the last pre-existing callback.

This commit makes this change using the new rcu_segcblist_entrain()
function.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/trace/events/rcu.h |  1 +
 kernel/rcu/tree.c          | 10 ++++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index e3facb356838..91dc089d65b7 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -742,6 +742,7 @@ TRACE_EVENT(rcu_torture_read,
  *	"OnlineQ": _rcu_barrier() found online CPU with callbacks.
  *	"OnlineNQ": _rcu_barrier() found online CPU, no callbacks.
  *	"IRQ": An rcu_barrier_callback() callback posted on remote CPU.
+ *	"IRQNQ": An rcu_barrier_callback() callback found no callbacks.
  *	"CB": An rcu_barrier_callback() invoked a callback, not the last.
  *	"LastCB": An rcu_barrier_callback() invoked the last callback.
  *	"Inc2": _rcu_barrier() piggyback check counter incremented.
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e354e475e645..657056c3e0cd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3578,8 +3578,14 @@ static void rcu_barrier_func(void *type)
 	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
 
 	_rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
-	atomic_inc(&rsp->barrier_cpu_count);
-	rsp->call(&rdp->barrier_head, rcu_barrier_callback);
+	rdp->barrier_head.func = rcu_barrier_callback;
+	debug_rcu_head_queue(&rdp->barrier_head);
+	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
+		atomic_inc(&rsp->barrier_cpu_count);
+	} else {
+		debug_rcu_head_unqueue(&rdp->barrier_head);
+		_rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence);
+	}
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 881ec9d209d5371c21db89ca1bb19afd3fcadab3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 12 Apr 2017 15:16:50 -0700
Subject: srcu: Eliminate possibility of destructive counter overflow

Earlier versions of Tree SRCU were subject to a counter overflow bug that
could theoretically result in too-short grace periods.  This commit
eliminates this problem by adding an update-side memory barrier.
The short explanation is that if the updater sums the unlock counts
too late to see a given __srcu_read_unlock() increment, that CPU's
next __srcu_read_lock() must see the new value of ->srcu_idx, thus
incrementing the other bank of counters.  This eliminates the possibility
of destructive counter overflow as long as the srcu_read_lock() nesting
level does not exceed floor(ULONG_MAX/NR_CPUS/2), which should be an
eminently reasonable nesting limit, especially on 64-bit systems.

Reported-by: Lance Roy <ldr709@gmail.com>
Suggested-by: Lance Roy <ldr709@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcutree.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 157654fa436a..fceca84df6b0 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -275,15 +275,20 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
 	 * not mean that there are no more readers, as one could have read
 	 * the current index but not have incremented the lock counter yet.
 	 *
-	 * Possible bug: There is no guarantee that there haven't been
-	 * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were
-	 * counted, meaning that this could return true even if there are
-	 * still active readers.  Since there are no memory barriers around
-	 * srcu_flip(), the CPU is not required to increment ->srcu_idx
-	 * before running srcu_readers_unlock_idx(), which means that there
-	 * could be an arbitrarily large number of critical sections that
-	 * execute after srcu_readers_unlock_idx() but use the old value
-	 * of ->srcu_idx.
+	 * So suppose that the updater is preempted here for so long
+	 * that more than ULONG_MAX non-nested readers come and go in
+	 * the meantime.  It turns out that this cannot result in overflow
+	 * because if a reader modifies its unlock count after we read it
+	 * above, then that reader's next load of ->srcu_idx is guaranteed
+	 * to get the new value, which will cause it to operate on the
+	 * other bank of counters, where it cannot contribute to the
+	 * overflow of these counters.  This means that there is a maximum
+	 * of 2*NR_CPUS increments, which cannot overflow given current
+	 * systems, especially not on 64-bit systems.
+	 *
+	 * OK, how about nesting?  This does impose a limit on nesting
+	 * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient,
+	 * especially on 64-bit systems.
 	 */
 	return srcu_readers_lock_idx(sp, idx) == unlocks;
 }
@@ -671,6 +676,16 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
  */
 static void srcu_flip(struct srcu_struct *sp)
 {
+	/*
+	 * Ensure that if this updater saw a given reader's increment
+	 * from __srcu_read_lock(), that reader was using an old value
+	 * of ->srcu_idx.  Also ensure that if a given reader sees the
+	 * new value of ->srcu_idx, this updater's earlier scans cannot
+	 * have seen that reader's increments (which is OK, because this
+	 * grace period need not wait on that reader).
+	 */
+	smp_mb(); /* E */  /* Pairs with B and C. */
+
 	WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
 
 	/*
-- 
cgit v1.3-14-g43fede


From 5b72f9643b52a5148bb8ced126e20563adfa3466 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 12 Apr 2017 15:29:14 -0700
Subject: rcu: Complain if blocking in preemptible RCU read-side critical
 section

Although preemptible RCU allows its read-side critical sections to be
preempted, general blocking is forbidden.  The reason for this is that
excessive preemption times can be handled by CONFIG_RCU_BOOST=y, but a
voluntarily blocked task doesn't care how high you boost its priority.
Because preemptible RCU is a global mechanism, one ill-behaved reader
hurts everyone.  Hence the prohibition against general blocking in
RCU-preempt read-side critical sections.  Preemption yes, blocking no.

This commit enforces this prohibition.

There is a special exception for the -rt patchset (which they kindly
volunteered to implement):  It is OK to block (as opposed to merely being
preempted) within an RCU-preempt read-side critical section, but only if
the blocking is subject to priority inheritance.  This exception permits
CONFIG_RCU_BOOST=y to get -rt RCU readers out of trouble.

Why doesn't this exception also apply to mainline's rt_mutex?  Because
of the possibility that someone does general blocking while holding
an rt_mutex.  Yes, the priority boosting will affect the rt_mutex,
but it won't help with the task doing general blocking while holding
that rt_mutex.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 2 +-
 kernel/rcu/tree.h        | 2 +-
 kernel/rcu/tree_plugin.h | 5 +++--
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 657056c3e0cd..9ce682242e99 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -478,7 +478,7 @@ void rcu_note_context_switch(bool preempt)
 	barrier(); /* Avoid RCU read-side critical sections leaking down. */
 	trace_rcu_utilization(TPS("Start context switch"));
 	rcu_sched_qs();
-	rcu_preempt_note_context_switch();
+	rcu_preempt_note_context_switch(preempt);
 	/* Load rcu_urgent_qs before other flags. */
 	if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs)))
 		goto out;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ba38262c3554..0fa7aee9ef55 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -477,7 +477,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
 
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
-static void rcu_preempt_note_context_switch(void);
+static void rcu_preempt_note_context_switch(bool preempt);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c9a48657512a..a421753e8e9c 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -286,12 +286,13 @@ static void rcu_preempt_qs(void)
  *
  * Caller must disable interrupts.
  */
-static void rcu_preempt_note_context_switch(void)
+static void rcu_preempt_note_context_switch(bool preempt)
 {
 	struct task_struct *t = current;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 
+	WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
 	if (t->rcu_read_lock_nesting > 0 &&
 	    !t->rcu_read_unlock_special.b.blocked) {
 
@@ -738,7 +739,7 @@ static void __init rcu_bootup_announce(void)
  * Because preemptible RCU does not exist, we never have to check for
  * CPUs being in quiescent states.
  */
-static void rcu_preempt_note_context_switch(void)
+static void rcu_preempt_note_context_switch(bool preempt)
 {
 }
 
-- 
cgit v1.3-14-g43fede


From 9683937df9ebf4eb62cdedb09c5f20a0760c7d80 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 14 Apr 2017 16:12:52 -0700
Subject: rcuperf: Defer expedited/normal check to end of test

Current rcuperf startup checks to see if the user asked to measure
only expedited grace periods, yet constrained all grace periods to be
normal, or if the user asked to measure only normal grace periods, yet
constrained all grace periods to be expedited.  Useless tests of this
sort are aborted.

Unfortunately, making RCU work through the mid-boot dead zone [1] puts
RCU into expedited-only mode during that zone.  Which happens to also
be the exact time that rcuperf carries out the aforementioned check.
So if the user asks rcuperf to measure only normal grace periods (the
default), rcuperf will now always complain and terminate the test.

This commit therefore moves the checks to rcu_perf_cleanup().  This has
the disadvantage of failing to abort useless tests, but avoids the need to
create yet another kthread and the need to do fiddly checks involving the
holdoff time.  (Yes, another approach is to do the checks in a late-stage
init function, but that would require some way to communicate badness
to rcuperf's kthreads, and seems not worth the bother.)

[1] https://lwn.net/Articles/716148/

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcuperf.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index a4a86fb47e4a..ef5b1faac495 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -452,6 +452,15 @@ rcu_perf_cleanup(void)
 	u64 *wdp;
 	u64 *wdpp;
 
+	/*
+	 * Would like warning at start, but everything is expedited
+	 * during the mid-boot phase, so have to wait till the end.
+	 */
+	if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
+		VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+	if (rcu_gp_is_normal() && gp_exp)
+		VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+
 	if (torture_cleanup_begin())
 		return;
 
@@ -624,16 +633,6 @@ rcu_perf_init(void)
 		firsterr = -ENOMEM;
 		goto unwind;
 	}
-	if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) {
-		VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
-		firsterr = -EINVAL;
-		goto unwind;
-	}
-	if (rcu_gp_is_normal() && gp_exp) {
-		VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
-		firsterr = -EINVAL;
-		goto unwind;
-	}
 	for (i = 0; i < nrealwriters; i++) {
 		writer_durations[i] =
 			kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
-- 
cgit v1.3-14-g43fede


From e28371c891db29c892d85322ea27ad997cc50f72 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 17 Apr 2017 09:59:53 -0700
Subject: rcu: Remove obsolete reference to synchronize_kernel()

The synchronize_kernel() primitive was removed in favor of
synchronize_sched() more than a decade ago, and it seems likely that
rather few kernel hackers are familiar with it.  Its continued presence
is therefore providing more confusion than enlightenment.  This commit
therefore removes the reference from the synchronize_sched() header
comment, and adds the corresponding information to the synchronize_rcu(0
header comment.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 6 ------
 kernel/rcu/tree_plugin.h | 9 +++++++--
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9ce682242e99..3bee58fc23b1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3280,12 +3280,6 @@ static inline int rcu_blocking_is_gp(void)
  * to have executed a full memory barrier during the execution of
  * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
  * again only if the system has more than one CPU).
- *
- * This primitive provides the guarantees made by the (now removed)
- * synchronize_kernel() API.  In contrast, synchronize_rcu() only
- * guarantees that rcu_read_lock() sections will have completed.
- * In "classic RCU", these two guarantees happen to be one and
- * the same, but can differ in realtime RCU implementations.
  */
 void synchronize_sched(void)
 {
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index a421753e8e9c..3b432fa4c45b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -664,8 +664,13 @@ EXPORT_SYMBOL_GPL(call_rcu);
  * synchronize_rcu() was waiting.  RCU read-side critical sections are
  * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
  *
- * See the description of synchronize_sched() for more detailed information
- * on memory ordering guarantees.
+ * See the description of synchronize_sched() for more detailed
+ * information on memory-ordering guarantees.  However, please note
+ * that -only- the memory-ordering guarantees apply.  For example,
+ * synchronize_rcu() is -not- guaranteed to wait on things like code
+ * protected by preempt_disable(), instead, synchronize_rcu() is -only-
+ * guaranteed to wait on RCU read-side critical sections, that is, sections
+ * of code protected by rcu_read_lock().
  */
 void synchronize_rcu(void)
 {
-- 
cgit v1.3-14-g43fede


From 881ed593a323c832c2e9383effeb6a0c99859210 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 17 Apr 2017 12:47:10 -0700
Subject: rcuperf: Add ability to performance-test call_rcu() and friends

This commit upgrades rcuperf so that it can do performance testing on
asynchronous grace-period primitives such as call_srcu().  There is
a new rcuperf.gp_async module parameter that specifies this new behavior,
with the pre-existing rcuperf.gp_exp testing expedited grace periods such as
synchronize_rcu_expedited, and with the default being to test synchronous
non-expedited grace periods such as synchronize_rcu().

There is also a new rcuperf.gp_async_max module parameter that specifies
the maximum number of outstanding callbacks per writer kthread, defaulting
to 1,000.  When this limit is exceeded, the writer thread invokes the
appropriate flavor of rcu_barrier() to wait for callbacks to drain.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Removed the redundant initialization noted by Arnd Bergmann. ]
---
 Documentation/admin-guide/kernel-parameters.txt | 11 ++++
 kernel/rcu/rcuperf.c                            | 69 +++++++++++++++++++++++--
 2 files changed, 75 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 15f79c27748d..3598464ca8ed 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3319,6 +3319,17 @@
 			This wake_up() will be accompanied by a
 			WARN_ONCE() splat and an ftrace_dump().
 
+	rcuperf.gp_async= [KNL]
+			Measure performance of asynchronous
+			grace-period primitives such as call_rcu().
+
+	rcuperf.gp_async_max= [KNL]
+			Specify the maximum number of outstanding
+			callbacks per writer thread.  When a writer
+			thread exceeds this limit, it invokes the
+			corresponding flavor of rcu_barrier() to allow
+			previously posted callbacks to drain.
+
 	rcuperf.gp_exp= [KNL]
 			Measure performance of expedited synchronous
 			grace-period primitives.
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index ef5b1faac495..e1ce97bead94 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -59,6 +59,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
 #define VERBOSE_PERFOUT_ERRSTRING(s) \
 	do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
 
+torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
+torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
 torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
 torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
 torture_param(int, nreaders, -1, "Number of RCU reader threads");
@@ -86,13 +88,16 @@ static u64 t_rcu_perf_writer_started;
 static u64 t_rcu_perf_writer_finished;
 static unsigned long b_rcu_perf_writer_started;
 static unsigned long b_rcu_perf_writer_finished;
+static DEFINE_PER_CPU(atomic_t, n_async_inflight);
 
 static int rcu_perf_writer_state;
 #define RTWS_INIT		0
-#define RTWS_EXP_SYNC		1
-#define RTWS_SYNC		2
-#define RTWS_IDLE		2
-#define RTWS_STOPPING		3
+#define RTWS_ASYNC		1
+#define RTWS_BARRIER		2
+#define RTWS_EXP_SYNC		3
+#define RTWS_SYNC		4
+#define RTWS_IDLE		5
+#define RTWS_STOPPING		6
 
 #define MAX_MEAS 10000
 #define MIN_MEAS 100
@@ -114,6 +119,8 @@ struct rcu_perf_ops {
 	unsigned long (*started)(void);
 	unsigned long (*completed)(void);
 	unsigned long (*exp_completed)(void);
+	void (*async)(struct rcu_head *head, rcu_callback_t func);
+	void (*gp_barrier)(void);
 	void (*sync)(void);
 	void (*exp_sync)(void);
 	const char *name;
@@ -153,6 +160,8 @@ static struct rcu_perf_ops rcu_ops = {
 	.started	= rcu_batches_started,
 	.completed	= rcu_batches_completed,
 	.exp_completed	= rcu_exp_batches_completed,
+	.async		= call_rcu,
+	.gp_barrier	= rcu_barrier,
 	.sync		= synchronize_rcu,
 	.exp_sync	= synchronize_rcu_expedited,
 	.name		= "rcu"
@@ -181,6 +190,8 @@ static struct rcu_perf_ops rcu_bh_ops = {
 	.started	= rcu_batches_started_bh,
 	.completed	= rcu_batches_completed_bh,
 	.exp_completed	= rcu_exp_batches_completed_sched,
+	.async		= call_rcu_bh,
+	.gp_barrier	= rcu_barrier_bh,
 	.sync		= synchronize_rcu_bh,
 	.exp_sync	= synchronize_rcu_bh_expedited,
 	.name		= "rcu_bh"
@@ -208,6 +219,16 @@ static unsigned long srcu_perf_completed(void)
 	return srcu_batches_completed(srcu_ctlp);
 }
 
+static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+	call_srcu(srcu_ctlp, head, func);
+}
+
+static void srcu_rcu_barrier(void)
+{
+	srcu_barrier(srcu_ctlp);
+}
+
 static void srcu_perf_synchronize(void)
 {
 	synchronize_srcu(srcu_ctlp);
@@ -226,6 +247,8 @@ static struct rcu_perf_ops srcu_ops = {
 	.started	= NULL,
 	.completed	= srcu_perf_completed,
 	.exp_completed	= srcu_perf_completed,
+	.async		= srcu_call_rcu,
+	.gp_barrier	= srcu_rcu_barrier,
 	.sync		= srcu_perf_synchronize,
 	.exp_sync	= srcu_perf_synchronize_expedited,
 	.name		= "srcu"
@@ -254,6 +277,8 @@ static struct rcu_perf_ops sched_ops = {
 	.started	= rcu_batches_started_sched,
 	.completed	= rcu_batches_completed_sched,
 	.exp_completed	= rcu_exp_batches_completed_sched,
+	.async		= call_rcu_sched,
+	.gp_barrier	= rcu_barrier_sched,
 	.sync		= synchronize_sched,
 	.exp_sync	= synchronize_sched_expedited,
 	.name		= "sched"
@@ -281,6 +306,8 @@ static struct rcu_perf_ops tasks_ops = {
 	.readunlock	= tasks_perf_read_unlock,
 	.started	= rcu_no_completed,
 	.completed	= rcu_no_completed,
+	.async		= call_rcu_tasks,
+	.gp_barrier	= rcu_barrier_tasks,
 	.sync		= synchronize_rcu_tasks,
 	.exp_sync	= synchronize_rcu_tasks,
 	.name		= "tasks"
@@ -343,6 +370,15 @@ rcu_perf_reader(void *arg)
 	return 0;
 }
 
+/*
+ * Callback function for asynchronous grace periods from rcu_perf_writer().
+ */
+static void rcu_perf_async_cb(struct rcu_head *rhp)
+{
+	atomic_dec(this_cpu_ptr(&n_async_inflight));
+	kfree(rhp);
+}
+
 /*
  * RCU perf writer kthread.  Repeatedly does a grace period.
  */
@@ -352,6 +388,7 @@ rcu_perf_writer(void *arg)
 	int i = 0;
 	int i_max;
 	long me = (long)arg;
+	struct rcu_head *rhp = NULL;
 	struct sched_param sp;
 	bool started = false, done = false, alldone = false;
 	u64 t;
@@ -382,7 +419,23 @@ rcu_perf_writer(void *arg)
 	do {
 		wdp = &wdpp[i];
 		*wdp = ktime_get_mono_fast_ns();
-		if (gp_exp) {
+		if (gp_async) {
+retry:
+			if (!rhp)
+				rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+			if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
+				rcu_perf_writer_state = RTWS_ASYNC;
+				atomic_inc(this_cpu_ptr(&n_async_inflight));
+				cur_ops->async(rhp, rcu_perf_async_cb);
+				rhp = NULL;
+			} else if (!kthread_should_stop()) {
+				rcu_perf_writer_state = RTWS_BARRIER;
+				cur_ops->gp_barrier();
+				goto retry;
+			} else {
+				kfree(rhp); /* Because we are stopping. */
+			}
+		} else if (gp_exp) {
 			rcu_perf_writer_state = RTWS_EXP_SYNC;
 			cur_ops->exp_sync();
 		} else {
@@ -429,6 +482,10 @@ rcu_perf_writer(void *arg)
 			i++;
 		rcu_perf_wait_shutdown();
 	} while (!torture_must_stop());
+	if (gp_async) {
+		rcu_perf_writer_state = RTWS_BARRIER;
+		cur_ops->gp_barrier();
+	}
 	rcu_perf_writer_state = RTWS_STOPPING;
 	writer_n_durations[me] = i_max;
 	torture_kthread_stopping("rcu_perf_writer");
@@ -460,6 +517,8 @@ rcu_perf_cleanup(void)
 		VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
 	if (rcu_gp_is_normal() && gp_exp)
 		VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+	if (gp_exp && gp_async)
+		VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!");
 
 	if (torture_cleanup_begin())
 		return;
-- 
cgit v1.3-14-g43fede


From dcfc315b7b7a5e7668dd9cb2474708b51ab1cdb1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 18 Apr 2017 09:53:07 -0700
Subject: rcu: Make sync_rcu_preempt_exp_done() return bool

The sync_rcu_preempt_exp_done() function returns a logical expression,
but its return type is nevertheless int.  This commit therefore changes
the return type to bool.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_exp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index e513b4ab1197..dd21ca47e4b4 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -147,7 +147,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
  *
  * Caller must hold the rcu_state's exp_mutex.
  */
-static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 {
 	return rnp->exp_tasks == NULL &&
 	       READ_ONCE(rnp->expmask) == 0;
-- 
cgit v1.3-14-g43fede


From f60cb4d4c8e568c2d63d01d72e589c4f75ee4140 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 13:43:21 -0700
Subject: rcuperf: Add test for dynamically initialized srcu_struct

This commit adds a perf_type of "srcud", which species that rcuperf
test SRCU on a dynamically initialized srcu_struct.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcuperf.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index e1ce97bead94..5158ddba6716 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -254,6 +254,35 @@ static struct rcu_perf_ops srcu_ops = {
 	.name		= "srcu"
 };
 
+static struct srcu_struct srcud;
+
+static void srcu_sync_perf_init(void)
+{
+	srcu_ctlp = &srcud;
+	init_srcu_struct(srcu_ctlp);
+}
+
+static void srcu_sync_perf_cleanup(void)
+{
+	cleanup_srcu_struct(srcu_ctlp);
+}
+
+static struct rcu_perf_ops srcud_ops = {
+	.ptype		= SRCU_FLAVOR,
+	.init		= srcu_sync_perf_init,
+	.cleanup	= srcu_sync_perf_cleanup,
+	.readlock	= srcu_perf_read_lock,
+	.readunlock	= srcu_perf_read_unlock,
+	.started	= NULL,
+	.completed	= srcu_perf_completed,
+	.exp_completed	= srcu_perf_completed,
+	.async		= srcu_call_rcu,
+	.gp_barrier	= srcu_rcu_barrier,
+	.sync		= srcu_perf_synchronize,
+	.exp_sync	= srcu_perf_synchronize_expedited,
+	.name		= "srcud"
+};
+
 /*
  * Definitions for sched perf testing.
  */
@@ -622,7 +651,7 @@ rcu_perf_init(void)
 	long i;
 	int firsterr = 0;
 	static struct rcu_perf_ops *perf_ops[] = {
-		&rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
+		&rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops,
 		RCUPERF_TASKS_OPS
 	};
 
-- 
cgit v1.3-14-g43fede


From 1f4f6da1c80905830337c3ff46a2d3260dabb864 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Apr 2017 11:16:32 -0700
Subject: srcu: Make Classic and Tree SRCU announce themselves at bootup

Currently, the only way to tell whether a given kernel is running
Classic, Tiny, or Tree SRCU is to look at the .config file, which
can easily be lost or associated with the wrong kernel.  This commit
therefore has Classic and Tree SRCU identify themselves at boot time.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcu.c     | 7 +++++++
 kernel/rcu/srcutree.c | 7 +++++++
 2 files changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index dea03614263f..4e3f558409a0 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -659,3 +659,10 @@ void process_srcu(struct work_struct *work)
 	srcu_reschedule(sp);
 }
 EXPORT_SYMBOL_GPL(process_srcu);
+
+static int __init srcu_bootup_announce(void)
+{
+	pr_info("Classic SRCU implementation.\n");
+	return 0;
+}
+early_initcall(srcu_bootup_announce);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index fceca84df6b0..03d57fe9f094 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1167,3 +1167,10 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
 	*gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed);
 }
 EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+
+static int __init srcu_bootup_announce(void)
+{
+	pr_info("Hierarchical SRCU implementation.\n");
+	return 0;
+}
+early_initcall(srcu_bootup_announce);
-- 
cgit v1.3-14-g43fede


From 3ddf20c953520203c42dbed1f091ed52080e1cd2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Apr 2017 13:33:20 -0700
Subject: srcu: Shrink Tiny SRCU a bit more

This commit rearranges Tiny SRCU's srcu_struct structure, substitutes
u8 for bool, and shrinks counters down to short.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcutiny.h | 8 ++++----
 kernel/rcu/rcutorture.c  | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 42311ee0334f..b8859179b001 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -27,15 +27,15 @@
 #include <linux/swait.h>
 
 struct srcu_struct {
-	int srcu_lock_nesting[2];	/* srcu_read_lock() nesting depth. */
+	short srcu_lock_nesting[2];	/* srcu_read_lock() nesting depth. */
+	short srcu_idx;			/* Current reader array element. */
+	u8 srcu_gp_running;		/* GP workqueue running? */
+	u8 srcu_gp_waiting;		/* GP waiting for readers? */
 	struct swait_queue_head srcu_wq;
 					/* Last srcu_read_unlock() wakes GP. */
 	unsigned long srcu_gp_seq;	/* GP seq # for callback tagging. */
 	struct rcu_segcblist srcu_cblist;
 					/* Pending SRCU callbacks. */
-	int srcu_idx;			/* Current reader array element. */
-	bool srcu_gp_running;		/* GP workqueue running? */
-	bool srcu_gp_waiting;		/* GP waiting for readers? */
 	struct work_struct srcu_work;	/* For driving grace periods. */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index ae6e574d4cf5..a58592b73f19 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -609,7 +609,7 @@ static void srcu_torture_stats(void)
 	pr_cont("\n");
 #elif defined(CONFIG_TINY_SRCU)
 	idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
-	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n",
+	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
 		 torture_type, TORTURE_FLAG, idx,
 		 READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
 		 READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
-- 
cgit v1.3-14-g43fede


From 492b95e59735998312f678d77a2d5fe20af6b0b9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Apr 2017 16:09:15 -0700
Subject: rcuperf: Set more user-friendly defaults

Common-case use of rcuperf must set rcuperf.nreaders=0 and if not built
as a module, rcuperf.shutdown.  This commit therefore sets the default
for rcuperf.nreaders to zero and sets the default for rcuperf.shutdown
to zero if rcuperf is built as a module and to one otherwise.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcuperf.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 5158ddba6716..49c8ed6bd2fd 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -63,9 +63,10 @@ torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
 torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
 torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
 torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
-torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, nreaders, 0, "Number of RCU reader threads");
 torture_param(int, nwriters, -1, "Number of RCU updater threads");
-torture_param(bool, shutdown, false, "Shutdown at end of performance tests.");
+torture_param(bool, shutdown, !IS_ENABLED(MODULE),
+	      "Shutdown at end of performance tests.");
 torture_param(bool, verbose, true, "Enable verbose debugging printk()s");
 
 static char *perf_type = "rcu";
-- 
cgit v1.3-14-g43fede


From 820687a7b98a5031207893ff265f97c0a0ad403e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 25 Apr 2017 15:12:56 -0700
Subject: rcuperf: Add writer_holdoff boot parameter

This commit adds a writer_holdoff boot parameter to rcuperf, which is
intended to be used to test Tree SRCU's auto-expediting.  This
boot parameter is in microseconds, and defaults to zero (that is,
disabled).  Set it to a bit larger than srcutree.exp_holdoff,
keeping the nanosecond/microsecond conversion, to force Tree SRCU
to auto-expedite more aggressively.

This commit also adds documentation for this parameter, and fixes some
alphabetization while in the neighborhood.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 11 ++++++++---
 kernel/rcu/rcuperf.c                            |  3 +++
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3598464ca8ed..01b5ab92d251 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3357,17 +3357,22 @@
 	rcuperf.perf_runnable= [BOOT]
 			Start rcuperf running at boot time.
 
+	rcuperf.perf_type= [KNL]
+			Specify the RCU implementation to test.
+
 	rcuperf.shutdown= [KNL]
 			Shut the system down after performance tests
 			complete.  This is useful for hands-off automated
 			testing.
 
-	rcuperf.perf_type= [KNL]
-			Specify the RCU implementation to test.
-
 	rcuperf.verbose= [KNL]
 			Enable additional printk() statements.
 
+	rcuperf.writer_holdoff= [KNL]
+			Write-side holdoff between grace periods,
+			in microseconds.  The default of zero says
+			no holdoff.
+
 	rcutorture.cbflood_inter_holdoff= [KNL]
 			Set holdoff time (jiffies) between successive
 			callback-flood tests.
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 49c8ed6bd2fd..d80f11d9f8bd 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -68,6 +68,7 @@ torture_param(int, nwriters, -1, "Number of RCU updater threads");
 torture_param(bool, shutdown, !IS_ENABLED(MODULE),
 	      "Shutdown at end of performance tests.");
 torture_param(bool, verbose, true, "Enable verbose debugging printk()s");
+torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
 
 static char *perf_type = "rcu";
 module_param(perf_type, charp, 0444);
@@ -447,6 +448,8 @@ rcu_perf_writer(void *arg)
 	}
 
 	do {
+		if (writer_holdoff)
+			udelay(writer_holdoff);
 		wdp = &wdpp[i];
 		*wdp = ktime_get_mono_fast_ns();
 		if (gp_async) {
-- 
cgit v1.3-14-g43fede


From f4687d2637a4016b2eedfdb777105c95e8d6fe52 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 27 Apr 2017 16:13:53 -0700
Subject: rcu: Add preemptibility checks in rcu_sched_qs() and rcu_bh_qs()

This commit adds WARN_ON_ONCE() calls that trigger if either
rcu_sched_qs() or rcu_bh_qs() are invoked with preemption enabled.
In the immortal words of Peter Zijlstra: "these are much harder to ignore
than comments".

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3bee58fc23b1..b01a02e7a0b7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -250,6 +250,7 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  */
 void rcu_sched_qs(void)
 {
+	RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!");
 	if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
 		return;
 	trace_rcu_grace_period(TPS("rcu_sched"),
@@ -265,6 +266,7 @@ void rcu_sched_qs(void)
 
 void rcu_bh_qs(void)
 {
+	RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!");
 	if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
 		trace_rcu_grace_period(TPS("rcu_bh"),
 				       __this_cpu_read(rcu_bh_data.gpnum),
-- 
cgit v1.3-14-g43fede


From 59d80fd8351b7b9a5dc7bbfa8bc4ca19f6ff3dad Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 28 Apr 2017 10:20:28 -0700
Subject: rcu: Print out rcupdate.c non-default boot-time settings

This commit adds a rcupdate_announce_bootup_oddness() function to
print out non-default values of significant kernel boot parameter
settings to aid in debugging.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |  1 +
 kernel/rcu/tree_plugin.h |  1 +
 kernel/rcu/update.c      | 42 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 43 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index e1e5d002fdb9..393e461d3ea8 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -76,6 +76,7 @@ bool rcu_gp_is_normal(void);     /* Internal RCU use. */
 bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
 void rcu_expedite_gp(void);
 void rcu_unexpedite_gp(void);
+void rcupdate_announce_bootup_oddness(void);
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 
 enum rcutorture_type {
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3b432fa4c45b..eb5ebdce25ff 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -92,6 +92,7 @@ static void __init rcu_bootup_announce_oddness(void)
 		pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
 	if (IS_ENABLED(CONFIG_RCU_BOOST))
 		pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
+	rcupdate_announce_bootup_oddness();
 }
 
 #ifdef CONFIG_PREEMPT_RCU
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 273e869ca21d..82a5aa10dbc5 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -560,7 +560,8 @@ static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
 DEFINE_SRCU(tasks_rcu_exit_srcu);
 
 /* Control stall timeouts.  Disable with <= 0, otherwise jiffies till stall. */
-static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
+#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
+static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
 module_param(rcu_task_stall_timeout, int, 0644);
 
 static void rcu_spawn_tasks_kthread(void);
@@ -851,6 +852,23 @@ static void rcu_spawn_tasks_kthread(void)
 
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
+#ifndef CONFIG_TINY_RCU
+
+/*
+ * Print any non-default Tasks RCU settings.
+ */
+static void __init rcu_tasks_bootup_oddness(void)
+{
+#ifdef CONFIG_TASKS_RCU
+	if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT)
+		pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout);
+	else
+		pr_info("\tTasks RCU enabled.\n");
+#endif /* #ifdef CONFIG_TASKS_RCU */
+}
+
+#endif /* #ifndef CONFIG_TINY_RCU */
+
 #ifdef CONFIG_PROVE_RCU
 
 /*
@@ -935,3 +953,25 @@ late_initcall(rcu_verify_early_boot_tests);
 #else
 void rcu_early_boot_tests(void) {}
 #endif /* CONFIG_PROVE_RCU */
+
+#ifndef CONFIG_TINY_RCU
+
+/*
+ * Print any significant non-default boot-time settings.
+ */
+void __init rcupdate_announce_bootup_oddness(void)
+{
+	if (rcu_normal)
+		pr_info("\tNo expedited grace period (rcu_normal).\n");
+	else if (rcu_normal_after_boot)
+		pr_info("\tNo expedited grace period (rcu_normal_after_boot).\n");
+	else if (rcu_expedited)
+		pr_info("\tAll grace periods are expedited (rcu_expedited).\n");
+	if (rcu_cpu_stall_suppress)
+		pr_info("\tRCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).\n");
+	if (rcu_cpu_stall_timeout != CONFIG_RCU_CPU_STALL_TIMEOUT)
+		pr_info("\tRCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).\n", rcu_cpu_stall_timeout);
+	rcu_tasks_bootup_oddness();
+}
+
+#endif /* #ifndef CONFIG_TINY_RCU */
-- 
cgit v1.3-14-g43fede


From 17c7798bea64b9d35c7b4cc14d564e6feff73ac3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 28 Apr 2017 11:12:34 -0700
Subject: rcu: Update rcu_bootup_announce_oddness()

This commit updates rcu_bootup_announce_oddness() to check additional
Kconfig options and module/boot parameters.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        |  9 ++++++---
 kernel/rcu/tree_plugin.h | 31 ++++++++++++++++++++++++++++---
 2 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b01a02e7a0b7..ac8dce15fd74 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -536,9 +536,12 @@ void rcu_all_qs(void)
 }
 EXPORT_SYMBOL_GPL(rcu_all_qs);
 
-static long blimit = 10;	/* Maximum callbacks per rcu_do_batch. */
-static long qhimark = 10000;	/* If this many pending, ignore blimit. */
-static long qlowmark = 100;	/* Once only this many pending, use blimit. */
+#define DEFAULT_RCU_BLIMIT 10     /* Maximum callbacks per rcu_do_batch. */
+static long blimit = DEFAULT_RCU_BLIMIT;
+#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
+static long qhimark = DEFAULT_RCU_QHIMARK;
+#define DEFAULT_RCU_QLOMARK 100   /* Once only this many pending, use blimit. */
+static long qlowmark = DEFAULT_RCU_QLOMARK;
 
 module_param(blimit, long, 0444);
 module_param(qhimark, long, 0444);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index eb5ebdce25ff..9cb3dff78b6f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -79,7 +79,9 @@ static void __init rcu_bootup_announce_oddness(void)
 		pr_info("\tHierarchical RCU autobalancing is disabled.\n");
 	if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
 		pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
-	if (IS_ENABLED(CONFIG_PROVE_RCU))
+	if (IS_ENABLED(CONFIG_PROVE_RCU_REPEATEDLY))
+		pr_info("\tRCU lockdep checking is permanently enabled.\n");
+	else if (IS_ENABLED(CONFIG_PROVE_RCU))
 		pr_info("\tRCU lockdep checking is enabled.\n");
 	if (RCU_NUM_LVLS >= 4)
 		pr_info("\tFour(or more)-level hierarchy is enabled.\n");
@@ -90,8 +92,31 @@ static void __init rcu_bootup_announce_oddness(void)
 		pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
 	if (nr_cpu_ids != NR_CPUS)
 		pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
-	if (IS_ENABLED(CONFIG_RCU_BOOST))
-		pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
+#ifdef CONFIG_RCU_BOOST
+	pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY);
+#endif
+	if (blimit != DEFAULT_RCU_BLIMIT)
+		pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit);
+	if (qhimark != DEFAULT_RCU_QHIMARK)
+		pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
+	if (qlowmark != DEFAULT_RCU_QLOMARK)
+		pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
+	if (jiffies_till_first_fqs != ULONG_MAX)
+		pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
+	if (jiffies_till_next_fqs != ULONG_MAX)
+		pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs);
+	if (rcu_kick_kthreads)
+		pr_info("\tKick kthreads if too-long grace period.\n");
+	if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
+		pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
+	if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT))
+		pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
+	if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT))
+		pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
+	if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP))
+		pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
+	if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
+		pr_info("\tRCU debug extended QS entry/exit.\n");
 	rcupdate_announce_bootup_oddness();
 }
 
-- 
cgit v1.3-14-g43fede


From b5815e6cd3b747cacf7628a32a275aa2c0f61e06 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 28 Apr 2017 11:20:29 -0700
Subject: srcu: Make exp_holdoff module parameter be static

Because exp_holdoff is not used outside of srcutree.c, it can be static.
This commit therefore makes this change.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcutree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 03d57fe9f094..08d43736f72a 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -40,7 +40,7 @@
 #include "rcu.h"
 #include "rcu_segcblist.h"
 
-ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */
+static ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */
 module_param(exp_holdoff, ulong, 0444);
 
 static void srcu_invoke_callbacks(struct work_struct *work);
-- 
cgit v1.3-14-g43fede


From 0c8e0e3c37955d17cced37222a10c00ab47efd4b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 28 Apr 2017 11:24:22 -0700
Subject: srcu: Print non-default exp_holdoff values at boot time

This commit makes srcu_bootup_announce() check for non-default values
of the auto-expedite holdoff time exp_holdoff and print a message if so.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcutree.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 08d43736f72a..0b6ea105d9f8 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -40,7 +40,9 @@
 #include "rcu.h"
 #include "rcu_segcblist.h"
 
-static ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */
+/* Holdoff in nanoseconds for auto-expediting. */
+#define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000)
+static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF;
 module_param(exp_holdoff, ulong, 0444);
 
 static void srcu_invoke_callbacks(struct work_struct *work);
@@ -1171,6 +1173,8 @@ EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
 static int __init srcu_bootup_announce(void)
 {
 	pr_info("Hierarchical SRCU implementation.\n");
+	if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
+		pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
 	return 0;
 }
 early_initcall(srcu_bootup_announce);
-- 
cgit v1.3-14-g43fede


From c0b334c5bfa98ab104bde38da330a113a6c7dd56 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 28 Apr 2017 12:32:15 -0700
Subject: rcu: Add lockdep_assert_held() teeth to tree.c

Comments can be helpful, but assertions carry more force.  This
commit therefore adds lockdep_assert_held() and RCU_LOCKDEP_WARN()
calls to enforce lock-held and interrupt-disabled preconditions.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ac8dce15fd74..121c1436a7f3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -762,6 +762,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
 	int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
 	int *fp = &rnp->need_future_gp[idx];
 
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!");
 	return READ_ONCE(*fp);
 }
 
@@ -773,6 +774,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
 static bool
 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 {
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!");
 	if (rcu_gp_in_progress(rsp))
 		return false;  /* No, a grace period is already in progress. */
 	if (rcu_future_needs_gp(rsp))
@@ -799,6 +801,7 @@ static void rcu_eqs_enter_common(bool user)
 	struct rcu_data *rdp;
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!");
 	trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
 	if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
 	    !user && !is_idle_task(current)) {
@@ -972,6 +975,7 @@ static void rcu_eqs_exit(bool user)
 	struct rcu_dynticks *rdtp;
 	long long oldval;
 
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!");
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
@@ -1679,6 +1683,8 @@ void rcu_cpu_stall_reset(void)
 static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
 				       struct rcu_node *rnp)
 {
+	lockdep_assert_held(&rnp->lock);
+
 	/*
 	 * If RCU is idle, we just wait for the next grace period.
 	 * But we can only be sure that RCU is idle if we are looking
@@ -1724,6 +1730,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
 	bool ret = false;
 	struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
 
+	lockdep_assert_held(&rnp->lock);
+
 	/*
 	 * Pick up grace-period number for new callbacks.  If this
 	 * grace period is already marked as needed, return to the caller.
@@ -1850,6 +1858,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
 {
 	bool ret = false;
 
+	lockdep_assert_held(&rnp->lock);
+
 	/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
 	if (!rcu_segcblist_pend_cbs(&rdp->cblist))
 		return false;
@@ -1888,6 +1898,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
 static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
 			    struct rcu_data *rdp)
 {
+	lockdep_assert_held(&rnp->lock);
+
 	/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
 	if (!rcu_segcblist_pend_cbs(&rdp->cblist))
 		return false;
@@ -1914,6 +1926,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 	bool ret;
 	bool need_gp;
 
+	lockdep_assert_held(&rnp->lock);
+
 	/* Handle the ends of any preceding grace periods first. */
 	if (rdp->completed == rnp->completed &&
 	    !unlikely(READ_ONCE(rdp->gpwrap))) {
@@ -2346,6 +2360,7 @@ static bool
 rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
 		      struct rcu_data *rdp)
 {
+	lockdep_assert_held(&rnp->lock);
 	if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
 		/*
 		 * Either we have not yet spawned the grace-period
@@ -2407,6 +2422,7 @@ static bool rcu_start_gp(struct rcu_state *rsp)
 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	__releases(rcu_get_root(rsp)->lock)
 {
+	lockdep_assert_held(&rcu_get_root(rsp)->lock);
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
 	raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
@@ -2431,6 +2447,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 	unsigned long oldmask = 0;
 	struct rcu_node *rnp_c;
 
+	lockdep_assert_held(&rnp->lock);
+
 	/* Walk up the rcu_node hierarchy. */
 	for (;;) {
 		if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
@@ -2491,6 +2509,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 	unsigned long mask;
 	struct rcu_node *rnp_p;
 
+	lockdep_assert_held(&rnp->lock);
 	if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
 	    rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2604,6 +2623,8 @@ static void
 rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 			  struct rcu_node *rnp, struct rcu_data *rdp)
 {
+	lockdep_assert_held(&rsp->orphan_lock);
+
 	/* No-CBs CPUs do not have orphanable callbacks. */
 	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
 		return;
@@ -2644,6 +2665,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
 {
 	struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
 
+	lockdep_assert_held(&rsp->orphan_lock);
+
 	/* No-CBs CPUs are handled specially. */
 	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
 	    rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
@@ -2710,6 +2733,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 	long mask;
 	struct rcu_node *rnp = rnp_leaf;
 
+	lockdep_assert_held(&rnp->lock);
 	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
 	    rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
 		return;
@@ -3703,6 +3727,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
 	long mask;
 	struct rcu_node *rnp = rnp_leaf;
 
+	lockdep_assert_held(&rnp->lock);
 	for (;;) {
 		mask = rnp->grpmask;
 		rnp = rnp->parent;
-- 
cgit v1.3-14-g43fede


From ea9b0c8a26a2cadfe49382d679eee88d3c4de79c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 28 Apr 2017 13:19:28 -0700
Subject: rcu: Add lockdep_assert_held() teeth to tree_plugin.h

Comments can be helpful, but assertions carry more force.  This commit
therefore adds lockdep_assert_held() and RCU_LOCKDEP_WARN() calls to
enforce lock-held and interrupt-disabled preconditions.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 9cb3dff78b6f..ee7cea75273e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -181,6 +181,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
 			 (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
 	struct task_struct *t = current;
 
+	lockdep_assert_held(&rnp->lock);
+
 	/*
 	 * Decide where to queue the newly blocked task.  In theory,
 	 * this could be an if-statement.  In practice, when I tried
@@ -289,6 +291,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
  */
 static void rcu_preempt_qs(void)
 {
+	RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n");
 	if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) {
 		trace_rcu_grace_period(TPS("rcu_preempt"),
 				       __this_cpu_read(rcu_data_p->gpnum),
@@ -318,6 +321,7 @@ static void rcu_preempt_note_context_switch(bool preempt)
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
 	WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
 	if (t->rcu_read_lock_nesting > 0 &&
 	    !t->rcu_read_unlock_special.b.blocked) {
@@ -634,6 +638,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
  */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 {
+	RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
 	WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
 	if (rcu_preempt_has_tasks(rnp))
 		rnp->gp_tasks = rnp->blkd_tasks.next;
@@ -1024,6 +1029,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 {
 	struct task_struct *t;
 
+	lockdep_assert_held(&rnp->lock);
 	if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
 		rnp->n_balk_exp_gp_tasks++;
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1404,6 +1410,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 	unsigned long dj;
 
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!");
 	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
 		*nextevt = KTIME_MAX;
 		return 0;
@@ -1456,6 +1463,7 @@ static void rcu_prepare_for_idle(void)
 	struct rcu_state *rsp;
 	int tne;
 
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!");
 	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
 	    rcu_is_nocb_cpu(smp_processor_id()))
 		return;
@@ -1511,6 +1519,7 @@ static void rcu_prepare_for_idle(void)
  */
 static void rcu_cleanup_after_idle(void)
 {
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!");
 	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
 	    rcu_is_nocb_cpu(smp_processor_id()))
 		return;
@@ -2544,6 +2553,8 @@ static void rcu_sysidle_enter(int irq)
 	unsigned long j;
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_enter() invoked with irqs enabled!!!");
+
 	/* If there are no nohz_full= CPUs, no need to track this. */
 	if (!tick_nohz_full_enabled())
 		return;
@@ -2615,6 +2626,8 @@ static void rcu_sysidle_exit(int irq)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_exit() invoked with irqs enabled!!!");
+
 	/* If there are no nohz_full= CPUs, no need to track this. */
 	if (!tick_nohz_full_enabled())
 		return;
@@ -2674,6 +2687,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
 	unsigned long j;
 	struct rcu_dynticks *rdtp = rdp->dynticks;
 
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_check_cpu() invoked with irqs enabled!!!");
+
 	/* If there are no nohz_full= CPUs, don't check system-wide idleness. */
 	if (!tick_nohz_full_enabled())
 		return;
@@ -2842,6 +2857,8 @@ bool rcu_sys_is_idle(void)
 	static struct rcu_sysidle_head rsh;
 	int rss = READ_ONCE(full_sysidle_state);
 
+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sys_is_idle() invoked with irqs enabled!!!");
+
 	if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
 		return false;
 
-- 
cgit v1.3-14-g43fede


From d4efe6c5ad91f9a1f2f1d66b7fbfc87e320b2abc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 28 Apr 2017 14:16:16 -0700
Subject: srcu: Shrink Tiny SRCU a bit

In Tiny SRCU, __srcu_read_lock() is a trivial function, outweighed by
its EXPORT_SYMBOL_GPL(), and on many architectures, its call sequence.
This commit therefore moves it to srcutiny.h so that it can be inlined.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcutiny.h | 15 +++++++++++++++
 kernel/rcu/srcutiny.c    | 16 ----------------
 2 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index b8859179b001..b6edd9c8fdce 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -63,6 +63,21 @@ void srcu_drive_gp(struct work_struct *wp);
 
 void synchronize_srcu(struct srcu_struct *sp);
 
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.  Can be invoked from irq/bh handlers, but the matching
+ * __srcu_read_unlock() must be in the same handler instance.  Returns an
+ * index that must be passed to the matching srcu_read_unlock().
+ */
+static inline int __srcu_read_lock(struct srcu_struct *sp)
+{
+	int idx;
+
+	idx = READ_ONCE(sp->srcu_idx);
+	WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
+	return idx;
+}
+
 static inline void synchronize_srcu_expedited(struct srcu_struct *sp)
 {
 	synchronize_srcu(sp);
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 32798eb14853..988543721d5d 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -95,22 +95,6 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 }
 EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 
-/*
- * Counts the new reader in the appropriate per-CPU element of the
- * srcu_struct.  Can be invoked from irq/bh handlers, but the matching
- * __srcu_read_unlock() must be in the same handler instance.  Returns an
- * index that must be passed to the matching srcu_read_unlock().
- */
-int __srcu_read_lock(struct srcu_struct *sp)
-{
-	int idx;
-
-	idx = READ_ONCE(sp->srcu_idx);
-	WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
-	return idx;
-}
-EXPORT_SYMBOL_GPL(__srcu_read_lock);
-
 /*
  * Removes the count for the old reader from the appropriate element of
  * the srcu_struct.
-- 
cgit v1.3-14-g43fede


From a602538e46c9c62e75f1f0be9806495c79bcda9f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 28 Apr 2017 15:39:34 -0700
Subject: srcu: Add DEBUG_OBJECTS_RCU_HEAD functionality

This commit adds DEBUG_OBJECTS_RCU_HEAD checking to detect call_srcu()
counterparts to double-free bugs.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcutree.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 0b6ea105d9f8..c6e2a4a1628b 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -761,6 +761,13 @@ static bool srcu_might_be_idle(struct srcu_struct *sp)
 	return true; /* With reasonable probability, idle! */
 }
 
+/*
+ * SRCU callback function to leak a callback.
+ */
+static void srcu_leak_callback(struct rcu_head *rhp)
+{
+}
+
 /*
  * Enqueue an SRCU callback on the srcu_data structure associated with
  * the current CPU and the specified srcu_struct structure, initiating
@@ -799,6 +806,12 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 	struct srcu_data *sdp;
 
 	check_init_srcu_struct(sp);
+	if (debug_rcu_head_queue(rhp)) {
+		/* Probable double call_srcu(), so leak the callback. */
+		WRITE_ONCE(rhp->func, srcu_leak_callback);
+		WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n");
+		return;
+	}
 	rhp->func = func;
 	local_irq_save(flags);
 	sdp = this_cpu_ptr(sp->sda);
@@ -973,9 +986,12 @@ void srcu_barrier(struct srcu_struct *sp)
 		spin_lock_irq(&sdp->lock);
 		atomic_inc(&sp->srcu_barrier_cpu_cnt);
 		sdp->srcu_barrier_head.func = srcu_barrier_cb;
+		debug_rcu_head_queue(&sdp->srcu_barrier_head);
 		if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
-					   &sdp->srcu_barrier_head, 0))
+					   &sdp->srcu_barrier_head, 0)) {
+			debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
 			atomic_dec(&sp->srcu_barrier_cpu_cnt);
+		}
 		spin_unlock_irq(&sdp->lock);
 	}
 
@@ -1100,6 +1116,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	spin_unlock_irq(&sdp->lock);
 	rhp = rcu_cblist_dequeue(&ready_cbs);
 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+		debug_rcu_head_unqueue(rhp);
 		local_bh_disable();
 		rhp->func(rhp);
 		local_bh_enable();
-- 
cgit v1.3-14-g43fede


From 68ab0b4263224157f4d0c0e42854169a183d7534 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 28 Apr 2017 16:19:07 -0700
Subject: rcu: Make synchronize_rcu_mult() check for duplicates

Currently, doing synchronize_rcu_mult(call_rcu, call_rcu) might
(or might not) wait for two RCU grace periods.  One approach is
of course "don't do that!", but in CONFIG_PREEMPT=n kernels,
synchronize_rcu_mult(call_rcu, call_rcu_sched) does exactly that.
This results in an ugly #ifdef in sched_cpu_deactivate().

This commit therefore makes __wait_rcu_gp() check for duplicates,
which in turn allows duplicates to be passed to synchronize_rcu_mult()
without risk of waiting twice on the same type of grace period.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/update.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 82a5aa10dbc5..123a9c4b5055 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -379,6 +379,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
 		   struct rcu_synchronize *rs_array)
 {
 	int i;
+	int j;
 
 	/* Initialize and register callbacks for each flavor specified. */
 	for (i = 0; i < n; i++) {
@@ -390,7 +391,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
 		}
 		init_rcu_head_on_stack(&rs_array[i].head);
 		init_completion(&rs_array[i].completion);
-		(crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+		for (j = 0; j < i; j++)
+			if (crcu_array[j] == crcu_array[i])
+				break;
+		if (j == i)
+			(crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
 	}
 
 	/* Wait for all callbacks to be invoked. */
@@ -399,7 +404,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
 		    (crcu_array[i] == call_rcu ||
 		     crcu_array[i] == call_rcu_bh))
 			continue;
-		wait_for_completion(&rs_array[i].completion);
+		for (j = 0; j < i; j++)
+			if (crcu_array[j] == crcu_array[i])
+				break;
+		if (j == i)
+			wait_for_completion(&rs_array[i].completion);
 		destroy_rcu_head_on_stack(&rs_array[i].head);
 	}
 }
-- 
cgit v1.3-14-g43fede


From d7d34d5e46140a13f86379c87a196dc8a0b9b585 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 28 Apr 2017 16:33:07 -0700
Subject: sched: Rely on synchronize_rcu_mult() de-duplication

The synchronize_rcu_mult() function now detects duplicate requests
for the same grace-period flavor and waits only once for each flavor.
This commit therefore removes the ugly #ifdef from sched_cpu_deactivate()
because synchronize_rcu_mult(call_rcu, call_rcu_sched) now does what
the #ifdef used to be needed for.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 803c3bc274c4..e91138fcde86 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5874,15 +5874,9 @@ int sched_cpu_deactivate(unsigned int cpu)
 	 * users of this state to go away such that all new such users will
 	 * observe it.
 	 *
-	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
-	 * not imply sync_sched(), so wait for both.
-	 *
 	 * Do sync before park smpboot threads to take care the rcu boost case.
 	 */
-	if (IS_ENABLED(CONFIG_PREEMPT))
-		synchronize_rcu_mult(call_rcu, call_rcu_sched);
-	else
-		synchronize_rcu();
+	synchronize_rcu_mult(call_rcu, call_rcu_sched);
 
 	if (!sched_smp_initialized)
 		return 0;
-- 
cgit v1.3-14-g43fede


From 511324e462a12ea8be1a7e5fc63a992134db80d7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 28 Apr 2017 17:04:09 -0700
Subject: rcu: Use RCU_NOCB_WAKE rather than RCU_NOGP_WAKE

The RCU_NOGP_WAKE_NOT, RCU_NOGP_WAKE, and RCU_NOGP_WAKE_FORCE flags
are used to mediate wakeups for the no-CBs CPU kthreads.  The "NOGP"
really doesn't make any sense, so this commit does s/NOGP/NOCB/.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.h        | 6 +++---
 kernel/rcu/tree_plugin.h | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 0fa7aee9ef55..ddfa34d020ba 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -312,9 +312,9 @@ struct rcu_data {
 };
 
 /* Values for nocb_defer_wakeup field in struct rcu_data. */
-#define RCU_NOGP_WAKE_NOT	0
-#define RCU_NOGP_WAKE		1
-#define RCU_NOGP_WAKE_FORCE	2
+#define RCU_NOCB_WAKE_NOT	0
+#define RCU_NOCB_WAKE		1
+#define RCU_NOCB_WAKE_FORCE	2
 
 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
 					/* For jiffies_till_first_fqs and */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ee7cea75273e..0b1042545116 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1901,7 +1901,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 					    TPS("WakeEmpty"));
 		} else {
-			WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE);
+			WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE);
 			/* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
 			smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
@@ -1915,7 +1915,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 					    TPS("WakeOvf"));
 		} else {
-			WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE);
+			WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE);
 			/* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
 			smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
@@ -2242,8 +2242,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
 	if (!rcu_nocb_need_deferred_wakeup(rdp))
 		return;
 	ndw = READ_ONCE(rdp->nocb_defer_wakeup);
-	WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT);
-	wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
+	WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+	wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE);
 	trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
 }
 
-- 
cgit v1.3-14-g43fede


From 6b5fc3a1331810db407c9e0e673dc1837afdc9d0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 28 Apr 2017 20:11:09 -0700
Subject: rcu: Add memory barriers for NOCB leader wakeup

Wait/wakeup operations do not guarantee ordering on their own.  Instead,
either locking or memory barriers are required.  This commit therefore
adds memory barriers to wake_nocb_leader() and nocb_leader_wait().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Krister Johansen <kjlx@templeofstupid.com>
Cc: <stable@vger.kernel.org> # 4.6.x
---
 kernel/rcu/tree_plugin.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0b1042545116..573fbe9640a0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1810,6 +1810,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 	if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
 		/* Prior smp_mb__after_atomic() orders against prior enqueue. */
 		WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
+		smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
 		swake_up(&rdp_leader->nocb_wq);
 	}
 }
@@ -2064,6 +2065,7 @@ wait_again:
 	 * nocb_gp_head, where they await a grace period.
 	 */
 	gotcbs = false;
+	smp_mb(); /* wakeup before ->nocb_head reads. */
 	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
 		rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
 		if (!rdp->nocb_gp_head)
-- 
cgit v1.3-14-g43fede


From a68a2bb28bbf7a6dd4672a25bd87fd1b5db4fa7d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 08:34:57 -0700
Subject: rcu: Move docbook comments out of rcupdate.h

The include/linux/rcupdate.h file is included by more than 200
files, so shrinking it should provide some build-time benefits.
This commit therefore moves several docbook comments from rcupdate.h to
kernel/rcu/update.c, kernel/rcu/tree.c, and kernel/rcu/tree_plugin.h, thus
reducing the number of times that the compiler has to scan these comments.
This likely provides only a small benefit, but every little bit helps.

This commit also fixes a malformed bulleted list noted by the 0day
Test Robot.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 117 ++---------------------------------------------
 kernel/rcu/tree.c        |  42 +++++++++++++++--
 kernel/rcu/tree_plugin.h |  33 ++++++++++++-
 kernel/rcu/update.c      |  20 ++++++--
 4 files changed, 89 insertions(+), 123 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 393e461d3ea8..7a206f039fc2 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -140,115 +140,14 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
 /* Exported common interfaces */
 
 #ifdef CONFIG_PREEMPT_RCU
-
-/**
- * call_rcu() - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all pre-existing RCU read-side
- * critical sections have completed.  However, the callback function
- * might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- *
- * Note that all CPUs must agree that the grace period extended beyond
- * all pre-existing RCU read-side critical section.  On systems with more
- * than one CPU, this means that when "func()" is invoked, each CPU is
- * guaranteed to have executed a full memory barrier since the end of its
- * last RCU read-side critical section whose beginning preceded the call
- * to call_rcu().  It also means that each CPU executing an RCU read-side
- * critical section that continues beyond the start of "func()" must have
- * executed a memory barrier after the call_rcu() but before the beginning
- * of that RCU read-side critical section.  Note that these guarantees
- * include CPUs that are offline, idle, or executing in user mode, as
- * well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
- * resulting RCU callback function "func()", then both CPU A and CPU B are
- * guaranteed to execute a full memory barrier during the time interval
- * between the call to call_rcu() and the invocation of "func()" -- even
- * if CPU A and CPU B are the same CPU (but again only if the system has
- * more than one CPU).
- */
-void call_rcu(struct rcu_head *head,
-	      rcu_callback_t func);
-
+void call_rcu(struct rcu_head *head, rcu_callback_t func);
 #else /* #ifdef CONFIG_PREEMPT_RCU */
-
-/* In classic RCU, call_rcu() is just call_rcu_sched(). */
 #define	call_rcu	call_rcu_sched
-
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
-/**
- * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_bh() assumes
- * that the read-side critical sections end on completion of a softirq
- * handler. This means that read-side critical sections in process
- * context must not be interrupted by softirqs. This interface is to be
- * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by :
- *  - rcu_read_lock() and  rcu_read_unlock(), if in interrupt context.
- *  OR
- *  - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
- *  These may be nested.
- *
- * See the description of call_rcu() for more detailed information on
- * memory ordering guarantees.
- */
-void call_rcu_bh(struct rcu_head *head,
-		 rcu_callback_t func);
-
-/**
- * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_sched() assumes
- * that the read-side critical sections end on enabling of preemption
- * or on voluntary preemption.
- * RCU read-side critical sections are delimited by :
- *  - rcu_read_lock_sched() and  rcu_read_unlock_sched(),
- *  OR
- *  anything that disables preemption.
- *  These may be nested.
- *
- * See the description of call_rcu() for more detailed information on
- * memory ordering guarantees.
- */
-void call_rcu_sched(struct rcu_head *head,
-		    rcu_callback_t func);
-
+void call_rcu_bh(struct rcu_head *head, rcu_callback_t func);
+void call_rcu_sched(struct rcu_head *head, rcu_callback_t func);
 void synchronize_sched(void);
-
-/**
- * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_tasks() assumes
- * that the read-side critical sections end at a voluntary context
- * switch (not a preemption!), entry into idle, or transition to usermode
- * execution.  As such, there are no read-side primitives analogous to
- * rcu_read_lock() and rcu_read_unlock() because this primitive is intended
- * to determine that all tasks have passed through a safe state, not so
- * much for data-strcuture synchronization.
- *
- * See the description of call_rcu() for more detailed information on
- * memory ordering guarantees.
- */
 void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
 void synchronize_rcu_tasks(void);
 void rcu_barrier_tasks(void);
@@ -474,18 +373,8 @@ extern struct lockdep_map rcu_bh_lock_map;
 extern struct lockdep_map rcu_sched_lock_map;
 extern struct lockdep_map rcu_callback_map;
 int debug_lockdep_rcu_enabled(void);
-
 int rcu_read_lock_held(void);
 int rcu_read_lock_bh_held(void);
-
-/**
- * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
- *
- * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
- * RCU-sched read-side critical section.  In absence of
- * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
- * critical section unless it can prove otherwise.
- */
 int rcu_read_lock_sched_held(void);
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 121c1436a7f3..5ebc830297c1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3223,8 +3223,24 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
 	local_irq_restore(flags);
 }
 
-/*
- * Queue an RCU-sched callback for invocation after a grace period.
+/**
+ * call_rcu_sched() - Queue an RCU for invocation after sched grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_sched() assumes
+ * that the read-side critical sections end on enabling of preemption
+ * or on voluntary preemption.
+ * RCU read-side critical sections are delimited by :
+ *  - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR
+ *  - anything that disables preemption.
+ *
+ *  These may be nested.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
  */
 void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
 {
@@ -3232,8 +3248,26 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
 }
 EXPORT_SYMBOL_GPL(call_rcu_sched);
 
-/*
- * Queue an RCU callback for invocation after a quicker grace period.
+/**
+ * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by :
+ *  - rcu_read_lock() and  rcu_read_unlock(), if in interrupt context.
+ *  OR
+ *  - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
+ *  These may be nested.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
  */
 void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
 {
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 573fbe9640a0..116cf8339826 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -675,8 +675,37 @@ static void rcu_preempt_do_callbacks(void)
 
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
-/*
- * Queue a preemptible-RCU callback for invocation after a grace period.
+/**
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed.  However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing RCU read-side critical section.  On systems with more
+ * than one CPU, this means that when "func()" is invoked, each CPU is
+ * guaranteed to have executed a full memory barrier since the end of its
+ * last RCU read-side critical section whose beginning preceded the call
+ * to call_rcu().  It also means that each CPU executing an RCU read-side
+ * critical section that continues beyond the start of "func()" must have
+ * executed a memory barrier after the call_rcu() but before the beginning
+ * of that RCU read-side critical section.  Note that these guarantees
+ * include CPUs that are offline, idle, or executing in user mode, as
+ * well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting RCU callback function "func()", then both CPU A and CPU B are
+ * guaranteed to execute a full memory barrier during the time interval
+ * between the call to call_rcu() and the invocation of "func()" -- even
+ * if CPU A and CPU B are the same CPU (but again only if the system has
+ * more than one CPU).
  */
 void call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 123a9c4b5055..84dec2c8ad1b 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -576,9 +576,23 @@ module_param(rcu_task_stall_timeout, int, 0644);
 static void rcu_spawn_tasks_kthread(void);
 static struct task_struct *rcu_tasks_kthread_ptr;
 
-/*
- * Post an RCU-tasks callback.  First call must be from process context
- * after the scheduler if fully operational.
+/**
+ * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
+ * @rhp: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_tasks() assumes
+ * that the read-side critical sections end at a voluntary context
+ * switch (not a preemption!), entry into idle, or transition to usermode
+ * execution.  As such, there are no read-side primitives analogous to
+ * rcu_read_lock() and rcu_read_unlock() because this primitive is intended
+ * to determine that all tasks have passed through a safe state, not so
+ * much for data-strcuture synchronization.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
  */
 void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
 {
-- 
cgit v1.3-14-g43fede


From 3caec62fbb313946b9be53720bbf2280bb19ec28 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 09:27:15 -0700
Subject: rcu: Move rcu_expedited and rcu_normal externs from rcupdate.h

The rcu_expedited and rcu_normal variables are used only by sysctl
and kernel/rcu/update.c, so it does not make sense to their extern
declarations in rcupdate.h.  This commit therefore moves these
extern declarations to update.c.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 7 -------
 kernel/rcu/update.c      | 2 ++
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 7a206f039fc2..6e7e930c1610 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -46,13 +46,6 @@
 #include <linux/ktime.h>
 #include <linux/irqflags.h>
 
-#include <asm/barrier.h>
-
-#ifndef CONFIG_TINY_RCU
-extern int rcu_expedited; /* for sysctl */
-extern int rcu_normal;    /* also for sysctl */
-#endif /* #ifndef CONFIG_TINY_RCU */
-
 #ifdef CONFIG_TINY_RCU
 /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
 static inline bool rcu_gp_is_normal(void)  /* Internal RCU use. */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 84dec2c8ad1b..00e77c470017 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,7 +62,9 @@
 #define MODULE_PARAM_PREFIX "rcupdate."
 
 #ifndef CONFIG_TINY_RCU
+extern int rcu_expedited; /* from sysctl */
 module_param(rcu_expedited, int, 0);
+extern int rcu_normal; /* from sysctl */
 module_param(rcu_normal, int, 0);
 static int rcu_normal_after_boot;
 module_param(rcu_normal_after_boot, int, 0);
-- 
cgit v1.3-14-g43fede


From 25c36329a30c8cac090effe1fbae9bb916fa95fe Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 09:51:55 -0700
Subject: rcu: Move expediting-related access/control out of rcupdate.h

The rcu_gp_is_normal(), rcu_gp_is_expedited(), rcu_expedite_gp(), and
rcu_unexpedite_gp() functions are intended only for use within the
RCU implementation itself -- the sysfs access is what should be used
outside of RCU.  This commit therefore moves the declarations for
these functions to kernel/rcu/rcu.h, and also includes this file into
kernel/rcu/rcutorture.c and kernel/rcu/rcuperf.c.  This also has the
beneficial effect of shrinking rcupdate.c a bit.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 26 --------------------------
 kernel/rcu/rcu.h         | 26 ++++++++++++++++++++++++++
 kernel/rcu/rcuperf.c     |  2 ++
 kernel/rcu/rcutorture.c  |  2 ++
 4 files changed, 30 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 6e7e930c1610..049c62c59f1b 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -46,32 +46,6 @@
 #include <linux/ktime.h>
 #include <linux/irqflags.h>
 
-#ifdef CONFIG_TINY_RCU
-/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
-static inline bool rcu_gp_is_normal(void)  /* Internal RCU use. */
-{
-	return true;
-}
-static inline bool rcu_gp_is_expedited(void)  /* Internal RCU use. */
-{
-	return false;
-}
-
-static inline void rcu_expedite_gp(void)
-{
-}
-
-static inline void rcu_unexpedite_gp(void)
-{
-}
-#else /* #ifdef CONFIG_TINY_RCU */
-bool rcu_gp_is_normal(void);     /* Internal RCU use. */
-bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
-void rcu_expedite_gp(void);
-void rcu_unexpedite_gp(void);
-void rcupdate_announce_bootup_oddness(void);
-#endif /* #else #ifdef CONFIG_TINY_RCU */
-
 enum rcutorture_type {
 	RCU_FLAVOR,
 	RCU_BH_FLAVOR,
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 73e16ec4054b..ceb78110db1b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -293,4 +293,30 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
 
 #endif /* #if defined(SRCU) || !defined(TINY_RCU) */
 
+#ifdef CONFIG_TINY_RCU
+/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
+static inline bool rcu_gp_is_normal(void)  /* Internal RCU use. */
+{
+	return true;
+}
+static inline bool rcu_gp_is_expedited(void)  /* Internal RCU use. */
+{
+	return false;
+}
+
+static inline void rcu_expedite_gp(void)
+{
+}
+
+static inline void rcu_unexpedite_gp(void)
+{
+}
+#else /* #ifdef CONFIG_TINY_RCU */
+bool rcu_gp_is_normal(void);     /* Internal RCU use. */
+bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
+void rcu_expedite_gp(void);
+void rcu_unexpedite_gp(void);
+void rcupdate_announce_bootup_oddness(void);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index d80f11d9f8bd..3cc18110b612 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -48,6 +48,8 @@
 #include <linux/torture.h>
 #include <linux/vmalloc.h>
 
+#include "rcu.h"
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
 
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index a58592b73f19..03cdf79e73d4 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -52,6 +52,8 @@
 #include <linux/torture.h>
 #include <linux/vmalloc.h>
 
+#include "rcu.h"
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
 
-- 
cgit v1.3-14-g43fede


From cad7b3897279c869de61dc88133037b941f84233 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 10:22:57 -0700
Subject: rcu: Move torture-related definitions from rcupdate.h to rcu.h

The include/linux/rcupdate.h file contains a number of definitions that
are used only to communicate between rcutorture, rcuperf, and the RCU code
itself.  There is no point in having these definitions exposed globally
throughout the kernel, so this commit moves them to kernel/rcu/rcu.h.
This change has the added benefit of shrinking rcupdate.h.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h    | 52 ---------------------------
 include/linux/rcutiny.h     |  5 +++
 include/linux/rcutree.h     |  1 +
 include/linux/srcuclassic.h | 14 --------
 include/linux/srcutiny.h    | 12 -------
 include/linux/srcutree.h    |  4 ---
 kernel/rcu/rcu.h            | 85 +++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 91 insertions(+), 82 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 049c62c59f1b..7557499d8e70 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -46,58 +46,6 @@
 #include <linux/ktime.h>
 #include <linux/irqflags.h>
 
-enum rcutorture_type {
-	RCU_FLAVOR,
-	RCU_BH_FLAVOR,
-	RCU_SCHED_FLAVOR,
-	RCU_TASKS_FLAVOR,
-	SRCU_FLAVOR,
-	INVALID_RCU_FLAVOR
-};
-
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
-void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
-			    unsigned long *gpnum, unsigned long *completed);
-void rcutorture_record_test_transition(void);
-void rcutorture_record_progress(unsigned long vernum);
-void do_trace_rcu_torture_read(const char *rcutorturename,
-			       struct rcu_head *rhp,
-			       unsigned long secs,
-			       unsigned long c_old,
-			       unsigned long c);
-bool rcu_irq_enter_disabled(void);
-#else
-static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
-					  int *flags,
-					  unsigned long *gpnum,
-					  unsigned long *completed)
-{
-	*flags = 0;
-	*gpnum = 0;
-	*completed = 0;
-}
-static inline void rcutorture_record_test_transition(void)
-{
-}
-static inline void rcutorture_record_progress(unsigned long vernum)
-{
-}
-static inline bool rcu_irq_enter_disabled(void)
-{
-	return false;
-}
-#ifdef CONFIG_RCU_TRACE
-void do_trace_rcu_torture_read(const char *rcutorturename,
-			       struct rcu_head *rhp,
-			       unsigned long secs,
-			       unsigned long c_old,
-			       unsigned long c);
-#else
-#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
-	do { } while (0)
-#endif
-#endif
-
 #define UINT_CMP_GE(a, b)	(UINT_MAX / 2 >= (a) - (b))
 #define UINT_CMP_LT(a, b)	(UINT_MAX / 2 < (a) - (b))
 #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 74d9c3a1feee..ade360e0d58c 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -202,6 +202,11 @@ static inline void rcu_irq_enter(void)
 {
 }
 
+static inline bool rcu_irq_enter_disabled(void)
+{
+	return false;
+}
+
 static inline void rcu_irq_exit_irqson(void)
 {
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 0bacb6b2af69..28af91a19573 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -101,6 +101,7 @@ void rcu_irq_enter(void);
 void rcu_irq_exit(void);
 void rcu_irq_enter_irqson(void);
 void rcu_irq_exit_irqson(void);
+bool rcu_irq_enter_disabled(void);
 
 void exit_rcu(void);
 
diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h
index 5753f7322262..41cf99930f34 100644
--- a/include/linux/srcuclassic.h
+++ b/include/linux/srcuclassic.h
@@ -98,18 +98,4 @@ void synchronize_srcu_expedited(struct srcu_struct *sp);
 void srcu_barrier(struct srcu_struct *sp);
 unsigned long srcu_batches_completed(struct srcu_struct *sp);
 
-static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
-					   struct srcu_struct *sp, int *flags,
-					   unsigned long *gpnum,
-					   unsigned long *completed)
-{
-	if (test_type != SRCU_FLAVOR)
-		return;
-	*flags = 0;
-	*completed = sp->completed;
-	*gpnum = *completed;
-	if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head)
-		(*gpnum)++;
-}
-
 #endif
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index b6edd9c8fdce..85bddce6a7a6 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -93,16 +93,4 @@ static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
 	return 0;
 }
 
-static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
-					   struct srcu_struct *sp, int *flags,
-					   unsigned long *gpnum,
-					   unsigned long *completed)
-{
-	if (test_type != SRCU_FLAVOR)
-		return;
-	*flags = 0;
-	*completed = sp->srcu_gp_seq;
-	*gpnum = *completed;
-}
-
 #endif
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 32e86d85fd11..f4adfed17b51 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -143,8 +143,4 @@ void synchronize_srcu_expedited(struct srcu_struct *sp);
 void srcu_barrier(struct srcu_struct *sp);
 unsigned long srcu_batches_completed(struct srcu_struct *sp);
 
-void srcutorture_get_gp_data(enum rcutorture_type test_type,
-			     struct srcu_struct *sp, int *flags,
-			     unsigned long *gpnum, unsigned long *completed);
-
 #endif
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index ceb78110db1b..f190fc1c8215 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -319,4 +319,89 @@ void rcu_unexpedite_gp(void);
 void rcupdate_announce_bootup_oddness(void);
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 
+enum rcutorture_type {
+	RCU_FLAVOR,
+	RCU_BH_FLAVOR,
+	RCU_SCHED_FLAVOR,
+	RCU_TASKS_FLAVOR,
+	SRCU_FLAVOR,
+	INVALID_RCU_FLAVOR
+};
+
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
+void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
+			    unsigned long *gpnum, unsigned long *completed);
+void rcutorture_record_test_transition(void);
+void rcutorture_record_progress(unsigned long vernum);
+void do_trace_rcu_torture_read(const char *rcutorturename,
+			       struct rcu_head *rhp,
+			       unsigned long secs,
+			       unsigned long c_old,
+			       unsigned long c);
+#else
+static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
+					  int *flags,
+					  unsigned long *gpnum,
+					  unsigned long *completed)
+{
+	*flags = 0;
+	*gpnum = 0;
+	*completed = 0;
+}
+static inline void rcutorture_record_test_transition(void)
+{
+}
+static inline void rcutorture_record_progress(unsigned long vernum)
+{
+}
+#ifdef CONFIG_RCU_TRACE
+void do_trace_rcu_torture_read(const char *rcutorturename,
+			       struct rcu_head *rhp,
+			       unsigned long secs,
+			       unsigned long c_old,
+			       unsigned long c);
+#else
+#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+	do { } while (0)
+#endif
+#endif
+
+#ifdef CONFIG_TINY_SRCU
+
+static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
+					   struct srcu_struct *sp, int *flags,
+					   unsigned long *gpnum,
+					   unsigned long *completed)
+{
+	if (test_type != SRCU_FLAVOR)
+		return;
+	*flags = 0;
+	*completed = sp->srcu_gp_seq;
+	*gpnum = *completed;
+}
+
+#elif defined(CONFIG_TREE_SRCU)
+
+void srcutorture_get_gp_data(enum rcutorture_type test_type,
+			     struct srcu_struct *sp, int *flags,
+			     unsigned long *gpnum, unsigned long *completed);
+
+#elif defined(CONFIG_CLASSIC_SRCU)
+
+static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
+					   struct srcu_struct *sp, int *flags,
+					   unsigned long *gpnum,
+					   unsigned long *completed)
+{
+	if (test_type != SRCU_FLAVOR)
+		return;
+	*flags = 0;
+	*completed = sp->completed;
+	*gpnum = *completed;
+	if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head)
+		(*gpnum)++;
+}
+
+#endif
+
 #endif /* __LINUX_RCU_H */
-- 
cgit v1.3-14-g43fede


From 791875d16e2f6e2e5b90328ccac643f512ac76c4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 11:06:05 -0700
Subject: rcu: Eliminate the unused __rcu_is_watching() function

The __rcu_is_watching() function is currently not used, aside from
to implement the rcu_is_watching() function.  This commit therefore
eliminates __rcu_is_watching(), which has the beneficial side-effect
of shrinking include/linux/rcupdate.h a bit.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |  4 ----
 include/linux/rcutiny.h  | 11 -----------
 kernel/rcu/tiny.c        | 13 -------------
 kernel/rcu/tree.c        | 19 ++++---------------
 4 files changed, 4 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 415633076cb1..b4edfe0966c6 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -204,10 +204,6 @@ do { \
 		rcu_note_voluntary_context_switch(current); \
 } while (0)
 
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP)
-bool __rcu_is_watching(void);
-#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */
-
 /*
  * Infrastructure to implement the synchronize_() primitives in
  * TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index ade360e0d58c..5ed6934152a6 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -232,22 +232,11 @@ static inline void rcu_scheduler_starting(void)
 }
 #endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
 
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
-
-static inline bool rcu_is_watching(void)
-{
-	return __rcu_is_watching();
-}
-
-#else /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
-
 static inline bool rcu_is_watching(void)
 {
 	return true;
 }
 
-#endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
-
 static inline void rcu_request_urgent_qs_task(struct task_struct *t)
 {
 }
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index e5385731e391..2306cab2195d 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -59,19 +59,6 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL(rcu_barrier_sched);
 
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
-
-/*
- * Test whether RCU thinks that the current CPU is idle.
- */
-bool notrace __rcu_is_watching(void)
-{
-	return true;
-}
-EXPORT_SYMBOL(__rcu_is_watching);
-
-#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
-
 /*
  * Helper function for rcu_sched_qs() and rcu_bh_qs().
  * Also irqs are disabled to avoid confusion due to interrupt handlers
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5ebc830297c1..61a97164abcc 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1138,23 +1138,12 @@ void rcu_nmi_exit(void)
 	rcu_dynticks_eqs_enter();
 }
 
-/**
- * __rcu_is_watching - are RCU read-side critical sections safe?
- *
- * Return true if RCU is watching the running CPU, which means that
- * this CPU can safely enter RCU read-side critical sections.  Unlike
- * rcu_is_watching(), the caller of __rcu_is_watching() must have at
- * least disabled preemption.
- */
-bool notrace __rcu_is_watching(void)
-{
-	return !rcu_dynticks_curr_cpu_in_eqs();
-}
-
 /**
  * rcu_is_watching - see if RCU thinks that the current CPU is idle
  *
- * If the current CPU is in its idle loop and is neither in an interrupt
+ * Return true if RCU is watching the running CPU, which means that this
+ * CPU can safely enter RCU read-side critical sections.  In other words,
+ * if the current CPU is in its idle loop and is neither in an interrupt
  * or NMI handler, return true.
  */
 bool notrace rcu_is_watching(void)
@@ -1162,7 +1151,7 @@ bool notrace rcu_is_watching(void)
 	bool ret;
 
 	preempt_disable_notrace();
-	ret = __rcu_is_watching();
+	ret = !rcu_dynticks_curr_cpu_in_eqs();
 	preempt_enable_notrace();
 	return ret;
 }
-- 
cgit v1.3-14-g43fede


From 82118249d0ca4078d56d5e43172ada1567fdf946 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 11:13:24 -0700
Subject: rcu: Move the RCU_SCHEDULER_ definitions from rcupdate.h

The RCU_SCHEDULER_INACTIVE, RCU_SCHEDULER_INIT, and RCU_SCHEDULER_RUNNING
definitions are used only within RCU, so this commit moves them from
include/linux/rcupdate.h to kernel/rcu/rcu.h.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 4 ----
 kernel/rcu/rcu.h         | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index b4edfe0966c6..9206a28a2d44 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -217,10 +217,6 @@ do { \
 #error "Unknown RCU implementation specified to kernel configuration"
 #endif
 
-#define RCU_SCHEDULER_INACTIVE	0
-#define RCU_SCHEDULER_INIT	1
-#define RCU_SCHEDULER_RUNNING	2
-
 /*
  * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
  * initialization and destruction of rcu_head on the stack. rcu_head structures
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index f190fc1c8215..17fee2a667d9 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -319,6 +319,10 @@ void rcu_unexpedite_gp(void);
 void rcupdate_announce_bootup_oddness(void);
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 
+#define RCU_SCHEDULER_INACTIVE	0
+#define RCU_SCHEDULER_INIT	1
+#define RCU_SCHEDULER_RUNNING	2
+
 enum rcutorture_type {
 	RCU_FLAVOR,
 	RCU_BH_FLAVOR,
-- 
cgit v1.3-14-g43fede


From fa3c66476975abf00c97f27b6c2b3d223f7d57f5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 11:38:55 -0700
Subject: rcu: Improve __call_rcu() debug-objects error message

The "__call_rcu(): Leaked duplicate callback" error message from
__call_rcu() has proven to be unhelpful.  This commit therefore changes
it to "__call_rcu(): Double-freed CB" and adds the value of the pointer
passed in.  The value of the pointer improves debuggability by allowing
correlation with tracing output, for example, the rcu:rcu_callback trace
event.

Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 61a97164abcc..cac24f5d3fd2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3161,9 +3161,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
 	WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
 
 	if (debug_rcu_head_queue(head)) {
-		/* Probable double call_rcu(), so leak the callback. */
+		/*
+		 * Probable double call_rcu(), so leak the callback.
+		 * Use rcu:rcu_callback trace event to find the previous
+		 * time callback was passed to __call_rcu().
+		 */
+		WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n",
+			  head, head->func);
 		WRITE_ONCE(head->func, rcu_leak_callback);
-		WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
 		return;
 	}
 	head->func = func;
-- 
cgit v1.3-14-g43fede


From 3d54f7983f3e6ac9f444fa20970b1abc8f089b79 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 12:25:50 -0700
Subject: rcu: Move rcu_is_nocb_cpu() from rcupdate.h to rcu.h

The rcu_is_nocb_cpu() function is used only internally to RCU.  This
commit therefore moves its declaration from include/linux/rcupdate.h
to kernel/rcu/rcu.h.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 9 ---------
 kernel/rcu/rcu.h         | 8 ++++++++
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f105f0834bbe..003427425e27 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -867,15 +867,6 @@ static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 }
 #endif /* #ifdef CONFIG_TINY_RCU */
 
-#if defined(CONFIG_RCU_NOCB_CPU_ALL)
-static inline bool rcu_is_nocb_cpu(int cpu) { return true; }
-#elif defined(CONFIG_RCU_NOCB_CPU)
-bool rcu_is_nocb_cpu(int cpu);
-#else
-static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
-#endif
-
-
 /* Only for use by adaptive-ticks code. */
 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE
 bool rcu_sys_is_idle(void);
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 17fee2a667d9..2f344662c568 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -408,4 +408,12 @@ static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
 
 #endif
 
+#if defined(CONFIG_RCU_NOCB_CPU_ALL)
+static inline bool rcu_is_nocb_cpu(int cpu) { return true; }
+#elif defined(CONFIG_RCU_NOCB_CPU)
+bool rcu_is_nocb_cpu(int cpu);
+#else
+static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
+#endif
+
 #endif /* __LINUX_RCU_H */
-- 
cgit v1.3-14-g43fede


From b8989b76052eedc99b09322efd6f68816f191a1a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 12:28:59 -0700
Subject: rcu: Move rcu_ftrace_dump() from rcupdate.h to rcu.h

The rcu_ftrace_dump() function is used only internally to RCU.  This
commit therefore moves its declaration from include/linux/rcupdate.h
to kernel/rcu/rcu.h.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 12 ------------
 kernel/rcu/rcu.h         | 12 ++++++++++++
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 003427425e27..ad5e6934dcf3 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -883,18 +883,6 @@ static inline void rcu_sysidle_force_exit(void) { }
 #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 
 
-/*
- * Dump the ftrace buffer, but only one time per callsite per boot.
- */
-#define rcu_ftrace_dump(oops_dump_mode) \
-do { \
-	static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
-	\
-	if (!atomic_read(&___rfd_beenhere) && \
-	    !atomic_xchg(&___rfd_beenhere, 1)) \
-		ftrace_dump(oops_dump_mode); \
-} while (0)
-
 /*
  * Place this after a lock-acquisition primitive to guarantee that
  * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 2f344662c568..cdbaa441bdac 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -212,6 +212,18 @@ int rcu_jiffies_till_stall_check(void);
  */
 #define TPS(x)  tracepoint_string(x)
 
+/*
+ * Dump the ftrace buffer, but only one time per callsite per boot.
+ */
+#define rcu_ftrace_dump(oops_dump_mode) \
+do { \
+	static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
+	\
+	if (!atomic_read(&___rfd_beenhere) && \
+	    !atomic_xchg(&___rfd_beenhere, 1)) \
+		ftrace_dump(oops_dump_mode); \
+} while (0)
+
 void rcu_early_boot_tests(void);
 void rcu_test_sync_prims(void);
 
-- 
cgit v1.3-14-g43fede


From e3c8d51e1a58c73a557eb38a9a6afb4f704a3379 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 13:37:16 -0700
Subject: rcu: Move torture-related functions out of rcutiny.h and rcutree.h

The various functions similar to rcu_batches_started(), the
function show_rcu_gp_kthreads(), the various functions similar to
rcu_force_quiescent_state(), and the variables rcutorture_testseq and
rcutorture_vernum are used only within RCU.  There is therefore no point
in exporting them to the kernel at large from include/linux/rcutiny.h
and include/linux/rcutree.h.  This commit therefore moves all of these
to kernel/rcu/rcu.h.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcutiny.h | 80 ---------------------------------------
 include/linux/rcutree.h | 16 --------
 kernel/rcu/rcu.h        | 99 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 96 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 5ed6934152a6..0d9270913686 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -106,86 +106,6 @@ static inline void rcu_virt_note_context_switch(int cpu)
 {
 }
 
-/*
- * Return the number of grace periods started.
- */
-static inline unsigned long rcu_batches_started(void)
-{
-	return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods started.
- */
-static inline unsigned long rcu_batches_started_bh(void)
-{
-	return 0;
-}
-
-/*
- * Return the number of sched grace periods started.
- */
-static inline unsigned long rcu_batches_started_sched(void)
-{
-	return 0;
-}
-
-/*
- * Return the number of grace periods completed.
- */
-static inline unsigned long rcu_batches_completed(void)
-{
-	return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_bh(void)
-{
-	return 0;
-}
-
-/*
- * Return the number of sched grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_sched(void)
-{
-	return 0;
-}
-
-/*
- * Return the number of expedited grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed(void)
-{
-	return 0;
-}
-
-/*
- * Return the number of expedited sched grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed_sched(void)
-{
-	return 0;
-}
-
-static inline void rcu_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_bh_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_sched_force_quiescent_state(void)
-{
-}
-
-static inline void show_rcu_gp_kthreads(void)
-{
-}
-
 static inline void rcu_cpu_stall_reset(void)
 {
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 28af91a19573..43113323ca09 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -79,22 +79,6 @@ void cond_synchronize_rcu(unsigned long oldstate);
 unsigned long get_state_synchronize_sched(void);
 void cond_synchronize_sched(unsigned long oldstate);
 
-extern unsigned long rcutorture_testseq;
-extern unsigned long rcutorture_vernum;
-unsigned long rcu_batches_started(void);
-unsigned long rcu_batches_started_bh(void);
-unsigned long rcu_batches_started_sched(void);
-unsigned long rcu_batches_completed(void);
-unsigned long rcu_batches_completed_bh(void);
-unsigned long rcu_batches_completed_sched(void);
-unsigned long rcu_exp_batches_completed(void);
-unsigned long rcu_exp_batches_completed_sched(void);
-void show_rcu_gp_kthreads(void);
-
-void rcu_force_quiescent_state(void);
-void rcu_bh_force_quiescent_state(void);
-void rcu_sched_force_quiescent_state(void);
-
 void rcu_idle_enter(void);
 void rcu_idle_exit(void);
 void rcu_irq_enter(void);
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index cdbaa441bdac..d849b371b32b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -420,6 +420,105 @@ static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
 
 #endif
 
+#ifdef CONFIG_TINY_RCU
+
+/*
+ * Return the number of grace periods started.
+ */
+static inline unsigned long rcu_batches_started(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of bottom-half grace periods started.
+ */
+static inline unsigned long rcu_batches_started_bh(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of sched grace periods started.
+ */
+static inline unsigned long rcu_batches_started_sched(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of grace periods completed.
+ */
+static inline unsigned long rcu_batches_completed(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of bottom-half grace periods completed.
+ */
+static inline unsigned long rcu_batches_completed_bh(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of sched grace periods completed.
+ */
+static inline unsigned long rcu_batches_completed_sched(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of expedited grace periods completed.
+ */
+static inline unsigned long rcu_exp_batches_completed(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of expedited sched grace periods completed.
+ */
+static inline unsigned long rcu_exp_batches_completed_sched(void)
+{
+	return 0;
+}
+
+static inline void rcu_force_quiescent_state(void)
+{
+}
+
+static inline void rcu_bh_force_quiescent_state(void)
+{
+}
+
+static inline void rcu_sched_force_quiescent_state(void)
+{
+}
+
+static inline void show_rcu_gp_kthreads(void)
+{
+}
+
+#else /* #ifdef CONFIG_TINY_RCU */
+extern unsigned long rcutorture_testseq;
+extern unsigned long rcutorture_vernum;
+unsigned long rcu_batches_started(void);
+unsigned long rcu_batches_started_bh(void);
+unsigned long rcu_batches_started_sched(void);
+unsigned long rcu_batches_completed(void);
+unsigned long rcu_batches_completed_bh(void);
+unsigned long rcu_batches_completed_sched(void);
+unsigned long rcu_exp_batches_completed(void);
+unsigned long rcu_exp_batches_completed_sched(void);
+void show_rcu_gp_kthreads(void);
+void rcu_force_quiescent_state(void);
+void rcu_bh_force_quiescent_state(void);
+void rcu_sched_force_quiescent_state(void);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
 #if defined(CONFIG_RCU_NOCB_CPU_ALL)
 static inline bool rcu_is_nocb_cpu(int cpu) { return true; }
 #elif defined(CONFIG_RCU_NOCB_CPU)
-- 
cgit v1.3-14-g43fede


From fe21a27e8ca0937a5ac298de1f4b46382e9c5c88 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 13:45:51 -0700
Subject: rcu: Move rcu_request_urgent_qs_task() out of rcutiny.h and rcutree.h

The rcu_request_urgent_qs_task() function is used only within RCU,
so there is no point in exporting it to the rest of the kernel from
nclude/linux/rcutiny.h and include/linux/rcutree.h.  This commit therefore
moves this function to kernel/rcu/rcu.h.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcutiny.h | 4 ----
 include/linux/rcutree.h | 3 ---
 kernel/rcu/rcu.h        | 6 ++++++
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 0d9270913686..f5067941bc27 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -157,10 +157,6 @@ static inline bool rcu_is_watching(void)
 	return true;
 }
 
-static inline void rcu_request_urgent_qs_task(struct task_struct *t)
-{
-}
-
 static inline void rcu_all_qs(void)
 {
 	barrier(); /* Avoid RCU read-side critical sections leaking across. */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 43113323ca09..d6aa89d15d47 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -91,10 +91,7 @@ void exit_rcu(void);
 
 void rcu_scheduler_starting(void);
 extern int rcu_scheduler_active __read_mostly;
-
 bool rcu_is_watching(void);
-void rcu_request_urgent_qs_task(struct task_struct *t);
-
 void rcu_all_qs(void);
 
 /* RCUtree hotplug events */
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index d849b371b32b..5b76a5baff2e 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -335,6 +335,12 @@ void rcupdate_announce_bootup_oddness(void);
 #define RCU_SCHEDULER_INIT	1
 #define RCU_SCHEDULER_RUNNING	2
 
+#ifdef CONFIG_TINY_RCU
+static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
+#else /* #ifdef CONFIG_TINY_RCU */
+void rcu_request_urgent_qs_task(struct task_struct *t);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
 enum rcutorture_type {
 	RCU_FLAVOR,
 	RCU_BH_FLAVOR,
-- 
cgit v1.3-14-g43fede


From c350c008297643dad3c395c2fd92230142da5cf6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 15:35:32 -0700
Subject: srcu: Prevent sdp->srcu_gp_seq_needed counter wrap

If a given CPU never happens to ever start an SRCU grace period, the
grace-period sequence counter might wrap.  If this CPU were to decide to
finally start a grace period, the state of its sdp->srcu_gp_seq_needed
might make it appear that it has already requested this grace period,
which would prevent starting the grace period.  If no other CPU ever started
a grace period again, this would look like a grace-period hang.  Even
if some other CPU took pity and started the needed grace period, the
leaf rcu_node structure's ->srcu_data_have_cbs field won't have record
of the fact that this CPU has a callback pending, which would look like
a very localized grace-period hang.

This might seem very unlikely, but SRCU grace periods can take less than
a microsecond on small systems, which means that overflow can happen
in much less than an hour on a 32-bit embedded system.  And embedded
systems are especially likely to have long-term idle CPUs.  Therefore,
it makes sense to prevent this scenario from happening.

This commit therefore scans each srcu_data structure occasionally,
with frequency controlled by the srcutree.counter_wrap_check kernel
boot parameter.  This parameter can be set to something like 255
in order to exercise the counter-wrap-prevention code.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  9 +++++++++
 kernel/rcu/srcutree.c                           | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 01b5ab92d251..6671f9b60a86 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3810,6 +3810,15 @@
 	spia_pedr=
 	spia_peddr=
 
+	srcutree.counter_wrap_check [KNL]
+			Specifies how frequently to check for
+			grace-period sequence counter wrap for the
+			srcu_data structure's ->srcu_gp_seq_needed field.
+			The greater the number of bits set in this kernel
+			parameter, the less frequently counter wrap will
+			be checked for.  Note that the bottom two bits
+			are ignored.
+
 	srcutree.exp_holdoff [KNL]
 			Specifies how many nanoseconds must elapse
 			since the end of the last SRCU grace period for
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index c6e2a4a1628b..cc06dbfc9692 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -45,6 +45,10 @@
 static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF;
 module_param(exp_holdoff, ulong, 0444);
 
+/* Overflow-check frequency.  N bits roughly says every 2**N grace periods. */
+static ulong counter_wrap_check = (ULONG_MAX >> 2);
+module_param(counter_wrap_check, ulong, 0444);
+
 static void srcu_invoke_callbacks(struct work_struct *work);
 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
 
@@ -496,10 +500,13 @@ static void srcu_gp_end(struct srcu_struct *sp)
 {
 	unsigned long cbdelay;
 	bool cbs;
+	int cpu;
+	unsigned long flags;
 	unsigned long gpseq;
 	int idx;
 	int idxnext;
 	unsigned long mask;
+	struct srcu_data *sdp;
 	struct srcu_node *snp;
 
 	/* Prevent more than one additional grace period. */
@@ -538,6 +545,17 @@ static void srcu_gp_end(struct srcu_struct *sp)
 			smp_mb(); /* GP end before CB invocation. */
 			srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
 		}
+
+		/* Occasionally prevent srcu_data counter wrap. */
+		if (!(gpseq & counter_wrap_check))
+			for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
+				sdp = per_cpu_ptr(sp->sda, cpu);
+				spin_lock_irqsave(&sdp->lock, flags);
+				if (ULONG_CMP_GE(gpseq,
+						 sdp->srcu_gp_seq_needed + 100))
+					sdp->srcu_gp_seq_needed = gpseq;
+				spin_unlock_irqrestore(&sdp->lock, flags);
+			}
 	}
 
 	/* Callback initiation done, allow grace periods after next. */
-- 
cgit v1.3-14-g43fede


From 5a0465e17a18c467b712a816985b7b8dd8d10c16 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 4 May 2017 11:31:04 -0700
Subject: srcu: Shrink srcu.h by moving docbook and private function

The call_srcu() docbook entry is currently in include/linux/srcu.h,
which causes needless processing for each include point.  This commit
therefore moves this entry to kernel/rcu/srcutree.c, which the compiler
reads only once.  In addition, the srcu_batches_completed() function is
used only within RCU and its torture-test suites.  This commit therefore
also moves this function's declaration from include/linux/srcutiny.h,
include/linux/srcutree.h, and include/linux/srcuclassic.h to
kernel/rcu/rcu.h.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h        | 20 --------------------
 include/linux/srcuclassic.h |  1 -
 include/linux/srcutiny.h    |  5 -----
 include/linux/srcutree.h    |  1 -
 kernel/rcu/rcu.h            |  6 ++++++
 kernel/rcu/srcutree.c       | 17 +++++++++++++++++
 6 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index ea356d800675..5f509018e6b5 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -65,32 +65,12 @@ int init_srcu_struct(struct srcu_struct *sp);
 #elif defined(CONFIG_SRCU)
 #error "Unknown SRCU implementation specified to kernel configuration"
 #else
-
 /* Dummy definition for things like notifiers.  Actual use gets link error. */
 struct srcu_struct { };
-
 #endif
 
-/**
- * call_srcu() - Queue a callback for invocation after an SRCU grace period
- * @sp: srcu_struct in queue the callback
- * @head: structure to be used for queueing the SRCU callback.
- * @func: function to be invoked after the SRCU grace period
- *
- * The callback function will be invoked some time after a full SRCU
- * grace period elapses, in other words after all pre-existing SRCU
- * read-side critical sections have completed.  However, the callback
- * function might well execute concurrently with other SRCU read-side
- * critical sections that started after call_srcu() was invoked.  SRCU
- * read-side critical sections are delimited by srcu_read_lock() and
- * srcu_read_unlock(), and may be nested.
- *
- * The callback will be invoked from process context, but must nevertheless
- * be fast and must not block.
- */
 void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 		void (*func)(struct rcu_head *head));
-
 void cleanup_srcu_struct(struct srcu_struct *sp);
 int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h
index 41cf99930f34..67db4a36ef0d 100644
--- a/include/linux/srcuclassic.h
+++ b/include/linux/srcuclassic.h
@@ -96,6 +96,5 @@ void process_srcu(struct work_struct *work);
 
 void synchronize_srcu_expedited(struct srcu_struct *sp);
 void srcu_barrier(struct srcu_struct *sp);
-unsigned long srcu_batches_completed(struct srcu_struct *sp);
 
 #endif
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 85bddce6a7a6..4c53e698c6e4 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -88,9 +88,4 @@ static inline void srcu_barrier(struct srcu_struct *sp)
 	synchronize_srcu(sp);
 }
 
-static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
-{
-	return 0;
-}
-
 #endif
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index f4adfed17b51..24e949bda12a 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -141,6 +141,5 @@ void process_srcu(struct work_struct *work);
 
 void synchronize_srcu_expedited(struct srcu_struct *sp);
 void srcu_barrier(struct srcu_struct *sp);
-unsigned long srcu_batches_completed(struct srcu_struct *sp);
 
 #endif
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 5b76a5baff2e..74d9fc205313 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -492,6 +492,11 @@ static inline unsigned long rcu_exp_batches_completed_sched(void)
 	return 0;
 }
 
+static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
+{
+	return 0;
+}
+
 static inline void rcu_force_quiescent_state(void)
 {
 }
@@ -519,6 +524,7 @@ unsigned long rcu_batches_completed_bh(void);
 unsigned long rcu_batches_completed_sched(void);
 unsigned long rcu_exp_batches_completed(void);
 unsigned long rcu_exp_batches_completed_sched(void);
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
 void show_rcu_gp_kthreads(void);
 void rcu_force_quiescent_state(void);
 void rcu_bh_force_quiescent_state(void);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index cc06dbfc9692..66a998f9c5a7 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -854,6 +854,23 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 		srcu_funnel_exp_start(sp, sdp->mynode, s);
 }
 
+/**
+ * call_srcu() - Queue a callback for invocation after an SRCU grace period
+ * @sp: srcu_struct in queue the callback
+ * @head: structure to be used for queueing the SRCU callback.
+ * @func: function to be invoked after the SRCU grace period
+ *
+ * The callback function will be invoked some time after a full SRCU
+ * grace period elapses, in other words after all pre-existing SRCU
+ * read-side critical sections have completed.  However, the callback
+ * function might well execute concurrently with other SRCU read-side
+ * critical sections that started after call_srcu() was invoked.  SRCU
+ * read-side critical sections are delimited by srcu_read_lock() and
+ * srcu_read_unlock(), and may be nested.
+ *
+ * The callback will be invoked from process context, but must nevertheless
+ * be fast and must not block.
+ */
 void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 	       rcu_callback_t func)
 {
-- 
cgit v1.3-14-g43fede


From 2464dd940e23bad227c387a40eec99f7aa02ed96 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 4 May 2017 14:29:16 -0700
Subject: srcu: Apply trivial callback lists to shrink Tiny SRCU

The rcu_segcblist structure provides quite a bit of functionality, and
Tiny SRCU needs almost none of it.  So this commit replaces Tiny SRCU's
uses of rcu_segcblist with a simple singly linked list with tail pointer.
This change significantly reduces Tiny SRCU's memory footprint, more
than making up for the growth caused by the creation of rcu_segcblist.c

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcutiny.h |  7 +++--
 init/Kconfig             |  2 +-
 kernel/rcu/rcu.h         |  2 +-
 kernel/rcu/srcutiny.c    | 70 ++++++++++++++++++++++--------------------------
 4 files changed, 37 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 4c53e698c6e4..cfbfc540cafc 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -33,9 +33,8 @@ struct srcu_struct {
 	u8 srcu_gp_waiting;		/* GP waiting for readers? */
 	struct swait_queue_head srcu_wq;
 					/* Last srcu_read_unlock() wakes GP. */
-	unsigned long srcu_gp_seq;	/* GP seq # for callback tagging. */
-	struct rcu_segcblist srcu_cblist;
-					/* Pending SRCU callbacks. */
+	struct rcu_head *srcu_cb_head;	/* Pending callbacks: Head. */
+	struct rcu_head **srcu_cb_tail;	/* Pending callbacks: Tail. */
 	struct work_struct srcu_work;	/* For driving grace periods. */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
@@ -47,7 +46,7 @@ void srcu_drive_gp(struct work_struct *wp);
 #define __SRCU_STRUCT_INIT(name)					\
 {									\
 	.srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq),	\
-	.srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),	\
+	.srcu_cb_tail = &name.srcu_cb_head,				\
 	.srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp),	\
 	__SRCU_DEP_MAP_INIT(name)					\
 }
diff --git a/init/Kconfig b/init/Kconfig
index d928a3724af9..a2cfde19e8b8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -573,7 +573,7 @@ config RCU_STALL_COMMON
 	  making these warnings mandatory for the tree variants.
 
 config RCU_NEED_SEGCBLIST
-	def_bool ( TREE_RCU || PREEMPT_RCU || TINY_SRCU || TREE_SRCU )
+	def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
 
 config CONTEXT_TRACKING
        bool
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 74d9fc205313..6a1e85bd2eac 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -398,7 +398,7 @@ static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
 	if (test_type != SRCU_FLAVOR)
 		return;
 	*flags = 0;
-	*completed = sp->srcu_gp_seq;
+	*completed = sp->srcu_idx;
 	*gpnum = *completed;
 }
 
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 988543721d5d..1a1c1047d2ed 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -38,8 +38,8 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
 	sp->srcu_lock_nesting[0] = 0;
 	sp->srcu_lock_nesting[1] = 0;
 	init_swait_queue_head(&sp->srcu_wq);
-	sp->srcu_gp_seq = 0;
-	rcu_segcblist_init(&sp->srcu_cblist);
+	sp->srcu_cb_head = NULL;
+	sp->srcu_cb_tail = &sp->srcu_cb_head;
 	sp->srcu_gp_running = false;
 	sp->srcu_gp_waiting = false;
 	sp->srcu_idx = 0;
@@ -88,10 +88,10 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 {
 	WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
 	flush_work(&sp->srcu_work);
-	WARN_ON(rcu_seq_state(sp->srcu_gp_seq));
 	WARN_ON(sp->srcu_gp_running);
 	WARN_ON(sp->srcu_gp_waiting);
-	WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist));
+	WARN_ON(sp->srcu_cb_head);
+	WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail);
 }
 EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 
@@ -117,52 +117,44 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 void srcu_drive_gp(struct work_struct *wp)
 {
 	int idx;
-	struct rcu_cblist ready_cbs;
-	struct srcu_struct *sp;
+	struct rcu_head *lh;
 	struct rcu_head *rhp;
+	struct srcu_struct *sp;
 
 	sp = container_of(wp, struct srcu_struct, srcu_work);
-	if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist))
+	if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head))
 		return; /* Already running or nothing to do. */
 
-	/* Tag recently arrived callbacks and wait for readers. */
+	/* Remove recently arrived callbacks and wait for readers. */
 	WRITE_ONCE(sp->srcu_gp_running, true);
-	rcu_segcblist_accelerate(&sp->srcu_cblist,
-				 rcu_seq_snap(&sp->srcu_gp_seq));
-	rcu_seq_start(&sp->srcu_gp_seq);
+	local_irq_disable();
+	lh = sp->srcu_cb_head;
+	sp->srcu_cb_head = NULL;
+	sp->srcu_cb_tail = &sp->srcu_cb_head;
+	local_irq_enable();
 	idx = sp->srcu_idx;
 	WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
 	WRITE_ONCE(sp->srcu_gp_waiting, true);  /* srcu_read_unlock() wakes! */
 	swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
 	WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
-	rcu_seq_end(&sp->srcu_gp_seq);
-
-	/* Update callback list based on GP, and invoke ready callbacks. */
-	rcu_segcblist_advance(&sp->srcu_cblist,
-			      rcu_seq_current(&sp->srcu_gp_seq));
-	if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
-		rcu_cblist_init(&ready_cbs);
-		local_irq_disable();
-		rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
-		local_irq_enable();
-		rhp = rcu_cblist_dequeue(&ready_cbs);
-		for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
-			local_bh_disable();
-			rhp->func(rhp);
-			local_bh_enable();
-		}
-		local_irq_disable();
-		rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
-		local_irq_enable();
+
+	/* Invoke the callbacks we removed above. */
+	while (lh) {
+		rhp = lh;
+		lh = lh->next;
+		local_bh_disable();
+		rhp->func(rhp);
+		local_bh_enable();
 	}
-	WRITE_ONCE(sp->srcu_gp_running, false);
 
 	/*
-	 * If more callbacks, reschedule ourselves.  This can race with
-	 * a call_srcu() at interrupt level, but the ->srcu_gp_running
-	 * checks will straighten that out.
+	 * Enable rescheduling, and if there are more callbacks,
+	 * reschedule ourselves.  This can race with a call_srcu()
+	 * at interrupt level, but the ->srcu_gp_running checks will
+	 * straighten that out.
 	 */
-	if (!rcu_segcblist_empty(&sp->srcu_cblist))
+	WRITE_ONCE(sp->srcu_gp_running, false);
+	if (READ_ONCE(sp->srcu_cb_head))
 		schedule_work(&sp->srcu_work);
 }
 EXPORT_SYMBOL_GPL(srcu_drive_gp);
@@ -171,14 +163,16 @@ EXPORT_SYMBOL_GPL(srcu_drive_gp);
  * Enqueue an SRCU callback on the specified srcu_struct structure,
  * initiating grace-period processing if it is not already running.
  */
-void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 	       rcu_callback_t func)
 {
 	unsigned long flags;
 
-	head->func = func;
+	rhp->func = func;
+	rhp->next = NULL;
 	local_irq_save(flags);
-	rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+	*sp->srcu_cb_tail = rhp;
+	sp->srcu_cb_tail = &rhp->next;
 	local_irq_restore(flags);
 	if (!READ_ONCE(sp->srcu_gp_running))
 		schedule_work(&sp->srcu_work);
-- 
cgit v1.3-14-g43fede


From 681fbec881dea1848e9246d7d1ecb3b97f11026d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 4 May 2017 15:44:38 -0700
Subject: lockdep: Use consistent printing primitives

Commit a5dd63efda3d ("lockdep: Use "WARNING" tag on lockdep splats")
substituted pr_warn() for printk() in places called out by Dmitry Vyukov.
However, this resulted in an ugly mix of pr_warn() and printk().  This
commit therefore changes printk() to pr_warn() or pr_cont(), depending
on the absence or presence of KERN_CONT.  This is done in all functions
that had printk() changed to pr_warn() by the aforementioned commit.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/locking/lockdep.c | 172 +++++++++++++++++++++++------------------------
 1 file changed, 86 insertions(+), 86 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c0e31bfee25c..cceb9534338a 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1157,18 +1157,18 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
 	if (debug_locks_silent)
 		return 0;
 
-	printk("\n");
+	pr_warn("\n");
 	pr_warn("======================================================\n");
 	pr_warn("WARNING: possible circular locking dependency detected\n");
 	print_kernel_ident();
 	pr_warn("------------------------------------------------------\n");
-	printk("%s/%d is trying to acquire lock:\n",
+	pr_warn("%s/%d is trying to acquire lock:\n",
 		curr->comm, task_pid_nr(curr));
 	print_lock(check_src);
-	printk("\nbut task is already holding lock:\n");
+	pr_warn("\nbut task is already holding lock:\n");
 	print_lock(check_tgt);
-	printk("\nwhich lock already depends on the new lock.\n\n");
-	printk("\nthe existing dependency chain (in reverse order) is:\n");
+	pr_warn("\nwhich lock already depends on the new lock.\n\n");
+	pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
 
 	print_circular_bug_entry(entry, depth);
 
@@ -1495,13 +1495,13 @@ print_bad_irq_dependency(struct task_struct *curr,
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	printk("\n");
+	pr_warn("\n");
 	pr_warn("=====================================================\n");
 	pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n",
 		irqclass, irqclass);
 	print_kernel_ident();
 	pr_warn("-----------------------------------------------------\n");
-	printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
+	pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
 		curr->comm, task_pid_nr(curr),
 		curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
 		curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
@@ -1509,46 +1509,46 @@ print_bad_irq_dependency(struct task_struct *curr,
 		curr->softirqs_enabled);
 	print_lock(next);
 
-	printk("\nand this task is already holding:\n");
+	pr_warn("\nand this task is already holding:\n");
 	print_lock(prev);
-	printk("which would create a new lock dependency:\n");
+	pr_warn("which would create a new lock dependency:\n");
 	print_lock_name(hlock_class(prev));
-	printk(KERN_CONT " ->");
+	pr_cont(" ->");
 	print_lock_name(hlock_class(next));
-	printk(KERN_CONT "\n");
+	pr_cont("\n");
 
-	printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
+	pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n",
 		irqclass);
 	print_lock_name(backwards_entry->class);
-	printk("\n... which became %s-irq-safe at:\n", irqclass);
+	pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
 
 	print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
 
-	printk("\nto a %s-irq-unsafe lock:\n", irqclass);
+	pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
 	print_lock_name(forwards_entry->class);
-	printk("\n... which became %s-irq-unsafe at:\n", irqclass);
-	printk("...");
+	pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
+	pr_warn("...");
 
 	print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
 
-	printk("\nother info that might help us debug this:\n\n");
+	pr_warn("\nother info that might help us debug this:\n\n");
 	print_irq_lock_scenario(backwards_entry, forwards_entry,
 				hlock_class(prev), hlock_class(next));
 
 	lockdep_print_held_locks(curr);
 
-	printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
+	pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
 	if (!save_trace(&prev_root->trace))
 		return 0;
 	print_shortest_lock_dependencies(backwards_entry, prev_root);
 
-	printk("\nthe dependencies between the lock to be acquired");
-	printk(" and %s-irq-unsafe lock:\n", irqclass);
+	pr_warn("\nthe dependencies between the lock to be acquired");
+	pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
 	if (!save_trace(&next_root->trace))
 		return 0;
 	print_shortest_lock_dependencies(forwards_entry, next_root);
 
-	printk("\nstack backtrace:\n");
+	pr_warn("\nstack backtrace:\n");
 	dump_stack();
 
 	return 0;
@@ -1724,22 +1724,22 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	printk("\n");
+	pr_warn("\n");
 	pr_warn("============================================\n");
 	pr_warn("WARNING: possible recursive locking detected\n");
 	print_kernel_ident();
 	pr_warn("--------------------------------------------\n");
-	printk("%s/%d is trying to acquire lock:\n",
+	pr_warn("%s/%d is trying to acquire lock:\n",
 		curr->comm, task_pid_nr(curr));
 	print_lock(next);
-	printk("\nbut task is already holding lock:\n");
+	pr_warn("\nbut task is already holding lock:\n");
 	print_lock(prev);
 
-	printk("\nother info that might help us debug this:\n");
+	pr_warn("\nother info that might help us debug this:\n");
 	print_deadlock_scenario(next, prev);
 	lockdep_print_held_locks(curr);
 
-	printk("\nstack backtrace:\n");
+	pr_warn("\nstack backtrace:\n");
 	dump_stack();
 
 	return 0;
@@ -2074,21 +2074,21 @@ static void print_collision(struct task_struct *curr,
 			struct held_lock *hlock_next,
 			struct lock_chain *chain)
 {
-	printk("\n");
+	pr_warn("\n");
 	pr_warn("============================\n");
 	pr_warn("WARNING: chain_key collision\n");
 	print_kernel_ident();
 	pr_warn("----------------------------\n");
-	printk("%s/%d: ", current->comm, task_pid_nr(current));
-	printk("Hash chain already cached but the contents don't match!\n");
+	pr_warn("%s/%d: ", current->comm, task_pid_nr(current));
+	pr_warn("Hash chain already cached but the contents don't match!\n");
 
-	printk("Held locks:");
+	pr_warn("Held locks:");
 	print_chain_keys_held_locks(curr, hlock_next);
 
-	printk("Locks in cached chain:");
+	pr_warn("Locks in cached chain:");
 	print_chain_keys_chain(chain);
 
-	printk("\nstack backtrace:\n");
+	pr_warn("\nstack backtrace:\n");
 	dump_stack();
 }
 #endif
@@ -2373,16 +2373,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	printk("\n");
+	pr_warn("\n");
 	pr_warn("================================\n");
 	pr_warn("WARNING: inconsistent lock state\n");
 	print_kernel_ident();
 	pr_warn("--------------------------------\n");
 
-	printk("inconsistent {%s} -> {%s} usage.\n",
+	pr_warn("inconsistent {%s} -> {%s} usage.\n",
 		usage_str[prev_bit], usage_str[new_bit]);
 
-	printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
+	pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
 		curr->comm, task_pid_nr(curr),
 		trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
 		trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
@@ -2390,16 +2390,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
 		trace_softirqs_enabled(curr));
 	print_lock(this);
 
-	printk("{%s} state was registered at:\n", usage_str[prev_bit]);
+	pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
 	print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
 
 	print_irqtrace_events(curr);
-	printk("\nother info that might help us debug this:\n");
+	pr_warn("\nother info that might help us debug this:\n");
 	print_usage_bug_scenario(this);
 
 	lockdep_print_held_locks(curr);
 
-	printk("\nstack backtrace:\n");
+	pr_warn("\nstack backtrace:\n");
 	dump_stack();
 
 	return 0;
@@ -2438,28 +2438,28 @@ print_irq_inversion_bug(struct task_struct *curr,
 	if (!debug_locks_off_graph_unlock() || debug_locks_silent)
 		return 0;
 
-	printk("\n");
+	pr_warn("\n");
 	pr_warn("========================================================\n");
 	pr_warn("WARNING: possible irq lock inversion dependency detected\n");
 	print_kernel_ident();
 	pr_warn("--------------------------------------------------------\n");
-	printk("%s/%d just changed the state of lock:\n",
+	pr_warn("%s/%d just changed the state of lock:\n",
 		curr->comm, task_pid_nr(curr));
 	print_lock(this);
 	if (forwards)
-		printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
+		pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
 	else
-		printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
+		pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
 	print_lock_name(other->class);
-	printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
+	pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n");
 
-	printk("\nother info that might help us debug this:\n");
+	pr_warn("\nother info that might help us debug this:\n");
 
 	/* Find a middle lock (if one exists) */
 	depth = get_lock_depth(other);
 	do {
 		if (depth == 0 && (entry != root)) {
-			printk("lockdep:%s bad path found in chain graph\n", __func__);
+			pr_warn("lockdep:%s bad path found in chain graph\n", __func__);
 			break;
 		}
 		middle = entry;
@@ -2475,12 +2475,12 @@ print_irq_inversion_bug(struct task_struct *curr,
 
 	lockdep_print_held_locks(curr);
 
-	printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
+	pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
 	if (!save_trace(&root->trace))
 		return 0;
 	print_shortest_lock_dependencies(other, root);
 
-	printk("\nstack backtrace:\n");
+	pr_warn("\nstack backtrace:\n");
 	dump_stack();
 
 	return 0;
@@ -3189,25 +3189,25 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
 	if (debug_locks_silent)
 		return 0;
 
-	printk("\n");
+	pr_warn("\n");
 	pr_warn("==================================\n");
 	pr_warn("WARNING: Nested lock was not taken\n");
 	print_kernel_ident();
 	pr_warn("----------------------------------\n");
 
-	printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
+	pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
 	print_lock(hlock);
 
-	printk("\nbut this task is not holding:\n");
-	printk("%s\n", hlock->nest_lock->name);
+	pr_warn("\nbut this task is not holding:\n");
+	pr_warn("%s\n", hlock->nest_lock->name);
 
-	printk("\nstack backtrace:\n");
+	pr_warn("\nstack backtrace:\n");
 	dump_stack();
 
-	printk("\nother info that might help us debug this:\n");
+	pr_warn("\nother info that might help us debug this:\n");
 	lockdep_print_held_locks(curr);
 
-	printk("\nstack backtrace:\n");
+	pr_warn("\nstack backtrace:\n");
 	dump_stack();
 
 	return 0;
@@ -3402,21 +3402,21 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
 	if (debug_locks_silent)
 		return 0;
 
-	printk("\n");
+	pr_warn("\n");
 	pr_warn("=====================================\n");
 	pr_warn("WARNING: bad unlock balance detected!\n");
 	print_kernel_ident();
 	pr_warn("-------------------------------------\n");
-	printk("%s/%d is trying to release lock (",
+	pr_warn("%s/%d is trying to release lock (",
 		curr->comm, task_pid_nr(curr));
 	print_lockdep_cache(lock);
-	printk(KERN_CONT ") at:\n");
+	pr_cont(") at:\n");
 	print_ip_sym(ip);
-	printk("but there are no more locks to release!\n");
-	printk("\nother info that might help us debug this:\n");
+	pr_warn("but there are no more locks to release!\n");
+	pr_warn("\nother info that might help us debug this:\n");
 	lockdep_print_held_locks(curr);
 
-	printk("\nstack backtrace:\n");
+	pr_warn("\nstack backtrace:\n");
 	dump_stack();
 
 	return 0;
@@ -3974,21 +3974,21 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
 	if (debug_locks_silent)
 		return 0;
 
-	printk("\n");
+	pr_warn("\n");
 	pr_warn("=================================\n");
 	pr_warn("WARNING: bad contention detected!\n");
 	print_kernel_ident();
 	pr_warn("---------------------------------\n");
-	printk("%s/%d is trying to contend lock (",
+	pr_warn("%s/%d is trying to contend lock (",
 		curr->comm, task_pid_nr(curr));
 	print_lockdep_cache(lock);
-	printk(KERN_CONT ") at:\n");
+	pr_cont(") at:\n");
 	print_ip_sym(ip);
-	printk("but there are no locks held!\n");
-	printk("\nother info that might help us debug this:\n");
+	pr_warn("but there are no locks held!\n");
+	pr_warn("\nother info that might help us debug this:\n");
 	lockdep_print_held_locks(curr);
 
-	printk("\nstack backtrace:\n");
+	pr_warn("\nstack backtrace:\n");
 	dump_stack();
 
 	return 0;
@@ -4318,17 +4318,17 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
 	if (debug_locks_silent)
 		return;
 
-	printk("\n");
+	pr_warn("\n");
 	pr_warn("=========================\n");
 	pr_warn("WARNING: held lock freed!\n");
 	print_kernel_ident();
 	pr_warn("-------------------------\n");
-	printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
+	pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
 		curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
 	print_lock(hlock);
 	lockdep_print_held_locks(curr);
 
-	printk("\nstack backtrace:\n");
+	pr_warn("\nstack backtrace:\n");
 	dump_stack();
 }
 
@@ -4376,14 +4376,14 @@ static void print_held_locks_bug(void)
 	if (debug_locks_silent)
 		return;
 
-	printk("\n");
+	pr_warn("\n");
 	pr_warn("====================================\n");
 	pr_warn("WARNING: %s/%d still has locks held!\n",
 	       current->comm, task_pid_nr(current));
 	print_kernel_ident();
 	pr_warn("------------------------------------\n");
 	lockdep_print_held_locks(current);
-	printk("\nstack backtrace:\n");
+	pr_warn("\nstack backtrace:\n");
 	dump_stack();
 }
 
@@ -4402,10 +4402,10 @@ void debug_show_all_locks(void)
 	int unlock = 1;
 
 	if (unlikely(!debug_locks)) {
-		printk("INFO: lockdep is turned off.\n");
+		pr_warn("INFO: lockdep is turned off.\n");
 		return;
 	}
-	printk("\nShowing all locks held in the system:\n");
+	pr_warn("\nShowing all locks held in the system:\n");
 
 	/*
 	 * Here we try to get the tasklist_lock as hard as possible,
@@ -4416,18 +4416,18 @@ void debug_show_all_locks(void)
 retry:
 	if (!read_trylock(&tasklist_lock)) {
 		if (count == 10)
-			printk("hm, tasklist_lock locked, retrying... ");
+			pr_warn("hm, tasklist_lock locked, retrying... ");
 		if (count) {
 			count--;
-			printk(" #%d", 10-count);
+			pr_cont(" #%d", 10-count);
 			mdelay(200);
 			goto retry;
 		}
-		printk(" ignoring it.\n");
+		pr_cont(" ignoring it.\n");
 		unlock = 0;
 	} else {
 		if (count != 10)
-			printk(KERN_CONT " locked it.\n");
+			pr_cont(" locked it.\n");
 	}
 
 	do_each_thread(g, p) {
@@ -4445,7 +4445,7 @@ retry:
 				unlock = 1;
 	} while_each_thread(g, p);
 
-	printk("\n");
+	pr_warn("\n");
 	pr_warn("=============================================\n\n");
 
 	if (unlock)
@@ -4475,12 +4475,12 @@ asmlinkage __visible void lockdep_sys_exit(void)
 	if (unlikely(curr->lockdep_depth)) {
 		if (!debug_locks_off())
 			return;
-		printk("\n");
+		pr_warn("\n");
 		pr_warn("================================================\n");
 		pr_warn("WARNING: lock held when returning to user space!\n");
 		print_kernel_ident();
 		pr_warn("------------------------------------------------\n");
-		printk("%s/%d is leaving the kernel with locks still held!\n",
+		pr_warn("%s/%d is leaving the kernel with locks still held!\n",
 				curr->comm, curr->pid);
 		lockdep_print_held_locks(curr);
 	}
@@ -4495,14 +4495,14 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 		return;
 #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
 	/* Note: the following can be executed concurrently, so be careful. */
-	printk("\n");
+	pr_warn("\n");
 	pr_warn("=============================\n");
 	pr_warn("WARNING: suspicious RCU usage\n");
 	print_kernel_ident();
 	pr_warn("-----------------------------\n");
-	printk("%s:%d %s!\n", file, line, s);
-	printk("\nother info that might help us debug this:\n\n");
-	printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
+	pr_warn("%s:%d %s!\n", file, line, s);
+	pr_warn("\nother info that might help us debug this:\n\n");
+	pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
 	       !rcu_lockdep_current_cpu_online()
 			? "RCU used illegally from offline CPU!\n"
 			: !rcu_is_watching()
@@ -4529,10 +4529,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 	 * rcu_read_lock_bh() and so on from extended quiescent states.
 	 */
 	if (!rcu_is_watching())
-		printk("RCU used illegally from extended quiescent state!\n");
+		pr_warn("RCU used illegally from extended quiescent state!\n");
 
 	lockdep_print_held_locks(curr);
-	printk("\nstack backtrace:\n");
+	pr_warn("\nstack backtrace:\n");
 	dump_stack();
 }
 EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
-- 
cgit v1.3-14-g43fede


From bf32c76540257f9f5f2cf661dbdd8bb4a4bd8c82 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 9 May 2017 12:05:46 -0700
Subject: rcu: Convert rnp->lock wrappers to macros for SRCU use

Use of smp_mb__after_unlock_lock() would allow SRCU to omit a full
memory barrier during callback execution, so this commit converts
raw_spin_lock_rcu_node() from inline functions to type-generic macros
to allow them to handle locks in srcu_node structures as well as
rcu_node structures.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.h | 47 +++++++++++++++++++++--------------------------
 1 file changed, 21 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ddfa34d020ba..a7f63f1074b4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -580,27 +580,22 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
  * As ->lock of struct rcu_node is a __private field, therefore one should use
  * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
  */
-static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
-{
-	raw_spin_lock(&ACCESS_PRIVATE(rnp, lock));
-	smp_mb__after_unlock_lock();
-}
+#define raw_spin_lock_rcu_node(p)					\
+do {									\
+	raw_spin_lock(&ACCESS_PRIVATE(p, lock));			\
+	smp_mb__after_unlock_lock();					\
+} while (0)
 
-static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp)
-{
-	raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock));
-}
+#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock))
 
-static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
-{
-	raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock));
-	smp_mb__after_unlock_lock();
-}
+#define raw_spin_lock_irq_rcu_node(p)					\
+do {									\
+	raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock));			\
+	smp_mb__after_unlock_lock();					\
+} while (0)
 
-static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp)
-{
-	raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock));
-}
+#define raw_spin_unlock_irq_rcu_node(p)					\
+	raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
 
 #define raw_spin_lock_irqsave_rcu_node(rnp, flags)			\
 do {									\
@@ -615,11 +610,11 @@ do {									\
 	raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags);	\
 } while (0)
 
-static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
-{
-	bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock));
-
-	if (locked)
-		smp_mb__after_unlock_lock();
-	return locked;
-}
+#define raw_spin_trylock_rcu_node(p)					\
+({									\
+	bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock));	\
+									\
+	if (___locked)							\
+		smp_mb__after_unlock_lock();				\
+	___locked;							\
+})
-- 
cgit v1.3-14-g43fede


From 83d40bd3bc3ab3d6b5a4a331f7667d627948a099 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 9 May 2017 13:28:51 -0700
Subject: rcu: Move rnp->lock wrappers for SRCU use

This commit moves the now-generic rnp->lock wrapper macros from
kernel/rcu/tree.h to kernel/rcu/rcu.h, thus allowing SRCU to use them.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcu.h  | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcu/tree.h | 53 -----------------------------------------------------
 2 files changed, 53 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 6a1e85bd2eac..2a75beb883c8 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -303,6 +303,59 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
 	     cpu <= rnp->grphi; \
 	     cpu = cpumask_next((cpu), cpu_possible_mask))
 
+/*
+ * Wrappers for the rcu_node::lock acquire and release.
+ *
+ * Because the rcu_nodes form a tree, the tree traversal locking will observe
+ * different lock values, this in turn means that an UNLOCK of one level
+ * followed by a LOCK of another level does not imply a full memory barrier;
+ * and most importantly transitivity is lost.
+ *
+ * In order to restore full ordering between tree levels, augment the regular
+ * lock acquire functions with smp_mb__after_unlock_lock().
+ *
+ * As ->lock of struct rcu_node is a __private field, therefore one should use
+ * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
+ */
+#define raw_spin_lock_rcu_node(p)					\
+do {									\
+	raw_spin_lock(&ACCESS_PRIVATE(p, lock));			\
+	smp_mb__after_unlock_lock();					\
+} while (0)
+
+#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock))
+
+#define raw_spin_lock_irq_rcu_node(p)					\
+do {									\
+	raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock));			\
+	smp_mb__after_unlock_lock();					\
+} while (0)
+
+#define raw_spin_unlock_irq_rcu_node(p)					\
+	raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
+
+#define raw_spin_lock_irqsave_rcu_node(rnp, flags)			\
+do {									\
+	typecheck(unsigned long, flags);				\
+	raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags);	\
+	smp_mb__after_unlock_lock();					\
+} while (0)
+
+#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags)			\
+do {									\
+	typecheck(unsigned long, flags);				\
+	raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags);	\
+} while (0)
+
+#define raw_spin_trylock_rcu_node(p)					\
+({									\
+	bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock));	\
+									\
+	if (___locked)							\
+		smp_mb__after_unlock_lock();				\
+	___locked;							\
+})
+
 #endif /* #if defined(SRCU) || !defined(TINY_RCU) */
 
 #ifdef CONFIG_TINY_RCU
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a7f63f1074b4..baa0bac8da2a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -565,56 +565,3 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
 }
 #endif /* #ifdef CONFIG_RCU_TRACE */
-
-/*
- * Wrappers for the rcu_node::lock acquire and release.
- *
- * Because the rcu_nodes form a tree, the tree traversal locking will observe
- * different lock values, this in turn means that an UNLOCK of one level
- * followed by a LOCK of another level does not imply a full memory barrier;
- * and most importantly transitivity is lost.
- *
- * In order to restore full ordering between tree levels, augment the regular
- * lock acquire functions with smp_mb__after_unlock_lock().
- *
- * As ->lock of struct rcu_node is a __private field, therefore one should use
- * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
- */
-#define raw_spin_lock_rcu_node(p)					\
-do {									\
-	raw_spin_lock(&ACCESS_PRIVATE(p, lock));			\
-	smp_mb__after_unlock_lock();					\
-} while (0)
-
-#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock))
-
-#define raw_spin_lock_irq_rcu_node(p)					\
-do {									\
-	raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock));			\
-	smp_mb__after_unlock_lock();					\
-} while (0)
-
-#define raw_spin_unlock_irq_rcu_node(p)					\
-	raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
-
-#define raw_spin_lock_irqsave_rcu_node(rnp, flags)			\
-do {									\
-	typecheck(unsigned long, flags);				\
-	raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags);	\
-	smp_mb__after_unlock_lock();					\
-} while (0)
-
-#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags)			\
-do {									\
-	typecheck(unsigned long, flags);				\
-	raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags);	\
-} while (0)
-
-#define raw_spin_trylock_rcu_node(p)					\
-({									\
-	bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock));	\
-									\
-	if (___locked)							\
-		smp_mb__after_unlock_lock();				\
-	___locked;							\
-})
-- 
cgit v1.3-14-g43fede


From a3883df3935e10caa8297719d85fa8eaff7cabbd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 9 May 2017 15:00:14 -0700
Subject: srcu: Use rnp->lock wrappers to replace explicit memory barriers

This commit uses TREE RCU's rnp->lock wrappers to replace a few explicit
memory barriers.  This change also has the advantage of making SRCU's
memory-ordering properties be implemented in roughly the same way as they
are in Tree RCU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcutree.h |  8 ++---
 kernel/rcu/srcutree.c    | 91 +++++++++++++++++++++++-------------------------
 2 files changed, 47 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 24e949bda12a..42973f787e7e 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -40,7 +40,7 @@ struct srcu_data {
 	unsigned long srcu_unlock_count[2];	/* Unlocks per CPU. */
 
 	/* Update-side state. */
-	spinlock_t lock ____cacheline_internodealigned_in_smp;
+	raw_spinlock_t __private lock ____cacheline_internodealigned_in_smp;
 	struct rcu_segcblist srcu_cblist;	/* List of callbacks.*/
 	unsigned long srcu_gp_seq_needed;	/* Furthest future GP needed. */
 	unsigned long srcu_gp_seq_needed_exp;	/* Furthest future exp GP. */
@@ -58,7 +58,7 @@ struct srcu_data {
  * Node in SRCU combining tree, similar in function to rcu_data.
  */
 struct srcu_node {
-	spinlock_t lock;
+	raw_spinlock_t __private lock;
 	unsigned long srcu_have_cbs[4];		/* GP seq for children */
 						/*  having CBs, but only */
 						/*  is > ->srcu_gq_seq. */
@@ -78,7 +78,7 @@ struct srcu_struct {
 	struct srcu_node *level[RCU_NUM_LVLS + 1];
 						/* First node at each level. */
 	struct mutex srcu_cb_mutex;		/* Serialize CB preparation. */
-	spinlock_t gp_lock;			/* protect ->srcu_cblist */
+	raw_spinlock_t __private lock;		/* Protect counters */
 	struct mutex srcu_gp_mutex;		/* Serialize GP work. */
 	unsigned int srcu_idx;			/* Current rdr array element. */
 	unsigned long srcu_gp_seq;		/* Grace-period seq #. */
@@ -109,7 +109,7 @@ void process_srcu(struct work_struct *work);
 #define __SRCU_STRUCT_INIT(name)					\
 	{								\
 		.sda = &name##_srcu_data,				\
-		.gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock),		\
+		.lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock),		\
 		.srcu_gp_seq_needed = 0 - 1,				\
 		__SRCU_DEP_MAP_INIT(name)				\
 	}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 66a998f9c5a7..d0ca524bf042 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -76,7 +76,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 
 	/* Each pass through this loop initializes one srcu_node structure. */
 	rcu_for_each_node_breadth_first(sp, snp) {
-		spin_lock_init(&snp->lock);
+		raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock));
 		WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
 			     ARRAY_SIZE(snp->srcu_data_have_cbs));
 		for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
@@ -110,7 +110,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 	snp_first = sp->level[level];
 	for_each_possible_cpu(cpu) {
 		sdp = per_cpu_ptr(sp->sda, cpu);
-		spin_lock_init(&sdp->lock);
+		raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
 		rcu_segcblist_init(&sdp->srcu_cblist);
 		sdp->srcu_cblist_invoking = false;
 		sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
@@ -169,7 +169,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
 	/* Don't re-initialize a lock while it is held. */
 	debug_check_no_locks_freed((void *)sp, sizeof(*sp));
 	lockdep_init_map(&sp->dep_map, name, key, 0);
-	spin_lock_init(&sp->gp_lock);
+	raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
 	return init_srcu_struct_fields(sp, false);
 }
 EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -186,7 +186,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
  */
 int init_srcu_struct(struct srcu_struct *sp)
 {
-	spin_lock_init(&sp->gp_lock);
+	raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock));
 	return init_srcu_struct_fields(sp, false);
 }
 EXPORT_SYMBOL_GPL(init_srcu_struct);
@@ -197,7 +197,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
  * First-use initialization of statically allocated srcu_struct
  * structure.  Wiring up the combining tree is more than can be
  * done with compile-time initialization, so this check is added
- * to each update-side SRCU primitive.  Use ->gp_lock, which -is-
+ * to each update-side SRCU primitive.  Use sp->lock, which -is-
  * compile-time initialized, to resolve races involving multiple
  * CPUs trying to garner first-use privileges.
  */
@@ -209,13 +209,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp)
 	/* The smp_load_acquire() pairs with the smp_store_release(). */
 	if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
 		return; /* Already initialized. */
-	spin_lock_irqsave(&sp->gp_lock, flags);
+	raw_spin_lock_irqsave_rcu_node(sp, flags);
 	if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
-		spin_unlock_irqrestore(&sp->gp_lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(sp, flags);
 		return;
 	}
 	init_srcu_struct_fields(sp, true);
-	spin_unlock_irqrestore(&sp->gp_lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(sp, flags);
 }
 
 /*
@@ -411,8 +411,7 @@ static void srcu_gp_start(struct srcu_struct *sp)
 	struct srcu_data *sdp = this_cpu_ptr(sp->sda);
 	int state;
 
-	RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock),
-			 "Invoked srcu_gp_start() without ->gp_lock!");
+	lockdep_assert_held(&sp->lock);
 	WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&sp->srcu_gp_seq));
@@ -513,7 +512,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	mutex_lock(&sp->srcu_cb_mutex);
 
 	/* End the current grace period. */
-	spin_lock_irq(&sp->gp_lock);
+	raw_spin_lock_irq_rcu_node(sp);
 	idx = rcu_seq_state(sp->srcu_gp_seq);
 	WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
 	cbdelay = srcu_get_delay(sp);
@@ -522,7 +521,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	gpseq = rcu_seq_current(&sp->srcu_gp_seq);
 	if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
 		sp->srcu_gp_seq_needed_exp = gpseq;
-	spin_unlock_irq(&sp->gp_lock);
+	raw_spin_unlock_irq_rcu_node(sp);
 	mutex_unlock(&sp->srcu_gp_mutex);
 	/* A new grace period can start at this point.  But only one. */
 
@@ -530,7 +529,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
 	idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
 	rcu_for_each_node_breadth_first(sp, snp) {
-		spin_lock_irq(&snp->lock);
+		raw_spin_lock_irq_rcu_node(snp);
 		cbs = false;
 		if (snp >= sp->level[rcu_num_lvls - 1])
 			cbs = snp->srcu_have_cbs[idx] == gpseq;
@@ -540,21 +539,19 @@ static void srcu_gp_end(struct srcu_struct *sp)
 			snp->srcu_gp_seq_needed_exp = gpseq;
 		mask = snp->srcu_data_have_cbs[idx];
 		snp->srcu_data_have_cbs[idx] = 0;
-		spin_unlock_irq(&snp->lock);
-		if (cbs) {
-			smp_mb(); /* GP end before CB invocation. */
+		raw_spin_unlock_irq_rcu_node(snp);
+		if (cbs)
 			srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
-		}
 
 		/* Occasionally prevent srcu_data counter wrap. */
 		if (!(gpseq & counter_wrap_check))
 			for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
 				sdp = per_cpu_ptr(sp->sda, cpu);
-				spin_lock_irqsave(&sdp->lock, flags);
+				raw_spin_lock_irqsave_rcu_node(sdp, flags);
 				if (ULONG_CMP_GE(gpseq,
 						 sdp->srcu_gp_seq_needed + 100))
 					sdp->srcu_gp_seq_needed = gpseq;
-				spin_unlock_irqrestore(&sdp->lock, flags);
+				raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
 			}
 	}
 
@@ -562,17 +559,17 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	mutex_unlock(&sp->srcu_cb_mutex);
 
 	/* Start a new grace period if needed. */
-	spin_lock_irq(&sp->gp_lock);
+	raw_spin_lock_irq_rcu_node(sp);
 	gpseq = rcu_seq_current(&sp->srcu_gp_seq);
 	if (!rcu_seq_state(gpseq) &&
 	    ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
 		srcu_gp_start(sp);
-		spin_unlock_irq(&sp->gp_lock);
+		raw_spin_unlock_irq_rcu_node(sp);
 		/* Throttle expedited grace periods: Should be rare! */
 		srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
 				    ? 0 : SRCU_INTERVAL);
 	} else {
-		spin_unlock_irq(&sp->gp_lock);
+		raw_spin_unlock_irq_rcu_node(sp);
 	}
 }
 
@@ -592,18 +589,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
 		if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
 		    ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
 			return;
-		spin_lock_irqsave(&snp->lock, flags);
+		raw_spin_lock_irqsave_rcu_node(snp, flags);
 		if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
-			spin_unlock_irqrestore(&snp->lock, flags);
+			raw_spin_unlock_irqrestore_rcu_node(snp, flags);
 			return;
 		}
 		WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
-		spin_unlock_irqrestore(&snp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(snp, flags);
 	}
-	spin_lock_irqsave(&sp->gp_lock, flags);
+	raw_spin_lock_irqsave_rcu_node(sp, flags);
 	if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
 		sp->srcu_gp_seq_needed_exp = s;
-	spin_unlock_irqrestore(&sp->gp_lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(sp, flags);
 }
 
 /*
@@ -625,14 +622,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
 	for (; snp != NULL; snp = snp->srcu_parent) {
 		if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
 			return; /* GP already done and CBs recorded. */
-		spin_lock_irqsave(&snp->lock, flags);
+		raw_spin_lock_irqsave_rcu_node(snp, flags);
 		if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
 			snp_seq = snp->srcu_have_cbs[idx];
 			if (snp == sdp->mynode && snp_seq == s)
 				snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
-			spin_unlock_irqrestore(&snp->lock, flags);
+			raw_spin_unlock_irqrestore_rcu_node(snp, flags);
 			if (snp == sdp->mynode && snp_seq != s) {
-				smp_mb(); /* CBs after GP! */
 				srcu_schedule_cbs_sdp(sdp, do_norm
 							   ? SRCU_INTERVAL
 							   : 0);
@@ -647,11 +643,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
 			snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
 		if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
 			snp->srcu_gp_seq_needed_exp = s;
-		spin_unlock_irqrestore(&snp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(snp, flags);
 	}
 
 	/* Top of tree, must ensure the grace period will be started. */
-	spin_lock_irqsave(&sp->gp_lock, flags);
+	raw_spin_lock_irqsave_rcu_node(sp, flags);
 	if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
 		/*
 		 * Record need for grace period s.  Pair with load
@@ -670,7 +666,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
 		queue_delayed_work(system_power_efficient_wq, &sp->work,
 				   srcu_get_delay(sp));
 	}
-	spin_unlock_irqrestore(&sp->gp_lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(sp, flags);
 }
 
 /*
@@ -833,7 +829,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 	rhp->func = func;
 	local_irq_save(flags);
 	sdp = this_cpu_ptr(sp->sda);
-	spin_lock(&sdp->lock);
+	raw_spin_lock_rcu_node(sdp);
 	rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&sp->srcu_gp_seq));
@@ -847,7 +843,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 		sdp->srcu_gp_seq_needed_exp = s;
 		needexp = true;
 	}
-	spin_unlock_irqrestore(&sdp->lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(sdp, flags);
 	if (needgp)
 		srcu_funnel_gp_start(sp, sdp, s, do_norm);
 	else if (needexp)
@@ -1018,7 +1014,7 @@ void srcu_barrier(struct srcu_struct *sp)
 	 */
 	for_each_possible_cpu(cpu) {
 		sdp = per_cpu_ptr(sp->sda, cpu);
-		spin_lock_irq(&sdp->lock);
+		raw_spin_lock_irq_rcu_node(sdp);
 		atomic_inc(&sp->srcu_barrier_cpu_cnt);
 		sdp->srcu_barrier_head.func = srcu_barrier_cb;
 		debug_rcu_head_queue(&sdp->srcu_barrier_head);
@@ -1027,7 +1023,7 @@ void srcu_barrier(struct srcu_struct *sp)
 			debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
 			atomic_dec(&sp->srcu_barrier_cpu_cnt);
 		}
-		spin_unlock_irq(&sdp->lock);
+		raw_spin_unlock_irq_rcu_node(sdp);
 	}
 
 	/* Remove the initial count, at which point reaching zero can happen. */
@@ -1076,17 +1072,17 @@ static void srcu_advance_state(struct srcu_struct *sp)
 	 */
 	idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
 	if (idx == SRCU_STATE_IDLE) {
-		spin_lock_irq(&sp->gp_lock);
+		raw_spin_lock_irq_rcu_node(sp);
 		if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
 			WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
-			spin_unlock_irq(&sp->gp_lock);
+			raw_spin_unlock_irq_rcu_node(sp);
 			mutex_unlock(&sp->srcu_gp_mutex);
 			return;
 		}
 		idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
 		if (idx == SRCU_STATE_IDLE)
 			srcu_gp_start(sp);
-		spin_unlock_irq(&sp->gp_lock);
+		raw_spin_unlock_irq_rcu_node(sp);
 		if (idx != SRCU_STATE_IDLE) {
 			mutex_unlock(&sp->srcu_gp_mutex);
 			return; /* Someone else started the grace period. */
@@ -1135,20 +1131,19 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	sdp = container_of(work, struct srcu_data, work.work);
 	sp = sdp->sp;
 	rcu_cblist_init(&ready_cbs);
-	spin_lock_irq(&sdp->lock);
-	smp_mb(); /* Old grace periods before callback invocation! */
+	raw_spin_lock_irq_rcu_node(sdp);
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&sp->srcu_gp_seq));
 	if (sdp->srcu_cblist_invoking ||
 	    !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
-		spin_unlock_irq(&sdp->lock);
+		raw_spin_unlock_irq_rcu_node(sdp);
 		return;  /* Someone else on the job or nothing to do. */
 	}
 
 	/* We are on the job!  Extract and invoke ready callbacks. */
 	sdp->srcu_cblist_invoking = true;
 	rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
-	spin_unlock_irq(&sdp->lock);
+	raw_spin_unlock_irq_rcu_node(sdp);
 	rhp = rcu_cblist_dequeue(&ready_cbs);
 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
 		debug_rcu_head_unqueue(rhp);
@@ -1161,13 +1156,13 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	 * Update counts, accelerate new callbacks, and if needed,
 	 * schedule another round of callback invocation.
 	 */
-	spin_lock_irq(&sdp->lock);
+	raw_spin_lock_irq_rcu_node(sdp);
 	rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
 	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
 				       rcu_seq_snap(&sp->srcu_gp_seq));
 	sdp->srcu_cblist_invoking = false;
 	more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
-	spin_unlock_irq(&sdp->lock);
+	raw_spin_unlock_irq_rcu_node(sdp);
 	if (more)
 		srcu_schedule_cbs_sdp(sdp, 0);
 }
@@ -1180,7 +1175,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
 {
 	bool pushgp = true;
 
-	spin_lock_irq(&sp->gp_lock);
+	raw_spin_lock_irq_rcu_node(sp);
 	if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
 		if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
 			/* All requests fulfilled, time to go idle. */
@@ -1190,7 +1185,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
 		/* Outstanding request and no GP.  Start one. */
 		srcu_gp_start(sp);
 	}
-	spin_unlock_irq(&sp->gp_lock);
+	raw_spin_unlock_irq_rcu_node(sp);
 
 	if (pushgp)
 		queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
-- 
cgit v1.3-14-g43fede


From 90040c9e3015054db7efa0101afdd446d1167fe8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 10 May 2017 14:36:55 -0700
Subject: rcu: Remove *_SLOW_* Kconfig options

The RCU_TORTURE_TEST_SLOW_PREINIT, RCU_TORTURE_TEST_SLOW_PREINIT_DELAY,
RCU_TORTURE_TEST_SLOW_PREINIT_DELAY, RCU_TORTURE_TEST_SLOW_INIT,
RCU_TORTURE_TEST_SLOW_INIT_DELAY, RCU_TORTURE_TEST_SLOW_CLEANUP,
and RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY Kconfig options are only
useful for torture testing, and there are the rcutree.gp_cleanup_delay,
rcutree.gp_init_delay, and rcutree.gp_preinit_delay kernel boot parameters
that rcutorture can use instead.  The effect of these parameters is to
artificially slow down grace period initialization and cleanup in order
to make some types of race conditions happen more often.

This commit therefore simplifies Tree RCU a bit by removing the Kconfig
options and adding the corresponding kernel parameters to rcutorture's
.boot files instead.  However, this commit also leaves out the kernel
parameters for TREE02, TREE04, and TREE07 in order to have about the
same number of tests slowed as not slowed.  TREE01, TREE03, TREE05,
and TREE06 are slowed, and the rest are not slowed.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/admin-guide/kernel-parameters.txt    | 10 +--
 kernel/rcu/tree.c                                  | 26 ++------
 kernel/rcu/tree_plugin.h                           |  6 +-
 lib/Kconfig.debug                                  | 75 ----------------------
 .../selftests/rcutorture/configs/rcu/TREE01        |  3 -
 .../selftests/rcutorture/configs/rcu/TREE01.boot   |  3 +
 .../selftests/rcutorture/configs/rcu/TREE02        |  3 -
 .../selftests/rcutorture/configs/rcu/TREE03        |  3 -
 .../selftests/rcutorture/configs/rcu/TREE03.boot   |  3 +
 .../selftests/rcutorture/configs/rcu/TREE04        |  3 -
 .../selftests/rcutorture/configs/rcu/TREE05        |  3 -
 .../selftests/rcutorture/configs/rcu/TREE05.boot   |  3 +
 .../selftests/rcutorture/configs/rcu/TREE06        |  3 -
 .../selftests/rcutorture/configs/rcu/TREE06.boot   |  3 +
 .../selftests/rcutorture/configs/rcu/TREE07        |  3 -
 .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt  |  9 ---
 16 files changed, 24 insertions(+), 135 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6671f9b60a86..f85bfe02f052 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3229,21 +3229,17 @@
 
 	rcutree.gp_cleanup_delay=	[KNL]
 			Set the number of jiffies to delay each step of
-			RCU grace-period cleanup.  This only has effect
-			when CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP is set.
+			RCU grace-period cleanup.
 
 	rcutree.gp_init_delay=	[KNL]
 			Set the number of jiffies to delay each step of
-			RCU grace-period initialization.  This only has
-			effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT
-			is set.
+			RCU grace-period initialization.
 
 	rcutree.gp_preinit_delay=	[KNL]
 			Set the number of jiffies to delay each step of
 			RCU grace-period pre-initialization, that is,
 			the propagation of recent CPU-hotplug changes up
-			the rcu_node combining tree.  This only has effect
-			when CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT is set.
+			the rcu_node combining tree.
 
 	rcutree.rcu_fanout_exact= [KNL]
 			Disable autobalancing of the rcu_node combining
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cac24f5d3fd2..bbbddd85906b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -177,26 +177,12 @@ module_param(kthread_prio, int, 0644);
 
 /* Delay in jiffies for grace-period initialization delays, debug only. */
 
-#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT
-static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY;
-module_param(gp_preinit_delay, int, 0644);
-#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
-static const int gp_preinit_delay;
-#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */
-
-#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
-static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
-module_param(gp_init_delay, int, 0644);
-#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-static const int gp_init_delay;
-#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
-
-#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP
-static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY;
-module_param(gp_cleanup_delay, int, 0644);
-#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
-static const int gp_cleanup_delay;
-#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */
+static int gp_preinit_delay;
+module_param(gp_preinit_delay, int, 0444);
+static int gp_init_delay;
+module_param(gp_init_delay, int, 0444);
+static int gp_cleanup_delay;
+module_param(gp_cleanup_delay, int, 0444);
 
 /*
  * Number of grace periods between delays, normalized by the duration of
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 116cf8339826..0553d9fed7d7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -109,11 +109,11 @@ static void __init rcu_bootup_announce_oddness(void)
 		pr_info("\tKick kthreads if too-long grace period.\n");
 	if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
 		pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
-	if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT))
+	if (gp_preinit_delay)
 		pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
-	if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT))
+	if (gp_init_delay)
 		pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
-	if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP))
+	if (gp_cleanup_delay)
 		pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
 	if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
 		pr_info("\tRCU debug extended QS entry/exit.\n");
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e4587ebe52c7..960c5d2d3c03 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1373,81 +1373,6 @@ config RCU_TORTURE_TEST
 	  Say M if you want the RCU torture tests to build as a module.
 	  Say N if you are unsure.
 
-config RCU_TORTURE_TEST_SLOW_PREINIT
-	bool "Slow down RCU grace-period pre-initialization to expose races"
-	depends on RCU_TORTURE_TEST
-	help
-	  This option delays grace-period pre-initialization (the
-	  propagation of CPU-hotplug changes up the rcu_node combining
-	  tree) for a few jiffies between initializing each pair of
-	  consecutive rcu_node structures.  This helps to expose races
-	  involving grace-period pre-initialization, in other words, it
-	  makes your kernel less stable.  It can also greatly increase
-	  grace-period latency, especially on systems with large numbers
-	  of CPUs.  This is useful when torture-testing RCU, but in
-	  almost no other circumstance.
-
-	  Say Y here if you want your system to crash and hang more often.
-	  Say N if you want a sane system.
-
-config RCU_TORTURE_TEST_SLOW_PREINIT_DELAY
-	int "How much to slow down RCU grace-period pre-initialization"
-	range 0 5
-	default 3
-	depends on RCU_TORTURE_TEST_SLOW_PREINIT
-	help
-	  This option specifies the number of jiffies to wait between
-	  each rcu_node structure pre-initialization step.
-
-config RCU_TORTURE_TEST_SLOW_INIT
-	bool "Slow down RCU grace-period initialization to expose races"
-	depends on RCU_TORTURE_TEST
-	help
-	  This option delays grace-period initialization for a few
-	  jiffies between initializing each pair of consecutive
-	  rcu_node structures.	This helps to expose races involving
-	  grace-period initialization, in other words, it makes your
-	  kernel less stable.  It can also greatly increase grace-period
-	  latency, especially on systems with large numbers of CPUs.
-	  This is useful when torture-testing RCU, but in almost no
-	  other circumstance.
-
-	  Say Y here if you want your system to crash and hang more often.
-	  Say N if you want a sane system.
-
-config RCU_TORTURE_TEST_SLOW_INIT_DELAY
-	int "How much to slow down RCU grace-period initialization"
-	range 0 5
-	default 3
-	depends on RCU_TORTURE_TEST_SLOW_INIT
-	help
-	  This option specifies the number of jiffies to wait between
-	  each rcu_node structure initialization.
-
-config RCU_TORTURE_TEST_SLOW_CLEANUP
-	bool "Slow down RCU grace-period cleanup to expose races"
-	depends on RCU_TORTURE_TEST
-	help
-	  This option delays grace-period cleanup for a few jiffies
-	  between cleaning up each pair of consecutive rcu_node
-	  structures.  This helps to expose races involving grace-period
-	  cleanup, in other words, it makes your kernel less stable.
-	  It can also greatly increase grace-period latency, especially
-	  on systems with large numbers of CPUs.  This is useful when
-	  torture-testing RCU, but in almost no other circumstance.
-
-	  Say Y here if you want your system to crash and hang more often.
-	  Say N if you want a sane system.
-
-config RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY
-	int "How much to slow down RCU grace-period cleanup"
-	range 0 5
-	default 3
-	depends on RCU_TORTURE_TEST_SLOW_CLEANUP
-	help
-	  This option specifies the number of jiffies to wait between
-	  each rcu_node structure cleanup operation.
-
 config RCU_CPU_STALL_TIMEOUT
 	int "RCU CPU stall timeout in seconds"
 	depends on RCU_STALL_COMMON
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
index cc6c5815236e..92ca49f90ef9 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
@@ -17,6 +17,3 @@ CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_RCU_BOOST=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
 CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
index adc3abc82fb8..89705ed79596 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
@@ -1 +1,4 @@
 rcutorture.torture_type=rcu_bh maxcpus=8
+rcutree.gp_preinit_delay=3
+rcutree.gp_init_delay=3
+rcutree.gp_cleanup_delay=3
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 b/tools/testing/selftests/rcutorture/configs/rcu/TREE02
index 1cecab330ba0..35e639e39366 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02
@@ -19,8 +19,5 @@ CONFIG_DEBUG_LOCK_ALLOC=y
 CONFIG_PROVE_LOCKING=n
 CONFIG_RCU_BOOST=n
 CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
 CONFIG_DEBUG_OBJECTS=y
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 b/tools/testing/selftests/rcutorture/configs/rcu/TREE03
index 3b93ee544e70..7a17c503b382 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03
@@ -17,6 +17,3 @@ CONFIG_RCU_BOOST=y
 CONFIG_RCU_KTHREAD_PRIO=2
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
 CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
index 120c0c88d100..9ef3aed126e9 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
@@ -1 +1,4 @@
 rcutorture.onoff_interval=1 rcutorture.onoff_holdoff=30
+rcutree.gp_preinit_delay=3
+rcutree.gp_init_delay=3
+rcutree.gp_cleanup_delay=3
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
index 851c01ae2cea..27d22695d64c 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
@@ -18,7 +18,4 @@ CONFIG_RCU_FANOUT_LEAF=3
 CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
 CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
 CONFIG_RCU_EQS_DEBUG=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 b/tools/testing/selftests/rcutorture/configs/rcu/TREE05
index d4cdc0d74e16..1257d3227b1e 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05
@@ -19,6 +19,3 @@ CONFIG_PROVE_LOCKING=y
 #CHECK#CONFIG_PROVE_RCU=y
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
 CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot
index 15b3e1a86f74..c7fd050dfcd9 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot
@@ -1,2 +1,5 @@
 rcutorture.torture_type=sched
 rcupdate.rcu_self_test_sched=1
+rcutree.gp_preinit_delay=3
+rcutree.gp_init_delay=3
+rcutree.gp_cleanup_delay=3
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 b/tools/testing/selftests/rcutorture/configs/rcu/TREE06
index 9215827649bd..05a4eec3f27b 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06
@@ -21,6 +21,3 @@ CONFIG_PROVE_LOCKING=y
 CONFIG_DEBUG_OBJECTS=y
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
 CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot
index dd90f28ed700..ad18b52a2cad 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot
@@ -2,3 +2,6 @@ rcupdate.rcu_self_test=1
 rcupdate.rcu_self_test_bh=1
 rcupdate.rcu_self_test_sched=1
 rcutree.rcu_fanout_exact=1
+rcutree.gp_preinit_delay=3
+rcutree.gp_init_delay=3
+rcutree.gp_cleanup_delay=3
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
index 99f04e4c5162..b9ddd3beeb9a 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
@@ -17,6 +17,3 @@ CONFIG_RCU_FANOUT_LEAF=2
 CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
 CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index 364801b1a230..1dfec4657d95 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -28,9 +28,6 @@ CONFIG_RCU_TRACE -- Do half.
 CONFIG_SMP -- Need one !SMP for PREEMPT_RCU.
 CONFIG_RCU_EXPERT=n -- Do a few, but these have to be vanilla configurations.
 CONFIG_RCU_EQS_DEBUG -- Do at least one for CONFIG_NO_HZ_FULL and not.
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP -- Do for all but a couple TREE scenarios.
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT -- Do for all but a couple TREE scenarios.
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT -- Do for all but a couple TREE scenarios.
 
 RCU-bh: Do one with PREEMPT and one with !PREEMPT.
 RCU-sched: Do one with PREEMPT but not BOOST.
@@ -78,12 +75,6 @@ CONFIG_RCU_TORTURE_TEST_RUNNABLE
 
 	Always used in KVM testing.
 
-CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY
-CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY
-CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY
-
-	Inspection suffices, ignore.
-
 CONFIG_PREEMPT_RCU
 CONFIG_TREE_RCU
 CONFIG_TINY_RCU
-- 
cgit v1.3-14-g43fede


From f7a10a975036ef9ca957bfe12ab2d4b1a46cccd1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 10 May 2017 15:57:16 -0700
Subject: rcu: Remove the RCU_KTHREAD_PRIO Kconfig option

Anything that can be done with the RCU_KTHREAD_PRIO Kconfig option can
also be done with the rcutree.kthread_prio kernel boot parameter.
This commit therefore removes this Kconfig option.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
---
 init/Kconfig                                       | 31 ----------------------
 kernel/rcu/tree.c                                  |  4 ---
 .../selftests/rcutorture/configs/rcu/TREE03        |  1 -
 .../selftests/rcutorture/configs/rcu/TREE03.boot   |  1 +
 .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt  |  1 -
 5 files changed, 1 insertion(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index a2cfde19e8b8..6f257d51f582 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -697,37 +697,6 @@ config RCU_BOOST
 	  Say Y here if you are working with real-time apps or heavy loads
 	  Say N here if you are unsure.
 
-config RCU_KTHREAD_PRIO
-	int "Real-time priority to use for RCU worker threads"
-	range 1 99 if RCU_BOOST
-	range 0 99 if !RCU_BOOST
-	default 1 if RCU_BOOST
-	default 0 if !RCU_BOOST
-	depends on RCU_EXPERT
-	help
-	  This option specifies the SCHED_FIFO priority value that will be
-	  assigned to the rcuc/n and rcub/n threads and is also the value
-	  used for RCU_BOOST (if enabled). If you are working with a
-	  real-time application that has one or more CPU-bound threads
-	  running at a real-time priority level, you should set
-	  RCU_KTHREAD_PRIO to a priority higher than the highest-priority
-	  real-time CPU-bound application thread.  The default RCU_KTHREAD_PRIO
-	  value of 1 is appropriate in the common case, which is real-time
-	  applications that do not have any CPU-bound threads.
-
-	  Some real-time applications might not have a single real-time
-	  thread that saturates a given CPU, but instead might have
-	  multiple real-time threads that, taken together, fully utilize
-	  that CPU.  In this case, you should set RCU_KTHREAD_PRIO to
-	  a priority higher than the lowest-priority thread that is
-	  conspiring to prevent the CPU from running any non-real-time
-	  tasks.  For example, if one thread at priority 10 and another
-	  thread at priority 5 are between themselves fully consuming
-	  the CPU time on a given CPU, then RCU_KTHREAD_PRIO should be
-	  set to priority 6 or higher.
-
-	  Specify the real-time priority, or take the default if unsure.
-
 config RCU_BOOST_DELAY
 	int "Milliseconds to delay boosting after RCU grace-period start"
 	range 0 3000
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bbbddd85906b..187ac3f41526 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -168,11 +168,7 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp,
 static void sync_sched_exp_online_cleanup(int cpu);
 
 /* rcuc/rcub kthread realtime priority */
-#ifdef CONFIG_RCU_KTHREAD_PRIO
-static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
-#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */
 static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
-#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */
 module_param(kthread_prio, int, 0644);
 
 /* Delay in jiffies for grace-period initialization delays, debug only. */
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 b/tools/testing/selftests/rcutorture/configs/rcu/TREE03
index 7a17c503b382..2dc31b16e506 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03
@@ -14,6 +14,5 @@ CONFIG_RCU_FANOUT_LEAF=2
 CONFIG_RCU_NOCB_CPU=n
 CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_RCU_BOOST=y
-CONFIG_RCU_KTHREAD_PRIO=2
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
 CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
index 9ef3aed126e9..5d2cc0bd50a0 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
@@ -2,3 +2,4 @@ rcutorture.onoff_interval=1 rcutorture.onoff_holdoff=30
 rcutree.gp_preinit_delay=3
 rcutree.gp_init_delay=3
 rcutree.gp_cleanup_delay=3
+rcutree.kthread_prio=2
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index 1dfec4657d95..b5ea8489969a 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -16,7 +16,6 @@ CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not.
 CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING.
 CONFIG_PROVE_RCU_REPEATEDLY -- Do one.
 CONFIG_RCU_BOOST -- one of PREEMPT_RCU.
-CONFIG_RCU_KTHREAD_PRIO -- set to 2 for _BOOST testing.
 CONFIG_RCU_FANOUT -- Cover hierarchy, but overlap with others.
 CONFIG_RCU_FANOUT_LEAF -- Do one non-default.
 CONFIG_RCU_FAST_NO_HZ -- Do one, but not with CONFIG_RCU_NOCB_CPU_ALL.
-- 
cgit v1.3-14-g43fede


From fe5ac724d81a3c7803e60c2232718f212f3f38d4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 11 May 2017 11:26:22 -0700
Subject: rcu: Remove nohz_full full-system-idle state machine

The NO_HZ_FULL_SYSIDLE full-system-idle capability was added in 2013
by commit 0edd1b1784cb ("nohz_full: Add full-system-idle state machine"),
but has not been used.  This commit therefore removes it.

If it turns out to be needed later, this commit can always be reverted.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../RCU/Design/Requirements/Requirements.html      |   6 +-
 include/linux/rcupdate.h                           |   9 -
 kernel/rcu/tree.c                                  |  41 +-
 kernel/rcu/tree.h                                  |  16 -
 kernel/rcu/tree_plugin.h                           | 429 ---------------------
 kernel/time/Kconfig                                |  50 ---
 .../selftests/rcutorture/configs/rcu/TREE07        |   1 -
 .../testing/selftests/rcutorture/doc/TINY_RCU.txt  |   1 -
 .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt  |   7 +-
 9 files changed, 9 insertions(+), 551 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index cb614f2a69c2..8c94fc1d1c84 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -2520,11 +2520,7 @@ It is similarly socially unacceptable to interrupt an
 <tt>nohz_full</tt> CPU running in userspace.
 RCU must therefore track <tt>nohz_full</tt> userspace
 execution.
-And in
-<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
-kernels, RCU must separately track idle CPUs on the one hand and
-CPUs that are either idle or executing in userspace on the other.
-In both cases, RCU must be able to sample state at two points in
+RCU must therefore be able to sample state at two points in
 time, and be able to determine whether or not some other CPU spent
 any time idle and/or executing in userspace.
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index ee40d7eba741..7f24a5e673f5 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -854,15 +854,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 #define kfree_rcu(ptr, rcu_head)					\
 	__kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
 
-/* Only for use by adaptive-ticks code. */
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-bool rcu_sys_is_idle(void);
-void rcu_sysidle_force_exit(void);
-#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-static inline bool rcu_sys_is_idle(void) { return false; }
-static inline void rcu_sysidle_force_exit(void) { }
-#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-
 
 /*
  * Place this after a lock-acquisition primitive to guarantee that
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 187ac3f41526..51d4c3acf32d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -270,10 +270,6 @@ void rcu_bh_qs(void)
 static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 	.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
 	.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-	.dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
-	.dynticks_idle = ATOMIC_INIT(1),
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 };
 
 /*
@@ -546,10 +542,7 @@ module_param(jiffies_till_sched_qs, ulong, 0644);
 
 static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
 				  struct rcu_data *rdp);
-static void force_qs_rnp(struct rcu_state *rsp,
-			 int (*f)(struct rcu_data *rsp, bool *isidle,
-				  unsigned long *maxj),
-			 bool *isidle, unsigned long *maxj);
+static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp));
 static void force_quiescent_state(struct rcu_state *rsp);
 static int rcu_pending(void);
 
@@ -854,7 +847,6 @@ void rcu_idle_enter(void)
 
 	local_irq_save(flags);
 	rcu_eqs_enter(false);
-	rcu_sysidle_enter(0);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -904,7 +896,6 @@ void rcu_irq_exit(void)
 		trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1);
 		rdtp->dynticks_nesting--;
 	}
-	rcu_sysidle_enter(1);
 }
 
 /*
@@ -986,7 +977,6 @@ void rcu_idle_exit(void)
 
 	local_irq_save(flags);
 	rcu_eqs_exit(false);
-	rcu_sysidle_exit(0);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -1038,7 +1028,6 @@ void rcu_irq_enter(void)
 		trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
 	else
 		rcu_eqs_exit_common(oldval, true);
-	rcu_sysidle_exit(1);
 }
 
 /*
@@ -1217,11 +1206,9 @@ static int rcu_is_cpu_rrupt_from_idle(void)
  * credit them with an implicit quiescent state.  Return 1 if this CPU
  * is in dynticks idle mode, which is an extended quiescent state.
  */
-static int dyntick_save_progress_counter(struct rcu_data *rdp,
-					 bool *isidle, unsigned long *maxj)
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
 	rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
-	rcu_sysidle_check_cpu(rdp, isidle, maxj);
 	if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
 		if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
@@ -1238,8 +1225,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
  * idle state since the last call to dyntick_save_progress_counter()
  * for this same CPU, or by virtue of having been offline.
  */
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
-				    bool *isidle, unsigned long *maxj)
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
 	unsigned long jtsq;
 	bool *rnhqp;
@@ -2105,25 +2091,16 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
  */
 static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
 {
-	bool isidle = false;
-	unsigned long maxj;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	WRITE_ONCE(rsp->gp_activity, jiffies);
 	rsp->n_force_qs++;
 	if (first_time) {
 		/* Collect dyntick-idle snapshots. */
-		if (is_sysidle_rcu_state(rsp)) {
-			isidle = true;
-			maxj = jiffies - ULONG_MAX / 4;
-		}
-		force_qs_rnp(rsp, dyntick_save_progress_counter,
-			     &isidle, &maxj);
-		rcu_sysidle_report_gp(rsp, isidle, maxj);
+		force_qs_rnp(rsp, dyntick_save_progress_counter);
 	} else {
 		/* Handle dyntick-idle and offline CPUs. */
-		isidle = true;
-		force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
+		force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
 	}
 	/* Clear flag to prevent immediate re-entry. */
 	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
@@ -2895,10 +2872,7 @@ void rcu_check_callbacks(int user)
  *
  * The caller must have suppressed start of new grace periods.
  */
-static void force_qs_rnp(struct rcu_state *rsp,
-			 int (*f)(struct rcu_data *rsp, bool *isidle,
-				  unsigned long *maxj),
-			 bool *isidle, unsigned long *maxj)
+static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp))
 {
 	int cpu;
 	unsigned long flags;
@@ -2937,7 +2911,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
 		for_each_leaf_node_possible_cpu(rnp, cpu) {
 			unsigned long bit = leaf_node_cpu_bit(rnp, cpu);
 			if ((rnp->qsmask & bit) != 0) {
-				if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
+				if (f(per_cpu_ptr(rsp->rda, cpu)))
 					mask |= bit;
 			}
 		}
@@ -3793,7 +3767,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	    !init_nocb_callback_list(rdp))
 		rcu_segcblist_init(&rdp->cblist);  /* Re-enable callbacks. */
 	rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-	rcu_sysidle_init_percpu_data(rdp->dynticks);
 	rcu_dynticks_eqs_online();
 	raw_spin_unlock_rcu_node(rnp);		/* irqs remain disabled. */
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index baa0bac8da2a..2c112bb11aa8 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -45,14 +45,6 @@ struct rcu_dynticks {
 	bool rcu_need_heavy_qs;     /* GP old, need heavy quiescent state. */
 	unsigned long rcu_qs_ctr;   /* Light universal quiescent state ctr. */
 	bool rcu_urgent_qs;	    /* GP old need light quiescent state. */
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-	long long dynticks_idle_nesting;
-				    /* irq/process nesting level from idle. */
-	atomic_t dynticks_idle;	    /* Even value for idle, else odd. */
-				    /*  "Idle" excludes userspace execution. */
-	unsigned long dynticks_idle_jiffies;
-				    /* End of last non-NMI non-idle period. */
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 #ifdef CONFIG_RCU_FAST_NO_HZ
 	bool all_lazy;		    /* Are all CPU's CBs lazy? */
 	unsigned long nonlazy_posted;
@@ -529,15 +521,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
 static bool init_nocb_callback_list(struct rcu_data *rdp);
-static void rcu_sysidle_enter(int irq);
-static void rcu_sysidle_exit(int irq);
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
-				  unsigned long *maxj);
-static bool is_sysidle_rcu_state(struct rcu_state *rsp);
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
-				  unsigned long maxj);
 static void rcu_bind_gp_kthread(void);
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
 static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
 static void rcu_dynticks_task_enter(void);
 static void rcu_dynticks_task_exit(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0553d9fed7d7..f524d967f7b6 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2563,429 +2563,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
 #endif /* #ifdef CONFIG_NO_HZ_FULL */
 }
 
-
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-
-static int full_sysidle_state;		/* Current system-idle state. */
-#define RCU_SYSIDLE_NOT		0	/* Some CPU is not idle. */
-#define RCU_SYSIDLE_SHORT	1	/* All CPUs idle for brief period. */
-#define RCU_SYSIDLE_LONG	2	/* All CPUs idle for long enough. */
-#define RCU_SYSIDLE_FULL	3	/* All CPUs idle, ready for sysidle. */
-#define RCU_SYSIDLE_FULL_NOTED	4	/* Actually entered sysidle state. */
-
-/*
- * Invoked to note exit from irq or task transition to idle.  Note that
- * usermode execution does -not- count as idle here!  After all, we want
- * to detect full-system idle states, not RCU quiescent states and grace
- * periods.  The caller must have disabled interrupts.
- */
-static void rcu_sysidle_enter(int irq)
-{
-	unsigned long j;
-	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_enter() invoked with irqs enabled!!!");
-
-	/* If there are no nohz_full= CPUs, no need to track this. */
-	if (!tick_nohz_full_enabled())
-		return;
-
-	/* Adjust nesting, check for fully idle. */
-	if (irq) {
-		rdtp->dynticks_idle_nesting--;
-		WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
-		if (rdtp->dynticks_idle_nesting != 0)
-			return;  /* Still not fully idle. */
-	} else {
-		if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
-		    DYNTICK_TASK_NEST_VALUE) {
-			rdtp->dynticks_idle_nesting = 0;
-		} else {
-			rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
-			WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
-			return;  /* Still not fully idle. */
-		}
-	}
-
-	/* Record start of fully idle period. */
-	j = jiffies;
-	WRITE_ONCE(rdtp->dynticks_idle_jiffies, j);
-	smp_mb__before_atomic();
-	atomic_inc(&rdtp->dynticks_idle);
-	smp_mb__after_atomic();
-	WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
-}
-
-/*
- * Unconditionally force exit from full system-idle state.  This is
- * invoked when a normal CPU exits idle, but must be called separately
- * for the timekeeping CPU (tick_do_timer_cpu).  The reason for this
- * is that the timekeeping CPU is permitted to take scheduling-clock
- * interrupts while the system is in system-idle state, and of course
- * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
- * interrupt from any other type of interrupt.
- */
-void rcu_sysidle_force_exit(void)
-{
-	int oldstate = READ_ONCE(full_sysidle_state);
-	int newoldstate;
-
-	/*
-	 * Each pass through the following loop attempts to exit full
-	 * system-idle state.  If contention proves to be a problem,
-	 * a trylock-based contention tree could be used here.
-	 */
-	while (oldstate > RCU_SYSIDLE_SHORT) {
-		newoldstate = cmpxchg(&full_sysidle_state,
-				      oldstate, RCU_SYSIDLE_NOT);
-		if (oldstate == newoldstate &&
-		    oldstate == RCU_SYSIDLE_FULL_NOTED) {
-			rcu_kick_nohz_cpu(tick_do_timer_cpu);
-			return; /* We cleared it, done! */
-		}
-		oldstate = newoldstate;
-	}
-	smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
-}
-
-/*
- * Invoked to note entry to irq or task transition from idle.  Note that
- * usermode execution does -not- count as idle here!  The caller must
- * have disabled interrupts.
- */
-static void rcu_sysidle_exit(int irq)
-{
-	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_exit() invoked with irqs enabled!!!");
-
-	/* If there are no nohz_full= CPUs, no need to track this. */
-	if (!tick_nohz_full_enabled())
-		return;
-
-	/* Adjust nesting, check for already non-idle. */
-	if (irq) {
-		rdtp->dynticks_idle_nesting++;
-		WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
-		if (rdtp->dynticks_idle_nesting != 1)
-			return; /* Already non-idle. */
-	} else {
-		/*
-		 * Allow for irq misnesting.  Yes, it really is possible
-		 * to enter an irq handler then never leave it, and maybe
-		 * also vice versa.  Handle both possibilities.
-		 */
-		if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
-			rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
-			WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
-			return; /* Already non-idle. */
-		} else {
-			rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
-		}
-	}
-
-	/* Record end of idle period. */
-	smp_mb__before_atomic();
-	atomic_inc(&rdtp->dynticks_idle);
-	smp_mb__after_atomic();
-	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
-
-	/*
-	 * If we are the timekeeping CPU, we are permitted to be non-idle
-	 * during a system-idle state.  This must be the case, because
-	 * the timekeeping CPU has to take scheduling-clock interrupts
-	 * during the time that the system is transitioning to full
-	 * system-idle state.  This means that the timekeeping CPU must
-	 * invoke rcu_sysidle_force_exit() directly if it does anything
-	 * more than take a scheduling-clock interrupt.
-	 */
-	if (smp_processor_id() == tick_do_timer_cpu)
-		return;
-
-	/* Update system-idle state: We are clearly no longer fully idle! */
-	rcu_sysidle_force_exit();
-}
-
-/*
- * Check to see if the current CPU is idle.  Note that usermode execution
- * does not count as idle.  The caller must have disabled interrupts,
- * and must be running on tick_do_timer_cpu.
- */
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
-				  unsigned long *maxj)
-{
-	int cur;
-	unsigned long j;
-	struct rcu_dynticks *rdtp = rdp->dynticks;
-
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_check_cpu() invoked with irqs enabled!!!");
-
-	/* If there are no nohz_full= CPUs, don't check system-wide idleness. */
-	if (!tick_nohz_full_enabled())
-		return;
-
-	/*
-	 * If some other CPU has already reported non-idle, if this is
-	 * not the flavor of RCU that tracks sysidle state, or if this
-	 * is an offline or the timekeeping CPU, nothing to do.
-	 */
-	if (!*isidle || rdp->rsp != rcu_state_p ||
-	    cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
-		return;
-	/* Verify affinity of current kthread. */
-	WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
-
-	/* Pick up current idle and NMI-nesting counter and check. */
-	cur = atomic_read(&rdtp->dynticks_idle);
-	if (cur & 0x1) {
-		*isidle = false; /* We are not idle! */
-		return;
-	}
-	smp_mb(); /* Read counters before timestamps. */
-
-	/* Pick up timestamps. */
-	j = READ_ONCE(rdtp->dynticks_idle_jiffies);
-	/* If this CPU entered idle more recently, update maxj timestamp. */
-	if (ULONG_CMP_LT(*maxj, j))
-		*maxj = j;
-}
-
-/*
- * Is this the flavor of RCU that is handling full-system idle?
- */
-static bool is_sysidle_rcu_state(struct rcu_state *rsp)
-{
-	return rsp == rcu_state_p;
-}
-
-/*
- * Return a delay in jiffies based on the number of CPUs, rcu_node
- * leaf fanout, and jiffies tick rate.  The idea is to allow larger
- * systems more time to transition to full-idle state in order to
- * avoid the cache thrashing that otherwise occur on the state variable.
- * Really small systems (less than a couple of tens of CPUs) should
- * instead use a single global atomically incremented counter, and later
- * versions of this will automatically reconfigure themselves accordingly.
- */
-static unsigned long rcu_sysidle_delay(void)
-{
-	if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
-		return 0;
-	return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
-}
-
-/*
- * Advance the full-system-idle state.  This is invoked when all of
- * the non-timekeeping CPUs are idle.
- */
-static void rcu_sysidle(unsigned long j)
-{
-	/* Check the current state. */
-	switch (READ_ONCE(full_sysidle_state)) {
-	case RCU_SYSIDLE_NOT:
-
-		/* First time all are idle, so note a short idle period. */
-		WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT);
-		break;
-
-	case RCU_SYSIDLE_SHORT:
-
-		/*
-		 * Idle for a bit, time to advance to next state?
-		 * cmpxchg failure means race with non-idle, let them win.
-		 */
-		if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
-			(void)cmpxchg(&full_sysidle_state,
-				      RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
-		break;
-
-	case RCU_SYSIDLE_LONG:
-
-		/*
-		 * Do an additional check pass before advancing to full.
-		 * cmpxchg failure means race with non-idle, let them win.
-		 */
-		if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
-			(void)cmpxchg(&full_sysidle_state,
-				      RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
-		break;
-
-	default:
-		break;
-	}
-}
-
-/*
- * Found a non-idle non-timekeeping CPU, so kick the system-idle state
- * back to the beginning.
- */
-static void rcu_sysidle_cancel(void)
-{
-	smp_mb();
-	if (full_sysidle_state > RCU_SYSIDLE_SHORT)
-		WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT);
-}
-
-/*
- * Update the sysidle state based on the results of a force-quiescent-state
- * scan of the CPUs' dyntick-idle state.
- */
-static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
-			       unsigned long maxj, bool gpkt)
-{
-	if (rsp != rcu_state_p)
-		return;  /* Wrong flavor, ignore. */
-	if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
-		return;  /* Running state machine from timekeeping CPU. */
-	if (isidle)
-		rcu_sysidle(maxj);    /* More idle! */
-	else
-		rcu_sysidle_cancel(); /* Idle is over. */
-}
-
-/*
- * Wrapper for rcu_sysidle_report() when called from the grace-period
- * kthread's context.
- */
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
-				  unsigned long maxj)
-{
-	/* If there are no nohz_full= CPUs, no need to track this. */
-	if (!tick_nohz_full_enabled())
-		return;
-
-	rcu_sysidle_report(rsp, isidle, maxj, true);
-}
-
-/* Callback and function for forcing an RCU grace period. */
-struct rcu_sysidle_head {
-	struct rcu_head rh;
-	int inuse;
-};
-
-static void rcu_sysidle_cb(struct rcu_head *rhp)
-{
-	struct rcu_sysidle_head *rshp;
-
-	/*
-	 * The following memory barrier is needed to replace the
-	 * memory barriers that would normally be in the memory
-	 * allocator.
-	 */
-	smp_mb();  /* grace period precedes setting inuse. */
-
-	rshp = container_of(rhp, struct rcu_sysidle_head, rh);
-	WRITE_ONCE(rshp->inuse, 0);
-}
-
-/*
- * Check to see if the system is fully idle, other than the timekeeping CPU.
- * The caller must have disabled interrupts.  This is not intended to be
- * called unless tick_nohz_full_enabled().
- */
-bool rcu_sys_is_idle(void)
-{
-	static struct rcu_sysidle_head rsh;
-	int rss = READ_ONCE(full_sysidle_state);
-
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sys_is_idle() invoked with irqs enabled!!!");
-
-	if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
-		return false;
-
-	/* Handle small-system case by doing a full scan of CPUs. */
-	if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
-		int oldrss = rss - 1;
-
-		/*
-		 * One pass to advance to each state up to _FULL.
-		 * Give up if any pass fails to advance the state.
-		 */
-		while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
-			int cpu;
-			bool isidle = true;
-			unsigned long maxj = jiffies - ULONG_MAX / 4;
-			struct rcu_data *rdp;
-
-			/* Scan all the CPUs looking for nonidle CPUs. */
-			for_each_possible_cpu(cpu) {
-				rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
-				rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
-				if (!isidle)
-					break;
-			}
-			rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
-			oldrss = rss;
-			rss = READ_ONCE(full_sysidle_state);
-		}
-	}
-
-	/* If this is the first observation of an idle period, record it. */
-	if (rss == RCU_SYSIDLE_FULL) {
-		rss = cmpxchg(&full_sysidle_state,
-			      RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
-		return rss == RCU_SYSIDLE_FULL;
-	}
-
-	smp_mb(); /* ensure rss load happens before later caller actions. */
-
-	/* If already fully idle, tell the caller (in case of races). */
-	if (rss == RCU_SYSIDLE_FULL_NOTED)
-		return true;
-
-	/*
-	 * If we aren't there yet, and a grace period is not in flight,
-	 * initiate a grace period.  Either way, tell the caller that
-	 * we are not there yet.  We use an xchg() rather than an assignment
-	 * to make up for the memory barriers that would otherwise be
-	 * provided by the memory allocator.
-	 */
-	if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
-	    !rcu_gp_in_progress(rcu_state_p) &&
-	    !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
-		call_rcu(&rsh.rh, rcu_sysidle_cb);
-	return false;
-}
-
-/*
- * Initialize dynticks sysidle state for CPUs coming online.
- */
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
-{
-	rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
-}
-
-#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-
-static void rcu_sysidle_enter(int irq)
-{
-}
-
-static void rcu_sysidle_exit(int irq)
-{
-}
-
-static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
-				  unsigned long *maxj)
-{
-}
-
-static bool is_sysidle_rcu_state(struct rcu_state *rsp)
-{
-	return false;
-}
-
-static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
-				  unsigned long maxj)
-{
-}
-
-static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-
 /*
  * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
  * grace-period kthread will do force_quiescent_state() processing?
@@ -3016,13 +2593,7 @@ static void rcu_bind_gp_kthread(void)
 
 	if (!tick_nohz_full_enabled())
 		return;
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-	cpu = tick_do_timer_cpu;
-	if (cpu >= 0 && cpu < nr_cpu_ids)
-		set_cpus_allowed_ptr(current, cpumask_of(cpu));
-#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 	housekeeping_affine(current);
-#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 }
 
 /* Record the current task on dyntick-idle entry. */
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 4008d9f95dd7..ac09bc29eb08 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -126,56 +126,6 @@ config NO_HZ_FULL_ALL
 	 Note the boot CPU will still be kept outside the range to
 	 handle the timekeeping duty.
 
-config NO_HZ_FULL_SYSIDLE
-	bool "Detect full-system idle state for full dynticks system"
-	depends on NO_HZ_FULL
-	default n
-	help
-	 At least one CPU must keep the scheduling-clock tick running for
-	 timekeeping purposes whenever there is a non-idle CPU, where
-	 "non-idle" also includes dynticks CPUs as long as they are
-	 running non-idle tasks.  Because the underlying adaptive-tick
-	 support cannot distinguish between all CPUs being idle and
-	 all CPUs each running a single task in dynticks mode, the
-	 underlying support simply ensures that there is always a CPU
-	 handling the scheduling-clock tick, whether or not all CPUs
-	 are idle.  This Kconfig option enables scalable detection of
-	 the all-CPUs-idle state, thus allowing the scheduling-clock
-	 tick to be disabled when all CPUs are idle.  Note that scalable
-	 detection of the all-CPUs-idle state means that larger systems
-	 will be slower to declare the all-CPUs-idle state.
-
-	 Say Y if you would like to help debug all-CPUs-idle detection.
-
-	 Say N if you are unsure.
-
-config NO_HZ_FULL_SYSIDLE_SMALL
-	int "Number of CPUs above which large-system approach is used"
-	depends on NO_HZ_FULL_SYSIDLE
-	range 1 NR_CPUS
-	default 8
-	help
-	 The full-system idle detection mechanism takes a lazy approach
-	 on large systems, as is required to attain decent scalability.
-	 However, on smaller systems, scalability is not anywhere near as
-	 large a concern as is energy efficiency.  The sysidle subsystem
-	 therefore uses a fast but non-scalable algorithm for small
-	 systems and a lazier but scalable algorithm for large systems.
-	 This Kconfig parameter defines the number of CPUs in the largest
-	 system that will be considered to be "small".
-
-	 The default value will be fine in most cases.	Battery-powered
-	 systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
-	 numbers of CPUs, and (3) are suffering from battery-lifetime
-	 problems due to long sysidle latencies might wish to experiment
-	 with larger values for this Kconfig parameter.  On the other
-	 hand, they might be even better served by disabling NO_HZ_FULL
-	 entirely, given that NO_HZ_FULL is intended for HPC and
-	 real-time workloads that at present do not tend to be run on
-	 battery-powered systems.
-
-	 Take the default if you are unsure.
-
 config NO_HZ
 	bool "Old Idle dynticks config"
 	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
index b9ddd3beeb9a..0f4759f4232e 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
@@ -8,7 +8,6 @@ CONFIG_HZ_PERIODIC=n
 CONFIG_NO_HZ_IDLE=n
 CONFIG_NO_HZ_FULL=y
 CONFIG_NO_HZ_FULL_ALL=n
-CONFIG_NO_HZ_FULL_SYSIDLE=y
 CONFIG_RCU_FAST_NO_HZ=n
 CONFIG_RCU_TRACE=y
 CONFIG_HOTPLUG_CPU=y
diff --git a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
index 24396ae8355b..a75b16991a92 100644
--- a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
+++ b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
@@ -18,7 +18,6 @@ CONFIG_PROVE_RCU
 
 	In common code tested by TREE_RCU test cases.
 
-CONFIG_NO_HZ_FULL_SYSIDLE
 CONFIG_RCU_NOCB_CPU
 
 	Meaningless for TINY_RCU.
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index b5ea8489969a..519e06d34d0b 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -9,8 +9,7 @@ CONFIG_DEBUG_OBJECTS_RCU_HEAD -- Do one.
 CONFIG_HOTPLUG_CPU -- Do half.  (Every second.)
 CONFIG_HZ_PERIODIC -- Do one.
 CONFIG_NO_HZ_IDLE -- Do those not otherwise specified. (Groups of two.)
-CONFIG_NO_HZ_FULL -- Do two, one with CONFIG_NO_HZ_FULL_SYSIDLE.
-CONFIG_NO_HZ_FULL_SYSIDLE -- Do one.
+CONFIG_NO_HZ_FULL -- Do two, one with partial CPU enablement.
 CONFIG_PREEMPT -- Do half.  (First three and #8.)
 CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not.
 CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING.
@@ -48,10 +47,6 @@ CONFIG_64BIT
 
 	Used only to check CONFIG_RCU_FANOUT value, inspection suffices.
 
-CONFIG_NO_HZ_FULL_SYSIDLE_SMALL
-
-	Defer until Frederic uses this.
-
 CONFIG_PREEMPT_COUNT
 CONFIG_PREEMPT_RCU
 
-- 
cgit v1.3-14-g43fede


From 4e4bea7427062ec15df7084f97728e2a44d912e3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 11 May 2017 15:33:23 -0700
Subject: rcu: Remove typecheck() from RCU locking wrapper functions

Because raw_spin_lock_irqsave() and raw_spin_unlock_irqrestore()
both do typecheck() on their flags argument, there is no point in
duplicating this check in raw_spin_lock_irqsave_rcu_node() and
raw_spin_unlock_irqrestore_rcu_node().  This commit therefore saves
a few lines by removing this duplicated check.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcu.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 2a75beb883c8..bc55b5716c37 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -334,18 +334,14 @@ do {									\
 #define raw_spin_unlock_irq_rcu_node(p)					\
 	raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
 
-#define raw_spin_lock_irqsave_rcu_node(rnp, flags)			\
+#define raw_spin_lock_irqsave_rcu_node(p, flags)			\
 do {									\
-	typecheck(unsigned long, flags);				\
-	raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags);	\
+	raw_spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags);	\
 	smp_mb__after_unlock_lock();					\
 } while (0)
 
-#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags)			\
-do {									\
-	typecheck(unsigned long, flags);				\
-	raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags);	\
-} while (0)
+#define raw_spin_unlock_irqrestore_rcu_node(p, flags)			\
+	raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags)	\
 
 #define raw_spin_trylock_rcu_node(p)					\
 ({									\
-- 
cgit v1.3-14-g43fede


From c4a09ff752e164c020bced6513e2008f992a02e6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 12 May 2017 14:37:19 -0700
Subject: rcu: Remove the now-obsolete PROVE_RCU_REPEATEDLY Kconfig option

The PROVE_RCU_REPEATEDLY Kconfig option was initially added due to
the volume of messages from PROVE_RCU: Doing just one per boot would
have required excessive numbers of boots to locate them all.  However,
PROVE_RCU messages are now relatively rare, so there is no longer any
reason to need more than one such message per boot.  This commit therefore
removes the PROVE_RCU_REPEATEDLY Kconfig option.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c                                   |  4 ----
 kernel/rcu/tree_plugin.h                                   |  4 +---
 lib/Kconfig.debug                                          | 14 --------------
 tools/testing/selftests/rcutorture/configs/rcu/TINY02      |  1 -
 .../testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt  |  1 -
 5 files changed, 1 insertion(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index cceb9534338a..7d2499bec5fe 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4490,10 +4490,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 {
 	struct task_struct *curr = current;
 
-#ifndef CONFIG_PROVE_RCU_REPEATEDLY
-	if (!debug_locks_off())
-		return;
-#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
 	/* Note: the following can be executed concurrently, so be careful. */
 	pr_warn("\n");
 	pr_warn("=============================\n");
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index f524d967f7b6..7f5919ab24c4 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -79,9 +79,7 @@ static void __init rcu_bootup_announce_oddness(void)
 		pr_info("\tHierarchical RCU autobalancing is disabled.\n");
 	if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
 		pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
-	if (IS_ENABLED(CONFIG_PROVE_RCU_REPEATEDLY))
-		pr_info("\tRCU lockdep checking is permanently enabled.\n");
-	else if (IS_ENABLED(CONFIG_PROVE_RCU))
+	if (IS_ENABLED(CONFIG_PROVE_RCU))
 		pr_info("\tRCU lockdep checking is enabled.\n");
 	if (RCU_NUM_LVLS >= 4)
 		pr_info("\tFour(or more)-level hierarchy is enabled.\n");
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 960c5d2d3c03..762deab304fe 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1306,20 +1306,6 @@ menu "RCU Debugging"
 config PROVE_RCU
 	def_bool PROVE_LOCKING
 
-config PROVE_RCU_REPEATEDLY
-	bool "RCU debugging: don't disable PROVE_RCU on first splat"
-	depends on PROVE_RCU
-	default n
-	help
-	 By itself, PROVE_RCU will disable checking upon issuing the
-	 first warning (or "splat").  This feature prevents such
-	 disabling, allowing multiple RCU-lockdep warnings to be printed
-	 on a single reboot.
-
-	 Say Y to allow multiple RCU-lockdep warnings per boot.
-
-	 Say N if you are unsure.
-
 config SPARSE_RCU_POINTER
 	bool "RCU debugging: sparse-based checks for pointer usage"
 	default n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY02 b/tools/testing/selftests/rcutorture/configs/rcu/TINY02
index 9007cd979df7..1f6bebbf5da8 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TINY02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY02
@@ -8,7 +8,6 @@ CONFIG_NO_HZ_IDLE=n
 CONFIG_NO_HZ_FULL=n
 CONFIG_RCU_TRACE=y
 CONFIG_PROVE_LOCKING=y
-CONFIG_PROVE_RCU_REPEATEDLY=y
 #CHECK#CONFIG_PROVE_RCU=y
 CONFIG_DEBUG_LOCK_ALLOC=y
 CONFIG_DEBUG_OBJECTS=y
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index 519e06d34d0b..b778a28f1386 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -13,7 +13,6 @@ CONFIG_NO_HZ_FULL -- Do two, one with partial CPU enablement.
 CONFIG_PREEMPT -- Do half.  (First three and #8.)
 CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not.
 CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING.
-CONFIG_PROVE_RCU_REPEATEDLY -- Do one.
 CONFIG_RCU_BOOST -- one of PREEMPT_RCU.
 CONFIG_RCU_FANOUT -- Cover hierarchy, but overlap with others.
 CONFIG_RCU_FANOUT_LEAF -- Do one non-default.
-- 
cgit v1.3-14-g43fede


From 7f0cd6333086ae09962791c31f0d4845a3329df9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 14 May 2017 17:06:30 -0700
Subject: srcu: Fix rcutorture-statistics typo

The function srcutorture_get_gp_data() duplicated the check for
sp->batch_check0.head instead of also checking sp->batch_check1.head.
The only effect of this typo would be for rcutorture statistics to
understate the fraction of time that an SRCU grace period was in flight,
and only for Classic SRCU.  This commit fixes this typo.

Reported-by: David Binderman <dcb314@hotmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index bc55b5716c37..27f871c88e0a 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -469,7 +469,7 @@ static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
 	*flags = 0;
 	*completed = sp->completed;
 	*gpnum = *completed;
-	if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head)
+	if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check1.head)
 		(*gpnum)++;
 }
 
-- 
cgit v1.3-14-g43fede


From bd8cc5a062f41e334596edbe823e2fa0adddd1b7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 15 May 2017 14:57:01 -0700
Subject: srcu: Remove Classic SRCU

Classic SRCU was only ever intended to be a fallback in case of issues
with Tree/Tiny SRCU, and the latter two are doing quite well in testing.
This commit therefore removes Classic SRCU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h                               |   2 -
 include/linux/srcuclassic.h                        | 100 ---
 init/Kconfig                                       |  21 +-
 kernel/rcu/Makefile                                |   1 -
 kernel/rcu/rcu.h                                   |  16 -
 kernel/rcu/rcutorture.c                            |  17 -
 kernel/rcu/srcu.c                                  | 668 ---------------------
 .../selftests/rcutorture/configs/rcu/CFLIST        |   1 -
 .../selftests/rcutorture/configs/rcu/SRCU-C        |  11 -
 .../rcutorture/configs/rcuperf/SRCUCLASSIC         |  16 -
 10 files changed, 2 insertions(+), 851 deletions(-)
 delete mode 100644 include/linux/srcuclassic.h
 delete mode 100644 kernel/rcu/srcu.c
 delete mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-C
 delete mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/SRCUCLASSIC

(limited to 'kernel')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 5f509018e6b5..39af9bc0f653 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -60,8 +60,6 @@ int init_srcu_struct(struct srcu_struct *sp);
 #include <linux/srcutiny.h>
 #elif defined(CONFIG_TREE_SRCU)
 #include <linux/srcutree.h>
-#elif defined(CONFIG_CLASSIC_SRCU)
-#include <linux/srcuclassic.h>
 #elif defined(CONFIG_SRCU)
 #error "Unknown SRCU implementation specified to kernel configuration"
 #else
diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h
deleted file mode 100644
index 67db4a36ef0d..000000000000
--- a/include/linux/srcuclassic.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Sleepable Read-Copy Update mechanism for mutual exclusion,
- *	classic v4.11 variant.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (C) IBM Corporation, 2017
- *
- * Author: Paul McKenney <paulmck@us.ibm.com>
- */
-
-#ifndef _LINUX_SRCU_CLASSIC_H
-#define _LINUX_SRCU_CLASSIC_H
-
-struct srcu_array {
-	unsigned long lock_count[2];
-	unsigned long unlock_count[2];
-};
-
-struct rcu_batch {
-	struct rcu_head *head, **tail;
-};
-
-#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
-
-struct srcu_struct {
-	unsigned long completed;
-	struct srcu_array __percpu *per_cpu_ref;
-	spinlock_t queue_lock; /* protect ->batch_queue, ->running */
-	bool running;
-	/* callbacks just queued */
-	struct rcu_batch batch_queue;
-	/* callbacks try to do the first check_zero */
-	struct rcu_batch batch_check0;
-	/* callbacks done with the first check_zero and the flip */
-	struct rcu_batch batch_check1;
-	struct rcu_batch batch_done;
-	struct delayed_work work;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-};
-
-void process_srcu(struct work_struct *work);
-
-#define __SRCU_STRUCT_INIT(name)					\
-	{								\
-		.completed = -300,					\
-		.per_cpu_ref = &name##_srcu_array,			\
-		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
-		.running = false,					\
-		.batch_queue = RCU_BATCH_INIT(name.batch_queue),	\
-		.batch_check0 = RCU_BATCH_INIT(name.batch_check0),	\
-		.batch_check1 = RCU_BATCH_INIT(name.batch_check1),	\
-		.batch_done = RCU_BATCH_INIT(name.batch_done),		\
-		.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
-		__SRCU_DEP_MAP_INIT(name)				\
-	}
-
-/*
- * Define and initialize a srcu struct at build time.
- * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
- *
- * Note that although DEFINE_STATIC_SRCU() hides the name from other
- * files, the per-CPU variable rules nevertheless require that the
- * chosen name be globally unique.  These rules also prohibit use of
- * DEFINE_STATIC_SRCU() within a function.  If these rules are too
- * restrictive, declare the srcu_struct manually.  For example, in
- * each file:
- *
- *	static struct srcu_struct my_srcu;
- *
- * Then, before the first use of each my_srcu, manually initialize it:
- *
- *	init_srcu_struct(&my_srcu);
- *
- * See include/linux/percpu-defs.h for the rules on per-CPU variables.
- */
-#define __DEFINE_SRCU(name, is_static)					\
-	static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
-	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
-#define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
-#define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
-
-void synchronize_srcu_expedited(struct srcu_struct *sp);
-void srcu_barrier(struct srcu_struct *sp);
-
-#endif
diff --git a/init/Kconfig b/init/Kconfig
index 6f257d51f582..2aa14ff40e88 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -526,32 +526,15 @@ config SRCU
 	  permits arbitrary sleeping or blocking within RCU read-side critical
 	  sections.
 
-config CLASSIC_SRCU
-	bool "Use v4.11 classic SRCU implementation"
-	default n
-	depends on RCU_EXPERT && SRCU
-	help
-	  This option selects the traditional well-tested classic SRCU
-	  implementation from v4.11, as might be desired for enterprise
-	  Linux distributions.  Without this option, the shiny new
-	  Tiny SRCU and Tree SRCU implementations are used instead.
-	  At some point, it is hoped that Tiny SRCU and Tree SRCU
-	  will accumulate enough test time and confidence to allow
-	  Classic SRCU to be dropped entirely.
-
-	  Say Y if you need a rock-solid SRCU.
-
-	  Say N if you would like help test Tree SRCU.
-
 config TINY_SRCU
 	bool
-	default y if SRCU && TINY_RCU && !CLASSIC_SRCU
+	default y if SRCU && TINY_RCU
 	help
 	  This option selects the single-CPU non-preemptible version of SRCU.
 
 config TREE_SRCU
 	bool
-	default y if SRCU && !TINY_RCU && !CLASSIC_SRCU
+	default y if SRCU && !TINY_RCU
 	help
 	  This option selects the full-fledged version of SRCU.
 
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 23803c7d5180..3945337c8ce4 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,7 +3,6 @@
 KCOV_INSTRUMENT := n
 
 obj-y += update.o sync.o
-obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
 obj-$(CONFIG_TREE_SRCU) += srcutree.o
 obj-$(CONFIG_TINY_SRCU) += srcutiny.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 27f871c88e0a..d06c42deee0b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -457,22 +457,6 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
 			     struct srcu_struct *sp, int *flags,
 			     unsigned long *gpnum, unsigned long *completed);
 
-#elif defined(CONFIG_CLASSIC_SRCU)
-
-static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
-					   struct srcu_struct *sp, int *flags,
-					   unsigned long *gpnum,
-					   unsigned long *completed)
-{
-	if (test_type != SRCU_FLAVOR)
-		return;
-	*flags = 0;
-	*completed = sp->completed;
-	*gpnum = *completed;
-	if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check1.head)
-		(*gpnum)++;
-}
-
 #endif
 
 #ifdef CONFIG_TINY_RCU
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 03cdf79e73d4..b8f7f8ce8575 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -564,31 +564,19 @@ static void srcu_torture_stats(void)
 	int __maybe_unused cpu;
 	int idx;
 
-#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU)
 #ifdef CONFIG_TREE_SRCU
 	idx = srcu_ctlp->srcu_idx & 0x1;
-#else /* #ifdef CONFIG_TREE_SRCU */
-	idx = srcu_ctlp->completed & 0x1;
-#endif /* #else #ifdef CONFIG_TREE_SRCU */
 	pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
 		 torture_type, TORTURE_FLAG, idx);
 	for_each_possible_cpu(cpu) {
 		unsigned long l0, l1;
 		unsigned long u0, u1;
 		long c0, c1;
-#ifdef CONFIG_TREE_SRCU
 		struct srcu_data *counts;
 
 		counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
 		u0 = counts->srcu_unlock_count[!idx];
 		u1 = counts->srcu_unlock_count[idx];
-#else /* #ifdef CONFIG_TREE_SRCU */
-		struct srcu_array *counts;
-
-		counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
-		u0 = counts->unlock_count[!idx];
-		u1 = counts->unlock_count[idx];
-#endif /* #else #ifdef CONFIG_TREE_SRCU */
 
 		/*
 		 * Make sure that a lock is always counted if the corresponding
@@ -596,13 +584,8 @@ static void srcu_torture_stats(void)
 		 */
 		smp_rmb();
 
-#ifdef CONFIG_TREE_SRCU
 		l0 = counts->srcu_lock_count[!idx];
 		l1 = counts->srcu_lock_count[idx];
-#else /* #ifdef CONFIG_TREE_SRCU */
-		l0 = counts->lock_count[!idx];
-		l1 = counts->lock_count[idx];
-#endif /* #else #ifdef CONFIG_TREE_SRCU */
 
 		c0 = l0 - u0;
 		c1 = l1 - u1;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
deleted file mode 100644
index 4e3f558409a0..000000000000
--- a/kernel/rcu/srcu.c
+++ /dev/null
@@ -1,668 +0,0 @@
-/*
- * Sleepable Read-Copy Update mechanism for mutual exclusion.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (C) IBM Corporation, 2006
- * Copyright (C) Fujitsu, 2012
- *
- * Author: Paul McKenney <paulmck@us.ibm.com>
- *	   Lai Jiangshan <laijs@cn.fujitsu.com>
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- *		Documentation/RCU/ *.txt
- *
- */
-
-#include <linux/export.h>
-#include <linux/mutex.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate_wait.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/delay.h>
-#include <linux/srcu.h>
-
-#include "rcu.h"
-
-/*
- * Initialize an rcu_batch structure to empty.
- */
-static inline void rcu_batch_init(struct rcu_batch *b)
-{
-	b->head = NULL;
-	b->tail = &b->head;
-}
-
-/*
- * Enqueue a callback onto the tail of the specified rcu_batch structure.
- */
-static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
-{
-	*b->tail = head;
-	b->tail = &head->next;
-}
-
-/*
- * Is the specified rcu_batch structure empty?
- */
-static inline bool rcu_batch_empty(struct rcu_batch *b)
-{
-	return b->tail == &b->head;
-}
-
-/*
- * Remove the callback at the head of the specified rcu_batch structure
- * and return a pointer to it, or return NULL if the structure is empty.
- */
-static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
-{
-	struct rcu_head *head;
-
-	if (rcu_batch_empty(b))
-		return NULL;
-
-	head = b->head;
-	b->head = head->next;
-	if (b->tail == &head->next)
-		rcu_batch_init(b);
-
-	return head;
-}
-
-/*
- * Move all callbacks from the rcu_batch structure specified by "from" to
- * the structure specified by "to".
- */
-static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
-{
-	if (!rcu_batch_empty(from)) {
-		*to->tail = from->head;
-		to->tail = from->tail;
-		rcu_batch_init(from);
-	}
-}
-
-static int init_srcu_struct_fields(struct srcu_struct *sp)
-{
-	sp->completed = 0;
-	spin_lock_init(&sp->queue_lock);
-	sp->running = false;
-	rcu_batch_init(&sp->batch_queue);
-	rcu_batch_init(&sp->batch_check0);
-	rcu_batch_init(&sp->batch_check1);
-	rcu_batch_init(&sp->batch_done);
-	INIT_DELAYED_WORK(&sp->work, process_srcu);
-	sp->per_cpu_ref = alloc_percpu(struct srcu_array);
-	return sp->per_cpu_ref ? 0 : -ENOMEM;
-}
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
-int __init_srcu_struct(struct srcu_struct *sp, const char *name,
-		       struct lock_class_key *key)
-{
-	/* Don't re-initialize a lock while it is held. */
-	debug_check_no_locks_freed((void *)sp, sizeof(*sp));
-	lockdep_init_map(&sp->dep_map, name, key, 0);
-	return init_srcu_struct_fields(sp);
-}
-EXPORT_SYMBOL_GPL(__init_srcu_struct);
-
-#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-/**
- * init_srcu_struct - initialize a sleep-RCU structure
- * @sp: structure to initialize.
- *
- * Must invoke this on a given srcu_struct before passing that srcu_struct
- * to any other function.  Each srcu_struct represents a separate domain
- * of SRCU protection.
- */
-int init_srcu_struct(struct srcu_struct *sp)
-{
-	return init_srcu_struct_fields(sp);
-}
-EXPORT_SYMBOL_GPL(init_srcu_struct);
-
-#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-/*
- * Returns approximate total of the readers' ->lock_count[] values for the
- * rank of per-CPU counters specified by idx.
- */
-static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
-{
-	int cpu;
-	unsigned long sum = 0;
-
-	for_each_possible_cpu(cpu) {
-		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
-
-		sum += READ_ONCE(cpuc->lock_count[idx]);
-	}
-	return sum;
-}
-
-/*
- * Returns approximate total of the readers' ->unlock_count[] values for the
- * rank of per-CPU counters specified by idx.
- */
-static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
-{
-	int cpu;
-	unsigned long sum = 0;
-
-	for_each_possible_cpu(cpu) {
-		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
-
-		sum += READ_ONCE(cpuc->unlock_count[idx]);
-	}
-	return sum;
-}
-
-/*
- * Return true if the number of pre-existing readers is determined to
- * be zero.
- */
-static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
-{
-	unsigned long unlocks;
-
-	unlocks = srcu_readers_unlock_idx(sp, idx);
-
-	/*
-	 * Make sure that a lock is always counted if the corresponding unlock
-	 * is counted. Needs to be a smp_mb() as the read side may contain a
-	 * read from a variable that is written to before the synchronize_srcu()
-	 * in the write side. In this case smp_mb()s A and B act like the store
-	 * buffering pattern.
-	 *
-	 * This smp_mb() also pairs with smp_mb() C to prevent accesses after the
-	 * synchronize_srcu() from being executed before the grace period ends.
-	 */
-	smp_mb(); /* A */
-
-	/*
-	 * If the locks are the same as the unlocks, then there must have
-	 * been no readers on this index at some time in between. This does not
-	 * mean that there are no more readers, as one could have read the
-	 * current index but not have incremented the lock counter yet.
-	 *
-	 * Possible bug: There is no guarantee that there haven't been ULONG_MAX
-	 * increments of ->lock_count[] since the unlocks were counted, meaning
-	 * that this could return true even if there are still active readers.
-	 * Since there are no memory barriers around srcu_flip(), the CPU is not
-	 * required to increment ->completed before running
-	 * srcu_readers_unlock_idx(), which means that there could be an
-	 * arbitrarily large number of critical sections that execute after
-	 * srcu_readers_unlock_idx() but use the old value of ->completed.
-	 */
-	return srcu_readers_lock_idx(sp, idx) == unlocks;
-}
-
-/**
- * srcu_readers_active - returns true if there are readers. and false
- *                       otherwise
- * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
- *
- * Note that this is not an atomic primitive, and can therefore suffer
- * severe errors when invoked on an active srcu_struct.  That said, it
- * can be useful as an error check at cleanup time.
- */
-static bool srcu_readers_active(struct srcu_struct *sp)
-{
-	int cpu;
-	unsigned long sum = 0;
-
-	for_each_possible_cpu(cpu) {
-		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
-
-		sum += READ_ONCE(cpuc->lock_count[0]);
-		sum += READ_ONCE(cpuc->lock_count[1]);
-		sum -= READ_ONCE(cpuc->unlock_count[0]);
-		sum -= READ_ONCE(cpuc->unlock_count[1]);
-	}
-	return sum;
-}
-
-/**
- * cleanup_srcu_struct - deconstruct a sleep-RCU structure
- * @sp: structure to clean up.
- *
- * Must invoke this only after you are finished using a given srcu_struct
- * that was initialized via init_srcu_struct().  This code does some
- * probabalistic checking, spotting late uses of srcu_read_lock(),
- * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu().
- * If any such late uses are detected, the per-CPU memory associated with
- * the srcu_struct is simply leaked and WARN_ON() is invoked.  If the
- * caller frees the srcu_struct itself, a use-after-free crash will likely
- * ensue, but at least there will be a warning printed.
- */
-void cleanup_srcu_struct(struct srcu_struct *sp)
-{
-	if (WARN_ON(srcu_readers_active(sp)))
-		return; /* Leakage unless caller handles error. */
-	free_percpu(sp->per_cpu_ref);
-	sp->per_cpu_ref = NULL;
-}
-EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
-
-/*
- * Counts the new reader in the appropriate per-CPU element of the
- * srcu_struct.
- * Returns an index that must be passed to the matching srcu_read_unlock().
- */
-int __srcu_read_lock(struct srcu_struct *sp)
-{
-	int idx;
-
-	idx = READ_ONCE(sp->completed) & 0x1;
-	this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
-	smp_mb(); /* B */  /* Avoid leaking the critical section. */
-	return idx;
-}
-EXPORT_SYMBOL_GPL(__srcu_read_lock);
-
-/*
- * Removes the count for the old reader from the appropriate per-CPU
- * element of the srcu_struct.  Note that this may well be a different
- * CPU than that which was incremented by the corresponding srcu_read_lock().
- */
-void __srcu_read_unlock(struct srcu_struct *sp, int idx)
-{
-	smp_mb(); /* C */  /* Avoid leaking the critical section. */
-	this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
-}
-EXPORT_SYMBOL_GPL(__srcu_read_unlock);
-
-/*
- * We use an adaptive strategy for synchronize_srcu() and especially for
- * synchronize_srcu_expedited().  We spin for a fixed time period
- * (defined below) to allow SRCU readers to exit their read-side critical
- * sections.  If there are still some readers after 10 microseconds,
- * we repeatedly block for 1-millisecond time periods.  This approach
- * has done well in testing, so there is no need for a config parameter.
- */
-#define SRCU_RETRY_CHECK_DELAY		5
-#define SYNCHRONIZE_SRCU_TRYCOUNT	2
-#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT	12
-
-/*
- * @@@ Wait until all pre-existing readers complete.  Such readers
- * will have used the index specified by "idx".
- * the caller should ensures the ->completed is not changed while checking
- * and idx = (->completed & 1) ^ 1
- */
-static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
-{
-	for (;;) {
-		if (srcu_readers_active_idx_check(sp, idx))
-			return true;
-		if (--trycount <= 0)
-			return false;
-		udelay(SRCU_RETRY_CHECK_DELAY);
-	}
-}
-
-/*
- * Increment the ->completed counter so that future SRCU readers will
- * use the other rank of the ->(un)lock_count[] arrays.  This allows
- * us to wait for pre-existing readers in a starvation-free manner.
- */
-static void srcu_flip(struct srcu_struct *sp)
-{
-	WRITE_ONCE(sp->completed, sp->completed + 1);
-
-	/*
-	 * Ensure that if the updater misses an __srcu_read_unlock()
-	 * increment, that task's next __srcu_read_lock() will see the
-	 * above counter update.  Note that both this memory barrier
-	 * and the one in srcu_readers_active_idx_check() provide the
-	 * guarantee for __srcu_read_lock().
-	 */
-	smp_mb(); /* D */  /* Pairs with C. */
-}
-
-/*
- * Enqueue an SRCU callback on the specified srcu_struct structure,
- * initiating grace-period processing if it is not already running.
- *
- * Note that all CPUs must agree that the grace period extended beyond
- * all pre-existing SRCU read-side critical section.  On systems with
- * more than one CPU, this means that when "func()" is invoked, each CPU
- * is guaranteed to have executed a full memory barrier since the end of
- * its last corresponding SRCU read-side critical section whose beginning
- * preceded the call to call_rcu().  It also means that each CPU executing
- * an SRCU read-side critical section that continues beyond the start of
- * "func()" must have executed a memory barrier after the call_rcu()
- * but before the beginning of that SRCU read-side critical section.
- * Note that these guarantees include CPUs that are offline, idle, or
- * executing in user mode, as well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
- * resulting SRCU callback function "func()", then both CPU A and CPU
- * B are guaranteed to execute a full memory barrier during the time
- * interval between the call to call_rcu() and the invocation of "func()".
- * This guarantee applies even if CPU A and CPU B are the same CPU (but
- * again only if the system has more than one CPU).
- *
- * Of course, these guarantees apply only for invocations of call_srcu(),
- * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
- * srcu_struct structure.
- */
-void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
-	       rcu_callback_t func)
-{
-	unsigned long flags;
-
-	head->next = NULL;
-	head->func = func;
-	spin_lock_irqsave(&sp->queue_lock, flags);
-	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	rcu_batch_queue(&sp->batch_queue, head);
-	if (!sp->running) {
-		sp->running = true;
-		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
-	}
-	spin_unlock_irqrestore(&sp->queue_lock, flags);
-}
-EXPORT_SYMBOL_GPL(call_srcu);
-
-static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
-static void srcu_reschedule(struct srcu_struct *sp);
-
-/*
- * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
- */
-static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
-{
-	struct rcu_synchronize rcu;
-	struct rcu_head *head = &rcu.head;
-	bool done = false;
-
-	RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
-			 lock_is_held(&rcu_bh_lock_map) ||
-			 lock_is_held(&rcu_lock_map) ||
-			 lock_is_held(&rcu_sched_lock_map),
-			 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
-
-	might_sleep();
-	init_completion(&rcu.completion);
-
-	head->next = NULL;
-	head->func = wakeme_after_rcu;
-	spin_lock_irq(&sp->queue_lock);
-	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	if (!sp->running) {
-		/* steal the processing owner */
-		sp->running = true;
-		rcu_batch_queue(&sp->batch_check0, head);
-		spin_unlock_irq(&sp->queue_lock);
-
-		srcu_advance_batches(sp, trycount);
-		if (!rcu_batch_empty(&sp->batch_done)) {
-			BUG_ON(sp->batch_done.head != head);
-			rcu_batch_dequeue(&sp->batch_done);
-			done = true;
-		}
-		/* give the processing owner to work_struct */
-		srcu_reschedule(sp);
-	} else {
-		rcu_batch_queue(&sp->batch_queue, head);
-		spin_unlock_irq(&sp->queue_lock);
-	}
-
-	if (!done) {
-		wait_for_completion(&rcu.completion);
-		smp_mb(); /* Caller's later accesses after GP. */
-	}
-
-}
-
-/**
- * synchronize_srcu - wait for prior SRCU read-side critical-section completion
- * @sp: srcu_struct with which to synchronize.
- *
- * Wait for the count to drain to zero of both indexes. To avoid the
- * possible starvation of synchronize_srcu(), it waits for the count of
- * the index=((->completed & 1) ^ 1) to drain to zero at first,
- * and then flip the completed and wait for the count of the other index.
- *
- * Can block; must be called from process context.
- *
- * Note that it is illegal to call synchronize_srcu() from the corresponding
- * SRCU read-side critical section; doing so will result in deadlock.
- * However, it is perfectly legal to call synchronize_srcu() on one
- * srcu_struct from some other srcu_struct's read-side critical section,
- * as long as the resulting graph of srcu_structs is acyclic.
- *
- * There are memory-ordering constraints implied by synchronize_srcu().
- * On systems with more than one CPU, when synchronize_srcu() returns,
- * each CPU is guaranteed to have executed a full memory barrier since
- * the end of its last corresponding SRCU-sched read-side critical section
- * whose beginning preceded the call to synchronize_srcu().  In addition,
- * each CPU having an SRCU read-side critical section that extends beyond
- * the return from synchronize_srcu() is guaranteed to have executed a
- * full memory barrier after the beginning of synchronize_srcu() and before
- * the beginning of that SRCU read-side critical section.  Note that these
- * guarantees include CPUs that are offline, idle, or executing in user mode,
- * as well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked synchronize_srcu(), which returned
- * to its caller on CPU B, then both CPU A and CPU B are guaranteed
- * to have executed a full memory barrier during the execution of
- * synchronize_srcu().  This guarantee applies even if CPU A and CPU B
- * are the same CPU, but again only if the system has more than one CPU.
- *
- * Of course, these memory-ordering guarantees apply only when
- * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
- * passed the same srcu_struct structure.
- */
-void synchronize_srcu(struct srcu_struct *sp)
-{
-	__synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
-			   ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
-			   : SYNCHRONIZE_SRCU_TRYCOUNT);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu);
-
-/**
- * synchronize_srcu_expedited - Brute-force SRCU grace period
- * @sp: srcu_struct with which to synchronize.
- *
- * Wait for an SRCU grace period to elapse, but be more aggressive about
- * spinning rather than blocking when waiting.
- *
- * Note that synchronize_srcu_expedited() has the same deadlock and
- * memory-ordering properties as does synchronize_srcu().
- */
-void synchronize_srcu_expedited(struct srcu_struct *sp)
-{
-	__synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
-
-/**
- * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
- * @sp: srcu_struct on which to wait for in-flight callbacks.
- */
-void srcu_barrier(struct srcu_struct *sp)
-{
-	synchronize_srcu(sp);
-}
-EXPORT_SYMBOL_GPL(srcu_barrier);
-
-/**
- * srcu_batches_completed - return batches completed.
- * @sp: srcu_struct on which to report batch completion.
- *
- * Report the number of batches, correlated with, but not necessarily
- * precisely the same as, the number of grace periods that have elapsed.
- */
-unsigned long srcu_batches_completed(struct srcu_struct *sp)
-{
-	return sp->completed;
-}
-EXPORT_SYMBOL_GPL(srcu_batches_completed);
-
-#define SRCU_CALLBACK_BATCH	10
-#define SRCU_INTERVAL		1
-
-/*
- * Move any new SRCU callbacks to the first stage of the SRCU grace
- * period pipeline.
- */
-static void srcu_collect_new(struct srcu_struct *sp)
-{
-	if (!rcu_batch_empty(&sp->batch_queue)) {
-		spin_lock_irq(&sp->queue_lock);
-		rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
-		spin_unlock_irq(&sp->queue_lock);
-	}
-}
-
-/*
- * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
- * ->batch_check1 and then to ->batch_done as readers drain.
- */
-static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
-{
-	int idx = 1 ^ (sp->completed & 1);
-
-	/*
-	 * Because readers might be delayed for an extended period after
-	 * fetching ->completed for their index, at any point in time there
-	 * might well be readers using both idx=0 and idx=1.  We therefore
-	 * need to wait for readers to clear from both index values before
-	 * invoking a callback.
-	 */
-
-	if (rcu_batch_empty(&sp->batch_check0) &&
-	    rcu_batch_empty(&sp->batch_check1))
-		return; /* no callbacks need to be advanced */
-
-	if (!try_check_zero(sp, idx, trycount))
-		return; /* failed to advance, will try after SRCU_INTERVAL */
-
-	/*
-	 * The callbacks in ->batch_check1 have already done with their
-	 * first zero check and flip back when they were enqueued on
-	 * ->batch_check0 in a previous invocation of srcu_advance_batches().
-	 * (Presumably try_check_zero() returned false during that
-	 * invocation, leaving the callbacks stranded on ->batch_check1.)
-	 * They are therefore ready to invoke, so move them to ->batch_done.
-	 */
-	rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-
-	if (rcu_batch_empty(&sp->batch_check0))
-		return; /* no callbacks need to be advanced */
-	srcu_flip(sp);
-
-	/*
-	 * The callbacks in ->batch_check0 just finished their
-	 * first check zero and flip, so move them to ->batch_check1
-	 * for future checking on the other idx.
-	 */
-	rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
-
-	/*
-	 * SRCU read-side critical sections are normally short, so check
-	 * at least twice in quick succession after a flip.
-	 */
-	trycount = trycount < 2 ? 2 : trycount;
-	if (!try_check_zero(sp, idx^1, trycount))
-		return; /* failed to advance, will try after SRCU_INTERVAL */
-
-	/*
-	 * The callbacks in ->batch_check1 have now waited for all
-	 * pre-existing readers using both idx values.  They are therefore
-	 * ready to invoke, so move them to ->batch_done.
-	 */
-	rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-}
-
-/*
- * Invoke a limited number of SRCU callbacks that have passed through
- * their grace period.  If there are more to do, SRCU will reschedule
- * the workqueue.  Note that needed memory barriers have been executed
- * in this task's context by srcu_readers_active_idx_check().
- */
-static void srcu_invoke_callbacks(struct srcu_struct *sp)
-{
-	int i;
-	struct rcu_head *head;
-
-	for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
-		head = rcu_batch_dequeue(&sp->batch_done);
-		if (!head)
-			break;
-		local_bh_disable();
-		head->func(head);
-		local_bh_enable();
-	}
-}
-
-/*
- * Finished one round of SRCU grace period.  Start another if there are
- * more SRCU callbacks queued, otherwise put SRCU into not-running state.
- */
-static void srcu_reschedule(struct srcu_struct *sp)
-{
-	bool pending = true;
-
-	if (rcu_batch_empty(&sp->batch_done) &&
-	    rcu_batch_empty(&sp->batch_check1) &&
-	    rcu_batch_empty(&sp->batch_check0) &&
-	    rcu_batch_empty(&sp->batch_queue)) {
-		spin_lock_irq(&sp->queue_lock);
-		if (rcu_batch_empty(&sp->batch_done) &&
-		    rcu_batch_empty(&sp->batch_check1) &&
-		    rcu_batch_empty(&sp->batch_check0) &&
-		    rcu_batch_empty(&sp->batch_queue)) {
-			sp->running = false;
-			pending = false;
-		}
-		spin_unlock_irq(&sp->queue_lock);
-	}
-
-	if (pending)
-		queue_delayed_work(system_power_efficient_wq,
-				   &sp->work, SRCU_INTERVAL);
-}
-
-/*
- * This is the work-queue function that handles SRCU grace periods.
- */
-void process_srcu(struct work_struct *work)
-{
-	struct srcu_struct *sp;
-
-	sp = container_of(work, struct srcu_struct, work.work);
-
-	srcu_collect_new(sp);
-	srcu_advance_batches(sp, 1);
-	srcu_invoke_callbacks(sp);
-	srcu_reschedule(sp);
-}
-EXPORT_SYMBOL_GPL(process_srcu);
-
-static int __init srcu_bootup_announce(void)
-{
-	pr_info("Classic SRCU implementation.\n");
-	return 0;
-}
-early_initcall(srcu_bootup_announce);
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST
index 0c1da784b8cb..6a0b9f69faad 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST
@@ -7,7 +7,6 @@ TREE06
 TREE07
 TREE08
 TREE09
-SRCU-C
 SRCU-N
 SRCU-P
 SRCU-t
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C
deleted file mode 100644
index d4e19c087c21..000000000000
--- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C
+++ /dev/null
@@ -1,11 +0,0 @@
-CONFIG_RCU_TRACE=n
-CONFIG_SMP=y
-CONFIG_NR_CPUS=3
-CONFIG_HOTPLUG_CPU=y
-CONFIG_RCU_EXPERT=y
-CONFIG_CLASSIC_SRCU=y
-CONFIG_PREEMPT_NONE=n
-CONFIG_PREEMPT_VOLUNTARY=n
-CONFIG_PREEMPT=y
-CONFIG_DEBUG_LOCK_ALLOC=y
-CONFIG_PROVE_LOCKING=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/SRCUCLASSIC b/tools/testing/selftests/rcutorture/configs/rcuperf/SRCUCLASSIC
deleted file mode 100644
index a1395af60ef4..000000000000
--- a/tools/testing/selftests/rcutorture/configs/rcuperf/SRCUCLASSIC
+++ /dev/null
@@ -1,16 +0,0 @@
-CONFIG_SMP=y
-CONFIG_HZ_PERIODIC=n
-CONFIG_NO_HZ_IDLE=y
-CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_HOTPLUG_CPU=n
-CONFIG_SUSPEND=n
-CONFIG_HIBERNATION=n
-CONFIG_RCU_NOCB_CPU=n
-CONFIG_DEBUG_LOCK_ALLOC=n
-CONFIG_PROVE_LOCKING=n
-CONFIG_RCU_BOOST=n
-CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
-CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TRACE=y
-CONFIG_CLASSIC_SRCU=y
-- 
cgit v1.3-14-g43fede


From ae91aa0adb14dc33114d566feca2f7cb7a96b8b7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 15 May 2017 15:30:32 -0700
Subject: rcu: Remove debugfs tracing

RCU's debugfs tracing used to be the only reasonable low-level debug
information available, but ftrace and event tracing has since surpassed
the RCU debugfs level of usefulness.  This commit therefore removes
RCU's debugfs tracing.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/RCU/00-INDEX                         |   2 -
 .../RCU/Design/Requirements/Requirements.html      |   2 +-
 Documentation/RCU/trace.txt                        | 535 ---------------------
 init/Kconfig                                       |   8 -
 kernel/rcu/Makefile                                |   1 -
 kernel/rcu/tiny_plugin.h                           |  45 --
 kernel/rcu/tree.h                                  |  27 --
 kernel/rcu/tree_plugin.h                           |  31 +-
 kernel/rcu/tree_trace.c                            | 494 -------------------
 lib/Kconfig.debug                                  |   5 +-
 .../selftests/rcutorture/configs/rcu/TREE02-T      |  21 -
 .../selftests/rcutorture/configs/rcu/TREE08-T      |  21 -
 .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt  |   4 -
 13 files changed, 4 insertions(+), 1192 deletions(-)
 delete mode 100644 Documentation/RCU/trace.txt
 delete mode 100644 kernel/rcu/tree_trace.c
 delete mode 100644 tools/testing/selftests/rcutorture/configs/rcu/TREE02-T
 delete mode 100644 tools/testing/selftests/rcutorture/configs/rcu/TREE08-T

(limited to 'kernel')

diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index 1672573b037a..f46980c060aa 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -28,8 +28,6 @@ stallwarn.txt
 	- RCU CPU stall warnings (module parameter rcu_cpu_stall_suppress)
 torture.txt
 	- RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
-trace.txt
-	- CONFIG_RCU_TRACE debugfs files and formats
 UP.txt
 	- RCU on Uniprocessor Systems
 whatisRCU.txt
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index 0e6550a8c926..95b30fa25d56 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -2034,7 +2034,7 @@ guard against mishaps and misuse:
 	some other synchronization mechanism, for example, reference
 	counting.
 <li>	In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
-	information is provided via both debugfs and event tracing.
+	information is provided via event tracing.
 <li>	Open-coded use of <tt>rcu_assign_pointer()</tt> and
 	<tt>rcu_dereference()</tt> to create typical linked
 	data structures can be surprisingly error-prone.
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
deleted file mode 100644
index 6549012033f9..000000000000
--- a/Documentation/RCU/trace.txt
+++ /dev/null
@@ -1,535 +0,0 @@
-CONFIG_RCU_TRACE debugfs Files and Formats
-
-
-The rcutree and rcutiny implementations of RCU provide debugfs trace
-output that summarizes counters and state.  This information is useful for
-debugging RCU itself, and can sometimes also help to debug abuses of RCU.
-The following sections describe the debugfs files and formats, first
-for rcutree and next for rcutiny.
-
-
-CONFIG_TREE_RCU and CONFIG_PREEMPT_RCU debugfs Files and Formats
-
-These implementations of RCU provide several debugfs directories under the
-top-level directory "rcu":
-
-rcu/rcu_bh
-rcu/rcu_preempt
-rcu/rcu_sched
-
-Each directory contains files for the corresponding flavor of RCU.
-Note that rcu/rcu_preempt is only present for CONFIG_PREEMPT_RCU.
-For CONFIG_TREE_RCU, the RCU flavor maps onto the RCU-sched flavor,
-so that activity for both appears in rcu/rcu_sched.
-
-In addition, the following file appears in the top-level directory:
-rcu/rcutorture.  This file displays rcutorture test progress.  The output
-of "cat rcu/rcutorture" looks as follows:
-
-rcutorture test sequence: 0 (test in progress)
-rcutorture update version number: 615
-
-The first line shows the number of rcutorture tests that have completed
-since boot.  If a test is currently running, the "(test in progress)"
-string will appear as shown above.  The second line shows the number of
-update cycles that the current test has started, or zero if there is
-no test in progress.
-
-
-Within each flavor directory (rcu/rcu_bh, rcu/rcu_sched, and possibly
-also rcu/rcu_preempt) the following files will be present:
-
-rcudata:
-	Displays fields in struct rcu_data.
-rcuexp:
-	Displays statistics for expedited grace periods.
-rcugp:
-	Displays grace-period counters.
-rcuhier:
-	Displays the struct rcu_node hierarchy.
-rcu_pending:
-	Displays counts of the reasons rcu_pending() decided that RCU had
-	work to do.
-rcuboost:
-	Displays RCU boosting statistics.  Only present if
-	CONFIG_RCU_BOOST=y.
-
-The output of "cat rcu/rcu_preempt/rcudata" looks as follows:
-
-  0!c=30455 g=30456 cnq=1/0:1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
-  1!c=30719 g=30720 cnq=1/0:0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
-  2!c=30150 g=30151 cnq=1/1:1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
-  3 c=31249 g=31250 cnq=1/1:0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
-  4!c=29502 g=29503 cnq=1/0:1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
-  5 c=31201 g=31202 cnq=1/0:1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
-  6!c=30253 g=30254 cnq=1/0:1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
-  7 c=31178 g=31178 cnq=1/0:0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
-
-This file has one line per CPU, or eight for this 8-CPU system.
-The fields are as follows:
-
-o	The number at the beginning of each line is the CPU number.
-	CPUs numbers followed by an exclamation mark are offline,
-	but have been online at least once since boot.	There will be
-	no output for CPUs that have never been online, which can be
-	a good thing in the surprisingly common case where NR_CPUS is
-	substantially larger than the number of actual CPUs.
-
-o	"c" is the count of grace periods that this CPU believes have
-	completed.  Offlined CPUs and CPUs in dynticks idle mode may lag
-	quite a ways behind, for example, CPU 4 under "rcu_sched" above,
-	which has been offline through 16 RCU grace periods.  It is not
-	unusual to see offline CPUs lagging by thousands of grace periods.
-	Note that although the grace-period number is an unsigned long,
-	it is printed out as a signed long to allow more human-friendly
-	representation near boot time.
-
-o	"g" is the count of grace periods that this CPU believes have
-	started.  Again, offlined CPUs and CPUs in dynticks idle mode
-	may lag behind.  If the "c" and "g" values are equal, this CPU
-	has already reported a quiescent state for the last RCU grace
-	period that it is aware of, otherwise, the CPU believes that it
-	owes RCU a quiescent state.
-
-o	"pq" indicates that this CPU has passed through a quiescent state
-	for the current grace period.  It is possible for "pq" to be
-	"1" and "c" different than "g", which indicates that although
-	the CPU has passed through a quiescent state, either (1) this
-	CPU has not yet reported that fact, (2) some other CPU has not
-	yet reported for this grace period, or (3) both.
-
-o	"qp" indicates that RCU still expects a quiescent state from
-	this CPU.  Offlined CPUs and CPUs in dyntick idle mode might
-	well have qp=1, which is OK: RCU is still ignoring them.
-
-o	"dt" is the current value of the dyntick counter that is incremented
-	when entering or leaving idle, either due to a context switch or
-	due to an interrupt.  This number is even if the CPU is in idle
-	from RCU's viewpoint and odd otherwise.  The number after the
-	first "/" is the interrupt nesting depth when in idle state,
-	or a large number added to the interrupt-nesting depth when
-	running a non-idle task.  Some architectures do not accurately
-	count interrupt nesting when running in non-idle kernel context,
-	which can result in interesting anomalies such as negative
-	interrupt-nesting levels.  The number after the second "/"
-	is the NMI nesting depth.
-
-o	"df" is the number of times that some other CPU has forced a
-	quiescent state on behalf of this CPU due to this CPU being in
-	idle state.
-
-o	"of" is the number of times that some other CPU has forced a
-	quiescent state on behalf of this CPU due to this CPU being
-	offline.  In a perfect world, this might never happen, but it
-	turns out that offlining and onlining a CPU can take several grace
-	periods, and so there is likely to be an extended period of time
-	when RCU believes that the CPU is online when it really is not.
-	Please note that erring in the other direction (RCU believing a
-	CPU is offline when it is really alive and kicking) is a fatal
-	error, so it makes sense to err conservatively.
-
-o	"ql" is the number of RCU callbacks currently residing on
-	this CPU.  The first number is the number of "lazy" callbacks
-	that are known to RCU to only be freeing memory, and the number
-	after the "/" is the total number of callbacks, lazy or not.
-	These counters count callbacks regardless of what phase of
-	grace-period processing that they are in (new, waiting for
-	grace period to start, waiting for grace period to end, ready
-	to invoke).
-
-o	"qs" gives an indication of the state of the callback queue
-	with four characters:
-
-	"N"	Indicates that there are callbacks queued that are not
-		ready to be handled by the next grace period, and thus
-		will be handled by the grace period following the next
-		one.
-
-	"R"	Indicates that there are callbacks queued that are
-		ready to be handled by the next grace period.
-
-	"W"	Indicates that there are callbacks queued that are
-		waiting on the current grace period.
-
-	"D"	Indicates that there are callbacks queued that have
-		already been handled by a prior grace period, and are
-		thus waiting to be invoked.  Note that callbacks in
-		the process of being invoked are not counted here.
-		Callbacks in the process of being invoked are those
-		that have been removed from the rcu_data structures
-		queues by rcu_do_batch(), but which have not yet been
-		invoked.
-
-	If there are no callbacks in a given one of the above states,
-	the corresponding character is replaced by ".".
-
-o	"b" is the batch limit for this CPU.  If more than this number
-	of RCU callbacks is ready to invoke, then the remainder will
-	be deferred.
-
-o	"ci" is the number of RCU callbacks that have been invoked for
-	this CPU.  Note that ci+nci+ql is the number of callbacks that have
-	been registered in absence of CPU-hotplug activity.
-
-o	"nci" is the number of RCU callbacks that have been offloaded from
-	this CPU.  This will always be zero unless the kernel was built
-	with CONFIG_RCU_NOCB_CPU=y and the "rcu_nocbs=" kernel boot
-	parameter was specified.
-
-o	"co" is the number of RCU callbacks that have been orphaned due to
-	this CPU going offline.  These orphaned callbacks have been moved
-	to an arbitrarily chosen online CPU.
-
-o	"ca" is the number of RCU callbacks that have been adopted by this
-	CPU due to other CPUs going offline.  Note that ci+co-ca+ql is
-	the number of RCU callbacks registered on this CPU.
-
-
-Kernels compiled with CONFIG_RCU_BOOST=y display the following from
-/debug/rcu/rcu_preempt/rcudata:
-
-  0!c=12865 g=12866 cnq=1/0:1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
-  1 c=14407 g=14408 cnq=1/0:0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
-  2 c=14407 g=14408 cnq=1/0:0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
-  3 c=14407 g=14408 cnq=1/0:0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
-  4 c=14405 g=14406 cnq=1/0:1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
-  5!c=14168 g=14169 cnq=1/0:0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
-  6 c=14404 g=14405 cnq=1/0:0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
-  7 c=14407 g=14408 cnq=1/0:1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
-
-This is similar to the output discussed above, but contains the following
-additional fields:
-
-o	"kt" is the per-CPU kernel-thread state.  The digit preceding
-	the first slash is zero if there is no work pending and 1
-	otherwise.  The character between the first pair of slashes is
-	as follows:
-
-	"S"	The kernel thread is stopped, in other words, all
-		CPUs corresponding to this rcu_node structure are
-		offline.
-
-	"R"	The kernel thread is running.
-
-	"W"	The kernel thread is waiting because there is no work
-		for it to do.
-
-	"O"	The kernel thread is waiting because it has been
-		forced off of its designated CPU or because its
-		->cpus_allowed mask permits it to run on other than
-		its designated CPU.
-
-	"Y"	The kernel thread is yielding to avoid hogging CPU.
-
-	"?"	Unknown value, indicates a bug.
-
-	The number after the final slash is the CPU that the kthread
-	is actually running on.
-
-	This field is displayed only for CONFIG_RCU_BOOST kernels.
-
-o	"ktl" is the low-order 16 bits (in hexadecimal) of the count of
-	the number of times that this CPU's per-CPU kthread has gone
-	through its loop servicing invoke_rcu_cpu_kthread() requests.
-
-	This field is displayed only for CONFIG_RCU_BOOST kernels.
-
-
-The output of "cat rcu/rcu_preempt/rcuexp" looks as follows:
-
-s=21872 wd1=0 wd2=0 wd3=5 enq=0 sc=21872
-
-These fields are as follows:
-
-o	"s" is the sequence number, with an odd number indicating that
-	an expedited grace period is in progress.
-
-o	"wd1", "wd2", and "wd3" are the number of times that an attempt
-	to start an expedited grace period found that someone else had
-	completed an expedited grace period that satisfies the attempted
-	request.  "Our work is done."
-
-o	"enq" is the number of quiescent states still outstanding.
-
-o	"sc" is the number of times that the attempt to start a
-	new expedited grace period succeeded.
-
-
-The output of "cat rcu/rcu_preempt/rcugp" looks as follows:
-
-completed=31249  gpnum=31250  age=1  max=18
-
-These fields are taken from the rcu_state structure, and are as follows:
-
-o	"completed" is the number of grace periods that have completed.
-	It is comparable to the "c" field from rcu/rcudata in that a
-	CPU whose "c" field matches the value of "completed" is aware
-	that the corresponding RCU grace period has completed.
-
-o	"gpnum" is the number of grace periods that have started.  It is
-	similarly comparable to the "g" field from rcu/rcudata in that
-	a CPU whose "g" field matches the value of "gpnum" is aware that
-	the corresponding RCU grace period has started.
-
-	If these two fields are equal, then there is no grace period
-	in progress, in other words, RCU is idle.  On the other hand,
-	if the two fields differ (as they are above), then an RCU grace
-	period is in progress.
-
-o	"age" is the number of jiffies that the current grace period
-	has extended for, or zero if there is no grace period currently
-	in effect.
-
-o	"max" is the age in jiffies of the longest-duration grace period
-	thus far.
-
-The output of "cat rcu/rcu_preempt/rcuhier" looks as follows:
-
-c=14407 g=14408 s=0 jfq=2 j=c863 nfqs=12040/nfqsng=0(12040) fqlh=1051 oqlen=0/0
-3/3 ..>. 0:7 ^0
-e/e ..>. 0:3 ^0    d/d ..>. 4:7 ^1
-
-The fields are as follows:
-
-o	"c" is exactly the same as "completed" under rcu/rcu_preempt/rcugp.
-
-o	"g" is exactly the same as "gpnum" under rcu/rcu_preempt/rcugp.
-
-o	"s" is the current state of the force_quiescent_state()
-	state machine.
-
-o	"jfq" is the number of jiffies remaining for this grace period
-	before force_quiescent_state() is invoked to help push things
-	along.	Note that CPUs in idle mode throughout the grace period
-	will not report on their own, but rather must be check by some
-	other CPU via force_quiescent_state().
-
-o	"j" is the low-order four hex digits of the jiffies counter.
-	Yes, Paul did run into a number of problems that turned out to
-	be due to the jiffies counter no longer counting.  Why do you ask?
-
-o	"nfqs" is the number of calls to force_quiescent_state() since
-	boot.
-
-o	"nfqsng" is the number of useless calls to force_quiescent_state(),
-	where there wasn't actually a grace period active.  This can
-	no longer happen due to grace-period processing being pushed
-	into a kthread.  The number in parentheses is the difference
-	between "nfqs" and "nfqsng", or the number of times that
-	force_quiescent_state() actually did some real work.
-
-o	"fqlh" is the number of calls to force_quiescent_state() that
-	exited immediately (without even being counted in nfqs above)
-	due to contention on ->fqslock.
-
-o	Each element of the form "3/3 ..>. 0:7 ^0" represents one rcu_node
-	structure.  Each line represents one level of the hierarchy,
-	from root to leaves.  It is best to think of the rcu_data
-	structures as forming yet another level after the leaves.
-	Note that there might be either one, two, three, or even four
-	levels of rcu_node structures, depending on the relationship
-	between CONFIG_RCU_FANOUT, CONFIG_RCU_FANOUT_LEAF (possibly
-	adjusted using the rcu_fanout_leaf kernel boot parameter), and
-	CONFIG_NR_CPUS (possibly adjusted using the nr_cpu_ids count of
-	possible CPUs for the booting hardware).
-
-	o	The numbers separated by the "/" are the qsmask followed
-		by the qsmaskinit.  The qsmask will have one bit
-		set for each entity in the next lower level that has
-		not yet checked in for the current grace period ("e"
-		indicating CPUs 5, 6, and 7 in the example above).
-		The qsmaskinit will have one bit for each entity that is
-		currently expected to check in during each grace period.
-		The value of qsmaskinit is assigned to that of qsmask
-		at the beginning of each grace period.
-
-	o	The characters separated by the ">" indicate the state
-		of the blocked-tasks lists.  A "G" preceding the ">"
-		indicates that at least one task blocked in an RCU
-		read-side critical section blocks the current grace
-		period, while a "E" preceding the ">" indicates that
-		at least one task blocked in an RCU read-side critical
-		section blocks the current expedited grace period.
-		A "T" character following the ">" indicates that at
-		least one task is blocked within an RCU read-side
-		critical section, regardless of whether any current
-		grace period (expedited or normal) is inconvenienced.
-		A "." character appears if the corresponding condition
-		does not hold, so that "..>." indicates that no tasks
-		are blocked.  In contrast, "GE>T" indicates maximal
-		inconvenience from blocked tasks.  CONFIG_TREE_RCU
-		builds of the kernel will always show "..>.".
-
-	o	The numbers separated by the ":" are the range of CPUs
-		served by this struct rcu_node.  This can be helpful
-		in working out how the hierarchy is wired together.
-
-		For example, the example rcu_node structure shown above
-		has "0:7", indicating that it covers CPUs 0 through 7.
-
-	o	The number after the "^" indicates the bit in the
-		next higher level rcu_node structure that this rcu_node
-		structure corresponds to.  For example, the "d/d ..>. 4:7
-		^1" has a "1" in this position, indicating that it
-		corresponds to the "1" bit in the "3" shown in the
-		"3/3 ..>. 0:7 ^0" entry on the next level up.
-
-
-The output of "cat rcu/rcu_sched/rcu_pending" looks as follows:
-
-  0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903 ndw=0
-  1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113 ndw=0
-  2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889 ndw=0
-  3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469 ndw=0
-  4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042 ndw=0
-  5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422 ndw=0
-  6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699 ndw=0
-  7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147 ndw=0
-
-The fields are as follows:
-
-o	The leading number is the CPU number, with "!" indicating
-	an offline CPU.
-
-o	"np" is the number of times that __rcu_pending() has been invoked
-	for the corresponding flavor of RCU.
-
-o	"qsp" is the number of times that the RCU was waiting for a
-	quiescent state from this CPU.
-
-o	"rpq" is the number of times that the CPU had passed through
-	a quiescent state, but not yet reported it to RCU.
-
-o	"cbr" is the number of times that this CPU had RCU callbacks
-	that had passed through a grace period, and were thus ready
-	to be invoked.
-
-o	"cng" is the number of times that this CPU needed another
-	grace period while RCU was idle.
-
-o	"gpc" is the number of times that an old grace period had
-	completed, but this CPU was not yet aware of it.
-
-o	"gps" is the number of times that a new grace period had started,
-	but this CPU was not yet aware of it.
-
-o	"ndw" is the number of times that a wakeup of an rcuo
-	callback-offload kthread had to be deferred in order to avoid
-	deadlock.
-
-o	"nn" is the number of times that this CPU needed nothing.
-
-
-The output of "cat rcu/rcuboost" looks as follows:
-
-0:3 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
-    balk: nt=0 egt=4695 bt=0 nb=0 ny=56 nos=0
-4:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
-    balk: nt=0 egt=6541 bt=0 nb=0 ny=126 nos=0
-
-This information is output only for rcu_preempt.  Each two-line entry
-corresponds to a leaf rcu_node structure.  The fields are as follows:
-
-o	"n:m" is the CPU-number range for the corresponding two-line
-	entry.  In the sample output above, the first entry covers
-	CPUs zero through three and the second entry covers CPUs four
-	through seven.
-
-o	"tasks=TNEB" gives the state of the various segments of the
-	rnp->blocked_tasks list:
-
-	"T"	This indicates that there are some tasks that blocked
-		while running on one of the corresponding CPUs while
-		in an RCU read-side critical section.
-
-	"N"	This indicates that some of the blocked tasks are preventing
-		the current normal (non-expedited) grace period from
-		completing.
-
-	"E"	This indicates that some of the blocked tasks are preventing
-		the current expedited grace period from completing.
-
-	"B"	This indicates that some of the blocked tasks are in
-		need of RCU priority boosting.
-
-	Each character is replaced with "." if the corresponding
-	condition does not hold.
-
-o	"kt" is the state of the RCU priority-boosting kernel
-	thread associated with the corresponding rcu_node structure.
-	The state can be one of the following:
-
-	"S"	The kernel thread is stopped, in other words, all
-		CPUs corresponding to this rcu_node structure are
-		offline.
-
-	"R"	The kernel thread is running.
-
-	"W"	The kernel thread is waiting because there is no work
-		for it to do.
-
-	"Y"	The kernel thread is yielding to avoid hogging CPU.
-
-	"?"	Unknown value, indicates a bug.
-
-o	"ntb" is the number of tasks boosted.
-
-o	"neb" is the number of tasks boosted in order to complete an
-	expedited grace period.
-
-o	"nnb" is the number of tasks boosted in order to complete a
-	normal (non-expedited) grace period.  When boosting a task
-	that was blocking both an expedited and a normal grace period,
-	it is counted against the expedited total above.
-
-o	"j" is the low-order 16 bits of the jiffies counter in
-	hexadecimal.
-
-o	"bt" is the low-order 16 bits of the value that the jiffies
-	counter will have when we next start boosting, assuming that
-	the current grace period does not end beforehand.  This is
-	also in hexadecimal.
-
-o	"balk: nt" counts the number of times we didn't boost (in
-	other words, we balked) even though it was time to boost because
-	there were no blocked tasks to boost.  This situation occurs
-	when there is one blocked task on one rcu_node structure and
-	none on some other rcu_node structure.
-
-o	"egt" counts the number of times we balked because although
-	there were blocked tasks, none of them were blocking the
-	current grace period, whether expedited or otherwise.
-
-o	"bt" counts the number of times we balked because boosting
-	had already been initiated for the current grace period.
-
-o	"nb" counts the number of times we balked because there
-	was at least one task blocking the current non-expedited grace
-	period that never had blocked.  If it is already running, it
-	just won't help to boost its priority!
-
-o	"ny" counts the number of times we balked because it was
-	not yet time to start boosting.
-
-o	"nos" counts the number of times we balked for other
-	reasons, e.g., the grace period ended first.
-
-
-CONFIG_TINY_RCU debugfs Files and Formats
-
-These implementations of RCU provides a single debugfs file under the
-top-level directory RCU, namely rcu/rcudata, which displays fields in
-rcu_bh_ctrlblk and rcu_sched_ctrlblk.
-
-The output of "cat rcu/rcudata" is as follows:
-
-rcu_sched: qlen: 0
-rcu_bh: qlen: 0
-
-This is split into rcu_sched and rcu_bh sections.  The field is as
-follows:
-
-o	"qlen" is the number of RCU callbacks currently waiting either
-	for an RCU grace period or waiting to be invoked.  This is the
-	only field present for rcu_sched and rcu_bh, due to the
-	short-circuiting of grace period in those two cases.
diff --git a/init/Kconfig b/init/Kconfig
index 2aa14ff40e88..3025383ab443 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -659,14 +659,6 @@ config RCU_FAST_NO_HZ
 
 	  Say N if you are unsure.
 
-config TREE_RCU_TRACE
-	def_bool RCU_TRACE && ( TREE_RCU || PREEMPT_RCU )
-	select DEBUG_FS
-	help
-	  This option provides tracing for the TREE_RCU and
-	  PREEMPT_RCU implementations, permitting Makefile to
-	  trivially select kernel/rcutree_trace.c.
-
 config RCU_BOOST
 	bool "Enable RCU priority boosting"
 	depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 3945337c8ce4..13c0fc852767 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -9,6 +9,5 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_PREEMPT_RCU) += tree.o
-obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
 obj-$(CONFIG_TINY_RCU) += tiny.o
 obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 371034e77f87..c642f23f1582 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -24,8 +24,6 @@
 
 #include <linux/kthread.h>
 #include <linux/init.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
@@ -87,49 +85,6 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
 	local_irq_restore(flags);
 }
 
-/*
- * Dump statistics for TINY_RCU, such as they are.
- */
-static int show_tiny_stats(struct seq_file *m, void *unused)
-{
-	seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
-	seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
-	return 0;
-}
-
-static int show_tiny_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_tiny_stats, NULL);
-}
-
-static const struct file_operations show_tiny_stats_fops = {
-	.owner = THIS_MODULE,
-	.open = show_tiny_stats_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static struct dentry *rcudir;
-
-static int __init rcutiny_trace_init(void)
-{
-	struct dentry *retval;
-
-	rcudir = debugfs_create_dir("rcu", NULL);
-	if (!rcudir)
-		goto free_out;
-	retval = debugfs_create_file("rcudata", 0444, rcudir,
-				     NULL, &show_tiny_stats_fops);
-	if (!retval)
-		goto free_out;
-	return 0;
-free_out:
-	debugfs_remove_recursive(rcudir);
-	return 1;
-}
-device_initcall(rcutiny_trace_init);
-
 static void check_cpu_stall(struct rcu_ctrlblk *rcp)
 {
 	unsigned long j;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 2c112bb11aa8..9af0f31d6847 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -152,19 +152,6 @@ struct rcu_node {
 				/* Number of tasks boosted for expedited GP. */
 	unsigned long n_normal_boosts;
 				/* Number of tasks boosted for normal GP. */
-	unsigned long n_balk_blkd_tasks;
-				/* Refused to boost: no blocked tasks. */
-	unsigned long n_balk_exp_gp_tasks;
-				/* Refused to boost: nothing blocking GP. */
-	unsigned long n_balk_boost_tasks;
-				/* Refused to boost: already boosting. */
-	unsigned long n_balk_notblocked;
-				/* Refused to boost: RCU RS CS still running. */
-	unsigned long n_balk_notyet;
-				/* Refused to boost: not yet time. */
-	unsigned long n_balk_nos;
-				/* Refused to boost: not sure why, though. */
-				/*  This can happen due to race conditions. */
 #ifdef CONFIG_RCU_NOCB_CPU
 	struct swait_queue_head nocb_gp_wq[2];
 				/* Place for rcu_nocb_kthread() to wait GP. */
@@ -535,17 +522,3 @@ void srcu_offline_cpu(unsigned int cpu) { }
 #endif /* #else #ifdef CONFIG_SRCU */
 
 #endif /* #ifndef RCU_TREE_NONCORE */
-
-#ifdef CONFIG_RCU_TRACE
-/* Read out queue lengths for tracing. */
-static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
-{
-#ifdef CONFIG_RCU_NOCB_CPU
-	*ql = atomic_long_read(&rdp->nocb_q_count);
-	*qll = atomic_long_read(&rdp->nocb_q_count_lazy);
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-	*ql = 0;
-	*qll = 0;
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
-}
-#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 7f5919ab24c4..43f2f8026b4a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -70,7 +70,7 @@ static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */
 static void __init rcu_bootup_announce_oddness(void)
 {
 	if (IS_ENABLED(CONFIG_RCU_TRACE))
-		pr_info("\tRCU debugfs-based tracing is enabled.\n");
+		pr_info("\tRCU event tracing is enabled.\n");
 	if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
 	    (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
 		pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
@@ -899,33 +899,6 @@ void exit_rcu(void)
 
 #include "../locking/rtmutex_common.h"
 
-#ifdef CONFIG_RCU_TRACE
-
-static void rcu_initiate_boost_trace(struct rcu_node *rnp)
-{
-	if (!rcu_preempt_has_tasks(rnp))
-		rnp->n_balk_blkd_tasks++;
-	else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
-		rnp->n_balk_exp_gp_tasks++;
-	else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
-		rnp->n_balk_boost_tasks++;
-	else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
-		rnp->n_balk_notblocked++;
-	else if (rnp->gp_tasks != NULL &&
-		 ULONG_CMP_LT(jiffies, rnp->boost_time))
-		rnp->n_balk_notyet++;
-	else
-		rnp->n_balk_nos++;
-}
-
-#else /* #ifdef CONFIG_RCU_TRACE */
-
-static void rcu_initiate_boost_trace(struct rcu_node *rnp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-
 static void rcu_wake_cond(struct task_struct *t, int status)
 {
 	/*
@@ -1058,7 +1031,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 
 	lockdep_assert_held(&rnp->lock);
 	if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
-		rnp->n_balk_exp_gp_tasks++;
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;
 	}
@@ -1074,7 +1046,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 		if (t)
 			rcu_wake_cond(t, rnp->boost_kthread_status);
 	} else {
-		rcu_initiate_boost_trace(rnp);
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
 }
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
deleted file mode 100644
index 6cea17a1ea30..000000000000
--- a/kernel/rcu/tree_trace.c
+++ /dev/null
@@ -1,494 +0,0 @@
-/*
- * Read-Copy Update tracing for hierarchical implementation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright IBM Corporation, 2008
- * Author: Paul E. McKenney
- *
- * Papers:  http://www.rdrop.com/users/paulmck/RCU
- *
- * For detailed explanation of Read-Copy Update mechanism see -
- *		Documentation/RCU
- *
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/completion.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
-#include <linux/mutex.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <linux/prefetch.h>
-
-#define RCU_TREE_NONCORE
-#include "tree.h"
-#include "rcu.h"
-
-static int r_open(struct inode *inode, struct file *file,
-					const struct seq_operations *op)
-{
-	int ret = seq_open(file, op);
-	if (!ret) {
-		struct seq_file *m = (struct seq_file *)file->private_data;
-		m->private = inode->i_private;
-	}
-	return ret;
-}
-
-static void *r_start(struct seq_file *m, loff_t *pos)
-{
-	struct rcu_state *rsp = (struct rcu_state *)m->private;
-	*pos = cpumask_next(*pos - 1, cpu_possible_mask);
-	if ((*pos) < nr_cpu_ids)
-		return per_cpu_ptr(rsp->rda, *pos);
-	return NULL;
-}
-
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	(*pos)++;
-	return r_start(m, pos);
-}
-
-static void r_stop(struct seq_file *m, void *v)
-{
-}
-
-static int show_rcubarrier(struct seq_file *m, void *v)
-{
-	struct rcu_state *rsp = (struct rcu_state *)m->private;
-	seq_printf(m, "bcc: %d bseq: %lu\n",
-		   atomic_read(&rsp->barrier_cpu_count),
-		   rsp->barrier_sequence);
-	return 0;
-}
-
-static int rcubarrier_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_rcubarrier, inode->i_private);
-}
-
-static const struct file_operations rcubarrier_fops = {
-	.owner = THIS_MODULE,
-	.open = rcubarrier_open,
-	.read = seq_read,
-	.llseek = no_llseek,
-	.release = single_release,
-};
-
-#ifdef CONFIG_RCU_BOOST
-
-static char convert_kthread_status(unsigned int kthread_status)
-{
-	if (kthread_status > RCU_KTHREAD_MAX)
-		return '?';
-	return "SRWOY"[kthread_status];
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
-{
-	long ql, qll;
-
-	if (!rdp->beenonline)
-		return;
-	seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d",
-		   rdp->cpu,
-		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
-		   ulong2long(rdp->completed), ulong2long(rdp->gpnum),
-		   rdp->cpu_no_qs.b.norm,
-		   rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu),
-		   rdp->core_needs_qs);
-	seq_printf(m, " dt=%d/%llx/%d df=%lu",
-		   rcu_dynticks_snap(rdp->dynticks),
-		   rdp->dynticks->dynticks_nesting,
-		   rdp->dynticks->dynticks_nmi_nesting,
-		   rdp->dynticks_fqs);
-	seq_printf(m, " of=%lu", rdp->offline_fqs);
-	rcu_nocb_q_lengths(rdp, &ql, &qll);
-	qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist);
-	ql += rcu_segcblist_n_cbs(&rdp->cblist);
-	seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
-		   qll, ql,
-		   ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)],
-		   ".R"[!rcu_segcblist_segempty(&rdp->cblist,
-						RCU_NEXT_READY_TAIL)],
-		   ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)],
-		   ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]);
-#ifdef CONFIG_RCU_BOOST
-	seq_printf(m, " kt=%d/%c ktl=%x",
-		   per_cpu(rcu_cpu_has_work, rdp->cpu),
-		   convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
-					  rdp->cpu)),
-		   per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-	seq_printf(m, " b=%ld", rdp->blimit);
-	seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
-		   rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
-		   rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
-}
-
-static int show_rcudata(struct seq_file *m, void *v)
-{
-	print_one_rcu_data(m, (struct rcu_data *)v);
-	return 0;
-}
-
-static const struct seq_operations rcudate_op = {
-	.start = r_start,
-	.next  = r_next,
-	.stop  = r_stop,
-	.show  = show_rcudata,
-};
-
-static int rcudata_open(struct inode *inode, struct file *file)
-{
-	return r_open(inode, file, &rcudate_op);
-}
-
-static const struct file_operations rcudata_fops = {
-	.owner = THIS_MODULE,
-	.open = rcudata_open,
-	.read = seq_read,
-	.llseek = no_llseek,
-	.release = seq_release,
-};
-
-static int show_rcuexp(struct seq_file *m, void *v)
-{
-	int cpu;
-	struct rcu_state *rsp = (struct rcu_state *)m->private;
-	struct rcu_data *rdp;
-	unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
-
-	for_each_possible_cpu(cpu) {
-		rdp = per_cpu_ptr(rsp->rda, cpu);
-		s0 += atomic_long_read(&rdp->exp_workdone0);
-		s1 += atomic_long_read(&rdp->exp_workdone1);
-		s2 += atomic_long_read(&rdp->exp_workdone2);
-		s3 += atomic_long_read(&rdp->exp_workdone3);
-	}
-	seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n",
-		   rsp->expedited_sequence, s0, s1, s2, s3,
-		   atomic_read(&rsp->expedited_need_qs),
-		   rsp->expedited_sequence / 2);
-	return 0;
-}
-
-static int rcuexp_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_rcuexp, inode->i_private);
-}
-
-static const struct file_operations rcuexp_fops = {
-	.owner = THIS_MODULE,
-	.open = rcuexp_open,
-	.read = seq_read,
-	.llseek = no_llseek,
-	.release = single_release,
-};
-
-#ifdef CONFIG_RCU_BOOST
-
-static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
-{
-	seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
-		   rnp->grplo, rnp->grphi,
-		   "T."[list_empty(&rnp->blkd_tasks)],
-		   "N."[!rnp->gp_tasks],
-		   "E."[!rnp->exp_tasks],
-		   "B."[!rnp->boost_tasks],
-		   convert_kthread_status(rnp->boost_kthread_status),
-		   rnp->n_tasks_boosted, rnp->n_exp_boosts,
-		   rnp->n_normal_boosts);
-	seq_printf(m, "j=%04x bt=%04x\n",
-		   (int)(jiffies & 0xffff),
-		   (int)(rnp->boost_time & 0xffff));
-	seq_printf(m, "    balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
-		   rnp->n_balk_blkd_tasks,
-		   rnp->n_balk_exp_gp_tasks,
-		   rnp->n_balk_boost_tasks,
-		   rnp->n_balk_notblocked,
-		   rnp->n_balk_notyet,
-		   rnp->n_balk_nos);
-}
-
-static int show_rcu_node_boost(struct seq_file *m, void *unused)
-{
-	struct rcu_node *rnp;
-
-	rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
-		print_one_rcu_node_boost(m, rnp);
-	return 0;
-}
-
-static int rcu_node_boost_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_rcu_node_boost, NULL);
-}
-
-static const struct file_operations rcu_node_boost_fops = {
-	.owner = THIS_MODULE,
-	.open = rcu_node_boost_open,
-	.read = seq_read,
-	.llseek = no_llseek,
-	.release = single_release,
-};
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
-{
-	unsigned long gpnum;
-	int level = 0;
-	struct rcu_node *rnp;
-
-	gpnum = rsp->gpnum;
-	seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
-		   ulong2long(rsp->completed), ulong2long(gpnum),
-		   rsp->gp_state,
-		   (long)(rsp->jiffies_force_qs - jiffies),
-		   (int)(jiffies & 0xffff));
-	seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
-		   rsp->n_force_qs, rsp->n_force_qs_ngp,
-		   rsp->n_force_qs - rsp->n_force_qs_ngp,
-		   READ_ONCE(rsp->n_force_qs_lh),
-		   rsp->orphan_done.len_lazy,
-		   rsp->orphan_done.len);
-	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
-		if (rnp->level != level) {
-			seq_puts(m, "\n");
-			level = rnp->level;
-		}
-		seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d    ",
-			   rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
-			   ".G"[rnp->gp_tasks != NULL],
-			   ".E"[rnp->exp_tasks != NULL],
-			   ".T"[!list_empty(&rnp->blkd_tasks)],
-			   rnp->grplo, rnp->grphi, rnp->grpnum);
-	}
-	seq_puts(m, "\n");
-}
-
-static int show_rcuhier(struct seq_file *m, void *v)
-{
-	struct rcu_state *rsp = (struct rcu_state *)m->private;
-	print_one_rcu_state(m, rsp);
-	return 0;
-}
-
-static int rcuhier_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_rcuhier, inode->i_private);
-}
-
-static const struct file_operations rcuhier_fops = {
-	.owner = THIS_MODULE,
-	.open = rcuhier_open,
-	.read = seq_read,
-	.llseek = no_llseek,
-	.release = single_release,
-};
-
-static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
-{
-	unsigned long flags;
-	unsigned long completed;
-	unsigned long gpnum;
-	unsigned long gpage;
-	unsigned long gpmax;
-	struct rcu_node *rnp = &rsp->node[0];
-
-	raw_spin_lock_irqsave_rcu_node(rnp, flags);
-	completed = READ_ONCE(rsp->completed);
-	gpnum = READ_ONCE(rsp->gpnum);
-	if (completed == gpnum)
-		gpage = 0;
-	else
-		gpage = jiffies - rsp->gp_start;
-	gpmax = rsp->gp_max;
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-	seq_printf(m, "completed=%ld  gpnum=%ld  age=%ld  max=%ld\n",
-		   ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
-}
-
-static int show_rcugp(struct seq_file *m, void *v)
-{
-	struct rcu_state *rsp = (struct rcu_state *)m->private;
-	show_one_rcugp(m, rsp);
-	return 0;
-}
-
-static int rcugp_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_rcugp, inode->i_private);
-}
-
-static const struct file_operations rcugp_fops = {
-	.owner = THIS_MODULE,
-	.open = rcugp_open,
-	.read = seq_read,
-	.llseek = no_llseek,
-	.release = single_release,
-};
-
-static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
-{
-	if (!rdp->beenonline)
-		return;
-	seq_printf(m, "%3d%cnp=%ld ",
-		   rdp->cpu,
-		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
-		   rdp->n_rcu_pending);
-	seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
-		   rdp->n_rp_core_needs_qs,
-		   rdp->n_rp_report_qs,
-		   rdp->n_rp_cb_ready,
-		   rdp->n_rp_cpu_needs_gp);
-	seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n",
-		   rdp->n_rp_gp_completed,
-		   rdp->n_rp_gp_started,
-		   rdp->n_rp_nocb_defer_wakeup,
-		   rdp->n_rp_need_nothing);
-}
-
-static int show_rcu_pending(struct seq_file *m, void *v)
-{
-	print_one_rcu_pending(m, (struct rcu_data *)v);
-	return 0;
-}
-
-static const struct seq_operations rcu_pending_op = {
-	.start = r_start,
-	.next  = r_next,
-	.stop  = r_stop,
-	.show  = show_rcu_pending,
-};
-
-static int rcu_pending_open(struct inode *inode, struct file *file)
-{
-	return r_open(inode, file, &rcu_pending_op);
-}
-
-static const struct file_operations rcu_pending_fops = {
-	.owner = THIS_MODULE,
-	.open = rcu_pending_open,
-	.read = seq_read,
-	.llseek = no_llseek,
-	.release = seq_release,
-};
-
-static int show_rcutorture(struct seq_file *m, void *unused)
-{
-	seq_printf(m, "rcutorture test sequence: %lu %s\n",
-		   rcutorture_testseq >> 1,
-		   (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
-	seq_printf(m, "rcutorture update version number: %lu\n",
-		   rcutorture_vernum);
-	return 0;
-}
-
-static int rcutorture_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_rcutorture, NULL);
-}
-
-static const struct file_operations rcutorture_fops = {
-	.owner = THIS_MODULE,
-	.open = rcutorture_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
-
-static struct dentry *rcudir;
-
-static int __init rcutree_trace_init(void)
-{
-	struct rcu_state *rsp;
-	struct dentry *retval;
-	struct dentry *rspdir;
-
-	rcudir = debugfs_create_dir("rcu", NULL);
-	if (!rcudir)
-		goto free_out;
-
-	for_each_rcu_flavor(rsp) {
-		rspdir = debugfs_create_dir(rsp->name, rcudir);
-		if (!rspdir)
-			goto free_out;
-
-		retval = debugfs_create_file("rcudata", 0444,
-				rspdir, rsp, &rcudata_fops);
-		if (!retval)
-			goto free_out;
-
-		retval = debugfs_create_file("rcuexp", 0444,
-				rspdir, rsp, &rcuexp_fops);
-		if (!retval)
-			goto free_out;
-
-		retval = debugfs_create_file("rcu_pending", 0444,
-				rspdir, rsp, &rcu_pending_fops);
-		if (!retval)
-			goto free_out;
-
-		retval = debugfs_create_file("rcubarrier", 0444,
-				rspdir, rsp, &rcubarrier_fops);
-		if (!retval)
-			goto free_out;
-
-#ifdef CONFIG_RCU_BOOST
-		if (rsp == &rcu_preempt_state) {
-			retval = debugfs_create_file("rcuboost", 0444,
-				rspdir, NULL, &rcu_node_boost_fops);
-			if (!retval)
-				goto free_out;
-		}
-#endif
-
-		retval = debugfs_create_file("rcugp", 0444,
-				rspdir, rsp, &rcugp_fops);
-		if (!retval)
-			goto free_out;
-
-		retval = debugfs_create_file("rcuhier", 0444,
-				rspdir, rsp, &rcuhier_fops);
-		if (!retval)
-			goto free_out;
-	}
-
-	retval = debugfs_create_file("rcutorture", 0444, rcudir,
-						NULL, &rcutorture_fops);
-	if (!retval)
-		goto free_out;
-	return 0;
-free_out:
-	debugfs_remove_recursive(rcudir);
-	return 1;
-}
-device_initcall(rcutree_trace_init);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 498d5dd63bf4..8c10b5a97b9e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1361,9 +1361,8 @@ config RCU_TRACE
 	default y if TREE_RCU
 	select TRACE_CLOCK
 	help
-	  This option provides tracing in RCU which presents stats
-	  in debugfs for debugging RCU implementation.  It also enables
-	  additional tracepoints for ftrace-style event tracing.
+	  This option enables additional tracepoints for ftrace-style
+	  event tracing.
 
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T b/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T
deleted file mode 100644
index 917d2517b5b5..000000000000
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T
+++ /dev/null
@@ -1,21 +0,0 @@
-CONFIG_SMP=y
-CONFIG_NR_CPUS=8
-CONFIG_PREEMPT_NONE=n
-CONFIG_PREEMPT_VOLUNTARY=n
-CONFIG_PREEMPT=y
-#CHECK#CONFIG_PREEMPT_RCU=y
-CONFIG_HZ_PERIODIC=n
-CONFIG_NO_HZ_IDLE=y
-CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_RCU_TRACE=y
-CONFIG_HOTPLUG_CPU=n
-CONFIG_SUSPEND=n
-CONFIG_HIBERNATION=n
-CONFIG_RCU_FANOUT=3
-CONFIG_RCU_FANOUT_LEAF=3
-CONFIG_RCU_NOCB_CPU=n
-CONFIG_DEBUG_LOCK_ALLOC=y
-CONFIG_PROVE_LOCKING=n
-CONFIG_RCU_BOOST=n
-CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T
deleted file mode 100644
index 2ad13f0d29cc..000000000000
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T
+++ /dev/null
@@ -1,21 +0,0 @@
-CONFIG_SMP=y
-CONFIG_NR_CPUS=16
-CONFIG_PREEMPT_NONE=n
-CONFIG_PREEMPT_VOLUNTARY=n
-CONFIG_PREEMPT=y
-#CHECK#CONFIG_PREEMPT_RCU=y
-CONFIG_HZ_PERIODIC=n
-CONFIG_NO_HZ_IDLE=y
-CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_RCU_TRACE=y
-CONFIG_HOTPLUG_CPU=n
-CONFIG_SUSPEND=n
-CONFIG_HIBERNATION=n
-CONFIG_RCU_FANOUT=3
-CONFIG_RCU_FANOUT_LEAF=2
-CONFIG_RCU_NOCB_CPU=y
-CONFIG_RCU_NOCB_CPU_ALL=y
-CONFIG_DEBUG_LOCK_ALLOC=n
-CONFIG_RCU_BOOST=n
-CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index c5c29fb7438c..928fadaecc25 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -79,9 +79,5 @@ CONFIG_TASKS_RCU
 
 	Selected by CONFIG_RCU_TORTURE_TEST, so cannot disable.
 
-CONFIG_RCU_TRACE
-
-	Implied by CONFIG_RCU_TRACE for Tree RCU.
-
 
 boot parameters ignored: TBD
-- 
cgit v1.3-14-g43fede


From 44c65ff2e3b0b48250a970183ab53b0602c25764 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 15 May 2017 16:26:34 -0700
Subject: rcu: Eliminate NOCBs CPU-state Kconfig options

The CONFIG_RCU_NOCB_CPU_ALL, CONFIG_RCU_NOCB_CPU_NONE, and
CONFIG_RCU_NOCB_CPU_ZERO Kconfig options are used only in testing and
are redundant with the rcu_nocbs= boot parameter.  This commit therefore
removes these three Kconfig options and adjusts the rcutorture scripts
to use the boot parameter instead.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/kernel-per-CPU-kthreads.txt          | 31 ++++++-------
 Documentation/timers/NO_HZ.txt                     | 29 ++----------
 init/Kconfig                                       | 53 ----------------------
 kernel/rcu/rcu.h                                   |  4 +-
 kernel/rcu/tree_plugin.h                           | 27 ++---------
 .../selftests/rcutorture/configs/rcu/TREE01        |  1 -
 .../selftests/rcutorture/configs/rcu/TREE01.boot   |  1 +
 .../selftests/rcutorture/configs/rcu/TREE05        |  1 -
 .../selftests/rcutorture/configs/rcu/TREE08        |  1 -
 .../selftests/rcutorture/configs/rcu/TREE08.boot   |  1 +
 .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt  |  8 ++--
 11 files changed, 26 insertions(+), 131 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
index df31e30b6a02..2cb7dc5c0e0d 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -109,13 +109,12 @@ SCHED_SOFTIRQ: Do all of the following:
 	on that CPU.  If a thread that expects to run on the de-jittered
 	CPU awakens, the scheduler will send an IPI that can result in
 	a subsequent SCHED_SOFTIRQ.
-2.	Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y,
-	CONFIG_NO_HZ_FULL=y, and, in addition, ensure that the CPU
-	to be de-jittered is marked as an adaptive-ticks CPU using the
-	"nohz_full=" boot parameter.  This reduces the number of
-	scheduler-clock interrupts that the de-jittered CPU receives,
-	minimizing its chances of being selected to do the load balancing
-	work that runs in SCHED_SOFTIRQ context.
+2.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered
+	is marked as an adaptive-ticks CPU using the "nohz_full="
+	boot parameter.  This reduces the number of scheduler-clock
+	interrupts that the de-jittered CPU receives, minimizing its
+	chances of being selected to do the load balancing work that
+	runs in SCHED_SOFTIRQ context.
 3.	To the extent possible, keep the CPU out of the kernel when it
 	is non-idle, for example, by avoiding system calls and by
 	forcing both kernel threads and interrupts to execute elsewhere.
@@ -135,11 +134,10 @@ HRTIMER_SOFTIRQ:  Do all of the following:
 RCU_SOFTIRQ:  Do at least one of the following:
 1.	Offload callbacks and keep the CPU in either dyntick-idle or
 	adaptive-ticks state by doing all of the following:
-	a.	Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y,
-		CONFIG_NO_HZ_FULL=y, and, in addition ensure that the CPU
-		to be de-jittered is marked as an adaptive-ticks CPU using
-		the "nohz_full=" boot parameter.  Bind the rcuo kthreads
-		to housekeeping CPUs, which can tolerate OS jitter.
+	a.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be
+		de-jittered is marked as an adaptive-ticks CPU using the
+		"nohz_full=" boot parameter.  Bind the rcuo kthreads to
+		housekeeping CPUs, which can tolerate OS jitter.
 	b.	To the extent possible, keep the CPU out of the kernel
 		when it is non-idle, for example, by avoiding system
 		calls and by forcing both kernel threads and interrupts
@@ -236,11 +234,10 @@ To reduce its OS jitter, do at least one of the following:
 	is feasible only if your workload never requires RCU priority
 	boosting, for example, if you ensure frequent idle time on all
 	CPUs that might execute within the kernel.
-3.	Build with CONFIG_RCU_NOCB_CPU=y and CONFIG_RCU_NOCB_CPU_ALL=y,
-	which offloads all RCU callbacks to kthreads that can be moved
-	off of CPUs susceptible to OS jitter.  This approach prevents the
-	rcuc/%u kthreads from having any work to do, so that they are
-	never awakened.
+3.	Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs=
+	boot parameter offloading RCU callbacks from all CPUs susceptible
+	to OS jitter.  This approach prevents the rcuc/%u kthreads from
+	having any work to do, so that they are never awakened.
 4.	Ensure that the CPU never enters the kernel, and, in particular,
 	avoid initiating any CPU hotplug operations on this CPU.  This is
 	another way of preventing any callbacks from being queued on the
diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt
index 6eaf576294f3..2dcaf9adb7a7 100644
--- a/Documentation/timers/NO_HZ.txt
+++ b/Documentation/timers/NO_HZ.txt
@@ -194,32 +194,9 @@ that the RCU callbacks are processed in a timely fashion.
 
 Another approach is to offload RCU callback processing to "rcuo" kthreads
 using the CONFIG_RCU_NOCB_CPU=y Kconfig option.  The specific CPUs to
-offload may be selected via several methods:
-
-1.	One of three mutually exclusive Kconfig options specify a
-	build-time default for the CPUs to offload:
-
-	a.	The CONFIG_RCU_NOCB_CPU_NONE=y Kconfig option results in
-		no CPUs being offloaded.
-
-	b.	The CONFIG_RCU_NOCB_CPU_ZERO=y Kconfig option causes
-		CPU 0 to be offloaded.
-
-	c.	The CONFIG_RCU_NOCB_CPU_ALL=y Kconfig option causes all
-		CPUs to be offloaded.  Note that the callbacks will be
-		offloaded to "rcuo" kthreads, and that those kthreads
-		will in fact run on some CPU.  However, this approach
-		gives fine-grained control on exactly which CPUs the
-		callbacks run on, along with their scheduling priority
-		(including the default of SCHED_OTHER), and it further
-		allows this control to be varied dynamically at runtime.
-
-2.	The "rcu_nocbs=" kernel boot parameter, which takes a comma-separated
-	list of CPUs and CPU ranges, for example, "1,3-5" selects CPUs 1,
-	3, 4, and 5.  The specified CPUs will be offloaded in addition to
-	any CPUs specified as offloaded by CONFIG_RCU_NOCB_CPU_ZERO=y or
-	CONFIG_RCU_NOCB_CPU_ALL=y.  This means that the "rcu_nocbs=" boot
-	parameter has no effect for kernels built with RCU_NOCB_CPU_ALL=y.
+offload may be selected using The "rcu_nocbs=" kernel boot parameter,
+which takes a comma-separated list of CPUs and CPU ranges, for example,
+"1,3-5" selects CPUs 1, 3, 4, and 5.
 
 The offloaded CPUs will never queue RCU callbacks, and therefore RCU
 never prevents offloaded CPUs from entering either dyntick-idle mode
diff --git a/init/Kconfig b/init/Kconfig
index 3025383ab443..dc431c6109f2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -709,59 +709,6 @@ config RCU_NOCB_CPU
 	  Say Y here if you want to help to debug reduced OS jitter.
 	  Say N here if you are unsure.
 
-choice
-	prompt "Build-forced no-CBs CPUs"
-	default RCU_NOCB_CPU_NONE
-	depends on RCU_NOCB_CPU
-	help
-	  This option allows no-CBs CPUs (whose RCU callbacks are invoked
-	  from kthreads rather than from softirq context) to be specified
-	  at build time.  Additional no-CBs CPUs may be specified by
-	  the rcu_nocbs= boot parameter.
-
-config RCU_NOCB_CPU_NONE
-	bool "No build_forced no-CBs CPUs"
-	help
-	  This option does not force any of the CPUs to be no-CBs CPUs.
-	  Only CPUs designated by the rcu_nocbs= boot parameter will be
-	  no-CBs CPUs, whose RCU callbacks will be invoked by per-CPU
-	  kthreads whose names begin with "rcuo".  All other CPUs will
-	  invoke their own RCU callbacks in softirq context.
-
-	  Select this option if you want to choose no-CBs CPUs at
-	  boot time, for example, to allow testing of different no-CBs
-	  configurations without having to rebuild the kernel each time.
-
-config RCU_NOCB_CPU_ZERO
-	bool "CPU 0 is a build_forced no-CBs CPU"
-	help
-	  This option forces CPU 0 to be a no-CBs CPU, so that its RCU
-	  callbacks are invoked by a per-CPU kthread whose name begins
-	  with "rcuo".	Additional CPUs may be designated as no-CBs
-	  CPUs using the rcu_nocbs= boot parameter will be no-CBs CPUs.
-	  All other CPUs will invoke their own RCU callbacks in softirq
-	  context.
-
-	  Select this if CPU 0 needs to be a no-CBs CPU for real-time
-	  or energy-efficiency reasons, but the real reason it exists
-	  is to ensure that randconfig testing covers mixed systems.
-
-config RCU_NOCB_CPU_ALL
-	bool "All CPUs are build_forced no-CBs CPUs"
-	help
-	  This option forces all CPUs to be no-CBs CPUs.  The rcu_nocbs=
-	  boot parameter will be ignored.  All CPUs' RCU callbacks will
-	  be executed in the context of per-CPU rcuo kthreads created for
-	  this purpose.  Assuming that the kthreads whose names start with
-	  "rcuo" are bound to "housekeeping" CPUs, this reduces OS jitter
-	  on the remaining CPUs, but might decrease memory locality during
-	  RCU-callback invocation, thus potentially degrading throughput.
-
-	  Select this if all CPUs need to be no-CBs CPUs for real-time
-	  or energy-efficiency reasons.
-
-endchoice
-
 endmenu # "RCU Subsystem"
 
 config BUILD_BIN2C
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index d06c42deee0b..808b8c85f626 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -564,9 +564,7 @@ void rcu_bh_force_quiescent_state(void);
 void rcu_sched_force_quiescent_state(void);
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 
-#if defined(CONFIG_RCU_NOCB_CPU_ALL)
-static inline bool rcu_is_nocb_cpu(int cpu) { return true; }
-#elif defined(CONFIG_RCU_NOCB_CPU)
+#ifdef CONFIG_RCU_NOCB_CPU
 bool rcu_is_nocb_cpu(int cpu);
 #else
 static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 43f2f8026b4a..908b309d60d7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1296,8 +1296,7 @@ static void rcu_prepare_kthreads(int cpu)
 int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 {
 	*nextevt = KTIME_MAX;
-	return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)
-	       ? 0 : rcu_cpu_has_callbacks(NULL);
+	return rcu_cpu_has_callbacks(NULL);
 }
 
 /*
@@ -1409,10 +1408,6 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 	unsigned long dj;
 
 	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!");
-	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) {
-		*nextevt = KTIME_MAX;
-		return 0;
-	}
 
 	/* Snapshot to detect later posting of non-lazy callback. */
 	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
@@ -1462,8 +1457,7 @@ static void rcu_prepare_for_idle(void)
 	int tne;
 
 	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!");
-	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
-	    rcu_is_nocb_cpu(smp_processor_id()))
+	if (rcu_is_nocb_cpu(smp_processor_id()))
 		return;
 
 	/* Handle nohz enablement switches conservatively. */
@@ -1518,8 +1512,7 @@ static void rcu_prepare_for_idle(void)
 static void rcu_cleanup_after_idle(void)
 {
 	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!");
-	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
-	    rcu_is_nocb_cpu(smp_processor_id()))
+	if (rcu_is_nocb_cpu(smp_processor_id()))
 		return;
 	if (rcu_try_advance_all_cbs())
 		invoke_rcu_core();
@@ -1786,7 +1779,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 	init_swait_queue_head(&rnp->nocb_gp_wq[1]);
 }
 
-#ifndef CONFIG_RCU_NOCB_CPU_ALL
 /* Is the specified CPU a no-CBs CPU? */
 bool rcu_is_nocb_cpu(int cpu)
 {
@@ -1794,7 +1786,6 @@ bool rcu_is_nocb_cpu(int cpu)
 		return cpumask_test_cpu(cpu, rcu_nocb_mask);
 	return false;
 }
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 
 /*
  * Kick the leader kthread for this NOCB group.
@@ -2253,10 +2244,6 @@ void __init rcu_init_nohz(void)
 	bool need_rcu_nocb_mask = true;
 	struct rcu_state *rsp;
 
-#ifdef CONFIG_RCU_NOCB_CPU_NONE
-	need_rcu_nocb_mask = false;
-#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
-
 #if defined(CONFIG_NO_HZ_FULL)
 	if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
 		need_rcu_nocb_mask = true;
@@ -2272,14 +2259,6 @@ void __init rcu_init_nohz(void)
 	if (!have_rcu_nocb_mask)
 		return;
 
-#ifdef CONFIG_RCU_NOCB_CPU_ZERO
-	pr_info("\tOffload RCU callbacks from CPU 0\n");
-	cpumask_set_cpu(0, rcu_nocb_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
-#ifdef CONFIG_RCU_NOCB_CPU_ALL
-	pr_info("\tOffload RCU callbacks from all CPUs\n");
-	cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
 #if defined(CONFIG_NO_HZ_FULL)
 	if (tick_nohz_full_running)
 		cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
index 92ca49f90ef9..b5b53973c01e 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
@@ -12,7 +12,6 @@ CONFIG_HOTPLUG_CPU=y
 CONFIG_MAXSMP=y
 CONFIG_CPUMASK_OFFSTACK=y
 CONFIG_RCU_NOCB_CPU=y
-CONFIG_RCU_NOCB_CPU_ZERO=y
 CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_RCU_BOOST=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
index 89705ed79596..1d14e1383016 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
@@ -2,3 +2,4 @@ rcutorture.torture_type=rcu_bh maxcpus=8
 rcutree.gp_preinit_delay=3
 rcutree.gp_init_delay=3
 rcutree.gp_cleanup_delay=3
+rcu_nocbs=0
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 b/tools/testing/selftests/rcutorture/configs/rcu/TREE05
index 1257d3227b1e..2dde0d9964e3 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05
@@ -13,7 +13,6 @@ CONFIG_HOTPLUG_CPU=y
 CONFIG_RCU_FANOUT=6
 CONFIG_RCU_FANOUT_LEAF=6
 CONFIG_RCU_NOCB_CPU=y
-CONFIG_RCU_NOCB_CPU_NONE=y
 CONFIG_DEBUG_LOCK_ALLOC=y
 CONFIG_PROVE_LOCKING=y
 #CHECK#CONFIG_PROVE_RCU=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 b/tools/testing/selftests/rcutorture/configs/rcu/TREE08
index 099cc63c6a3b..fb1c763c10c5 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08
@@ -15,7 +15,6 @@ CONFIG_HIBERNATION=n
 CONFIG_RCU_FANOUT=3
 CONFIG_RCU_FANOUT_LEAF=2
 CONFIG_RCU_NOCB_CPU=y
-CONFIG_RCU_NOCB_CPU_ALL=y
 CONFIG_DEBUG_LOCK_ALLOC=n
 CONFIG_PROVE_LOCKING=n
 CONFIG_RCU_BOOST=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot
index fb066dc82769..1bd8efc4141e 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot
@@ -2,3 +2,4 @@ rcutorture.torture_type=sched
 rcupdate.rcu_self_test=1
 rcupdate.rcu_self_test_sched=1
 rcutree.rcu_fanout_exact=1
+rcu_nocbs=0-7
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
index 928fadaecc25..9ad3f89c8dc7 100644
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -16,11 +16,9 @@ CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING.
 CONFIG_RCU_BOOST -- one of PREEMPT_RCU.
 CONFIG_RCU_FANOUT -- Cover hierarchy, but overlap with others.
 CONFIG_RCU_FANOUT_LEAF -- Do one non-default.
-CONFIG_RCU_FAST_NO_HZ -- Do one, but not with CONFIG_RCU_NOCB_CPU_ALL.
-CONFIG_RCU_NOCB_CPU -- Do three, see below.
-CONFIG_RCU_NOCB_CPU_ALL -- Do one.
-CONFIG_RCU_NOCB_CPU_NONE -- Do one.
-CONFIG_RCU_NOCB_CPU_ZERO -- Do one.
+CONFIG_RCU_FAST_NO_HZ -- Do one, but not with all nohz_full CPUs.
+CONFIG_RCU_NOCB_CPU -- Do three, one with no rcu_nocbs CPUs, one with
+	rcu_nocbs=0, and one with all rcu_nocbs CPUs.
 CONFIG_RCU_TRACE -- Do half.
 CONFIG_SMP -- Need one !SMP for PREEMPT_RCU.
 CONFIG_RCU_EXPERT=n -- Do a few, but these have to be vanilla configurations.
-- 
cgit v1.3-14-g43fede


From 0af92d46098a092aa5817dfeb6d24a8d85b66205 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 17 May 2017 08:43:40 -0700
Subject: rcu: Move RCU non-debug Kconfig options to kernel/rcu

RCU's Kconfig options are scattered, and there are enough of them
that it would be good for them to be more centralized.  This commit
therefore extracts RCU's Kconfig options from init/Kconfig into a new
kernel/rcu/Kconfig file.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 init/Kconfig       | 239 +---------------------------------------------------
 kernel/rcu/Kconfig | 242 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 243 insertions(+), 238 deletions(-)
 create mode 100644 kernel/rcu/Kconfig

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index dc431c6109f2..bc4c180c66a5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -472,244 +472,7 @@ config TASK_IO_ACCOUNTING
 
 endmenu # "CPU/Task time and stats accounting"
 
-menu "RCU Subsystem"
-
-config TREE_RCU
-	bool
-	default y if !PREEMPT && SMP
-	help
-	  This option selects the RCU implementation that is
-	  designed for very large SMP system with hundreds or
-	  thousands of CPUs.  It also scales down nicely to
-	  smaller systems.
-
-config PREEMPT_RCU
-	bool
-	default y if PREEMPT
-	help
-	  This option selects the RCU implementation that is
-	  designed for very large SMP systems with hundreds or
-	  thousands of CPUs, but for which real-time response
-	  is also required.  It also scales down nicely to
-	  smaller systems.
-
-	  Select this option if you are unsure.
-
-config TINY_RCU
-	bool
-	default y if !PREEMPT && !SMP
-	help
-	  This option selects the RCU implementation that is
-	  designed for UP systems from which real-time response
-	  is not required.  This option greatly reduces the
-	  memory footprint of RCU.
-
-config RCU_EXPERT
-	bool "Make expert-level adjustments to RCU configuration"
-	default n
-	help
-	  This option needs to be enabled if you wish to make
-	  expert-level adjustments to RCU configuration.  By default,
-	  no such adjustments can be made, which has the often-beneficial
-	  side-effect of preventing "make oldconfig" from asking you all
-	  sorts of detailed questions about how you would like numerous
-	  obscure RCU options to be set up.
-
-	  Say Y if you need to make expert-level adjustments to RCU.
-
-	  Say N if you are unsure.
-
-config SRCU
-	bool
-	help
-	  This option selects the sleepable version of RCU. This version
-	  permits arbitrary sleeping or blocking within RCU read-side critical
-	  sections.
-
-config TINY_SRCU
-	bool
-	default y if SRCU && TINY_RCU
-	help
-	  This option selects the single-CPU non-preemptible version of SRCU.
-
-config TREE_SRCU
-	bool
-	default y if SRCU && !TINY_RCU
-	help
-	  This option selects the full-fledged version of SRCU.
-
-config TASKS_RCU
-	bool
-	default n
-	select SRCU
-	help
-	  This option enables a task-based RCU implementation that uses
-	  only voluntary context switch (not preemption!), idle, and
-	  user-mode execution as quiescent states.
-
-config RCU_STALL_COMMON
-	def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE )
-	help
-	  This option enables RCU CPU stall code that is common between
-	  the TINY and TREE variants of RCU.  The purpose is to allow
-	  the tiny variants to disable RCU CPU stall warnings, while
-	  making these warnings mandatory for the tree variants.
-
-config RCU_NEED_SEGCBLIST
-	def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
-
-config CONTEXT_TRACKING
-       bool
-
-config CONTEXT_TRACKING_FORCE
-	bool "Force context tracking"
-	depends on CONTEXT_TRACKING
-	default y if !NO_HZ_FULL
-	help
-	  The major pre-requirement for full dynticks to work is to
-	  support the context tracking subsystem. But there are also
-	  other dependencies to provide in order to make the full
-	  dynticks working.
-
-	  This option stands for testing when an arch implements the
-	  context tracking backend but doesn't yet fullfill all the
-	  requirements to make the full dynticks feature working.
-	  Without the full dynticks, there is no way to test the support
-	  for context tracking and the subsystems that rely on it: RCU
-	  userspace extended quiescent state and tickless cputime
-	  accounting. This option copes with the absence of the full
-	  dynticks subsystem by forcing the context tracking on all
-	  CPUs in the system.
-
-	  Say Y only if you're working on the development of an
-	  architecture backend for the context tracking.
-
-	  Say N otherwise, this option brings an overhead that you
-	  don't want in production.
-
-
-config RCU_FANOUT
-	int "Tree-based hierarchical RCU fanout value"
-	range 2 64 if 64BIT
-	range 2 32 if !64BIT
-	depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
-	default 64 if 64BIT
-	default 32 if !64BIT
-	help
-	  This option controls the fanout of hierarchical implementations
-	  of RCU, allowing RCU to work efficiently on machines with
-	  large numbers of CPUs.  This value must be at least the fourth
-	  root of NR_CPUS, which allows NR_CPUS to be insanely large.
-	  The default value of RCU_FANOUT should be used for production
-	  systems, but if you are stress-testing the RCU implementation
-	  itself, small RCU_FANOUT values allow you to test large-system
-	  code paths on small(er) systems.
-
-	  Select a specific number if testing RCU itself.
-	  Take the default if unsure.
-
-config RCU_FANOUT_LEAF
-	int "Tree-based hierarchical RCU leaf-level fanout value"
-	range 2 64 if 64BIT
-	range 2 32 if !64BIT
-	depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
-	default 16
-	help
-	  This option controls the leaf-level fanout of hierarchical
-	  implementations of RCU, and allows trading off cache misses
-	  against lock contention.  Systems that synchronize their
-	  scheduling-clock interrupts for energy-efficiency reasons will
-	  want the default because the smaller leaf-level fanout keeps
-	  lock contention levels acceptably low.  Very large systems
-	  (hundreds or thousands of CPUs) will instead want to set this
-	  value to the maximum value possible in order to reduce the
-	  number of cache misses incurred during RCU's grace-period
-	  initialization.  These systems tend to run CPU-bound, and thus
-	  are not helped by synchronized interrupts, and thus tend to
-	  skew them, which reduces lock contention enough that large
-	  leaf-level fanouts work well.  That said, setting leaf-level
-	  fanout to a large number will likely cause problematic
-	  lock contention on the leaf-level rcu_node structures unless
-	  you boot with the skew_tick kernel parameter.
-
-	  Select a specific number if testing RCU itself.
-
-	  Select the maximum permissible value for large systems, but
-	  please understand that you may also need to set the skew_tick
-	  kernel boot parameter to avoid contention on the rcu_node
-	  structure's locks.
-
-	  Take the default if unsure.
-
-config RCU_FAST_NO_HZ
-	bool "Accelerate last non-dyntick-idle CPU's grace periods"
-	depends on NO_HZ_COMMON && SMP && RCU_EXPERT
-	default n
-	help
-	  This option permits CPUs to enter dynticks-idle state even if
-	  they have RCU callbacks queued, and prevents RCU from waking
-	  these CPUs up more than roughly once every four jiffies (by
-	  default, you can adjust this using the rcutree.rcu_idle_gp_delay
-	  parameter), thus improving energy efficiency.  On the other
-	  hand, this option increases the duration of RCU grace periods,
-	  for example, slowing down synchronize_rcu().
-
-	  Say Y if energy efficiency is critically important, and you
-	  	don't care about increased grace-period durations.
-
-	  Say N if you are unsure.
-
-config RCU_BOOST
-	bool "Enable RCU priority boosting"
-	depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
-	default n
-	help
-	  This option boosts the priority of preempted RCU readers that
-	  block the current preemptible RCU grace period for too long.
-	  This option also prevents heavy loads from blocking RCU
-	  callback invocation for all flavors of RCU.
-
-	  Say Y here if you are working with real-time apps or heavy loads
-	  Say N here if you are unsure.
-
-config RCU_BOOST_DELAY
-	int "Milliseconds to delay boosting after RCU grace-period start"
-	range 0 3000
-	depends on RCU_BOOST
-	default 500
-	help
-	  This option specifies the time to wait after the beginning of
-	  a given grace period before priority-boosting preempted RCU
-	  readers blocking that grace period.  Note that any RCU reader
-	  blocking an expedited RCU grace period is boosted immediately.
-
-	  Accept the default if unsure.
-
-config RCU_NOCB_CPU
-	bool "Offload RCU callback processing from boot-selected CPUs"
-	depends on TREE_RCU || PREEMPT_RCU
-	depends on RCU_EXPERT || NO_HZ_FULL
-	default n
-	help
-	  Use this option to reduce OS jitter for aggressive HPC or
-	  real-time workloads.	It can also be used to offload RCU
-	  callback invocation to energy-efficient CPUs in battery-powered
-	  asymmetric multiprocessors.
-
-	  This option offloads callback invocation from the set of
-	  CPUs specified at boot time by the rcu_nocbs parameter.
-	  For each such CPU, a kthread ("rcuox/N") will be created to
-	  invoke callbacks, where the "N" is the CPU being offloaded,
-	  and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
-	  "s" for RCU-sched.  Nothing prevents this kthread from running
-	  on the specified CPUs, but (1) the kthreads may be preempted
-	  between each callback, and (2) affinity or cgroups can be used
-	  to force the kthreads to run on whatever set of CPUs is desired.
-
-	  Say Y here if you want to help to debug reduced OS jitter.
-	  Say N here if you are unsure.
-
-endmenu # "RCU Subsystem"
+source "kernel/rcu/Kconfig"
 
 config BUILD_BIN2C
 	bool
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
new file mode 100644
index 000000000000..8edff43e8e94
--- /dev/null
+++ b/kernel/rcu/Kconfig
@@ -0,0 +1,242 @@
+#
+# RCU-related configuration options
+#
+
+menu "RCU Subsystem"
+
+config TREE_RCU
+	bool
+	default y if !PREEMPT && SMP
+	help
+	  This option selects the RCU implementation that is
+	  designed for very large SMP system with hundreds or
+	  thousands of CPUs.  It also scales down nicely to
+	  smaller systems.
+
+config PREEMPT_RCU
+	bool
+	default y if PREEMPT
+	help
+	  This option selects the RCU implementation that is
+	  designed for very large SMP systems with hundreds or
+	  thousands of CPUs, but for which real-time response
+	  is also required.  It also scales down nicely to
+	  smaller systems.
+
+	  Select this option if you are unsure.
+
+config TINY_RCU
+	bool
+	default y if !PREEMPT && !SMP
+	help
+	  This option selects the RCU implementation that is
+	  designed for UP systems from which real-time response
+	  is not required.  This option greatly reduces the
+	  memory footprint of RCU.
+
+config RCU_EXPERT
+	bool "Make expert-level adjustments to RCU configuration"
+	default n
+	help
+	  This option needs to be enabled if you wish to make
+	  expert-level adjustments to RCU configuration.  By default,
+	  no such adjustments can be made, which has the often-beneficial
+	  side-effect of preventing "make oldconfig" from asking you all
+	  sorts of detailed questions about how you would like numerous
+	  obscure RCU options to be set up.
+
+	  Say Y if you need to make expert-level adjustments to RCU.
+
+	  Say N if you are unsure.
+
+config SRCU
+	bool
+	help
+	  This option selects the sleepable version of RCU. This version
+	  permits arbitrary sleeping or blocking within RCU read-side critical
+	  sections.
+
+config TINY_SRCU
+	bool
+	default y if SRCU && TINY_RCU
+	help
+	  This option selects the single-CPU non-preemptible version of SRCU.
+
+config TREE_SRCU
+	bool
+	default y if SRCU && !TINY_RCU
+	help
+	  This option selects the full-fledged version of SRCU.
+
+config TASKS_RCU
+	bool
+	default n
+	select SRCU
+	help
+	  This option enables a task-based RCU implementation that uses
+	  only voluntary context switch (not preemption!), idle, and
+	  user-mode execution as quiescent states.
+
+config RCU_STALL_COMMON
+	def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE )
+	help
+	  This option enables RCU CPU stall code that is common between
+	  the TINY and TREE variants of RCU.  The purpose is to allow
+	  the tiny variants to disable RCU CPU stall warnings, while
+	  making these warnings mandatory for the tree variants.
+
+config RCU_NEED_SEGCBLIST
+	def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU )
+
+config CONTEXT_TRACKING
+       bool
+
+config CONTEXT_TRACKING_FORCE
+	bool "Force context tracking"
+	depends on CONTEXT_TRACKING
+	default y if !NO_HZ_FULL
+	help
+	  The major pre-requirement for full dynticks to work is to
+	  support the context tracking subsystem. But there are also
+	  other dependencies to provide in order to make the full
+	  dynticks working.
+
+	  This option stands for testing when an arch implements the
+	  context tracking backend but doesn't yet fullfill all the
+	  requirements to make the full dynticks feature working.
+	  Without the full dynticks, there is no way to test the support
+	  for context tracking and the subsystems that rely on it: RCU
+	  userspace extended quiescent state and tickless cputime
+	  accounting. This option copes with the absence of the full
+	  dynticks subsystem by forcing the context tracking on all
+	  CPUs in the system.
+
+	  Say Y only if you're working on the development of an
+	  architecture backend for the context tracking.
+
+	  Say N otherwise, this option brings an overhead that you
+	  don't want in production.
+
+
+config RCU_FANOUT
+	int "Tree-based hierarchical RCU fanout value"
+	range 2 64 if 64BIT
+	range 2 32 if !64BIT
+	depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
+	default 64 if 64BIT
+	default 32 if !64BIT
+	help
+	  This option controls the fanout of hierarchical implementations
+	  of RCU, allowing RCU to work efficiently on machines with
+	  large numbers of CPUs.  This value must be at least the fourth
+	  root of NR_CPUS, which allows NR_CPUS to be insanely large.
+	  The default value of RCU_FANOUT should be used for production
+	  systems, but if you are stress-testing the RCU implementation
+	  itself, small RCU_FANOUT values allow you to test large-system
+	  code paths on small(er) systems.
+
+	  Select a specific number if testing RCU itself.
+	  Take the default if unsure.
+
+config RCU_FANOUT_LEAF
+	int "Tree-based hierarchical RCU leaf-level fanout value"
+	range 2 64 if 64BIT
+	range 2 32 if !64BIT
+	depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
+	default 16
+	help
+	  This option controls the leaf-level fanout of hierarchical
+	  implementations of RCU, and allows trading off cache misses
+	  against lock contention.  Systems that synchronize their
+	  scheduling-clock interrupts for energy-efficiency reasons will
+	  want the default because the smaller leaf-level fanout keeps
+	  lock contention levels acceptably low.  Very large systems
+	  (hundreds or thousands of CPUs) will instead want to set this
+	  value to the maximum value possible in order to reduce the
+	  number of cache misses incurred during RCU's grace-period
+	  initialization.  These systems tend to run CPU-bound, and thus
+	  are not helped by synchronized interrupts, and thus tend to
+	  skew them, which reduces lock contention enough that large
+	  leaf-level fanouts work well.  That said, setting leaf-level
+	  fanout to a large number will likely cause problematic
+	  lock contention on the leaf-level rcu_node structures unless
+	  you boot with the skew_tick kernel parameter.
+
+	  Select a specific number if testing RCU itself.
+
+	  Select the maximum permissible value for large systems, but
+	  please understand that you may also need to set the skew_tick
+	  kernel boot parameter to avoid contention on the rcu_node
+	  structure's locks.
+
+	  Take the default if unsure.
+
+config RCU_FAST_NO_HZ
+	bool "Accelerate last non-dyntick-idle CPU's grace periods"
+	depends on NO_HZ_COMMON && SMP && RCU_EXPERT
+	default n
+	help
+	  This option permits CPUs to enter dynticks-idle state even if
+	  they have RCU callbacks queued, and prevents RCU from waking
+	  these CPUs up more than roughly once every four jiffies (by
+	  default, you can adjust this using the rcutree.rcu_idle_gp_delay
+	  parameter), thus improving energy efficiency.  On the other
+	  hand, this option increases the duration of RCU grace periods,
+	  for example, slowing down synchronize_rcu().
+
+	  Say Y if energy efficiency is critically important, and you
+	  	don't care about increased grace-period durations.
+
+	  Say N if you are unsure.
+
+config RCU_BOOST
+	bool "Enable RCU priority boosting"
+	depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
+	default n
+	help
+	  This option boosts the priority of preempted RCU readers that
+	  block the current preemptible RCU grace period for too long.
+	  This option also prevents heavy loads from blocking RCU
+	  callback invocation for all flavors of RCU.
+
+	  Say Y here if you are working with real-time apps or heavy loads
+	  Say N here if you are unsure.
+
+config RCU_BOOST_DELAY
+	int "Milliseconds to delay boosting after RCU grace-period start"
+	range 0 3000
+	depends on RCU_BOOST
+	default 500
+	help
+	  This option specifies the time to wait after the beginning of
+	  a given grace period before priority-boosting preempted RCU
+	  readers blocking that grace period.  Note that any RCU reader
+	  blocking an expedited RCU grace period is boosted immediately.
+
+	  Accept the default if unsure.
+
+config RCU_NOCB_CPU
+	bool "Offload RCU callback processing from boot-selected CPUs"
+	depends on TREE_RCU || PREEMPT_RCU
+	depends on RCU_EXPERT || NO_HZ_FULL
+	default n
+	help
+	  Use this option to reduce OS jitter for aggressive HPC or
+	  real-time workloads.	It can also be used to offload RCU
+	  callback invocation to energy-efficient CPUs in battery-powered
+	  asymmetric multiprocessors.
+
+	  This option offloads callback invocation from the set of
+	  CPUs specified at boot time by the rcu_nocbs parameter.
+	  For each such CPU, a kthread ("rcuox/N") will be created to
+	  invoke callbacks, where the "N" is the CPU being offloaded,
+	  and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
+	  "s" for RCU-sched.  Nothing prevents this kthread from running
+	  on the specified CPUs, but (1) the kthreads may be preempted
+	  between each callback, and (2) affinity or cgroups can be used
+	  to force the kthreads to run on whatever set of CPUs is desired.
+
+	  Say Y here if you want to help to debug reduced OS jitter.
+	  Say N here if you are unsure.
+
+endmenu # "RCU Subsystem"
-- 
cgit v1.3-14-g43fede


From 43a0a2a7d725f2ed2547cd656749eb66c093f2c2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 17 May 2017 09:19:44 -0700
Subject: rcu: Move RCU debug Kconfig options to kernel/rcu

RCU's debugging Kconfig options are in the unintuitive location
lib/Kconfig.debug, and there are enough of them that it would be good for
them to be more centralized.  This commit therefore extracts RCU's Kconfig
options from init/Kconfig into a new kernel/rcu/Kconfig.debug file.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/Kconfig.debug | 82 ++++++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug        | 79 +---------------------------------------------
 2 files changed, 83 insertions(+), 78 deletions(-)
 create mode 100644 kernel/rcu/Kconfig.debug

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
new file mode 100644
index 000000000000..0ec7d1d33a14
--- /dev/null
+++ b/kernel/rcu/Kconfig.debug
@@ -0,0 +1,82 @@
+#
+# RCU-related debugging configuration options
+#
+
+menu "RCU Debugging"
+
+config PROVE_RCU
+	def_bool PROVE_LOCKING
+
+config TORTURE_TEST
+	tristate
+	default n
+
+config RCU_PERF_TEST
+	tristate "performance tests for RCU"
+	depends on DEBUG_KERNEL
+	select TORTURE_TEST
+	select SRCU
+	select TASKS_RCU
+	default n
+	help
+	  This option provides a kernel module that runs performance
+	  tests on the RCU infrastructure.  The kernel module may be built
+	  after the fact on the running kernel to be tested, if desired.
+
+	  Say Y here if you want RCU performance tests to be built into
+	  the kernel.
+	  Say M if you want the RCU performance tests to build as a module.
+	  Say N if you are unsure.
+
+config RCU_TORTURE_TEST
+	tristate "torture tests for RCU"
+	depends on DEBUG_KERNEL
+	select TORTURE_TEST
+	select SRCU
+	select TASKS_RCU
+	default n
+	help
+	  This option provides a kernel module that runs torture tests
+	  on the RCU infrastructure.  The kernel module may be built
+	  after the fact on the running kernel to be tested, if desired.
+
+	  Say Y here if you want RCU torture tests to be built into
+	  the kernel.
+	  Say M if you want the RCU torture tests to build as a module.
+	  Say N if you are unsure.
+
+config RCU_CPU_STALL_TIMEOUT
+	int "RCU CPU stall timeout in seconds"
+	depends on RCU_STALL_COMMON
+	range 3 300
+	default 21
+	help
+	  If a given RCU grace period extends more than the specified
+	  number of seconds, a CPU stall warning is printed.  If the
+	  RCU grace period persists, additional CPU stall warnings are
+	  printed at more widely spaced intervals.
+
+config RCU_TRACE
+	bool "Enable tracing for RCU"
+	depends on DEBUG_KERNEL
+	default y if TREE_RCU
+	select TRACE_CLOCK
+	help
+	  This option enables additional tracepoints for ftrace-style
+	  event tracing.
+
+	  Say Y here if you want to enable RCU tracing
+	  Say N if you are unsure.
+
+config RCU_EQS_DEBUG
+	bool "Provide debugging asserts for adding NO_HZ support to an arch"
+	depends on DEBUG_KERNEL
+	help
+	  This option provides consistency checks in RCU's handling of
+	  NO_HZ.  These checks have proven quite helpful in detecting
+	  bugs in arch-specific NO_HZ code.
+
+	  Say N here if you need ultimate kernel/user switch latencies
+	  Say Y if you are unsure
+
+endmenu # "RCU Debugging"
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8c10b5a97b9e..a7a751a75cfd 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1301,84 +1301,7 @@ config DEBUG_CREDENTIALS
 
 	  If unsure, say N.
 
-menu "RCU Debugging"
-
-config PROVE_RCU
-	def_bool PROVE_LOCKING
-
-config TORTURE_TEST
-	tristate
-	default n
-
-config RCU_PERF_TEST
-	tristate "performance tests for RCU"
-	depends on DEBUG_KERNEL
-	select TORTURE_TEST
-	select SRCU
-	select TASKS_RCU
-	default n
-	help
-	  This option provides a kernel module that runs performance
-	  tests on the RCU infrastructure.  The kernel module may be built
-	  after the fact on the running kernel to be tested, if desired.
-
-	  Say Y here if you want RCU performance tests to be built into
-	  the kernel.
-	  Say M if you want the RCU performance tests to build as a module.
-	  Say N if you are unsure.
-
-config RCU_TORTURE_TEST
-	tristate "torture tests for RCU"
-	depends on DEBUG_KERNEL
-	select TORTURE_TEST
-	select SRCU
-	select TASKS_RCU
-	default n
-	help
-	  This option provides a kernel module that runs torture tests
-	  on the RCU infrastructure.  The kernel module may be built
-	  after the fact on the running kernel to be tested, if desired.
-
-	  Say Y here if you want RCU torture tests to be built into
-	  the kernel.
-	  Say M if you want the RCU torture tests to build as a module.
-	  Say N if you are unsure.
-
-config RCU_CPU_STALL_TIMEOUT
-	int "RCU CPU stall timeout in seconds"
-	depends on RCU_STALL_COMMON
-	range 3 300
-	default 21
-	help
-	  If a given RCU grace period extends more than the specified
-	  number of seconds, a CPU stall warning is printed.  If the
-	  RCU grace period persists, additional CPU stall warnings are
-	  printed at more widely spaced intervals.
-
-config RCU_TRACE
-	bool "Enable tracing for RCU"
-	depends on DEBUG_KERNEL
-	default y if TREE_RCU
-	select TRACE_CLOCK
-	help
-	  This option enables additional tracepoints for ftrace-style
-	  event tracing.
-
-	  Say Y here if you want to enable RCU tracing
-	  Say N if you are unsure.
-
-config RCU_EQS_DEBUG
-	bool "Provide debugging asserts for adding NO_HZ support to an arch"
-	depends on DEBUG_KERNEL
-	help
-	  This option provides consistency checks in RCU's handling of
-	  NO_HZ.  These checks have proven quite helpful in detecting
-	  bugs in arch-specific NO_HZ code.
-
-	  Say N here if you need ultimate kernel/user switch latencies
-	  Say Y if you are unsure
-
-endmenu # "RCU Debugging"
+source "kernel/rcu/Kconfig.debug"
 
 config DEBUG_WQ_FORCE_RR_CPU
 	bool "Force round-robin CPU selection for unbound work items"
-- 
cgit v1.3-14-g43fede


From c23484f0e7bc89e1facb04103ce24efeebee76b9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 17 May 2017 10:17:03 -0700
Subject: rcu: Remove event tracing from Tiny RCU

This commit saves a few lines by getting rid of Tiny RCU's event tracing.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tiny.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 2306cab2195d..595cb1bf944f 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,7 +35,6 @@
 #include <linux/time.h>
 #include <linux/cpu.h>
 #include <linux/prefetch.h>
-#include <linux/trace_events.h>
 
 #include "rcu.h"
 
@@ -139,7 +138,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 		local_irq_restore(flags);
 		return;
 	}
-	RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);)
 	list = rcp->rcucblist;
 	rcp->rcucblist = *rcp->donetail;
 	*rcp->donetail = NULL;
@@ -161,10 +159,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 		RCU_TRACE(cb_count++;)
 	}
 	RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);)
-	RCU_TRACE(trace_rcu_batch_end(rcp->name,
-				      cb_count, 0, need_resched(),
-				      is_idle_task(current),
-				      false));
 }
 
 static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
-- 
cgit v1.3-14-g43fede


From 6d48152eafde1f0d0a4a9e0584fa7d9ff4fbfdac Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 17 May 2017 10:54:29 -0700
Subject: rcu: Remove RCU CPU stall warnings from Tiny RCU

Tiny RCU's job is to be tiny, so this commit removes its RCU CPU
stall warning code.  After this, there is no longer any need for
rcu_sched_ctrlblk and rcu_bh_ctrlblk to be in tiny_plugin.h, so this
commit also moves them to tiny.c.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/Kconfig                                 |  2 +-
 kernel/rcu/tiny.c                                  | 35 +++++-----
 kernel/rcu/tiny_plugin.h                           | 78 ----------------------
 .../selftests/rcutorture/configs/rcu/TINY02        |  1 -
 4 files changed, 19 insertions(+), 97 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 8edff43e8e94..be90c945063f 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -78,7 +78,7 @@ config TASKS_RCU
 	  user-mode execution as quiescent states.
 
 config RCU_STALL_COMMON
-	def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE )
+	def_bool ( TREE_RCU || PREEMPT_RCU )
 	help
 	  This option enables RCU CPU stall code that is common between
 	  the TINY and TREE variants of RCU.  The purpose is to allow
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 595cb1bf944f..f8488965250f 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -38,11 +38,23 @@
 
 #include "rcu.h"
 
-/* Forward declarations for tiny_plugin.h. */
-struct rcu_ctrlblk;
-static void __call_rcu(struct rcu_head *head,
-		       rcu_callback_t func,
-		       struct rcu_ctrlblk *rcp);
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
+	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
+	struct rcu_head **curtail;	/* ->next pointer of last CB. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+	.donetail	= &rcu_sched_ctrlblk.rcucblist,
+	.curtail	= &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+	.donetail	= &rcu_bh_ctrlblk.rcucblist,
+	.curtail	= &rcu_bh_ctrlblk.rcucblist,
+};
 
 #include "tiny_plugin.h"
 
@@ -65,7 +77,6 @@ EXPORT_SYMBOL(rcu_barrier_sched);
  */
 static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 {
-	RCU_TRACE(reset_cpu_stall_ticks(rcp);)
 	if (rcp->donetail != rcp->curtail) {
 		rcp->donetail = rcp->curtail;
 		return 1;
@@ -111,7 +122,6 @@ void rcu_bh_qs(void)
  */
 void rcu_check_callbacks(int user)
 {
-	RCU_TRACE(check_cpu_stalls();)
 	if (user)
 		rcu_sched_qs();
 	else if (!in_softirq())
@@ -126,10 +136,8 @@ void rcu_check_callbacks(int user)
  */
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
-	const char *rn = NULL;
 	struct rcu_head *next, *list;
 	unsigned long flags;
-	RCU_TRACE(int cb_count = 0;)
 
 	/* Move the ready-to-invoke callbacks to a local list. */
 	local_irq_save(flags);
@@ -147,18 +155,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 	local_irq_restore(flags);
 
 	/* Invoke the callbacks on the local list. */
-	RCU_TRACE(rn = rcp->name;)
 	while (list) {
 		next = list->next;
 		prefetch(next);
 		debug_rcu_head_unqueue(list);
 		local_bh_disable();
-		__rcu_reclaim(rn, list);
+		__rcu_reclaim("", list);
 		local_bh_enable();
 		list = next;
-		RCU_TRACE(cb_count++;)
 	}
-	RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);)
 }
 
 static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused)
@@ -202,7 +207,6 @@ static void __call_rcu(struct rcu_head *head,
 	local_irq_save(flags);
 	*rcp->curtail = head;
 	rcp->curtail = &head->next;
-	RCU_TRACE(rcp->qlen++;)
 	local_irq_restore(flags);
 
 	if (unlikely(is_idle_task(current))) {
@@ -235,8 +239,5 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
 void __init rcu_init(void)
 {
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-	RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);)
-	RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);)
-
 	rcu_early_boot_tests();
 }
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index c642f23f1582..f0a01b2a3062 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -22,34 +22,6 @@
  * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
 
-#include <linux/kthread.h>
-#include <linux/init.h>
-
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
-	struct rcu_head *rcucblist;	/* List of pending callbacks (CBs). */
-	struct rcu_head **donetail;	/* ->next pointer of last "done" CB. */
-	struct rcu_head **curtail;	/* ->next pointer of last CB. */
-	RCU_TRACE(long qlen);		/* Number of pending CBs. */
-	RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
-	RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
-	RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
-	RCU_TRACE(const char *name);	/* Name of RCU type. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
-	.donetail	= &rcu_sched_ctrlblk.rcucblist,
-	.curtail	= &rcu_sched_ctrlblk.rcucblist,
-	RCU_TRACE(.name = "rcu_sched")
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.donetail	= &rcu_bh_ctrlblk.rcucblist,
-	.curtail	= &rcu_bh_ctrlblk.rcucblist,
-	RCU_TRACE(.name = "rcu_bh")
-};
-
 #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
 #include <linux/kernel_stat.h>
 
@@ -73,53 +45,3 @@ void __init rcu_scheduler_starting(void)
 }
 
 #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
-
-#ifdef CONFIG_RCU_TRACE
-
-static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	rcp->qlen -= n;
-	local_irq_restore(flags);
-}
-
-static void check_cpu_stall(struct rcu_ctrlblk *rcp)
-{
-	unsigned long j;
-	unsigned long js;
-
-	if (rcu_cpu_stall_suppress)
-		return;
-	rcp->ticks_this_gp++;
-	j = jiffies;
-	js = READ_ONCE(rcp->jiffies_stall);
-	if (rcp->rcucblist && ULONG_CMP_GE(j, js)) {
-		pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
-		       rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE,
-		       jiffies - rcp->gp_start, rcp->qlen);
-		dump_stack();
-		WRITE_ONCE(rcp->jiffies_stall,
-			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
-	} else if (ULONG_CMP_GE(j, js)) {
-		WRITE_ONCE(rcp->jiffies_stall,
-			   jiffies + rcu_jiffies_till_stall_check());
-	}
-}
-
-static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
-{
-	rcp->ticks_this_gp = 0;
-	rcp->gp_start = jiffies;
-	WRITE_ONCE(rcp->jiffies_stall,
-		   jiffies + rcu_jiffies_till_stall_check());
-}
-
-static void check_cpu_stalls(void)
-{
-	RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);)
-	RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);)
-}
-
-#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY02 b/tools/testing/selftests/rcutorture/configs/rcu/TINY02
index 1f6bebbf5da8..d8674264318d 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/TINY02
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY02
@@ -6,7 +6,6 @@ CONFIG_PREEMPT=n
 CONFIG_HZ_PERIODIC=y
 CONFIG_NO_HZ_IDLE=n
 CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_TRACE=y
 CONFIG_PROVE_LOCKING=y
 #CHECK#CONFIG_PROVE_RCU=y
 CONFIG_DEBUG_LOCK_ALLOC=y
-- 
cgit v1.3-14-g43fede


From 67e707bd68269aac70904943f07a979eeb163b13 Mon Sep 17 00:00:00 2001
From: Jeff Vander Stoep <jeffv@google.com>
Date: Thu, 8 Jun 2017 18:09:09 +0530
Subject: config: android-recommended: enable fstack-protector-strong

If compiler has stack protector support, set
CONFIG_CC_STACKPROTECTOR_STRONG.

Reviewed-at: https://android-review.googlesource.com/#/c/238388/

Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
[AmitP: cherry-picked this change from Android common kernel]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/configs/android-recommended.config | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index 28ee064b6744..a86faa41bfd2 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -11,6 +11,7 @@ CONFIG_BLK_DEV_DM=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_CC_STACKPROTECTOR_STRONG=y
 CONFIG_COMPACTION=y
 CONFIG_STRICT_KERNEL_RWX=y
 CONFIG_DM_CRYPT=y
-- 
cgit v1.3-14-g43fede


From 0c9238c7a1cfd834d8bb96a2b1fabe0b1a5961df Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Jun 2017 18:09:10 +0530
Subject: config: android-recommended: enable CONFIG_ARM64_SW_TTBR0_PAN

Enable PAN emulation using TTBR0_EL1 switching.

Reviewed-at: https://android-review.googlesource.com/#/c/325997/

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
[AmitP: cherry-picked this change from Android common kernel
        and updated the commit message]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/configs/android-recommended.config | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index a86faa41bfd2..a02c447769f7 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -6,6 +6,7 @@
 # CONFIG_NF_CONNTRACK_SIP is not set
 # CONFIG_PM_WAKELOCKS_GC is not set
 # CONFIG_VT is not set
+CONFIG_ARM64_SW_TTBR0_PAN=y
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
 CONFIG_BLK_DEV_DM=y
 CONFIG_BLK_DEV_LOOP=y
-- 
cgit v1.3-14-g43fede


From c1ebc2febdb85a73a4f91a9b9eaab6387619eaa6 Mon Sep 17 00:00:00 2001
From: Max Shi <meixuanshi@google.com>
Date: Thu, 8 Jun 2017 18:09:11 +0530
Subject: config: android-base: disable CONFIG_USELIB and CONFIG_FHANDLE

Turn off the two kernel configs to disable related system ABI.

Reviewed-at: https://android-review.googlesource.com/#/c/264976/

Signed-off-by: Max Shi <meixuanshi@google.com>
[AmitP: cherry-picked this change from Android common kernel]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/configs/android-base.config | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 26a06e09a5bd..efe5ff86767e 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -1,10 +1,12 @@
 #  KEEP ALPHABETICALLY SORTED
 # CONFIG_DEVKMEM is not set
 # CONFIG_DEVMEM is not set
+# CONFIG_FHANDLE is not set
 # CONFIG_INET_LRO is not set
 # CONFIG_MODULES is not set
 # CONFIG_OABI_COMPAT is not set
 # CONFIG_SYSVIPC is not set
+# CONFIG_USELIB is not set
 CONFIG_ANDROID=y
 CONFIG_ANDROID_BINDER_IPC=y
 CONFIG_ANDROID_LOW_MEMORY_KILLER=y
-- 
cgit v1.3-14-g43fede


From fb0b1538983c1cf7d2a2242b332a34a953753624 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Jun 2017 18:09:12 +0530
Subject: config: android-recommended: enable CONFIG_CPU_SW_DOMAIN_PAN

Enable CPU domain PAN to ensure that normal kernel accesses are
unable to access userspace addresses.

Reviewed-at: https://android-review.googlesource.com/#/c/334035/

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
[AmitP: cherry-picked this change from Android common kernel, updated
        the commit message and re-placed the CONFIG_STRICT_KERNEL_RWX
        config in sorted order]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/configs/android-recommended.config | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index a02c447769f7..946fb92418f7 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -14,7 +14,7 @@ CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
 CONFIG_CC_STACKPROTECTOR_STRONG=y
 CONFIG_COMPACTION=y
-CONFIG_STRICT_KERNEL_RWX=y
+CONFIG_CPU_SW_DOMAIN_PAN=y
 CONFIG_DM_CRYPT=y
 CONFIG_DM_UEVENT=y
 CONFIG_DM_VERITY=y
@@ -107,6 +107,7 @@ CONFIG_SCHEDSTATS=y
 CONFIG_SMARTJOYPLUS_FF=y
 CONFIG_SND=y
 CONFIG_SOUND=y
+CONFIG_STRICT_KERNEL_RWX=y
 CONFIG_SUSPEND_TIME=y
 CONFIG_TABLET_USB_ACECAD=y
 CONFIG_TABLET_USB_AIPTEK=y
-- 
cgit v1.3-14-g43fede


From 5b89db2fa545b473dc352689ac3afe407367ea34 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Thu, 8 Jun 2017 18:09:13 +0530
Subject: config: android-base: add CONFIG_IKCONFIG option

This adds CONFIG_IKCONFIG and CONFIG_IKCONFIG_PROC options, which are a
requirement for the O release.

Reviewed-at: https://android-review.googlesource.com/#/c/364553/

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
[AmitP: cherry-picked this change from Android common kernel]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/configs/android-base.config | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index efe5ff86767e..e12cfec25758 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -25,6 +25,8 @@ CONFIG_EMBEDDED=y
 CONFIG_FB=y
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_HIGH_RES_TIMERS=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
 CONFIG_INET6_AH=y
 CONFIG_INET6_ESP=y
 CONFIG_INET6_IPCOMP=y
-- 
cgit v1.3-14-g43fede


From 2096e1706336d83cd66ca744e4d904af4d63e25c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Thu, 8 Jun 2017 18:09:14 +0530
Subject: config: android-base: add CONFIG_MODULES option

This adds CONFIG_MODULES, CONFIG_MODULE_UNLOAD, and CONFIG_MODVERSIONS
which are required by the O release.

Reviewed-at: https://android-review.googlesource.com/#/c/364554/

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
[AmitP: cherry-picked this change from Android common kernel]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/configs/android-base.config | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index e12cfec25758..62cb392fc34b 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -3,7 +3,6 @@
 # CONFIG_DEVMEM is not set
 # CONFIG_FHANDLE is not set
 # CONFIG_INET_LRO is not set
-# CONFIG_MODULES is not set
 # CONFIG_OABI_COMPAT is not set
 # CONFIG_SYSVIPC is not set
 # CONFIG_USELIB is not set
@@ -64,6 +63,9 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y
 CONFIG_IP_NF_TARGET_NETMAP=y
 CONFIG_IP_NF_TARGET_REDIRECT=y
 CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
 CONFIG_NET=y
 CONFIG_NETDEVICES=y
 CONFIG_NETFILTER=y
-- 
cgit v1.3-14-g43fede


From 2edfe6be206adc4c1055e053322d27267f8952bc Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Thu, 8 Jun 2017 18:09:15 +0530
Subject: config: android-base: add CGROUP_BPF

Add CONFIG_CGROUP_BPF as a default configuration in android base config
since it is used to replace XT_QTAGUID in future.

Reviewed-at: https://android-review.googlesource.com/#/c/400374/

Signed-off-by: Chenbo Feng <fengc@google.com>
[AmitP: cherry-picked this change from Android common kernel]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/configs/android-base.config | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 62cb392fc34b..cdde5af6b332 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -14,6 +14,7 @@ CONFIG_ASHMEM=y
 CONFIG_AUDIT=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_CGROUPS=y
+CONFIG_CGROUP_BPF=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_DEBUG=y
 CONFIG_CGROUP_FREEZER=y
-- 
cgit v1.3-14-g43fede


From 9e69dd0179c346dfb5d08b8d46d5f5c9c81ab1b7 Mon Sep 17 00:00:00 2001
From: Roberto Pereira <rpere@google.com>
Date: Thu, 8 Jun 2017 18:09:16 +0530
Subject: config: android-base: disable CONFIG_NFSD and CONFIG_NFS_FS

Disable Network file system support.

Reviewed-at: https://android-review.googlesource.com/#/c/409559/

Signed-off-by: Roberto Pereira <rpere@google.com>
[AmitP: cherry-picked this change from Android common kernel
        and updated commit message]
Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/configs/android-base.config | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index cdde5af6b332..d70829033bb7 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -3,6 +3,8 @@
 # CONFIG_DEVMEM is not set
 # CONFIG_FHANDLE is not set
 # CONFIG_INET_LRO is not set
+# CONFIG_NFSD is not set
+# CONFIG_NFS_FS is not set
 # CONFIG_OABI_COMPAT is not set
 # CONFIG_SYSVIPC is not set
 # CONFIG_USELIB is not set
-- 
cgit v1.3-14-g43fede


From 4e4cbee93d56137ebff722be022cae5f70ef84fb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 3 Jun 2017 09:38:06 +0200
Subject: block: switch bios to blk_status_t

Replace bi_error with a new bi_status to allow for a clear conversion.
Note that device mapper overloaded bi_error with a private value, which
we'll have to keep arround at least for now and thus propagate to a
proper blk_status_t value.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio-integrity.c               |  8 ++--
 block/bio.c                         |  8 ++--
 block/blk-core.c                    | 20 ++++++----
 block/blk-integrity.c               |  4 +-
 block/bounce.c                      |  4 +-
 block/t10-pi.c                      | 30 +++++++--------
 drivers/block/aoe/aoecmd.c          | 10 ++---
 drivers/block/aoe/aoedev.c          |  2 +-
 drivers/block/drbd/drbd_actlog.c    |  2 +-
 drivers/block/drbd/drbd_bitmap.c    |  6 +--
 drivers/block/drbd/drbd_int.h       |  2 +-
 drivers/block/drbd/drbd_receiver.c  |  6 +--
 drivers/block/drbd/drbd_req.c       |  6 +--
 drivers/block/drbd/drbd_worker.c    | 16 ++++----
 drivers/block/floppy.c              |  4 +-
 drivers/block/pktcdvd.c             | 18 ++++-----
 drivers/block/ps3vram.c             | 14 +++----
 drivers/block/rsxx/dev.c            | 14 +++----
 drivers/block/rsxx/dma.c            | 13 +++----
 drivers/block/rsxx/rsxx_priv.h      |  2 +-
 drivers/block/umem.c                |  2 +-
 drivers/block/xen-blkback/blkback.c | 19 ++++------
 drivers/block/xen-blkfront.c        |  2 +-
 drivers/lightnvm/pblk-core.c        |  4 +-
 drivers/lightnvm/pblk-read.c        |  4 +-
 drivers/lightnvm/pblk-write.c       |  2 +-
 drivers/lightnvm/rrpc.c             |  8 ++--
 drivers/md/bcache/bcache.h          |  7 ++--
 drivers/md/bcache/btree.c           |  6 +--
 drivers/md/bcache/io.c              |  6 +--
 drivers/md/bcache/journal.c         |  2 +-
 drivers/md/bcache/movinggc.c        | 10 ++---
 drivers/md/bcache/request.c         | 28 +++++++-------
 drivers/md/bcache/request.h         |  2 +-
 drivers/md/bcache/super.c           |  6 +--
 drivers/md/bcache/writeback.c       |  4 +-
 drivers/md/dm-bio-prison-v1.c       |  4 +-
 drivers/md/dm-bio-prison-v1.h       |  2 +-
 drivers/md/dm-bufio.c               | 28 +++++++-------
 drivers/md/dm-cache-target.c        | 34 +++++++++--------
 drivers/md/dm-crypt.c               | 34 ++++++++---------
 drivers/md/dm-flakey.c              |  5 ++-
 drivers/md/dm-integrity.c           | 18 ++++-----
 drivers/md/dm-io.c                  | 10 ++---
 drivers/md/dm-log-writes.c          |  7 ++--
 drivers/md/dm-mpath.c               | 15 ++++----
 drivers/md/dm-raid1.c               | 13 ++++---
 drivers/md/dm-rq.c                  |  2 +-
 drivers/md/dm-snap.c                |  5 ++-
 drivers/md/dm-stripe.c              |  5 ++-
 drivers/md/dm-thin.c                | 65 ++++++++++++++++----------------
 drivers/md/dm-verity-target.c       | 10 ++---
 drivers/md/dm.c                     | 40 ++++++++++----------
 drivers/md/md.c                     |  8 ++--
 drivers/md/multipath.c              | 10 ++---
 drivers/md/raid1.c                  | 36 +++++++++---------
 drivers/md/raid10.c                 | 36 +++++++++---------
 drivers/md/raid5-cache.c            |  4 +-
 drivers/md/raid5-ppl.c              |  2 +-
 drivers/md/raid5.c                  | 22 +++++------
 drivers/nvdimm/blk.c                |  4 +-
 drivers/nvdimm/btt.c                |  4 +-
 drivers/nvdimm/pmem.c               | 28 +++++++-------
 drivers/nvme/target/io-cmd.c        |  4 +-
 drivers/target/target_core_iblock.c | 10 ++---
 fs/block_dev.c                      | 18 +++++----
 fs/btrfs/btrfs_inode.h              |  3 +-
 fs/btrfs/check-integrity.c          |  4 +-
 fs/btrfs/compression.c              | 44 +++++++++++-----------
 fs/btrfs/compression.h              |  4 +-
 fs/btrfs/ctree.h                    |  6 +--
 fs/btrfs/disk-io.c                  | 75 ++++++++++++++++++-------------------
 fs/btrfs/disk-io.h                  | 12 +++---
 fs/btrfs/extent_io.c                | 23 +++++++-----
 fs/btrfs/extent_io.h                |  6 +--
 fs/btrfs/file-item.c                | 14 +++----
 fs/btrfs/inode.c                    | 73 ++++++++++++++++++------------------
 fs/btrfs/raid56.c                   | 16 ++++----
 fs/btrfs/scrub.c                    | 26 ++++++-------
 fs/btrfs/volumes.c                  | 11 +++---
 fs/buffer.c                         |  2 +-
 fs/crypto/bio.c                     |  2 +-
 fs/direct-io.c                      |  8 ++--
 fs/ext4/page-io.c                   | 13 ++++---
 fs/ext4/readpage.c                  |  4 +-
 fs/f2fs/data.c                      | 10 ++---
 fs/f2fs/segment.c                   |  2 +-
 fs/gfs2/lops.c                      |  8 ++--
 fs/gfs2/meta_io.c                   |  2 +-
 fs/gfs2/ops_fstype.c                |  4 +-
 fs/iomap.c                          |  4 +-
 fs/jfs/jfs_logmgr.c                 |  2 +-
 fs/jfs/jfs_metapage.c               |  4 +-
 fs/mpage.c                          |  3 +-
 fs/nfs/blocklayout/blocklayout.c    |  4 +-
 fs/nilfs2/segbuf.c                  |  2 +-
 fs/ocfs2/cluster/heartbeat.c        |  6 +--
 fs/xfs/xfs_aops.c                   |  7 ++--
 fs/xfs/xfs_buf.c                    |  7 +++-
 include/linux/bio.h                 |  2 +-
 include/linux/blk_types.h           |  5 ++-
 include/linux/blkdev.h              |  2 +-
 include/linux/device-mapper.h       |  2 +-
 kernel/power/swap.c                 | 14 +++----
 kernel/trace/blktrace.c             |  4 +-
 mm/page_io.c                        |  4 +-
 106 files changed, 625 insertions(+), 603 deletions(-)

(limited to 'kernel')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5384713d48bc..17b9740e138b 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -221,7 +221,7 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
  * @bio:	bio to generate/verify integrity metadata for
  * @proc_fn:	Pointer to the relevant processing function
  */
-static int bio_integrity_process(struct bio *bio,
+static blk_status_t bio_integrity_process(struct bio *bio,
 				 integrity_processing_fn *proc_fn)
 {
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
@@ -229,7 +229,7 @@ static int bio_integrity_process(struct bio *bio,
 	struct bvec_iter bviter;
 	struct bio_vec bv;
 	struct bio_integrity_payload *bip = bio_integrity(bio);
-	unsigned int ret = 0;
+	blk_status_t ret = BLK_STS_OK;
 	void *prot_buf = page_address(bip->bip_vec->bv_page) +
 		bip->bip_vec->bv_offset;
 
@@ -366,7 +366,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 	struct bio *bio = bip->bip_bio;
 	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
 
-	bio->bi_error = bio_integrity_process(bio, bi->profile->verify_fn);
+	bio->bi_status = bio_integrity_process(bio, bi->profile->verify_fn);
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
@@ -395,7 +395,7 @@ void bio_integrity_endio(struct bio *bio)
 	 * integrity metadata.  Restore original bio end_io handler
 	 * and run it.
 	 */
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		bio->bi_end_io = bip->bip_end_io;
 		bio_endio(bio);
 
diff --git a/block/bio.c b/block/bio.c
index 888e7801c638..7a5c8ed27f42 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -309,8 +309,8 @@ static struct bio *__bio_chain_endio(struct bio *bio)
 {
 	struct bio *parent = bio->bi_private;
 
-	if (!parent->bi_error)
-		parent->bi_error = bio->bi_error;
+	if (!parent->bi_status)
+		parent->bi_status = bio->bi_status;
 	bio_put(bio);
 	return parent;
 }
@@ -918,7 +918,7 @@ static void submit_bio_wait_endio(struct bio *bio)
 {
 	struct submit_bio_ret *ret = bio->bi_private;
 
-	ret->error = bio->bi_error;
+	ret->error = blk_status_to_errno(bio->bi_status);
 	complete(&ret->event);
 }
 
@@ -1818,7 +1818,7 @@ again:
 
 	if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
 		trace_block_bio_complete(bdev_get_queue(bio->bi_bdev),
-					 bio, bio->bi_error);
+					 bio, bio->bi_status);
 		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 	}
 
diff --git a/block/blk-core.c b/block/blk-core.c
index e942a9f814c7..3d84820ace9e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -144,6 +144,9 @@ static const struct {
 	[BLK_STS_PROTECTION]	= { -EILSEQ,	"protection" },
 	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
 
+	/* device mapper special case, should not leak out: */
+	[BLK_STS_DM_REQUEUE]	= { -EREMCHG, "dm internal retry" },
+
 	/* everything else not covered above: */
 	[BLK_STS_IOERR]		= { -EIO,	"I/O" },
 };
@@ -188,7 +191,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, blk_status_t error)
 {
 	if (error)
-		bio->bi_error = blk_status_to_errno(error);
+		bio->bi_status = error;
 
 	if (unlikely(rq->rq_flags & RQF_QUIET))
 		bio_set_flag(bio, BIO_QUIET);
@@ -1717,7 +1720,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 	blk_queue_split(q, &bio, q->bio_split);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-		bio->bi_error = -EIO;
+		bio->bi_status = BLK_STS_IOERR;
 		bio_endio(bio);
 		return BLK_QC_T_NONE;
 	}
@@ -1775,7 +1778,10 @@ get_rq:
 	req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
 	if (IS_ERR(req)) {
 		__wbt_done(q->rq_wb, wb_acct);
-		bio->bi_error = PTR_ERR(req);
+		if (PTR_ERR(req) == -ENOMEM)
+			bio->bi_status = BLK_STS_RESOURCE;
+		else
+			bio->bi_status = BLK_STS_IOERR;
 		bio_endio(bio);
 		goto out_unlock;
 	}
@@ -1930,7 +1936,7 @@ generic_make_request_checks(struct bio *bio)
 {
 	struct request_queue *q;
 	int nr_sectors = bio_sectors(bio);
-	int err = -EIO;
+	blk_status_t status = BLK_STS_IOERR;
 	char b[BDEVNAME_SIZE];
 	struct hd_struct *part;
 
@@ -1973,7 +1979,7 @@ generic_make_request_checks(struct bio *bio)
 	    !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
 		bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
 		if (!nr_sectors) {
-			err = 0;
+			status = BLK_STS_OK;
 			goto end_io;
 		}
 	}
@@ -2025,9 +2031,9 @@ generic_make_request_checks(struct bio *bio)
 	return true;
 
 not_supported:
-	err = -EOPNOTSUPP;
+	status = BLK_STS_NOTSUPP;
 end_io:
-	bio->bi_error = err;
+	bio->bi_status = status;
 	bio_endio(bio);
 	return false;
 }
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 0f891a9aff4d..feb30570eaf5 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -384,9 +384,9 @@ static struct kobj_type integrity_ktype = {
 	.sysfs_ops	= &integrity_ops,
 };
 
-static int blk_integrity_nop_fn(struct blk_integrity_iter *iter)
+static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter)
 {
-	return 0;
+	return BLK_STS_OK;
 }
 
 static const struct blk_integrity_profile nop_profile = {
diff --git a/block/bounce.c b/block/bounce.c
index 1cb5dd3a5da1..e4703181d97f 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -143,7 +143,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
 		mempool_free(bvec->bv_page, pool);
 	}
 
-	bio_orig->bi_error = bio->bi_error;
+	bio_orig->bi_status = bio->bi_status;
 	bio_endio(bio_orig);
 	bio_put(bio);
 }
@@ -163,7 +163,7 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
 {
 	struct bio *bio_orig = bio->bi_private;
 
-	if (!bio->bi_error)
+	if (!bio->bi_status)
 		copy_to_high_bio_irq(bio_orig, bio);
 
 	bounce_end_io(bio, pool);
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 680c6d636298..350b3cbcf9e5 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -46,8 +46,8 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len)
  * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref
  * tag.
  */
-static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn,
-			   unsigned int type)
+static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
+		csum_fn *fn, unsigned int type)
 {
 	unsigned int i;
 
@@ -67,11 +67,11 @@ static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn,
 		iter->seed++;
 	}
 
-	return 0;
+	return BLK_STS_OK;
 }
 
-static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn,
-				unsigned int type)
+static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
+		csum_fn *fn, unsigned int type)
 {
 	unsigned int i;
 
@@ -108,7 +108,7 @@ static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn,
 			       "(rcvd %04x, want %04x)\n", iter->disk_name,
 			       (unsigned long long)iter->seed,
 			       be16_to_cpu(pi->guard_tag), be16_to_cpu(csum));
-			return -EILSEQ;
+			return BLK_STS_PROTECTION;
 		}
 
 next:
@@ -117,45 +117,45 @@ next:
 		iter->seed++;
 	}
 
-	return 0;
+	return BLK_STS_OK;
 }
 
-static int t10_pi_type1_generate_crc(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter)
 {
 	return t10_pi_generate(iter, t10_pi_crc_fn, 1);
 }
 
-static int t10_pi_type1_generate_ip(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter)
 {
 	return t10_pi_generate(iter, t10_pi_ip_fn, 1);
 }
 
-static int t10_pi_type1_verify_crc(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter)
 {
 	return t10_pi_verify(iter, t10_pi_crc_fn, 1);
 }
 
-static int t10_pi_type1_verify_ip(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter)
 {
 	return t10_pi_verify(iter, t10_pi_ip_fn, 1);
 }
 
-static int t10_pi_type3_generate_crc(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter)
 {
 	return t10_pi_generate(iter, t10_pi_crc_fn, 3);
 }
 
-static int t10_pi_type3_generate_ip(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter)
 {
 	return t10_pi_generate(iter, t10_pi_ip_fn, 3);
 }
 
-static int t10_pi_type3_verify_crc(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter)
 {
 	return t10_pi_verify(iter, t10_pi_crc_fn, 3);
 }
 
-static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
+static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
 {
 	return t10_pi_verify(iter, t10_pi_ip_fn, 3);
 }
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 5bf0c9d21fc1..dc43254e05a4 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1070,7 +1070,7 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
 		d->ip.rq = NULL;
 	do {
 		bio = rq->bio;
-		bok = !fastfail && !bio->bi_error;
+		bok = !fastfail && !bio->bi_status;
 	} while (__blk_end_request(rq, bok ? BLK_STS_OK : BLK_STS_IOERR, bio->bi_iter.bi_size));
 
 	/* cf. http://lkml.org/lkml/2006/10/31/28 */
@@ -1131,7 +1131,7 @@ ktiocomplete(struct frame *f)
 			ahout->cmdstat, ahin->cmdstat,
 			d->aoemajor, d->aoeminor);
 noskb:		if (buf)
-			buf->bio->bi_error = -EIO;
+			buf->bio->bi_status = BLK_STS_IOERR;
 		goto out;
 	}
 
@@ -1144,7 +1144,7 @@ noskb:		if (buf)
 				"aoe: runt data size in read from",
 				(long) d->aoemajor, d->aoeminor,
 			       skb->len, n);
-			buf->bio->bi_error = -EIO;
+			buf->bio->bi_status = BLK_STS_IOERR;
 			break;
 		}
 		if (n > f->iter.bi_size) {
@@ -1152,7 +1152,7 @@ noskb:		if (buf)
 				"aoe: too-large data size in read from",
 				(long) d->aoemajor, d->aoeminor,
 				n, f->iter.bi_size);
-			buf->bio->bi_error = -EIO;
+			buf->bio->bi_status = BLK_STS_IOERR;
 			break;
 		}
 		bvcpy(skb, f->buf->bio, f->iter, n);
@@ -1654,7 +1654,7 @@ aoe_failbuf(struct aoedev *d, struct buf *buf)
 	if (buf == NULL)
 		return;
 	buf->iter.bi_size = 0;
-	buf->bio->bi_error = -EIO;
+	buf->bio->bi_status = BLK_STS_IOERR;
 	if (buf->nframesout == 0)
 		aoe_end_buf(d, buf);
 }
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index ffd1947500c6..b28fefb90391 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -170,7 +170,7 @@ aoe_failip(struct aoedev *d)
 	if (rq == NULL)
 		return;
 	while ((bio = d->ip.nxbio)) {
-		bio->bi_error = -EIO;
+		bio->bi_status = BLK_STS_IOERR;
 		d->ip.nxbio = bio->bi_next;
 		n = (unsigned long) rq->special;
 		rq->special = (void *) --n;
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 8d7bcfa49c12..e02c45cd3c5a 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -178,7 +178,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
 	else
 		submit_bio(bio);
 	wait_until_done_or_force_detached(device, bdev, &device->md_io.done);
-	if (!bio->bi_error)
+	if (!bio->bi_status)
 		err = device->md_io.error;
 
  out:
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index a804a4107fbc..809fd245c3dc 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -959,16 +959,16 @@ static void drbd_bm_endio(struct bio *bio)
 	    !bm_test_page_unchanged(b->bm_pages[idx]))
 		drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx);
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		/* ctx error will hold the completed-last non-zero error code,
 		 * in case error codes differ. */
-		ctx->error = bio->bi_error;
+		ctx->error = blk_status_to_errno(bio->bi_status);
 		bm_set_page_io_err(b->bm_pages[idx]);
 		/* Not identical to on disk version of it.
 		 * Is BM_PAGE_IO_ERROR enough? */
 		if (__ratelimit(&drbd_ratelimit_state))
 			drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
-					bio->bi_error, idx);
+					bio->bi_status, idx);
 	} else {
 		bm_clear_page_io_err(b->bm_pages[idx]);
 		dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index d5da45bb03a6..76761b4ca13e 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1627,7 +1627,7 @@ static inline void drbd_generic_make_request(struct drbd_device *device,
 	__release(local);
 	if (!bio->bi_bdev) {
 		drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n");
-		bio->bi_error = -ENODEV;
+		bio->bi_status = BLK_STS_IOERR;
 		bio_endio(bio);
 		return;
 	}
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 1b0a2be24f39..c7e95e6380fb 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1229,9 +1229,9 @@ void one_flush_endio(struct bio *bio)
 	struct drbd_device *device = octx->device;
 	struct issue_flush_context *ctx = octx->ctx;
 
-	if (bio->bi_error) {
-		ctx->error = bio->bi_error;
-		drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_error);
+	if (bio->bi_status) {
+		ctx->error = blk_status_to_errno(bio->bi_status);
+		drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_status);
 	}
 	kfree(octx);
 	bio_put(bio);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 656624314f0d..fca6b9914948 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -203,7 +203,7 @@ void start_new_tl_epoch(struct drbd_connection *connection)
 void complete_master_bio(struct drbd_device *device,
 		struct bio_and_error *m)
 {
-	m->bio->bi_error = m->error;
+	m->bio->bi_status = errno_to_blk_status(m->error);
 	bio_endio(m->bio);
 	dec_ap_bio(device);
 }
@@ -1157,7 +1157,7 @@ static void drbd_process_discard_req(struct drbd_request *req)
 
 	if (blkdev_issue_zeroout(bdev, req->i.sector, req->i.size >> 9,
 			GFP_NOIO, 0))
-		req->private_bio->bi_error = -EIO;
+		req->private_bio->bi_status = BLK_STS_IOERR;
 	bio_endio(req->private_bio);
 }
 
@@ -1225,7 +1225,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
 		/* only pass the error to the upper layers.
 		 * if user cannot handle io errors, that's not our business. */
 		drbd_err(device, "could not kmalloc() req\n");
-		bio->bi_error = -ENOMEM;
+		bio->bi_status = BLK_STS_RESOURCE;
 		bio_endio(bio);
 		return ERR_PTR(-ENOMEM);
 	}
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 1afcb4e02d8d..1d8726a8df34 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -63,7 +63,7 @@ void drbd_md_endio(struct bio *bio)
 	struct drbd_device *device;
 
 	device = bio->bi_private;
-	device->md_io.error = bio->bi_error;
+	device->md_io.error = blk_status_to_errno(bio->bi_status);
 
 	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
 	 * to timeout on the lower level device, and eventually detach from it.
@@ -177,13 +177,13 @@ void drbd_peer_request_endio(struct bio *bio)
 	bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES ||
 			  bio_op(bio) == REQ_OP_DISCARD;
 
-	if (bio->bi_error && __ratelimit(&drbd_ratelimit_state))
+	if (bio->bi_status && __ratelimit(&drbd_ratelimit_state))
 		drbd_warn(device, "%s: error=%d s=%llus\n",
 				is_write ? (is_discard ? "discard" : "write")
-					: "read", bio->bi_error,
+					: "read", bio->bi_status,
 				(unsigned long long)peer_req->i.sector);
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		set_bit(__EE_WAS_ERROR, &peer_req->flags);
 
 	bio_put(bio); /* no need for the bio anymore */
@@ -243,16 +243,16 @@ void drbd_request_endio(struct bio *bio)
 		if (__ratelimit(&drbd_ratelimit_state))
 			drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
 
-		if (!bio->bi_error)
+		if (!bio->bi_status)
 			drbd_panic_after_delayed_completion_of_aborted_request(device);
 	}
 
 	/* to avoid recursion in __req_mod */
-	if (unlikely(bio->bi_error)) {
+	if (unlikely(bio->bi_status)) {
 		switch (bio_op(bio)) {
 		case REQ_OP_WRITE_ZEROES:
 		case REQ_OP_DISCARD:
-			if (bio->bi_error == -EOPNOTSUPP)
+			if (bio->bi_status == BLK_STS_NOTSUPP)
 				what = DISCARD_COMPLETED_NOTSUPP;
 			else
 				what = DISCARD_COMPLETED_WITH_ERROR;
@@ -272,7 +272,7 @@ void drbd_request_endio(struct bio *bio)
 	}
 
 	bio_put(req->private_bio);
-	req->private_bio = ERR_PTR(bio->bi_error);
+	req->private_bio = ERR_PTR(blk_status_to_errno(bio->bi_status));
 
 	/* not req_mod(), we need irqsave here! */
 	spin_lock_irqsave(&device->resource->req_lock, flags);
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index cc75a5176057..9e3cb32e365d 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3780,9 +3780,9 @@ static void floppy_rb0_cb(struct bio *bio)
 	struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private;
 	int drive = cbdata->drive;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		pr_info("floppy: error %d while reading block 0\n",
-			bio->bi_error);
+			bio->bi_status);
 		set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
 	}
 	complete(&cbdata->complete);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 42e3c880a8a5..e8a381161db6 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -952,9 +952,9 @@ static void pkt_end_io_read(struct bio *bio)
 
 	pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n",
 		bio, (unsigned long long)pkt->sector,
-		(unsigned long long)bio->bi_iter.bi_sector, bio->bi_error);
+		(unsigned long long)bio->bi_iter.bi_sector, bio->bi_status);
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		atomic_inc(&pkt->io_errors);
 	if (atomic_dec_and_test(&pkt->io_wait)) {
 		atomic_inc(&pkt->run_sm);
@@ -969,7 +969,7 @@ static void pkt_end_io_packet_write(struct bio *bio)
 	struct pktcdvd_device *pd = pkt->pd;
 	BUG_ON(!pd);
 
-	pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_error);
+	pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_status);
 
 	pd->stats.pkt_ended++;
 
@@ -1305,16 +1305,16 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 	pkt_queue_bio(pd, pkt->w_bio);
 }
 
-static void pkt_finish_packet(struct packet_data *pkt, int error)
+static void pkt_finish_packet(struct packet_data *pkt, blk_status_t status)
 {
 	struct bio *bio;
 
-	if (error)
+	if (status)
 		pkt->cache_valid = 0;
 
 	/* Finish all bios corresponding to this packet */
 	while ((bio = bio_list_pop(&pkt->orig_bios))) {
-		bio->bi_error = error;
+		bio->bi_status = status;
 		bio_endio(bio);
 	}
 }
@@ -1349,7 +1349,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data
 			if (atomic_read(&pkt->io_wait) > 0)
 				return;
 
-			if (!pkt->w_bio->bi_error) {
+			if (!pkt->w_bio->bi_status) {
 				pkt_set_state(pkt, PACKET_FINISHED_STATE);
 			} else {
 				pkt_set_state(pkt, PACKET_RECOVERY_STATE);
@@ -1366,7 +1366,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data
 			break;
 
 		case PACKET_FINISHED_STATE:
-			pkt_finish_packet(pkt, pkt->w_bio->bi_error);
+			pkt_finish_packet(pkt, pkt->w_bio->bi_status);
 			return;
 
 		default:
@@ -2301,7 +2301,7 @@ static void pkt_end_io_read_cloned(struct bio *bio)
 	struct packet_stacked_data *psd = bio->bi_private;
 	struct pktcdvd_device *pd = psd->pd;
 
-	psd->bio->bi_error = bio->bi_error;
+	psd->bio->bi_status = bio->bi_status;
 	bio_put(bio);
 	bio_endio(psd->bio);
 	mempool_free(psd, psd_pool);
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 456b4fe21559..6fa2b8197013 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -428,7 +428,7 @@ static void ps3vram_cache_cleanup(struct ps3_system_bus_device *dev)
 	kfree(priv->cache.tags);
 }
 
-static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from,
+static blk_status_t ps3vram_read(struct ps3_system_bus_device *dev, loff_t from,
 			size_t len, size_t *retlen, u_char *buf)
 {
 	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
@@ -438,7 +438,7 @@ static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from,
 		(unsigned int)from, len);
 
 	if (from >= priv->size)
-		return -EIO;
+		return BLK_STS_IOERR;
 
 	if (len > priv->size - from)
 		len = priv->size - from;
@@ -472,14 +472,14 @@ static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from,
 	return 0;
 }
 
-static int ps3vram_write(struct ps3_system_bus_device *dev, loff_t to,
+static blk_status_t ps3vram_write(struct ps3_system_bus_device *dev, loff_t to,
 			 size_t len, size_t *retlen, const u_char *buf)
 {
 	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
 	unsigned int cached, count;
 
 	if (to >= priv->size)
-		return -EIO;
+		return BLK_STS_IOERR;
 
 	if (len > priv->size - to)
 		len = priv->size - to;
@@ -554,7 +554,7 @@ static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev,
 	int write = bio_data_dir(bio) == WRITE;
 	const char *op = write ? "write" : "read";
 	loff_t offset = bio->bi_iter.bi_sector << 9;
-	int error = 0;
+	blk_status_t error = 0;
 	struct bio_vec bvec;
 	struct bvec_iter iter;
 	struct bio *next;
@@ -578,7 +578,7 @@ static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev,
 
 		if (retlen != len) {
 			dev_err(&dev->core, "Short %s\n", op);
-			error = -EIO;
+			error = BLK_STS_IOERR;
 			goto out;
 		}
 
@@ -593,7 +593,7 @@ out:
 	next = bio_list_peek(&priv->list);
 	spin_unlock_irq(&priv->lock);
 
-	bio->bi_error = error;
+	bio->bi_status = error;
 	bio_endio(bio);
 	return next;
 }
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index 9c566364ac9c..0b0a0a902355 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -149,7 +149,7 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct rsxx_cardinfo *card = q->queuedata;
 	struct rsxx_bio_meta *bio_meta;
-	int st = -EINVAL;
+	blk_status_t st = BLK_STS_IOERR;
 
 	blk_queue_split(q, &bio, q->bio_split);
 
@@ -161,15 +161,11 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio)
 	if (bio_end_sector(bio) > get_capacity(card->gendisk))
 		goto req_err;
 
-	if (unlikely(card->halt)) {
-		st = -EFAULT;
+	if (unlikely(card->halt))
 		goto req_err;
-	}
 
-	if (unlikely(card->dma_fault)) {
-		st = (-EFAULT);
+	if (unlikely(card->dma_fault))
 		goto req_err;
-	}
 
 	if (bio->bi_iter.bi_size == 0) {
 		dev_err(CARD_TO_DEV(card), "size zero BIO!\n");
@@ -178,7 +174,7 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio)
 
 	bio_meta = kmem_cache_alloc(bio_meta_pool, GFP_KERNEL);
 	if (!bio_meta) {
-		st = -ENOMEM;
+		st = BLK_STS_RESOURCE;
 		goto req_err;
 	}
 
@@ -205,7 +201,7 @@ queue_err:
 	kmem_cache_free(bio_meta_pool, bio_meta);
 req_err:
 	if (st)
-		bio->bi_error = st;
+		bio->bi_status = st;
 	bio_endio(bio);
 	return BLK_QC_T_NONE;
 }
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index 5a20385f87d0..6a1b2177951c 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -611,7 +611,7 @@ static void rsxx_schedule_done(struct work_struct *work)
 	mutex_unlock(&ctrl->work_lock);
 }
 
-static int rsxx_queue_discard(struct rsxx_cardinfo *card,
+static blk_status_t rsxx_queue_discard(struct rsxx_cardinfo *card,
 				  struct list_head *q,
 				  unsigned int laddr,
 				  rsxx_dma_cb cb,
@@ -621,7 +621,7 @@ static int rsxx_queue_discard(struct rsxx_cardinfo *card,
 
 	dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL);
 	if (!dma)
-		return -ENOMEM;
+		return BLK_STS_RESOURCE;
 
 	dma->cmd          = HW_CMD_BLK_DISCARD;
 	dma->laddr        = laddr;
@@ -640,7 +640,7 @@ static int rsxx_queue_discard(struct rsxx_cardinfo *card,
 	return 0;
 }
 
-static int rsxx_queue_dma(struct rsxx_cardinfo *card,
+static blk_status_t rsxx_queue_dma(struct rsxx_cardinfo *card,
 			      struct list_head *q,
 			      int dir,
 			      unsigned int dma_off,
@@ -655,7 +655,7 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card,
 
 	dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL);
 	if (!dma)
-		return -ENOMEM;
+		return BLK_STS_RESOURCE;
 
 	dma->cmd          = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ;
 	dma->laddr        = laddr;
@@ -677,7 +677,7 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card,
 	return 0;
 }
 
-int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
+blk_status_t rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
 			   struct bio *bio,
 			   atomic_t *n_dmas,
 			   rsxx_dma_cb cb,
@@ -694,7 +694,7 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
 	unsigned int dma_len;
 	int dma_cnt[RSXX_MAX_TARGETS];
 	int tgt;
-	int st;
+	blk_status_t st;
 	int i;
 
 	addr8 = bio->bi_iter.bi_sector << 9; /* sectors are 512 bytes */
@@ -769,7 +769,6 @@ bvec_err:
 	for (i = 0; i < card->n_targets; i++)
 		rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i],
 					FREE_DMA);
-
 	return st;
 }
 
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h
index 6bbc64d0f690..277f27e673a2 100644
--- a/drivers/block/rsxx/rsxx_priv.h
+++ b/drivers/block/rsxx/rsxx_priv.h
@@ -391,7 +391,7 @@ int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl);
 void rsxx_dma_cleanup(void);
 void rsxx_dma_queue_reset(struct rsxx_cardinfo *card);
 int rsxx_dma_configure(struct rsxx_cardinfo *card);
-int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
+blk_status_t rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
 			   struct bio *bio,
 			   atomic_t *n_dmas,
 			   rsxx_dma_cb cb,
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index c141cc3be22b..4b3c947697b1 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -454,7 +454,7 @@ static void process_page(unsigned long data)
 				PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE);
 		if (control & DMASCR_HARD_ERROR) {
 			/* error */
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 			dev_printk(KERN_WARNING, &card->dev->dev,
 				"I/O error on sector %d/%d\n",
 				le32_to_cpu(desc->local_addr)>>9,
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 726c32e35db9..746bd8c8c09a 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -1069,20 +1069,17 @@ static void xen_blk_drain_io(struct xen_blkif_ring *ring)
 	atomic_set(&blkif->drain, 0);
 }
 
-/*
- * Completion callback on the bio's. Called as bh->b_end_io()
- */
-
-static void __end_block_io_op(struct pending_req *pending_req, int error)
+static void __end_block_io_op(struct pending_req *pending_req,
+		blk_status_t error)
 {
 	/* An error fails the entire request. */
-	if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
-	    (error == -EOPNOTSUPP)) {
+	if (pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE &&
+	    error == BLK_STS_NOTSUPP) {
 		pr_debug("flush diskcache op failed, not supported\n");
 		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
 		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
-	} else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
-		    (error == -EOPNOTSUPP)) {
+	} else if (pending_req->operation == BLKIF_OP_WRITE_BARRIER &&
+		   error == BLK_STS_NOTSUPP) {
 		pr_debug("write barrier op failed, not supported\n");
 		xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
 		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
@@ -1106,7 +1103,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
  */
 static void end_block_io_op(struct bio *bio)
 {
-	__end_block_io_op(bio->bi_private, bio->bi_error);
+	__end_block_io_op(bio->bi_private, bio->bi_status);
 	bio_put(bio);
 }
 
@@ -1423,7 +1420,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 	for (i = 0; i < nbio; i++)
 		bio_put(biolist[i]);
 	atomic_set(&pending_req->pendcnt, 1);
-	__end_block_io_op(pending_req, -EINVAL);
+	__end_block_io_op(pending_req, BLK_STS_RESOURCE);
 	msleep(1); /* back off a bit */
 	return -EIO;
 }
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2f468cf86dcf..e3be666c2776 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2006,7 +2006,7 @@ static void split_bio_end(struct bio *bio)
 
 	if (atomic_dec_and_test(&split_bio->pending)) {
 		split_bio->bio->bi_phys_segments = 0;
-		split_bio->bio->bi_error = bio->bi_error;
+		split_bio->bio->bi_status = bio->bi_status;
 		bio_endio(split_bio->bio);
 		kfree(split_bio);
 	}
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 5e44768ccffa..4e0de995cd90 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -296,8 +296,8 @@ void pblk_flush_writer(struct pblk *pblk)
 		pr_err("pblk: tear down bio failed\n");
 	}
 
-	if (bio->bi_error)
-		pr_err("pblk: flush sync write failed (%u)\n", bio->bi_error);
+	if (bio->bi_status)
+		pr_err("pblk: flush sync write failed (%u)\n", bio->bi_status);
 
 	bio_put(bio);
 }
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 4a12f14d78c6..762c0b73cb67 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -114,7 +114,7 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
 		pblk_log_read_err(pblk, rqd);
 #ifdef CONFIG_NVM_DEBUG
 	else
-		WARN_ONCE(bio->bi_error, "pblk: corrupted read error\n");
+		WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n");
 #endif
 
 	if (rqd->nr_ppas > 1)
@@ -123,7 +123,7 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
 	bio_put(bio);
 	if (r_ctx->orig_bio) {
 #ifdef CONFIG_NVM_DEBUG
-		WARN_ONCE(r_ctx->orig_bio->bi_error,
+		WARN_ONCE(r_ctx->orig_bio->bi_status,
 						"pblk: corrupted read bio\n");
 #endif
 		bio_endio(r_ctx->orig_bio);
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index aef6fd7c4a0c..79b90d8dbcb3 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -186,7 +186,7 @@ static void pblk_end_io_write(struct nvm_rq *rqd)
 	}
 #ifdef CONFIG_NVM_DEBUG
 	else
-		WARN_ONCE(rqd->bio->bi_error, "pblk: corrupted write error\n");
+		WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n");
 #endif
 
 	pblk_complete_write(pblk, rqd, c_ctx);
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index cf0e28a0ff61..8d3b53bb3307 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -279,8 +279,8 @@ static void rrpc_end_sync_bio(struct bio *bio)
 {
 	struct completion *waiting = bio->bi_private;
 
-	if (bio->bi_error)
-		pr_err("nvm: gc request failed (%u).\n", bio->bi_error);
+	if (bio->bi_status)
+		pr_err("nvm: gc request failed (%u).\n", bio->bi_status);
 
 	complete(waiting);
 }
@@ -359,7 +359,7 @@ try:
 			goto finished;
 		}
 		wait_for_completion_io(&wait);
-		if (bio->bi_error) {
+		if (bio->bi_status) {
 			rrpc_inflight_laddr_release(rrpc, rqd);
 			goto finished;
 		}
@@ -385,7 +385,7 @@ try:
 		wait_for_completion_io(&wait);
 
 		rrpc_inflight_laddr_release(rrpc, rqd);
-		if (bio->bi_error)
+		if (bio->bi_status)
 			goto finished;
 
 		bio_reset(bio);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index c3ea03c9a1a8..dee542fff68e 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -849,10 +849,11 @@ static inline void wake_up_allocators(struct cache_set *c)
 
 /* Forward declarations */
 
-void bch_count_io_errors(struct cache *, int, const char *);
+void bch_count_io_errors(struct cache *, blk_status_t, const char *);
 void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
-			      int, const char *);
-void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *);
+			      blk_status_t, const char *);
+void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t,
+		const char *);
 void bch_bbio_free(struct bio *, struct cache_set *);
 struct bio *bch_bbio_alloc(struct cache_set *);
 
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 450d0e848ae4..866dcf78ff8e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -307,7 +307,7 @@ static void bch_btree_node_read(struct btree *b)
 	bch_submit_bbio(bio, b->c, &b->key, 0);
 	closure_sync(&cl);
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		set_btree_node_io_error(b);
 
 	bch_bbio_free(bio, b->c);
@@ -374,10 +374,10 @@ static void btree_node_write_endio(struct bio *bio)
 	struct closure *cl = bio->bi_private;
 	struct btree *b = container_of(cl, struct btree, io);
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		set_btree_node_io_error(b);
 
-	bch_bbio_count_io_errors(b->c, bio, bio->bi_error, "writing btree");
+	bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree");
 	closure_put(cl);
 }
 
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index db45a88c0ce9..6a9b85095e7b 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -50,7 +50,7 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
 
 /* IO errors */
 
-void bch_count_io_errors(struct cache *ca, int error, const char *m)
+void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
 {
 	/*
 	 * The halflife of an error is:
@@ -103,7 +103,7 @@ void bch_count_io_errors(struct cache *ca, int error, const char *m)
 }
 
 void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
-			      int error, const char *m)
+			      blk_status_t error, const char *m)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	struct cache *ca = PTR_CACHE(c, &b->key, 0);
@@ -132,7 +132,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
 }
 
 void bch_bbio_endio(struct cache_set *c, struct bio *bio,
-		    int error, const char *m)
+		    blk_status_t error, const char *m)
 {
 	struct closure *cl = bio->bi_private;
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 1198e53d5670..0352d05e495c 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -549,7 +549,7 @@ static void journal_write_endio(struct bio *bio)
 {
 	struct journal_write *w = bio->bi_private;
 
-	cache_set_err_on(bio->bi_error, w->c, "journal io error");
+	cache_set_err_on(bio->bi_status, w->c, "journal io error");
 	closure_put(&w->c->journal.io);
 }
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 13b8a907006d..f633b30c962e 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -63,14 +63,14 @@ static void read_moving_endio(struct bio *bio)
 	struct moving_io *io = container_of(bio->bi_private,
 					    struct moving_io, cl);
 
-	if (bio->bi_error)
-		io->op.error = bio->bi_error;
+	if (bio->bi_status)
+		io->op.status = bio->bi_status;
 	else if (!KEY_DIRTY(&b->key) &&
 		 ptr_stale(io->op.c, &b->key, 0)) {
-		io->op.error = -EINTR;
+		io->op.status = BLK_STS_IOERR;
 	}
 
-	bch_bbio_endio(io->op.c, bio, bio->bi_error, "reading data to move");
+	bch_bbio_endio(io->op.c, bio, bio->bi_status, "reading data to move");
 }
 
 static void moving_init(struct moving_io *io)
@@ -92,7 +92,7 @@ static void write_moving(struct closure *cl)
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct data_insert_op *op = &io->op;
 
-	if (!op->error) {
+	if (!op->status) {
 		moving_init(io);
 
 		io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 709c9cc34369..019b3df9f1c6 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -81,7 +81,7 @@ static void bch_data_insert_keys(struct closure *cl)
 	if (ret == -ESRCH) {
 		op->replace_collision = true;
 	} else if (ret) {
-		op->error		= -ENOMEM;
+		op->status		= BLK_STS_RESOURCE;
 		op->insert_data_done	= true;
 	}
 
@@ -178,17 +178,17 @@ static void bch_data_insert_endio(struct bio *bio)
 	struct closure *cl = bio->bi_private;
 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		/* TODO: We could try to recover from this. */
 		if (op->writeback)
-			op->error = bio->bi_error;
+			op->status = bio->bi_status;
 		else if (!op->replace)
 			set_closure_fn(cl, bch_data_insert_error, op->wq);
 		else
 			set_closure_fn(cl, NULL, NULL);
 	}
 
-	bch_bbio_endio(op->c, bio, bio->bi_error, "writing data to cache");
+	bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache");
 }
 
 static void bch_data_insert_start(struct closure *cl)
@@ -488,15 +488,15 @@ static void bch_cache_read_endio(struct bio *bio)
 	 * from the backing device.
 	 */
 
-	if (bio->bi_error)
-		s->iop.error = bio->bi_error;
+	if (bio->bi_status)
+		s->iop.status = bio->bi_status;
 	else if (!KEY_DIRTY(&b->key) &&
 		 ptr_stale(s->iop.c, &b->key, 0)) {
 		atomic_long_inc(&s->iop.c->cache_read_races);
-		s->iop.error = -EINTR;
+		s->iop.status = BLK_STS_IOERR;
 	}
 
-	bch_bbio_endio(s->iop.c, bio, bio->bi_error, "reading from cache");
+	bch_bbio_endio(s->iop.c, bio, bio->bi_status, "reading from cache");
 }
 
 /*
@@ -593,9 +593,9 @@ static void request_endio(struct bio *bio)
 {
 	struct closure *cl = bio->bi_private;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		struct search *s = container_of(cl, struct search, cl);
-		s->iop.error = bio->bi_error;
+		s->iop.status = bio->bi_status;
 		/* Only cache read errors are recoverable */
 		s->recoverable = false;
 	}
@@ -611,7 +611,7 @@ static void bio_complete(struct search *s)
 				    &s->d->disk->part0, s->start_time);
 
 		trace_bcache_request_end(s->d, s->orig_bio);
-		s->orig_bio->bi_error = s->iop.error;
+		s->orig_bio->bi_status = s->iop.status;
 		bio_endio(s->orig_bio);
 		s->orig_bio = NULL;
 	}
@@ -664,7 +664,7 @@ static inline struct search *search_alloc(struct bio *bio,
 	s->iop.inode		= d->id;
 	s->iop.write_point	= hash_long((unsigned long) current, 16);
 	s->iop.write_prio	= 0;
-	s->iop.error		= 0;
+	s->iop.status		= 0;
 	s->iop.flags		= 0;
 	s->iop.flush_journal	= op_is_flush(bio->bi_opf);
 	s->iop.wq		= bcache_wq;
@@ -707,7 +707,7 @@ static void cached_dev_read_error(struct closure *cl)
 		/* Retry from the backing device: */
 		trace_bcache_read_retry(s->orig_bio);
 
-		s->iop.error = 0;
+		s->iop.status = 0;
 		do_bio_hook(s, s->orig_bio);
 
 		/* XXX: invalidate cache */
@@ -767,7 +767,7 @@ static void cached_dev_read_done_bh(struct closure *cl)
 				  !s->cache_miss, s->iop.bypass);
 	trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
 
-	if (s->iop.error)
+	if (s->iop.status)
 		continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
 	else if (s->iop.bio || verify(dc, &s->bio.bio))
 		continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 1ff36875c2b3..7689176951ce 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -10,7 +10,7 @@ struct data_insert_op {
 	unsigned		inode;
 	uint16_t		write_point;
 	uint16_t		write_prio;
-	short			error;
+	blk_status_t		status;
 
 	union {
 		uint16_t	flags;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e57353e39168..fbc4f5412dec 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -271,7 +271,7 @@ static void write_super_endio(struct bio *bio)
 {
 	struct cache *ca = bio->bi_private;
 
-	bch_count_io_errors(ca, bio->bi_error, "writing superblock");
+	bch_count_io_errors(ca, bio->bi_status, "writing superblock");
 	closure_put(&ca->set->sb_write);
 }
 
@@ -321,7 +321,7 @@ static void uuid_endio(struct bio *bio)
 	struct closure *cl = bio->bi_private;
 	struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
 
-	cache_set_err_on(bio->bi_error, c, "accessing uuids");
+	cache_set_err_on(bio->bi_status, c, "accessing uuids");
 	bch_bbio_free(bio, c);
 	closure_put(cl);
 }
@@ -494,7 +494,7 @@ static void prio_endio(struct bio *bio)
 {
 	struct cache *ca = bio->bi_private;
 
-	cache_set_err_on(bio->bi_error, ca->set, "accessing priorities");
+	cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
 	bch_bbio_free(bio, ca->set);
 	closure_put(&ca->prio);
 }
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 6ac2e48b9235..42c66e76f05e 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -167,7 +167,7 @@ static void dirty_endio(struct bio *bio)
 	struct keybuf_key *w = bio->bi_private;
 	struct dirty_io *io = w->private;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		SET_KEY_DIRTY(&w->key, false);
 
 	closure_put(&io->cl);
@@ -195,7 +195,7 @@ static void read_dirty_endio(struct bio *bio)
 	struct dirty_io *io = w->private;
 
 	bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
-			    bio->bi_error, "reading dirty data from cache");
+			    bio->bi_status, "reading dirty data from cache");
 
 	dirty_endio(bio);
 }
diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index ae7da2c30a57..82d27384d31f 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -229,7 +229,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
 EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
 
 void dm_cell_error(struct dm_bio_prison *prison,
-		   struct dm_bio_prison_cell *cell, int error)
+		   struct dm_bio_prison_cell *cell, blk_status_t error)
 {
 	struct bio_list bios;
 	struct bio *bio;
@@ -238,7 +238,7 @@ void dm_cell_error(struct dm_bio_prison *prison,
 	dm_cell_release(prison, cell, &bios);
 
 	while ((bio = bio_list_pop(&bios))) {
-		bio->bi_error = error;
+		bio->bi_status = error;
 		bio_endio(bio);
 	}
 }
diff --git a/drivers/md/dm-bio-prison-v1.h b/drivers/md/dm-bio-prison-v1.h
index cddd4ac07e2c..cec52ac5e1ae 100644
--- a/drivers/md/dm-bio-prison-v1.h
+++ b/drivers/md/dm-bio-prison-v1.h
@@ -91,7 +91,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
 			       struct dm_bio_prison_cell *cell,
 			       struct bio_list *inmates);
 void dm_cell_error(struct dm_bio_prison *prison,
-		   struct dm_bio_prison_cell *cell, int error);
+		   struct dm_bio_prison_cell *cell, blk_status_t error);
 
 /*
  * Visits the cell and then releases.  Guarantees no new inmates are
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index cd8139593ccd..0902d2fd1743 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -145,8 +145,8 @@ struct dm_buffer {
 	enum data_mode data_mode;
 	unsigned char list_mode;		/* LIST_* */
 	unsigned hold_count;
-	int read_error;
-	int write_error;
+	blk_status_t read_error;
+	blk_status_t write_error;
 	unsigned long state;
 	unsigned long last_accessed;
 	struct dm_bufio_client *c;
@@ -555,7 +555,7 @@ static void dmio_complete(unsigned long error, void *context)
 {
 	struct dm_buffer *b = context;
 
-	b->bio.bi_error = error ? -EIO : 0;
+	b->bio.bi_status = error ? BLK_STS_IOERR : 0;
 	b->bio.bi_end_io(&b->bio);
 }
 
@@ -588,7 +588,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
 
 	r = dm_io(&io_req, 1, &region, NULL);
 	if (r) {
-		b->bio.bi_error = r;
+		b->bio.bi_status = errno_to_blk_status(r);
 		end_io(&b->bio);
 	}
 }
@@ -596,7 +596,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
 static void inline_endio(struct bio *bio)
 {
 	bio_end_io_t *end_fn = bio->bi_private;
-	int error = bio->bi_error;
+	blk_status_t status = bio->bi_status;
 
 	/*
 	 * Reset the bio to free any attached resources
@@ -604,7 +604,7 @@ static void inline_endio(struct bio *bio)
 	 */
 	bio_reset(bio);
 
-	bio->bi_error = error;
+	bio->bi_status = status;
 	end_fn(bio);
 }
 
@@ -685,11 +685,12 @@ static void write_endio(struct bio *bio)
 {
 	struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
 
-	b->write_error = bio->bi_error;
-	if (unlikely(bio->bi_error)) {
+	b->write_error = bio->bi_status;
+	if (unlikely(bio->bi_status)) {
 		struct dm_bufio_client *c = b->c;
-		int error = bio->bi_error;
-		(void)cmpxchg(&c->async_write_error, 0, error);
+
+		(void)cmpxchg(&c->async_write_error, 0,
+				blk_status_to_errno(bio->bi_status));
 	}
 
 	BUG_ON(!test_bit(B_WRITING, &b->state));
@@ -1063,7 +1064,7 @@ static void read_endio(struct bio *bio)
 {
 	struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
 
-	b->read_error = bio->bi_error;
+	b->read_error = bio->bi_status;
 
 	BUG_ON(!test_bit(B_READING, &b->state));
 
@@ -1107,7 +1108,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
 	wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
 
 	if (b->read_error) {
-		int error = b->read_error;
+		int error = blk_status_to_errno(b->read_error);
 
 		dm_bufio_release(b);
 
@@ -1257,7 +1258,8 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
  */
 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
 {
-	int a, f;
+	blk_status_t a;
+	int f;
 	unsigned long buffers_processed = 0;
 	struct dm_buffer *b, *tmp;
 
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index c48612e6d525..c5ea03fc7ee1 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -119,7 +119,7 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
  */
 struct continuation {
 	struct work_struct ws;
-	int input;
+	blk_status_t input;
 };
 
 static inline void init_continuation(struct continuation *k,
@@ -145,7 +145,7 @@ struct batcher {
 	/*
 	 * The operation that everyone is waiting for.
 	 */
-	int (*commit_op)(void *context);
+	blk_status_t (*commit_op)(void *context);
 	void *commit_context;
 
 	/*
@@ -171,8 +171,7 @@ struct batcher {
 static void __commit(struct work_struct *_ws)
 {
 	struct batcher *b = container_of(_ws, struct batcher, commit_work);
-
-	int r;
+	blk_status_t r;
 	unsigned long flags;
 	struct list_head work_items;
 	struct work_struct *ws, *tmp;
@@ -205,7 +204,7 @@ static void __commit(struct work_struct *_ws)
 
 	while ((bio = bio_list_pop(&bios))) {
 		if (r) {
-			bio->bi_error = r;
+			bio->bi_status = r;
 			bio_endio(bio);
 		} else
 			b->issue_op(bio, b->issue_context);
@@ -213,7 +212,7 @@ static void __commit(struct work_struct *_ws)
 }
 
 static void batcher_init(struct batcher *b,
-			 int (*commit_op)(void *),
+			 blk_status_t (*commit_op)(void *),
 			 void *commit_context,
 			 void (*issue_op)(struct bio *bio, void *),
 			 void *issue_context,
@@ -955,7 +954,7 @@ static void writethrough_endio(struct bio *bio)
 
 	dm_unhook_bio(&pb->hook_info, bio);
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		bio_endio(bio);
 		return;
 	}
@@ -1220,7 +1219,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
 	struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
 
 	if (read_err || write_err)
-		mg->k.input = -EIO;
+		mg->k.input = BLK_STS_IOERR;
 
 	queue_continuation(mg->cache->wq, &mg->k);
 }
@@ -1266,8 +1265,8 @@ static void overwrite_endio(struct bio *bio)
 
 	dm_unhook_bio(&pb->hook_info, bio);
 
-	if (bio->bi_error)
-		mg->k.input = bio->bi_error;
+	if (bio->bi_status)
+		mg->k.input = bio->bi_status;
 
 	queue_continuation(mg->cache->wq, &mg->k);
 }
@@ -1323,8 +1322,10 @@ static void mg_complete(struct dm_cache_migration *mg, bool success)
 		if (mg->overwrite_bio) {
 			if (success)
 				force_set_dirty(cache, cblock);
+			else if (mg->k.input)
+				mg->overwrite_bio->bi_status = mg->k.input;
 			else
-				mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO);
+				mg->overwrite_bio->bi_status = BLK_STS_IOERR;
 			bio_endio(mg->overwrite_bio);
 		} else {
 			if (success)
@@ -1504,7 +1505,7 @@ static void mg_copy(struct work_struct *ws)
 		r = copy(mg, is_policy_promote);
 		if (r) {
 			DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache));
-			mg->k.input = -EIO;
+			mg->k.input = BLK_STS_IOERR;
 			mg_complete(mg, false);
 		}
 	}
@@ -1907,12 +1908,12 @@ static int commit(struct cache *cache, bool clean_shutdown)
 /*
  * Used by the batcher.
  */
-static int commit_op(void *context)
+static blk_status_t commit_op(void *context)
 {
 	struct cache *cache = context;
 
 	if (dm_cache_changed_this_transaction(cache->cmd))
-		return commit(cache, false);
+		return errno_to_blk_status(commit(cache, false));
 
 	return 0;
 }
@@ -2018,7 +2019,7 @@ static void requeue_deferred_bios(struct cache *cache)
 	bio_list_init(&cache->deferred_bios);
 
 	while ((bio = bio_list_pop(&bios))) {
-		bio->bi_error = DM_ENDIO_REQUEUE;
+		bio->bi_status = BLK_STS_DM_REQUEUE;
 		bio_endio(bio);
 	}
 }
@@ -2820,7 +2821,8 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 	return r;
 }
 
-static int cache_end_io(struct dm_target *ti, struct bio *bio, int *error)
+static int cache_end_io(struct dm_target *ti, struct bio *bio,
+		blk_status_t *error)
 {
 	struct cache *cache = ti->private;
 	unsigned long flags;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index f4b51809db21..586cef085c6a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -71,7 +71,7 @@ struct dm_crypt_io {
 	struct convert_context ctx;
 
 	atomic_t io_pending;
-	int error;
+	blk_status_t error;
 	sector_t sector;
 
 	struct rb_node rb_node;
@@ -1292,7 +1292,7 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_
 /*
  * Encrypt / decrypt data from one bio to another one (can be the same one)
  */
-static int crypt_convert(struct crypt_config *cc,
+static blk_status_t crypt_convert(struct crypt_config *cc,
 			 struct convert_context *ctx)
 {
 	unsigned int tag_offset = 0;
@@ -1343,13 +1343,13 @@ static int crypt_convert(struct crypt_config *cc,
 		 */
 		case -EBADMSG:
 			atomic_dec(&ctx->cc_pending);
-			return -EILSEQ;
+			return BLK_STS_PROTECTION;
 		/*
 		 * There was an error while processing the request.
 		 */
 		default:
 			atomic_dec(&ctx->cc_pending);
-			return -EIO;
+			return BLK_STS_IOERR;
 		}
 	}
 
@@ -1463,7 +1463,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
 	struct bio *base_bio = io->base_bio;
-	int error = io->error;
+	blk_status_t error = io->error;
 
 	if (!atomic_dec_and_test(&io->io_pending))
 		return;
@@ -1476,7 +1476,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
 	else
 		kfree(io->integrity_metadata);
 
-	base_bio->bi_error = error;
+	base_bio->bi_status = error;
 	bio_endio(base_bio);
 }
 
@@ -1502,7 +1502,7 @@ static void crypt_endio(struct bio *clone)
 	struct dm_crypt_io *io = clone->bi_private;
 	struct crypt_config *cc = io->cc;
 	unsigned rw = bio_data_dir(clone);
-	int error;
+	blk_status_t error;
 
 	/*
 	 * free the processed pages
@@ -1510,7 +1510,7 @@ static void crypt_endio(struct bio *clone)
 	if (rw == WRITE)
 		crypt_free_buffer_pages(cc, clone);
 
-	error = clone->bi_error;
+	error = clone->bi_status;
 	bio_put(clone);
 
 	if (rw == READ && !error) {
@@ -1570,7 +1570,7 @@ static void kcryptd_io_read_work(struct work_struct *work)
 
 	crypt_inc_pending(io);
 	if (kcryptd_io_read(io, GFP_NOIO))
-		io->error = -ENOMEM;
+		io->error = BLK_STS_RESOURCE;
 	crypt_dec_pending(io);
 }
 
@@ -1656,7 +1656,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
 	sector_t sector;
 	struct rb_node **rbp, *parent;
 
-	if (unlikely(io->error < 0)) {
+	if (unlikely(io->error)) {
 		crypt_free_buffer_pages(cc, clone);
 		bio_put(clone);
 		crypt_dec_pending(io);
@@ -1697,7 +1697,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 	struct bio *clone;
 	int crypt_finished;
 	sector_t sector = io->sector;
-	int r;
+	blk_status_t r;
 
 	/*
 	 * Prevent io from disappearing until this function completes.
@@ -1707,7 +1707,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 
 	clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
 	if (unlikely(!clone)) {
-		io->error = -EIO;
+		io->error = BLK_STS_IOERR;
 		goto dec;
 	}
 
@@ -1718,7 +1718,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
 
 	crypt_inc_pending(io);
 	r = crypt_convert(cc, &io->ctx);
-	if (r < 0)
+	if (r)
 		io->error = r;
 	crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending);
 
@@ -1740,7 +1740,7 @@ static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
 static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 {
 	struct crypt_config *cc = io->cc;
-	int r = 0;
+	blk_status_t r;
 
 	crypt_inc_pending(io);
 
@@ -1748,7 +1748,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
 			   io->sector);
 
 	r = crypt_convert(cc, &io->ctx);
-	if (r < 0)
+	if (r)
 		io->error = r;
 
 	if (atomic_dec_and_test(&io->ctx.cc_pending))
@@ -1781,9 +1781,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	if (error == -EBADMSG) {
 		DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
 			    (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)));
-		io->error = -EILSEQ;
+		io->error = BLK_STS_PROTECTION;
 	} else if (error < 0)
-		io->error = -EIO;
+		io->error = BLK_STS_IOERR;
 
 	crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
 
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index c9539917a59b..3d04d5ce19d9 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -358,7 +358,8 @@ map_bio:
 	return DM_MAPIO_REMAPPED;
 }
 
-static int flakey_end_io(struct dm_target *ti, struct bio *bio, int *error)
+static int flakey_end_io(struct dm_target *ti, struct bio *bio,
+		blk_status_t *error)
 {
 	struct flakey_c *fc = ti->private;
 	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
@@ -377,7 +378,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int *error)
 			 * Error read during the down_interval if drop_writes
 			 * and error_writes were not configured.
 			 */
-			*error = -EIO;
+			*error = BLK_STS_IOERR;
 		}
 	}
 
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index ee78fb471229..ccc6ef4d00b9 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -246,7 +246,7 @@ struct dm_integrity_io {
 	unsigned metadata_offset;
 
 	atomic_t in_flight;
-	int bi_error;
+	blk_status_t bi_status;
 
 	struct completion *completion;
 
@@ -1114,8 +1114,8 @@ static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *
 static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
 {
 	int r = dm_integrity_failed(ic);
-	if (unlikely(r) && !bio->bi_error)
-		bio->bi_error = r;
+	if (unlikely(r) && !bio->bi_status)
+		bio->bi_status = errno_to_blk_status(r);
 	bio_endio(bio);
 }
 
@@ -1123,7 +1123,7 @@ static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *di
 {
 	struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
 
-	if (unlikely(dio->fua) && likely(!bio->bi_error) && likely(!dm_integrity_failed(ic)))
+	if (unlikely(dio->fua) && likely(!bio->bi_status) && likely(!dm_integrity_failed(ic)))
 		submit_flush_bio(ic, dio);
 	else
 		do_endio(ic, bio);
@@ -1142,9 +1142,9 @@ static void dec_in_flight(struct dm_integrity_io *dio)
 
 		bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
 
-		if (unlikely(dio->bi_error) && !bio->bi_error)
-			bio->bi_error = dio->bi_error;
-		if (likely(!bio->bi_error) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) {
+		if (unlikely(dio->bi_status) && !bio->bi_status)
+			bio->bi_status = dio->bi_status;
+		if (likely(!bio->bi_status) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) {
 			dio->range.logical_sector += dio->range.n_sectors;
 			bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT);
 			INIT_WORK(&dio->work, integrity_bio_wait);
@@ -1318,7 +1318,7 @@ skip_io:
 	dec_in_flight(dio);
 	return;
 error:
-	dio->bi_error = r;
+	dio->bi_status = errno_to_blk_status(r);
 	dec_in_flight(dio);
 }
 
@@ -1331,7 +1331,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 	sector_t area, offset;
 
 	dio->ic = ic;
-	dio->bi_error = 0;
+	dio->bi_status = 0;
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		submit_flush_bio(ic, dio);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 3702e502466d..c8f8f3004085 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -124,7 +124,7 @@ static void complete_io(struct io *io)
 	fn(error_bits, context);
 }
 
-static void dec_count(struct io *io, unsigned int region, int error)
+static void dec_count(struct io *io, unsigned int region, blk_status_t error)
 {
 	if (error)
 		set_bit(region, &io->error_bits);
@@ -137,9 +137,9 @@ static void endio(struct bio *bio)
 {
 	struct io *io;
 	unsigned region;
-	int error;
+	blk_status_t error;
 
-	if (bio->bi_error && bio_data_dir(bio) == READ)
+	if (bio->bi_status && bio_data_dir(bio) == READ)
 		zero_fill_bio(bio);
 
 	/*
@@ -147,7 +147,7 @@ static void endio(struct bio *bio)
 	 */
 	retrieve_io_and_region_from_bio(bio, &io, &region);
 
-	error = bio->bi_error;
+	error = bio->bi_status;
 	bio_put(bio);
 
 	dec_count(io, region, error);
@@ -319,7 +319,7 @@ static void do_region(int op, int op_flags, unsigned region,
 	if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES ||
 	     op == REQ_OP_WRITE_SAME)  &&
 	    special_cmd_max_sectors == 0) {
-		dec_count(io, region, -EOPNOTSUPP);
+		dec_count(io, region, BLK_STS_NOTSUPP);
 		return;
 	}
 
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index cc57c7fa1268..a1da0eb58a93 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -150,10 +150,10 @@ static void log_end_io(struct bio *bio)
 {
 	struct log_writes_c *lc = bio->bi_private;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		unsigned long flags;
 
-		DMERR("Error writing log block, error=%d", bio->bi_error);
+		DMERR("Error writing log block, error=%d", bio->bi_status);
 		spin_lock_irqsave(&lc->blocks_lock, flags);
 		lc->logging_enabled = false;
 		spin_unlock_irqrestore(&lc->blocks_lock, flags);
@@ -664,7 +664,8 @@ map_bio:
 	return DM_MAPIO_REMAPPED;
 }
 
-static int normal_end_io(struct dm_target *ti, struct bio *bio, int *error)
+static int normal_end_io(struct dm_target *ti, struct bio *bio,
+		blk_status_t *error)
 {
 	struct log_writes_c *lc = ti->private;
 	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 39262e344ae1..a7d2e0840cc5 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -565,7 +565,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 	mpio->pgpath = pgpath;
 	mpio->nr_bytes = nr_bytes;
 
-	bio->bi_error = 0;
+	bio->bi_status = 0;
 	bio->bi_bdev = pgpath->path.dev->bdev;
 	bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
 
@@ -623,10 +623,10 @@ static void process_queued_bios(struct work_struct *work)
 		r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio));
 		switch (r) {
 		case DM_MAPIO_KILL:
-			r = -EIO;
-			/*FALLTHRU*/
+			bio->bi_status = BLK_STS_IOERR;
+			bio_endio(bio);
 		case DM_MAPIO_REQUEUE:
-			bio->bi_error = r;
+			bio->bi_status = BLK_STS_DM_REQUEUE;
 			bio_endio(bio);
 			break;
 		case DM_MAPIO_REMAPPED:
@@ -1510,7 +1510,8 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 	return r;
 }
 
-static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int *error)
+static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
+		blk_status_t *error)
 {
 	struct multipath *m = ti->private;
 	struct dm_mpath_io *mpio = get_mpio_from_bio(clone);
@@ -1518,7 +1519,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int *er
 	unsigned long flags;
 	int r = DM_ENDIO_DONE;
 
-	if (!*error || noretry_error(errno_to_blk_status(*error)))
+	if (!*error || noretry_error(*error))
 		goto done;
 
 	if (pgpath)
@@ -1527,7 +1528,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int *er
 	if (atomic_read(&m->nr_valid_paths) == 0 &&
 	    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
 		dm_report_EIO(m);
-		*error = -EIO;
+		*error = BLK_STS_IOERR;
 		goto done;
 	}
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 77bcf50ce75f..0822e4a6f67d 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -490,9 +490,9 @@ static void hold_bio(struct mirror_set *ms, struct bio *bio)
 		 * If device is suspended, complete the bio.
 		 */
 		if (dm_noflush_suspending(ms->ti))
-			bio->bi_error = DM_ENDIO_REQUEUE;
+			bio->bi_status = BLK_STS_DM_REQUEUE;
 		else
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 
 		bio_endio(bio);
 		return;
@@ -626,7 +626,7 @@ static void write_callback(unsigned long error, void *context)
 	 * degrade the array.
 	 */
 	if (bio_op(bio) == REQ_OP_DISCARD) {
-		bio->bi_error = -EOPNOTSUPP;
+		bio->bi_status = BLK_STS_NOTSUPP;
 		bio_endio(bio);
 		return;
 	}
@@ -1236,7 +1236,8 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
 	return DM_MAPIO_REMAPPED;
 }
 
-static int mirror_end_io(struct dm_target *ti, struct bio *bio, int *error)
+static int mirror_end_io(struct dm_target *ti, struct bio *bio,
+		blk_status_t *error)
 {
 	int rw = bio_data_dir(bio);
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
@@ -1255,7 +1256,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int *error)
 		return DM_ENDIO_DONE;
 	}
 
-	if (*error == -EOPNOTSUPP)
+	if (*error == BLK_STS_NOTSUPP)
 		return DM_ENDIO_DONE;
 
 	if (bio->bi_opf & REQ_RAHEAD)
@@ -1277,7 +1278,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int *error)
 			bd = &bio_record->details;
 
 			dm_bio_restore(bd, bio);
-			bio->bi_error = 0;
+			bio->bi_status = 0;
 
 			queue_bio(ms, bio, rw);
 			return DM_ENDIO_INCOMPLETE;
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 63402f8a38de..fafd5326e572 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -119,7 +119,7 @@ static void end_clone_bio(struct bio *clone)
 	struct dm_rq_target_io *tio = info->tio;
 	struct bio *bio = info->orig;
 	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
-	blk_status_t error = errno_to_blk_status(clone->bi_error);
+	blk_status_t error = clone->bi_status;
 
 	bio_put(clone);
 
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 79a845798e2f..1ba41048b438 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1590,7 +1590,7 @@ static void full_bio_end_io(struct bio *bio)
 {
 	void *callback_data = bio->bi_private;
 
-	dm_kcopyd_do_callback(callback_data, 0, bio->bi_error ? 1 : 0);
+	dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0);
 }
 
 static void start_full_bio(struct dm_snap_pending_exception *pe,
@@ -1851,7 +1851,8 @@ out_unlock:
 	return r;
 }
 
-static int snapshot_end_io(struct dm_target *ti, struct bio *bio, int *error)
+static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
+		blk_status_t *error)
 {
 	struct dm_snapshot *s = ti->private;
 
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 49888bc2c909..11621a0af887 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -375,7 +375,8 @@ static void stripe_status(struct dm_target *ti, status_type_t type,
 	}
 }
 
-static int stripe_end_io(struct dm_target *ti, struct bio *bio, int *error)
+static int stripe_end_io(struct dm_target *ti, struct bio *bio,
+		blk_status_t *error)
 {
 	unsigned i;
 	char major_minor[16];
@@ -387,7 +388,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, int *error)
 	if (bio->bi_opf & REQ_RAHEAD)
 		return DM_ENDIO_DONE;
 
-	if (*error == -EOPNOTSUPP)
+	if (*error == BLK_STS_NOTSUPP)
 		return DM_ENDIO_DONE;
 
 	memset(major_minor, 0, sizeof(major_minor));
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 22b1a64c44b7..3490b300cbff 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -383,8 +383,8 @@ static void end_discard(struct discard_op *op, int r)
 	 * Even if r is set, there could be sub discards in flight that we
 	 * need to wait for.
 	 */
-	if (r && !op->parent_bio->bi_error)
-		op->parent_bio->bi_error = r;
+	if (r && !op->parent_bio->bi_status)
+		op->parent_bio->bi_status = errno_to_blk_status(r);
 	bio_endio(op->parent_bio);
 }
 
@@ -450,22 +450,20 @@ static void cell_release_no_holder(struct pool *pool,
 }
 
 static void cell_error_with_code(struct pool *pool,
-				 struct dm_bio_prison_cell *cell, int error_code)
+		struct dm_bio_prison_cell *cell, blk_status_t error_code)
 {
 	dm_cell_error(pool->prison, cell, error_code);
 	dm_bio_prison_free_cell(pool->prison, cell);
 }
 
-static int get_pool_io_error_code(struct pool *pool)
+static blk_status_t get_pool_io_error_code(struct pool *pool)
 {
-	return pool->out_of_data_space ? -ENOSPC : -EIO;
+	return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR;
 }
 
 static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
 {
-	int error = get_pool_io_error_code(pool);
-
-	cell_error_with_code(pool, cell, error);
+	cell_error_with_code(pool, cell, get_pool_io_error_code(pool));
 }
 
 static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
@@ -475,7 +473,7 @@ static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
 
 static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
 {
-	cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);
+	cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE);
 }
 
 /*----------------------------------------------------------------*/
@@ -555,17 +553,18 @@ static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
 	bio_list_init(master);
 }
 
-static void error_bio_list(struct bio_list *bios, int error)
+static void error_bio_list(struct bio_list *bios, blk_status_t error)
 {
 	struct bio *bio;
 
 	while ((bio = bio_list_pop(bios))) {
-		bio->bi_error = error;
+		bio->bi_status = error;
 		bio_endio(bio);
 	}
 }
 
-static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
+static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
+		blk_status_t error)
 {
 	struct bio_list bios;
 	unsigned long flags;
@@ -608,11 +607,11 @@ static void requeue_io(struct thin_c *tc)
 	__merge_bio_list(&bios, &tc->retry_on_resume_list);
 	spin_unlock_irqrestore(&tc->lock, flags);
 
-	error_bio_list(&bios, DM_ENDIO_REQUEUE);
+	error_bio_list(&bios, BLK_STS_DM_REQUEUE);
 	requeue_deferred_cells(tc);
 }
 
-static void error_retry_list_with_code(struct pool *pool, int error)
+static void error_retry_list_with_code(struct pool *pool, blk_status_t error)
 {
 	struct thin_c *tc;
 
@@ -624,9 +623,7 @@ static void error_retry_list_with_code(struct pool *pool, int error)
 
 static void error_retry_list(struct pool *pool)
 {
-	int error = get_pool_io_error_code(pool);
-
-	error_retry_list_with_code(pool, error);
+	error_retry_list_with_code(pool, get_pool_io_error_code(pool));
 }
 
 /*
@@ -774,7 +771,7 @@ struct dm_thin_new_mapping {
 	 */
 	atomic_t prepare_actions;
 
-	int err;
+	blk_status_t status;
 	struct thin_c *tc;
 	dm_block_t virt_begin, virt_end;
 	dm_block_t data_block;
@@ -814,7 +811,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
 {
 	struct dm_thin_new_mapping *m = context;
 
-	m->err = read_err || write_err ? -EIO : 0;
+	m->status = read_err || write_err ? BLK_STS_IOERR : 0;
 	complete_mapping_preparation(m);
 }
 
@@ -825,7 +822,7 @@ static void overwrite_endio(struct bio *bio)
 
 	bio->bi_end_io = m->saved_bi_end_io;
 
-	m->err = bio->bi_error;
+	m->status = bio->bi_status;
 	complete_mapping_preparation(m);
 }
 
@@ -925,7 +922,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	struct bio *bio = m->bio;
 	int r;
 
-	if (m->err) {
+	if (m->status) {
 		cell_error(pool, m->cell);
 		goto out;
 	}
@@ -1495,7 +1492,7 @@ static void retry_on_resume(struct bio *bio)
 	spin_unlock_irqrestore(&tc->lock, flags);
 }
 
-static int should_error_unserviceable_bio(struct pool *pool)
+static blk_status_t should_error_unserviceable_bio(struct pool *pool)
 {
 	enum pool_mode m = get_pool_mode(pool);
 
@@ -1503,27 +1500,27 @@ static int should_error_unserviceable_bio(struct pool *pool)
 	case PM_WRITE:
 		/* Shouldn't get here */
 		DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
-		return -EIO;
+		return BLK_STS_IOERR;
 
 	case PM_OUT_OF_DATA_SPACE:
-		return pool->pf.error_if_no_space ? -ENOSPC : 0;
+		return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
 
 	case PM_READ_ONLY:
 	case PM_FAIL:
-		return -EIO;
+		return BLK_STS_IOERR;
 	default:
 		/* Shouldn't get here */
 		DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
-		return -EIO;
+		return BLK_STS_IOERR;
 	}
 }
 
 static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
 {
-	int error = should_error_unserviceable_bio(pool);
+	blk_status_t error = should_error_unserviceable_bio(pool);
 
 	if (error) {
-		bio->bi_error = error;
+		bio->bi_status = error;
 		bio_endio(bio);
 	} else
 		retry_on_resume(bio);
@@ -1533,7 +1530,7 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
 {
 	struct bio *bio;
 	struct bio_list bios;
-	int error;
+	blk_status_t error;
 
 	error = should_error_unserviceable_bio(pool);
 	if (error) {
@@ -2071,7 +2068,8 @@ static void process_thin_deferred_bios(struct thin_c *tc)
 	unsigned count = 0;
 
 	if (tc->requeue_mode) {
-		error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE);
+		error_thin_bio_list(tc, &tc->deferred_bio_list,
+				BLK_STS_DM_REQUEUE);
 		return;
 	}
 
@@ -2322,7 +2320,7 @@ static void do_no_space_timeout(struct work_struct *ws)
 	if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
 		pool->pf.error_if_no_space = true;
 		notify_of_pool_mode_change_to_oods(pool);
-		error_retry_list_with_code(pool, -ENOSPC);
+		error_retry_list_with_code(pool, BLK_STS_NOSPC);
 	}
 }
 
@@ -2624,7 +2622,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 	thin_hook_bio(tc, bio);
 
 	if (tc->requeue_mode) {
-		bio->bi_error = DM_ENDIO_REQUEUE;
+		bio->bi_status = BLK_STS_DM_REQUEUE;
 		bio_endio(bio);
 		return DM_MAPIO_SUBMITTED;
 	}
@@ -4177,7 +4175,8 @@ static int thin_map(struct dm_target *ti, struct bio *bio)
 	return thin_bio_map(ti, bio);
 }
 
-static int thin_endio(struct dm_target *ti, struct bio *bio, int *err)
+static int thin_endio(struct dm_target *ti, struct bio *bio,
+		blk_status_t *err)
 {
 	unsigned long flags;
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 9ed55468b98b..2dca66eb67e1 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -538,13 +538,13 @@ static int verity_verify_io(struct dm_verity_io *io)
 /*
  * End one "io" structure with a given error.
  */
-static void verity_finish_io(struct dm_verity_io *io, int error)
+static void verity_finish_io(struct dm_verity_io *io, blk_status_t status)
 {
 	struct dm_verity *v = io->v;
 	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
 
 	bio->bi_end_io = io->orig_bi_end_io;
-	bio->bi_error = error;
+	bio->bi_status = status;
 
 	verity_fec_finish_io(io);
 
@@ -555,15 +555,15 @@ static void verity_work(struct work_struct *w)
 {
 	struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
 
-	verity_finish_io(io, verity_verify_io(io));
+	verity_finish_io(io, errno_to_blk_status(verity_verify_io(io)));
 }
 
 static void verity_end_io(struct bio *bio)
 {
 	struct dm_verity_io *io = bio->bi_private;
 
-	if (bio->bi_error && !verity_fec_is_enabled(io->v)) {
-		verity_finish_io(io, bio->bi_error);
+	if (bio->bi_status && !verity_fec_is_enabled(io->v)) {
+		verity_finish_io(io, bio->bi_status);
 		return;
 	}
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7a7047211c64..f38f9dd5cbdd 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -63,7 +63,7 @@ static struct workqueue_struct *deferred_remove_workqueue;
  */
 struct dm_io {
 	struct mapped_device *md;
-	int error;
+	blk_status_t status;
 	atomic_t io_count;
 	struct bio *bio;
 	unsigned long start_time;
@@ -768,23 +768,24 @@ static int __noflush_suspending(struct mapped_device *md)
  * Decrements the number of outstanding ios that a bio has been
  * cloned into, completing the original io if necc.
  */
-static void dec_pending(struct dm_io *io, int error)
+static void dec_pending(struct dm_io *io, blk_status_t error)
 {
 	unsigned long flags;
-	int io_error;
+	blk_status_t io_error;
 	struct bio *bio;
 	struct mapped_device *md = io->md;
 
 	/* Push-back supersedes any I/O errors */
 	if (unlikely(error)) {
 		spin_lock_irqsave(&io->endio_lock, flags);
-		if (!(io->error > 0 && __noflush_suspending(md)))
-			io->error = error;
+		if (!(io->status == BLK_STS_DM_REQUEUE &&
+				__noflush_suspending(md)))
+			io->status = error;
 		spin_unlock_irqrestore(&io->endio_lock, flags);
 	}
 
 	if (atomic_dec_and_test(&io->io_count)) {
-		if (io->error == DM_ENDIO_REQUEUE) {
+		if (io->status == BLK_STS_DM_REQUEUE) {
 			/*
 			 * Target requested pushing back the I/O.
 			 */
@@ -793,16 +794,16 @@ static void dec_pending(struct dm_io *io, int error)
 				bio_list_add_head(&md->deferred, io->bio);
 			else
 				/* noflush suspend was interrupted. */
-				io->error = -EIO;
+				io->status = BLK_STS_IOERR;
 			spin_unlock_irqrestore(&md->deferred_lock, flags);
 		}
 
-		io_error = io->error;
+		io_error = io->status;
 		bio = io->bio;
 		end_io_acct(io);
 		free_io(md, io);
 
-		if (io_error == DM_ENDIO_REQUEUE)
+		if (io_error == BLK_STS_DM_REQUEUE)
 			return;
 
 		if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
@@ -814,7 +815,7 @@ static void dec_pending(struct dm_io *io, int error)
 			queue_io(md, bio);
 		} else {
 			/* done with normal IO or empty flush */
-			bio->bi_error = io_error;
+			bio->bi_status = io_error;
 			bio_endio(bio);
 		}
 	}
@@ -838,14 +839,13 @@ void disable_write_zeroes(struct mapped_device *md)
 
 static void clone_endio(struct bio *bio)
 {
-	int error = bio->bi_error;
-	int r = error;
+	blk_status_t error = bio->bi_status;
 	struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 	struct dm_io *io = tio->io;
 	struct mapped_device *md = tio->io->md;
 	dm_endio_fn endio = tio->ti->type->end_io;
 
-	if (unlikely(error == -EREMOTEIO)) {
+	if (unlikely(error == BLK_STS_TARGET)) {
 		if (bio_op(bio) == REQ_OP_WRITE_SAME &&
 		    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
 			disable_write_same(md);
@@ -855,10 +855,10 @@ static void clone_endio(struct bio *bio)
 	}
 
 	if (endio) {
-		r = endio(tio->ti, bio, &error);
+		int r = endio(tio->ti, bio, &error);
 		switch (r) {
 		case DM_ENDIO_REQUEUE:
-			error = DM_ENDIO_REQUEUE;
+			error = BLK_STS_DM_REQUEUE;
 			/*FALLTHRU*/
 		case DM_ENDIO_DONE:
 			break;
@@ -1094,11 +1094,11 @@ static void __map_bio(struct dm_target_io *tio)
 		generic_make_request(clone);
 		break;
 	case DM_MAPIO_KILL:
-		r = -EIO;
-		/*FALLTHRU*/
+		dec_pending(tio->io, BLK_STS_IOERR);
+		free_tio(tio);
+		break;
 	case DM_MAPIO_REQUEUE:
-		/* error the io and bail out, or requeue it if needed */
-		dec_pending(tio->io, r);
+		dec_pending(tio->io, BLK_STS_DM_REQUEUE);
 		free_tio(tio);
 		break;
 	default:
@@ -1366,7 +1366,7 @@ static void __split_and_process_bio(struct mapped_device *md,
 	ci.map = map;
 	ci.md = md;
 	ci.io = alloc_io(md);
-	ci.io->error = 0;
+	ci.io->status = 0;
 	atomic_set(&ci.io->io_count, 1);
 	ci.io->bio = bio;
 	ci.io->md = md;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 10367ffe92e3..6452e83fd650 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -273,7 +273,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 	}
 	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
 		if (bio_sectors(bio) != 0)
-			bio->bi_error = -EROFS;
+			bio->bi_status = BLK_STS_IOERR;
 		bio_endio(bio);
 		return BLK_QC_T_NONE;
 	}
@@ -719,8 +719,8 @@ static void super_written(struct bio *bio)
 	struct md_rdev *rdev = bio->bi_private;
 	struct mddev *mddev = rdev->mddev;
 
-	if (bio->bi_error) {
-		pr_err("md: super_written gets error=%d\n", bio->bi_error);
+	if (bio->bi_status) {
+		pr_err("md: super_written gets error=%d\n", bio->bi_status);
 		md_error(mddev, rdev);
 		if (!test_bit(Faulty, &rdev->flags)
 		    && (bio->bi_opf & MD_FAILFAST)) {
@@ -801,7 +801,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 
 	submit_bio_wait(bio);
 
-	ret = !bio->bi_error;
+	ret = !bio->bi_status;
 	bio_put(bio);
 	return ret;
 }
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index e95d521d93e9..68d036e64041 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -73,12 +73,12 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
  * operation and are ready to return a success/failure code to the buffer
  * cache layer.
  */
-static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
+static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status)
 {
 	struct bio *bio = mp_bh->master_bio;
 	struct mpconf *conf = mp_bh->mddev->private;
 
-	bio->bi_error = err;
+	bio->bi_status = status;
 	bio_endio(bio);
 	mempool_free(mp_bh, conf->pool);
 }
@@ -89,7 +89,7 @@ static void multipath_end_request(struct bio *bio)
 	struct mpconf *conf = mp_bh->mddev->private;
 	struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev;
 
-	if (!bio->bi_error)
+	if (!bio->bi_status)
 		multipath_end_bh_io(mp_bh, 0);
 	else if (!(bio->bi_opf & REQ_RAHEAD)) {
 		/*
@@ -102,7 +102,7 @@ static void multipath_end_request(struct bio *bio)
 			(unsigned long long)bio->bi_iter.bi_sector);
 		multipath_reschedule_retry(mp_bh);
 	} else
-		multipath_end_bh_io(mp_bh, bio->bi_error);
+		multipath_end_bh_io(mp_bh, bio->bi_status);
 	rdev_dec_pending(rdev, conf->mddev);
 }
 
@@ -347,7 +347,7 @@ static void multipathd(struct md_thread *thread)
 			pr_err("multipath: %s: unrecoverable IO read error for block %llu\n",
 			       bdevname(bio->bi_bdev,b),
 			       (unsigned long long)bio->bi_iter.bi_sector);
-			multipath_end_bh_io(mp_bh, -EIO);
+			multipath_end_bh_io(mp_bh, BLK_STS_IOERR);
 		} else {
 			pr_err("multipath: %s: redirecting sector %llu to another IO path\n",
 			       bdevname(bio->bi_bdev,b),
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index af5056d56878..94b87c4d0f7b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -277,7 +277,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
 	struct r1conf *conf = r1_bio->mddev->private;
 
 	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
-		bio->bi_error = -EIO;
+		bio->bi_status = BLK_STS_IOERR;
 
 	bio_endio(bio);
 	/*
@@ -335,7 +335,7 @@ static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
 
 static void raid1_end_read_request(struct bio *bio)
 {
-	int uptodate = !bio->bi_error;
+	int uptodate = !bio->bi_status;
 	struct r1bio *r1_bio = bio->bi_private;
 	struct r1conf *conf = r1_bio->mddev->private;
 	struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
@@ -426,12 +426,12 @@ static void raid1_end_write_request(struct bio *bio)
 	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
 	bool discard_error;
 
-	discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD;
+	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
 
 	/*
 	 * 'one mirror IO has finished' event handler:
 	 */
-	if (bio->bi_error && !discard_error) {
+	if (bio->bi_status && !discard_error) {
 		set_bit(WriteErrorSeen,	&rdev->flags);
 		if (!test_and_set_bit(WantReplacement, &rdev->flags))
 			set_bit(MD_RECOVERY_NEEDED, &
@@ -802,7 +802,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio)
 		bio->bi_next = NULL;
 		bio->bi_bdev = rdev->bdev;
 		if (test_bit(Faulty, &rdev->flags)) {
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 			bio_endio(bio);
 		} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
 				    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
@@ -1856,7 +1856,7 @@ static void end_sync_read(struct bio *bio)
 	 * or re-read if the read failed.
 	 * We don't do much here, just schedule handling by raid1d
 	 */
-	if (!bio->bi_error)
+	if (!bio->bi_status)
 		set_bit(R1BIO_Uptodate, &r1_bio->state);
 
 	if (atomic_dec_and_test(&r1_bio->remaining))
@@ -1865,7 +1865,7 @@ static void end_sync_read(struct bio *bio)
 
 static void end_sync_write(struct bio *bio)
 {
-	int uptodate = !bio->bi_error;
+	int uptodate = !bio->bi_status;
 	struct r1bio *r1_bio = get_resync_r1bio(bio);
 	struct mddev *mddev = r1_bio->mddev;
 	struct r1conf *conf = mddev->private;
@@ -2058,7 +2058,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
 		idx ++;
 	}
 	set_bit(R1BIO_Uptodate, &r1_bio->state);
-	bio->bi_error = 0;
+	bio->bi_status = 0;
 	return 1;
 }
 
@@ -2082,16 +2082,16 @@ static void process_checks(struct r1bio *r1_bio)
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 		int j;
 		int size;
-		int error;
+		blk_status_t status;
 		struct bio_vec *bi;
 		struct bio *b = r1_bio->bios[i];
 		struct resync_pages *rp = get_resync_pages(b);
 		if (b->bi_end_io != end_sync_read)
 			continue;
 		/* fixup the bio for reuse, but preserve errno */
-		error = b->bi_error;
+		status = b->bi_status;
 		bio_reset(b);
-		b->bi_error = error;
+		b->bi_status = status;
 		b->bi_vcnt = vcnt;
 		b->bi_iter.bi_size = r1_bio->sectors << 9;
 		b->bi_iter.bi_sector = r1_bio->sector +
@@ -2113,7 +2113,7 @@ static void process_checks(struct r1bio *r1_bio)
 	}
 	for (primary = 0; primary < conf->raid_disks * 2; primary++)
 		if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
-		    !r1_bio->bios[primary]->bi_error) {
+		    !r1_bio->bios[primary]->bi_status) {
 			r1_bio->bios[primary]->bi_end_io = NULL;
 			rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
 			break;
@@ -2123,7 +2123,7 @@ static void process_checks(struct r1bio *r1_bio)
 		int j;
 		struct bio *pbio = r1_bio->bios[primary];
 		struct bio *sbio = r1_bio->bios[i];
-		int error = sbio->bi_error;
+		blk_status_t status = sbio->bi_status;
 		struct page **ppages = get_resync_pages(pbio)->pages;
 		struct page **spages = get_resync_pages(sbio)->pages;
 		struct bio_vec *bi;
@@ -2132,12 +2132,12 @@ static void process_checks(struct r1bio *r1_bio)
 		if (sbio->bi_end_io != end_sync_read)
 			continue;
 		/* Now we can 'fixup' the error value */
-		sbio->bi_error = 0;
+		sbio->bi_status = 0;
 
 		bio_for_each_segment_all(bi, sbio, j)
 			page_len[j] = bi->bv_len;
 
-		if (!error) {
+		if (!status) {
 			for (j = vcnt; j-- ; ) {
 				if (memcmp(page_address(ppages[j]),
 					   page_address(spages[j]),
@@ -2149,7 +2149,7 @@ static void process_checks(struct r1bio *r1_bio)
 		if (j >= 0)
 			atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
 		if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
-			      && !error)) {
+			      && !status)) {
 			/* No need to write to this device. */
 			sbio->bi_end_io = NULL;
 			rdev_dec_pending(conf->mirrors[i].rdev, mddev);
@@ -2400,11 +2400,11 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 		struct bio *bio = r1_bio->bios[m];
 		if (bio->bi_end_io == NULL)
 			continue;
-		if (!bio->bi_error &&
+		if (!bio->bi_status &&
 		    test_bit(R1BIO_MadeGood, &r1_bio->state)) {
 			rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
 		}
-		if (bio->bi_error &&
+		if (bio->bi_status &&
 		    test_bit(R1BIO_WriteError, &r1_bio->state)) {
 			if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
 				md_error(conf->mddev, rdev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 4343d7ff9916..89ad1cd29037 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -336,7 +336,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
 	struct r10conf *conf = r10_bio->mddev->private;
 
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
-		bio->bi_error = -EIO;
+		bio->bi_status = BLK_STS_IOERR;
 
 	bio_endio(bio);
 	/*
@@ -389,7 +389,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
 
 static void raid10_end_read_request(struct bio *bio)
 {
-	int uptodate = !bio->bi_error;
+	int uptodate = !bio->bi_status;
 	struct r10bio *r10_bio = bio->bi_private;
 	int slot, dev;
 	struct md_rdev *rdev;
@@ -477,7 +477,7 @@ static void raid10_end_write_request(struct bio *bio)
 	struct bio *to_put = NULL;
 	bool discard_error;
 
-	discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD;
+	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
 
 	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 
@@ -491,7 +491,7 @@ static void raid10_end_write_request(struct bio *bio)
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
-	if (bio->bi_error && !discard_error) {
+	if (bio->bi_status && !discard_error) {
 		if (repl)
 			/* Never record new bad blocks to replacement,
 			 * just fail it.
@@ -913,7 +913,7 @@ static void flush_pending_writes(struct r10conf *conf)
 			bio->bi_next = NULL;
 			bio->bi_bdev = rdev->bdev;
 			if (test_bit(Faulty, &rdev->flags)) {
-				bio->bi_error = -EIO;
+				bio->bi_status = BLK_STS_IOERR;
 				bio_endio(bio);
 			} else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
 					    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
@@ -1098,7 +1098,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 		bio->bi_next = NULL;
 		bio->bi_bdev = rdev->bdev;
 		if (test_bit(Faulty, &rdev->flags)) {
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 			bio_endio(bio);
 		} else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
 				    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
@@ -1888,7 +1888,7 @@ static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
 {
 	struct r10conf *conf = r10_bio->mddev->private;
 
-	if (!bio->bi_error)
+	if (!bio->bi_status)
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
 	else
 		/* The write handler will notice the lack of
@@ -1972,7 +1972,7 @@ static void end_sync_write(struct bio *bio)
 	else
 		rdev = conf->mirrors[d].rdev;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		if (repl)
 			md_error(mddev, rdev);
 		else {
@@ -2021,7 +2021,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 
 	/* find the first device with a block */
 	for (i=0; i<conf->copies; i++)
-		if (!r10_bio->devs[i].bio->bi_error)
+		if (!r10_bio->devs[i].bio->bi_status)
 			break;
 
 	if (i == conf->copies)
@@ -2050,7 +2050,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		tpages = get_resync_pages(tbio)->pages;
 		d = r10_bio->devs[i].devnum;
 		rdev = conf->mirrors[d].rdev;
-		if (!r10_bio->devs[i].bio->bi_error) {
+		if (!r10_bio->devs[i].bio->bi_status) {
 			/* We know that the bi_io_vec layout is the same for
 			 * both 'first' and 'i', so we just compare them.
 			 * All vec entries are PAGE_SIZE;
@@ -2633,7 +2633,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 			rdev = conf->mirrors[dev].rdev;
 			if (r10_bio->devs[m].bio == NULL)
 				continue;
-			if (!r10_bio->devs[m].bio->bi_error) {
+			if (!r10_bio->devs[m].bio->bi_status) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
@@ -2649,7 +2649,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 			if (r10_bio->devs[m].repl_bio == NULL)
 				continue;
 
-			if (!r10_bio->devs[m].repl_bio->bi_error) {
+			if (!r10_bio->devs[m].repl_bio->bi_status) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
@@ -2675,7 +2675,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 					r10_bio->devs[m].addr,
 					r10_bio->sectors, 0);
 				rdev_dec_pending(rdev, conf->mddev);
-			} else if (bio != NULL && bio->bi_error) {
+			} else if (bio != NULL && bio->bi_status) {
 				fail = true;
 				if (!narrow_write_error(r10_bio, m)) {
 					md_error(conf->mddev, rdev);
@@ -3267,7 +3267,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				r10_bio->devs[i].repl_bio->bi_end_io = NULL;
 
 			bio = r10_bio->devs[i].bio;
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 			rcu_read_lock();
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
@@ -3309,7 +3309,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 			/* Need to set up for writing to the replacement */
 			bio = r10_bio->devs[i].repl_bio;
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 
 			sector = r10_bio->devs[i].addr;
 			bio->bi_next = biolist;
@@ -3375,7 +3375,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 
 		if (bio->bi_end_io == end_sync_read) {
 			md_sync_acct(bio->bi_bdev, nr_sectors);
-			bio->bi_error = 0;
+			bio->bi_status = 0;
 			generic_make_request(bio);
 		}
 	}
@@ -4394,7 +4394,7 @@ read_more:
 	read_bio->bi_end_io = end_reshape_read;
 	bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
 	read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
-	read_bio->bi_error = 0;
+	read_bio->bi_status = 0;
 	read_bio->bi_vcnt = 0;
 	read_bio->bi_iter.bi_size = 0;
 	r10_bio->master_bio = read_bio;
@@ -4638,7 +4638,7 @@ static void end_reshape_write(struct bio *bio)
 		rdev = conf->mirrors[d].rdev;
 	}
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		/* FIXME should record badblock */
 		md_error(mddev, rdev);
 	}
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 4c00bc248287..3ed6a0d89db8 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -572,7 +572,7 @@ static void r5l_log_endio(struct bio *bio)
 	struct r5l_log *log = io->log;
 	unsigned long flags;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		md_error(log->rdev->mddev, log->rdev);
 
 	bio_put(bio);
@@ -1247,7 +1247,7 @@ static void r5l_log_flush_endio(struct bio *bio)
 	unsigned long flags;
 	struct r5l_io_unit *io;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		md_error(log->rdev->mddev, log->rdev);
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 5d25bebf3328..09e04be34e5f 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -397,7 +397,7 @@ static void ppl_log_endio(struct bio *bio)
 
 	pr_debug("%s: seq: %llu\n", __func__, io->seq);
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		md_error(ppl_conf->mddev, log->rdev);
 
 	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9c4f7659f8b1..e1bdc320f664 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2476,7 +2476,7 @@ static void raid5_end_read_request(struct bio * bi)
 
 	pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
 		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
-		bi->bi_error);
+		bi->bi_status);
 	if (i == disks) {
 		bio_reset(bi);
 		BUG();
@@ -2496,7 +2496,7 @@ static void raid5_end_read_request(struct bio * bi)
 		s = sh->sector + rdev->new_data_offset;
 	else
 		s = sh->sector + rdev->data_offset;
-	if (!bi->bi_error) {
+	if (!bi->bi_status) {
 		set_bit(R5_UPTODATE, &sh->dev[i].flags);
 		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
 			/* Note that this cannot happen on a
@@ -2613,7 +2613,7 @@ static void raid5_end_write_request(struct bio *bi)
 	}
 	pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
 		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
-		bi->bi_error);
+		bi->bi_status);
 	if (i == disks) {
 		bio_reset(bi);
 		BUG();
@@ -2621,14 +2621,14 @@ static void raid5_end_write_request(struct bio *bi)
 	}
 
 	if (replacement) {
-		if (bi->bi_error)
+		if (bi->bi_status)
 			md_error(conf->mddev, rdev);
 		else if (is_badblock(rdev, sh->sector,
 				     STRIPE_SECTORS,
 				     &first_bad, &bad_sectors))
 			set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
 	} else {
-		if (bi->bi_error) {
+		if (bi->bi_status) {
 			set_bit(STRIPE_DEGRADED, &sh->state);
 			set_bit(WriteErrorSeen, &rdev->flags);
 			set_bit(R5_WriteError, &sh->dev[i].flags);
@@ -2649,7 +2649,7 @@ static void raid5_end_write_request(struct bio *bi)
 	}
 	rdev_dec_pending(rdev, conf->mddev);
 
-	if (sh->batch_head && bi->bi_error && !replacement)
+	if (sh->batch_head && bi->bi_status && !replacement)
 		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
 
 	bio_reset(bi);
@@ -3381,7 +3381,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			sh->dev[i].sector + STRIPE_SECTORS) {
 			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
 
-			bi->bi_error = -EIO;
+			bi->bi_status = BLK_STS_IOERR;
 			md_write_end(conf->mddev);
 			bio_endio(bi);
 			bi = nextbi;
@@ -3403,7 +3403,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		       sh->dev[i].sector + STRIPE_SECTORS) {
 			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
 
-			bi->bi_error = -EIO;
+			bi->bi_status = BLK_STS_IOERR;
 			md_write_end(conf->mddev);
 			bio_endio(bi);
 			bi = bi2;
@@ -3429,7 +3429,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 				struct bio *nextbi =
 					r5_next_bio(bi, sh->dev[i].sector);
 
-				bi->bi_error = -EIO;
+				bi->bi_status = BLK_STS_IOERR;
 				bio_endio(bi);
 				bi = nextbi;
 			}
@@ -5144,7 +5144,7 @@ static void raid5_align_endio(struct bio *bi)
 	struct mddev *mddev;
 	struct r5conf *conf;
 	struct md_rdev *rdev;
-	int error = bi->bi_error;
+	blk_status_t error = bi->bi_status;
 
 	bio_put(bi);
 
@@ -5721,7 +5721,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 			release_stripe_plug(mddev, sh);
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
-			bi->bi_error = -EIO;
+			bi->bi_status = BLK_STS_IOERR;
 			break;
 		}
 	}
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 822198a75e96..79eb9fb358d5 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -186,7 +186,7 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 	 * another kernel subsystem, and we just pass it through.
 	 */
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-		bio->bi_error = -EIO;
+		bio->bi_status = BLK_STS_IOERR;
 		goto out;
 	}
 
@@ -205,7 +205,7 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 					"io error in %s sector %lld, len %d,\n",
 					(rw == READ) ? "READ" : "WRITE",
 					(unsigned long long) iter.bi_sector, len);
-			bio->bi_error = err;
+			bio->bi_status = errno_to_blk_status(err);
 			break;
 		}
 	}
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 983718b8fd9b..31b2d14e210d 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1210,7 +1210,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
 	 * another kernel subsystem, and we just pass it through.
 	 */
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
-		bio->bi_error = -EIO;
+		bio->bi_status = BLK_STS_IOERR;
 		goto out;
 	}
 
@@ -1232,7 +1232,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
 					(op_is_write(bio_op(bio))) ? "WRITE" :
 					"READ",
 					(unsigned long long) iter.bi_sector, len);
-			bio->bi_error = err;
+			bio->bi_status = errno_to_blk_status(err);
 			break;
 		}
 	}
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index c544d466ea51..7bd383aeea14 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -49,19 +49,19 @@ static struct nd_region *to_region(struct pmem_device *pmem)
 	return to_nd_region(to_dev(pmem)->parent);
 }
 
-static int pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
-		unsigned int len)
+static blk_status_t pmem_clear_poison(struct pmem_device *pmem,
+		phys_addr_t offset, unsigned int len)
 {
 	struct device *dev = to_dev(pmem);
 	sector_t sector;
 	long cleared;
-	int rc = 0;
+	blk_status_t rc = BLK_STS_OK;
 
 	sector = (offset - pmem->data_offset) / 512;
 
 	cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
 	if (cleared < len)
-		rc = -EIO;
+		rc = BLK_STS_IOERR;
 	if (cleared > 0 && cleared / 512) {
 		cleared /= 512;
 		dev_dbg(dev, "%s: %#llx clear %ld sector%s\n", __func__,
@@ -84,7 +84,7 @@ static void write_pmem(void *pmem_addr, struct page *page,
 	kunmap_atomic(mem);
 }
 
-static int read_pmem(struct page *page, unsigned int off,
+static blk_status_t read_pmem(struct page *page, unsigned int off,
 		void *pmem_addr, unsigned int len)
 {
 	int rc;
@@ -93,15 +93,15 @@ static int read_pmem(struct page *page, unsigned int off,
 	rc = memcpy_mcsafe(mem + off, pmem_addr, len);
 	kunmap_atomic(mem);
 	if (rc)
-		return -EIO;
-	return 0;
+		return BLK_STS_IOERR;
+	return BLK_STS_OK;
 }
 
-static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
+static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 			unsigned int len, unsigned int off, bool is_write,
 			sector_t sector)
 {
-	int rc = 0;
+	blk_status_t rc = BLK_STS_OK;
 	bool bad_pmem = false;
 	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
 	void *pmem_addr = pmem->virt_addr + pmem_off;
@@ -111,7 +111,7 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 
 	if (!is_write) {
 		if (unlikely(bad_pmem))
-			rc = -EIO;
+			rc = BLK_STS_IOERR;
 		else {
 			rc = read_pmem(page, off, pmem_addr, len);
 			flush_dcache_page(page);
@@ -149,7 +149,7 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 
 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 {
-	int rc = 0;
+	blk_status_t rc = 0;
 	bool do_acct;
 	unsigned long start;
 	struct bio_vec bvec;
@@ -166,7 +166,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 				bvec.bv_offset, op_is_write(bio_op(bio)),
 				iter.bi_sector);
 		if (rc) {
-			bio->bi_error = rc;
+			bio->bi_status = rc;
 			break;
 		}
 	}
@@ -184,7 +184,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 		       struct page *page, bool is_write)
 {
 	struct pmem_device *pmem = bdev->bd_queue->queuedata;
-	int rc;
+	blk_status_t rc;
 
 	rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector);
 
@@ -197,7 +197,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 	if (rc == 0)
 		page_endio(page, is_write, 0);
 
-	return rc;
+	return blk_status_to_errno(rc);
 }
 
 /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index c77940d80fc8..40128793e613 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -21,7 +21,7 @@ static void nvmet_bio_done(struct bio *bio)
 	struct nvmet_req *req = bio->bi_private;
 
 	nvmet_req_complete(req,
-		bio->bi_error ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+		bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
 
 	if (bio != &req->inline_bio)
 		bio_put(bio);
@@ -145,7 +145,7 @@ static void nvmet_execute_discard(struct nvmet_req *req)
 		bio->bi_private = req;
 		bio->bi_end_io = nvmet_bio_done;
 		if (status) {
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 			bio_endio(bio);
 		} else {
 			submit_bio(bio);
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index bb069ebe4aa6..75373624604b 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -296,8 +296,8 @@ static void iblock_bio_done(struct bio *bio)
 	struct se_cmd *cmd = bio->bi_private;
 	struct iblock_req *ibr = cmd->priv;
 
-	if (bio->bi_error) {
-		pr_err("bio error: %p,  err: %d\n", bio, bio->bi_error);
+	if (bio->bi_status) {
+		pr_err("bio error: %p,  err: %d\n", bio, bio->bi_status);
 		/*
 		 * Bump the ib_bio_err_cnt and release bio.
 		 */
@@ -354,11 +354,11 @@ static void iblock_end_io_flush(struct bio *bio)
 {
 	struct se_cmd *cmd = bio->bi_private;
 
-	if (bio->bi_error)
-		pr_err("IBLOCK: cache flush failed: %d\n", bio->bi_error);
+	if (bio->bi_status)
+		pr_err("IBLOCK: cache flush failed: %d\n", bio->bi_status);
 
 	if (cmd) {
-		if (bio->bi_error)
+		if (bio->bi_status)
 			target_complete_cmd(cmd, SAM_STAT_CHECK_CONDITION);
 		else
 			target_complete_cmd(cmd, SAM_STAT_GOOD);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c1dc393ad6b9..bcd8e16a34e1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -262,8 +262,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 	if (vecs != inline_vecs)
 		kfree(vecs);
 
-	if (unlikely(bio.bi_error))
-		return bio.bi_error;
+	if (unlikely(bio.bi_status))
+		return blk_status_to_errno(bio.bi_status);
 	return ret;
 }
 
@@ -288,16 +288,18 @@ static void blkdev_bio_end_io(struct bio *bio)
 	bool should_dirty = dio->should_dirty;
 
 	if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
-		if (bio->bi_error && !dio->bio.bi_error)
-			dio->bio.bi_error = bio->bi_error;
+		if (bio->bi_status && !dio->bio.bi_status)
+			dio->bio.bi_status = bio->bi_status;
 	} else {
 		if (!dio->is_sync) {
 			struct kiocb *iocb = dio->iocb;
-			ssize_t ret = dio->bio.bi_error;
+			ssize_t ret;
 
-			if (likely(!ret)) {
+			if (likely(!dio->bio.bi_status)) {
 				ret = dio->size;
 				iocb->ki_pos += ret;
+			} else {
+				ret = blk_status_to_errno(dio->bio.bi_status);
 			}
 
 			dio->iocb->ki_complete(iocb, ret, 0);
@@ -363,7 +365,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 
 		ret = bio_iov_iter_get_pages(bio, iter);
 		if (unlikely(ret)) {
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 			bio_endio(bio);
 			break;
 		}
@@ -413,7 +415,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	__set_current_state(TASK_RUNNING);
 
 	if (!ret)
-		ret = dio->bio.bi_error;
+		ret = blk_status_to_errno(dio->bio.bi_status);
 	if (likely(!ret))
 		ret = dio->size;
 
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b8622e4d1744..d87ac27a5f2b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -310,7 +310,8 @@ struct btrfs_dio_private {
 	 * The original bio may be split to several sub-bios, this is
 	 * done during endio of sub-bios
 	 */
-	int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
+	blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *,
+			blk_status_t);
 };
 
 /*
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ab14c2e635ca..4ded1c3f92b8 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2129,7 +2129,7 @@ static void btrfsic_bio_end_io(struct bio *bp)
 	/* mutex is not held! This is not save if IO is not yet completed
 	 * on umount */
 	iodone_w_error = 0;
-	if (bp->bi_error)
+	if (bp->bi_status)
 		iodone_w_error = 1;
 
 	BUG_ON(NULL == block);
@@ -2143,7 +2143,7 @@ static void btrfsic_bio_end_io(struct bio *bp)
 		if ((dev_state->state->print_mask &
 		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
 			pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
-			       bp->bi_error,
+			       bp->bi_status,
 			       btrfsic_get_block_type(dev_state->state, block),
 			       block->logical_bytenr, dev_state->name,
 			       block->dev_bytenr, block->mirror_num);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 10e6b282d09d..9ac55b266e78 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -155,7 +155,7 @@ static void end_compressed_bio_read(struct bio *bio)
 	unsigned long index;
 	int ret;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		cb->errors = 1;
 
 	/* if there are more bios still pending for this compressed
@@ -268,7 +268,7 @@ static void end_compressed_bio_write(struct bio *bio)
 	struct page *page;
 	unsigned long index;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		cb->errors = 1;
 
 	/* if there are more bios still pending for this compressed
@@ -287,7 +287,7 @@ static void end_compressed_bio_write(struct bio *bio)
 					 cb->start,
 					 cb->start + cb->len - 1,
 					 NULL,
-					 bio->bi_error ? 0 : 1);
+					 bio->bi_status ? 0 : 1);
 	cb->compressed_pages[0]->mapping = NULL;
 
 	end_compressed_writeback(inode, cb);
@@ -320,7 +320,7 @@ out:
  * This also checksums the file bytes and gets things ready for
  * the end io hooks.
  */
-int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 				 unsigned long len, u64 disk_start,
 				 unsigned long compressed_len,
 				 struct page **compressed_pages,
@@ -335,13 +335,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	struct page *page;
 	u64 first_byte = disk_start;
 	struct block_device *bdev;
-	int ret;
+	blk_status_t ret;
 	int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
 	WARN_ON(start & ((u64)PAGE_SIZE - 1));
 	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
 	if (!cb)
-		return -ENOMEM;
+		return BLK_STS_RESOURCE;
 	refcount_set(&cb->pending_bios, 0);
 	cb->errors = 0;
 	cb->inode = inode;
@@ -358,7 +358,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
 	if (!bio) {
 		kfree(cb);
-		return -ENOMEM;
+		return BLK_STS_RESOURCE;
 	}
 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 	bio->bi_private = cb;
@@ -368,17 +368,17 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	/* create and submit bios for the compressed pages */
 	bytes_left = compressed_len;
 	for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
+		int submit = 0;
+
 		page = compressed_pages[pg_index];
 		page->mapping = inode->i_mapping;
 		if (bio->bi_iter.bi_size)
-			ret = io_tree->ops->merge_bio_hook(page, 0,
+			submit = io_tree->ops->merge_bio_hook(page, 0,
 							   PAGE_SIZE,
 							   bio, 0);
-		else
-			ret = 0;
 
 		page->mapping = NULL;
-		if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) <
+		if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) <
 		    PAGE_SIZE) {
 			bio_get(bio);
 
@@ -400,7 +400,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 
 			ret = btrfs_map_bio(fs_info, bio, 0, 1);
 			if (ret) {
-				bio->bi_error = ret;
+				bio->bi_status = ret;
 				bio_endio(bio);
 			}
 
@@ -434,7 +434,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 
 	ret = btrfs_map_bio(fs_info, bio, 0, 1);
 	if (ret) {
-		bio->bi_error = ret;
+		bio->bi_status = ret;
 		bio_endio(bio);
 	}
 
@@ -569,7 +569,7 @@ next:
  * After the compressed pages are read, we copy the bytes into the
  * bio we were passed and then call the bio end_io calls
  */
-int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -586,7 +586,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	u64 em_len;
 	u64 em_start;
 	struct extent_map *em;
-	int ret = -ENOMEM;
+	blk_status_t ret = BLK_STS_RESOURCE;
 	int faili = 0;
 	u32 *sums;
 
@@ -600,7 +600,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				   PAGE_SIZE);
 	read_unlock(&em_tree->lock);
 	if (!em)
-		return -EIO;
+		return BLK_STS_IOERR;
 
 	compressed_len = em->block_len;
 	cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
@@ -659,19 +659,19 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	refcount_set(&cb->pending_bios, 1);
 
 	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
+		int submit = 0;
+
 		page = cb->compressed_pages[pg_index];
 		page->mapping = inode->i_mapping;
 		page->index = em_start >> PAGE_SHIFT;
 
 		if (comp_bio->bi_iter.bi_size)
-			ret = tree->ops->merge_bio_hook(page, 0,
+			submit = tree->ops->merge_bio_hook(page, 0,
 							PAGE_SIZE,
 							comp_bio, 0);
-		else
-			ret = 0;
 
 		page->mapping = NULL;
-		if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
+		if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
 		    PAGE_SIZE) {
 			bio_get(comp_bio);
 
@@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 			ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
 			if (ret) {
-				comp_bio->bi_error = ret;
+				comp_bio->bi_status = ret;
 				bio_endio(comp_bio);
 			}
 
@@ -726,7 +726,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0);
 	if (ret) {
-		comp_bio->bi_error = ret;
+		comp_bio->bi_status = ret;
 		bio_endio(comp_bio);
 	}
 
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 39ec43ab8df1..680d4265d601 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -48,12 +48,12 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
 			      unsigned long total_out, u64 disk_start,
 			      struct bio *bio);
 
-int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
 				  unsigned long len, u64 disk_start,
 				  unsigned long compressed_len,
 				  struct page **compressed_pages,
 				  unsigned long nr_pages);
-int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags);
 
 enum btrfs_compression_type {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 643c70d2b2e6..d2da0a52d560 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3078,8 +3078,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
 struct btrfs_dio_private;
 int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		    struct btrfs_fs_info *fs_info, u64 bytenr, u64 len);
-int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst);
-int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst);
+blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
 			      u64 logical_offset);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
@@ -3094,7 +3094,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_ordered_sum *sums);
-int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
 		       u64 file_start, int contig);
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 			     struct list_head *list, int search_commit);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8685d67185d0..46accc75ad5a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -87,7 +87,7 @@ struct btrfs_end_io_wq {
 	bio_end_io_t *end_io;
 	void *private;
 	struct btrfs_fs_info *info;
-	int error;
+	blk_status_t status;
 	enum btrfs_wq_endio_type metadata;
 	struct list_head list;
 	struct btrfs_work work;
@@ -131,7 +131,7 @@ struct async_submit_bio {
 	 */
 	u64 bio_offset;
 	struct btrfs_work work;
-	int error;
+	blk_status_t status;
 };
 
 /*
@@ -799,7 +799,7 @@ static void end_workqueue_bio(struct bio *bio)
 	btrfs_work_func_t func;
 
 	fs_info = end_io_wq->info;
-	end_io_wq->error = bio->bi_error;
+	end_io_wq->status = bio->bi_status;
 
 	if (bio_op(bio) == REQ_OP_WRITE) {
 		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
@@ -836,19 +836,19 @@ static void end_workqueue_bio(struct bio *bio)
 	btrfs_queue_work(wq, &end_io_wq->work);
 }
 
-int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			enum btrfs_wq_endio_type metadata)
 {
 	struct btrfs_end_io_wq *end_io_wq;
 
 	end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
 	if (!end_io_wq)
-		return -ENOMEM;
+		return BLK_STS_RESOURCE;
 
 	end_io_wq->private = bio->bi_private;
 	end_io_wq->end_io = bio->bi_end_io;
 	end_io_wq->info = info;
-	end_io_wq->error = 0;
+	end_io_wq->status = 0;
 	end_io_wq->bio = bio;
 	end_io_wq->metadata = metadata;
 
@@ -868,14 +868,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
 static void run_one_async_start(struct btrfs_work *work)
 {
 	struct async_submit_bio *async;
-	int ret;
+	blk_status_t ret;
 
 	async = container_of(work, struct  async_submit_bio, work);
 	ret = async->submit_bio_start(async->inode, async->bio,
 				      async->mirror_num, async->bio_flags,
 				      async->bio_offset);
 	if (ret)
-		async->error = ret;
+		async->status = ret;
 }
 
 static void run_one_async_done(struct btrfs_work *work)
@@ -898,8 +898,8 @@ static void run_one_async_done(struct btrfs_work *work)
 		wake_up(&fs_info->async_submit_wait);
 
 	/* If an error occurred we just want to clean up the bio and move on */
-	if (async->error) {
-		async->bio->bi_error = async->error;
+	if (async->status) {
+		async->bio->bi_status = async->status;
 		bio_endio(async->bio);
 		return;
 	}
@@ -916,18 +916,17 @@ static void run_one_async_free(struct btrfs_work *work)
 	kfree(async);
 }
 
-int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
-			struct bio *bio, int mirror_num,
-			unsigned long bio_flags,
-			u64 bio_offset,
-			extent_submit_bio_hook_t *submit_bio_start,
-			extent_submit_bio_hook_t *submit_bio_done)
+blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info,
+		struct inode *inode, struct bio *bio, int mirror_num,
+		unsigned long bio_flags, u64 bio_offset,
+		extent_submit_bio_hook_t *submit_bio_start,
+		extent_submit_bio_hook_t *submit_bio_done)
 {
 	struct async_submit_bio *async;
 
 	async = kmalloc(sizeof(*async), GFP_NOFS);
 	if (!async)
-		return -ENOMEM;
+		return BLK_STS_RESOURCE;
 
 	async->inode = inode;
 	async->bio = bio;
@@ -941,7 +940,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->bio_flags = bio_flags;
 	async->bio_offset = bio_offset;
 
-	async->error = 0;
+	async->status = 0;
 
 	atomic_inc(&fs_info->nr_async_submits);
 
@@ -959,7 +958,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	return 0;
 }
 
-static int btree_csum_one_bio(struct bio *bio)
+static blk_status_t btree_csum_one_bio(struct bio *bio)
 {
 	struct bio_vec *bvec;
 	struct btrfs_root *root;
@@ -972,12 +971,12 @@ static int btree_csum_one_bio(struct bio *bio)
 			break;
 	}
 
-	return ret;
+	return errno_to_blk_status(ret);
 }
 
-static int __btree_submit_bio_start(struct inode *inode, struct bio *bio,
-				    int mirror_num, unsigned long bio_flags,
-				    u64 bio_offset)
+static blk_status_t __btree_submit_bio_start(struct inode *inode,
+		struct bio *bio, int mirror_num, unsigned long bio_flags,
+		u64 bio_offset)
 {
 	/*
 	 * when we're called for a write, we're already in the async
@@ -986,11 +985,11 @@ static int __btree_submit_bio_start(struct inode *inode, struct bio *bio,
 	return btree_csum_one_bio(bio);
 }
 
-static int __btree_submit_bio_done(struct inode *inode, struct bio *bio,
-				 int mirror_num, unsigned long bio_flags,
-				 u64 bio_offset)
+static blk_status_t __btree_submit_bio_done(struct inode *inode,
+		struct bio *bio, int mirror_num, unsigned long bio_flags,
+		u64 bio_offset)
 {
-	int ret;
+	blk_status_t ret;
 
 	/*
 	 * when we're called for a write, we're already in the async
@@ -998,7 +997,7 @@ static int __btree_submit_bio_done(struct inode *inode, struct bio *bio,
 	 */
 	ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1);
 	if (ret) {
-		bio->bi_error = ret;
+		bio->bi_status = ret;
 		bio_endio(bio);
 	}
 	return ret;
@@ -1015,13 +1014,13 @@ static int check_async_write(unsigned long bio_flags)
 	return 1;
 }
 
-static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
+static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	int async = check_async_write(bio_flags);
-	int ret;
+	blk_status_t ret;
 
 	if (bio_op(bio) != REQ_OP_WRITE) {
 		/*
@@ -1054,7 +1053,7 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio,
 	return 0;
 
 out_w_error:
-	bio->bi_error = ret;
+	bio->bi_status = ret;
 	bio_endio(bio);
 	return ret;
 }
@@ -1820,7 +1819,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
 	bio = end_io_wq->bio;
 
-	bio->bi_error = end_io_wq->error;
+	bio->bi_status = end_io_wq->status;
 	bio->bi_private = end_io_wq->private;
 	bio->bi_end_io = end_io_wq->end_io;
 	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
@@ -3495,11 +3494,11 @@ static void btrfs_end_empty_barrier(struct bio *bio)
  * any device where the flush fails with eopnotsupp are flagged as not-barrier
  * capable
  */
-static int write_dev_flush(struct btrfs_device *device, int wait)
+static blk_status_t write_dev_flush(struct btrfs_device *device, int wait)
 {
 	struct request_queue *q = bdev_get_queue(device->bdev);
 	struct bio *bio;
-	int ret = 0;
+	blk_status_t ret = 0;
 
 	if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
 		return 0;
@@ -3511,8 +3510,8 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 
 		wait_for_completion(&device->flush_wait);
 
-		if (bio->bi_error) {
-			ret = bio->bi_error;
+		if (bio->bi_status) {
+			ret = bio->bi_status;
 			btrfs_dev_stat_inc_and_print(device,
 				BTRFS_DEV_STAT_FLUSH_ERRS);
 		}
@@ -3531,7 +3530,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 	device->flush_bio = NULL;
 	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
 	if (!bio)
-		return -ENOMEM;
+		return BLK_STS_RESOURCE;
 
 	bio->bi_end_io = btrfs_end_empty_barrier;
 	bio->bi_bdev = device->bdev;
@@ -3556,7 +3555,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 	struct btrfs_device *dev;
 	int errors_send = 0;
 	int errors_wait = 0;
-	int ret;
+	blk_status_t ret;
 
 	/* send down all the barriers */
 	head = &info->fs_devices->devices;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 21f1ceb85b76..c581927555f3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -118,13 +118,13 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
 u32 btrfs_csum_data(const char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, u8 *result);
-int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			enum btrfs_wq_endio_type metadata);
-int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
-			struct bio *bio, int mirror_num,
-			unsigned long bio_flags, u64 bio_offset,
-			extent_submit_bio_hook_t *submit_bio_start,
-			extent_submit_bio_hook_t *submit_bio_done);
+blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info,
+		struct inode *inode, struct bio *bio, int mirror_num,
+		unsigned long bio_flags, u64 bio_offset,
+		extent_submit_bio_hook_t *submit_bio_start,
+		extent_submit_bio_hook_t *submit_bio_done);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 int btrfs_write_tree_block(struct extent_buffer *buf);
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d8da3edf2ac3..35cbb6ceb70d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2399,6 +2399,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 	struct bio *bio;
 	int read_mode = 0;
+	blk_status_t status;
 	int ret;
 
 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2431,11 +2432,12 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 		"Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
 		read_mode, failrec->this_mirror, failrec->in_validation);
 
-	ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror,
+	status = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror,
 					 failrec->bio_flags, 0);
-	if (ret) {
+	if (status) {
 		free_io_failure(BTRFS_I(inode), failrec);
 		bio_put(bio);
+		ret = blk_status_to_errno(status);
 	}
 
 	return ret;
@@ -2474,6 +2476,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
  */
 static void end_bio_extent_writepage(struct bio *bio)
 {
+	int error = blk_status_to_errno(bio->bi_status);
 	struct bio_vec *bvec;
 	u64 start;
 	u64 end;
@@ -2503,7 +2506,7 @@ static void end_bio_extent_writepage(struct bio *bio)
 		start = page_offset(page);
 		end = start + bvec->bv_offset + bvec->bv_len - 1;
 
-		end_extent_writepage(page, bio->bi_error, start, end);
+		end_extent_writepage(page, error, start, end);
 		end_page_writeback(page);
 	}
 
@@ -2536,7 +2539,7 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
 static void end_bio_extent_readpage(struct bio *bio)
 {
 	struct bio_vec *bvec;
-	int uptodate = !bio->bi_error;
+	int uptodate = !bio->bi_status;
 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
 	struct extent_io_tree *tree;
 	u64 offset = 0;
@@ -2556,7 +2559,7 @@ static void end_bio_extent_readpage(struct bio *bio)
 
 		btrfs_debug(fs_info,
 			"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
-			(u64)bio->bi_iter.bi_sector, bio->bi_error,
+			(u64)bio->bi_iter.bi_sector, bio->bi_status,
 			io_bio->mirror_num);
 		tree = &BTRFS_I(inode)->io_tree;
 
@@ -2615,7 +2618,7 @@ static void end_bio_extent_readpage(struct bio *bio)
 				ret = bio_readpage_error(bio, offset, page,
 							 start, end, mirror);
 				if (ret == 0) {
-					uptodate = !bio->bi_error;
+					uptodate = !bio->bi_status;
 					offset += len;
 					continue;
 				}
@@ -2673,7 +2676,7 @@ readpage_ok:
 		endio_readpage_release_extent(tree, extent_start, extent_len,
 					      uptodate);
 	if (io_bio->end_io)
-		io_bio->end_io(io_bio, bio->bi_error);
+		io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status));
 	bio_put(bio);
 }
 
@@ -2743,7 +2746,7 @@ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 				       unsigned long bio_flags)
 {
-	int ret = 0;
+	blk_status_t ret = 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct page *page = bvec->bv_page;
 	struct extent_io_tree *tree = bio->bi_private;
@@ -2761,7 +2764,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 		btrfsic_submit_bio(bio);
 
 	bio_put(bio);
-	return ret;
+	return blk_status_to_errno(ret);
 }
 
 static int merge_bio(struct extent_io_tree *tree, struct page *page,
@@ -3707,7 +3710,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
 		BUG_ON(!eb);
 		done = atomic_dec_and_test(&eb->io_pages);
 
-		if (bio->bi_error ||
+		if (bio->bi_status ||
 		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
 			ClearPageUptodate(page);
 			set_btree_ioerr(page);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1eafa2f0ede3..487ca0207cb6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -92,9 +92,9 @@ struct btrfs_inode;
 struct btrfs_io_bio;
 struct io_failure_record;
 
-typedef	int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio,
-				       int mirror_num, unsigned long bio_flags,
-				       u64 bio_offset);
+typedef	blk_status_t (extent_submit_bio_hook_t)(struct inode *inode,
+		struct bio *bio, int mirror_num, unsigned long bio_flags,
+		u64 bio_offset);
 struct extent_io_ops {
 	/*
 	 * The following callbacks must be allways defined, the function
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 64fcb31d7163..5b1c7090e546 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -160,7 +160,7 @@ static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err)
 	kfree(bio->csum_allocated);
 }
 
-static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
+static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
 				   u64 logical_offset, u32 *dst, int dio)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -182,7 +182,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
 
 	path = btrfs_alloc_path();
 	if (!path)
-		return -ENOMEM;
+		return BLK_STS_RESOURCE;
 
 	nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
 	if (!dst) {
@@ -191,7 +191,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
 					csum_size, GFP_NOFS);
 			if (!btrfs_bio->csum_allocated) {
 				btrfs_free_path(path);
-				return -ENOMEM;
+				return BLK_STS_RESOURCE;
 			}
 			btrfs_bio->csum = btrfs_bio->csum_allocated;
 			btrfs_bio->end_io = btrfs_io_bio_endio_readpage;
@@ -303,12 +303,12 @@ next:
 	return 0;
 }
 
-int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst)
+blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst)
 {
 	return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0);
 }
 
-int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset)
+blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset)
 {
 	return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1);
 }
@@ -433,7 +433,7 @@ fail:
 	return ret;
 }
 
-int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
+blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
 		       u64 file_start, int contig)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -452,7 +452,7 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
 	sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size),
 		       GFP_NOFS);
 	if (!sums)
-		return -ENOMEM;
+		return BLK_STS_RESOURCE;
 
 	sums->len = bio->bi_iter.bi_size;
 	INIT_LIST_HEAD(&sums->list);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 758b2666885e..ea7cae1003eb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -842,13 +842,12 @@ retry:
 				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
 				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
 				PAGE_SET_WRITEBACK);
-		ret = btrfs_submit_compressed_write(inode,
+		if (btrfs_submit_compressed_write(inode,
 				    async_extent->start,
 				    async_extent->ram_size,
 				    ins.objectid,
 				    ins.offset, async_extent->pages,
-				    async_extent->nr_pages);
-		if (ret) {
+				    async_extent->nr_pages)) {
 			struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 			struct page *p = async_extent->pages[0];
 			const u64 start = async_extent->start;
@@ -1901,11 +1900,11 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
-				    int mirror_num, unsigned long bio_flags,
-				    u64 bio_offset)
+static blk_status_t __btrfs_submit_bio_start(struct inode *inode,
+		struct bio *bio, int mirror_num, unsigned long bio_flags,
+		u64 bio_offset)
 {
-	int ret = 0;
+	blk_status_t ret = 0;
 
 	ret = btrfs_csum_one_bio(inode, bio, 0, 0);
 	BUG_ON(ret); /* -ENOMEM */
@@ -1920,16 +1919,16 @@ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
-			  int mirror_num, unsigned long bio_flags,
-			  u64 bio_offset)
+static blk_status_t __btrfs_submit_bio_done(struct inode *inode,
+		struct bio *bio, int mirror_num, unsigned long bio_flags,
+		u64 bio_offset)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	int ret;
+	blk_status_t ret;
 
 	ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
 	if (ret) {
-		bio->bi_error = ret;
+		bio->bi_status = ret;
 		bio_endio(bio);
 	}
 	return ret;
@@ -1939,14 +1938,14 @@ static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
  * extent_io.c submission hook. This does the right thing for csum calculation
  * on write, or reading the csums from the tree before a read
  */
-static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
+static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
 			  int mirror_num, unsigned long bio_flags,
 			  u64 bio_offset)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
-	int ret = 0;
+	blk_status_t ret = 0;
 	int skip_sum;
 	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
 
@@ -1991,8 +1990,8 @@ mapit:
 	ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
 
 out:
-	if (ret < 0) {
-		bio->bi_error = ret;
+	if (ret) {
+		bio->bi_status = ret;
 		bio_endio(bio);
 	}
 	return ret;
@@ -8037,7 +8036,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio)
 	struct bio_vec *bvec;
 	int i;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		goto end;
 
 	ASSERT(bio->bi_vcnt == 1);
@@ -8116,7 +8115,7 @@ static void btrfs_retry_endio(struct bio *bio)
 	int ret;
 	int i;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		goto end;
 
 	uptodate = 1;
@@ -8141,8 +8140,8 @@ end:
 	bio_put(bio);
 }
 
-static int __btrfs_subio_endio_read(struct inode *inode,
-				    struct btrfs_io_bio *io_bio, int err)
+static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
+		struct btrfs_io_bio *io_bio, blk_status_t err)
 {
 	struct btrfs_fs_info *fs_info;
 	struct bio_vec *bvec;
@@ -8184,7 +8183,7 @@ try_again:
 				io_bio->mirror_num,
 				btrfs_retry_endio, &done);
 		if (ret) {
-			err = ret;
+			err = errno_to_blk_status(ret);
 			goto next;
 		}
 
@@ -8211,8 +8210,8 @@ next:
 	return err;
 }
 
-static int btrfs_subio_endio_read(struct inode *inode,
-				  struct btrfs_io_bio *io_bio, int err)
+static blk_status_t btrfs_subio_endio_read(struct inode *inode,
+		struct btrfs_io_bio *io_bio, blk_status_t err)
 {
 	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
@@ -8232,7 +8231,7 @@ static void btrfs_endio_direct_read(struct bio *bio)
 	struct inode *inode = dip->inode;
 	struct bio *dio_bio;
 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
-	int err = bio->bi_error;
+	blk_status_t err = bio->bi_status;
 
 	if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
 		err = btrfs_subio_endio_read(inode, io_bio, err);
@@ -8243,11 +8242,11 @@ static void btrfs_endio_direct_read(struct bio *bio)
 
 	kfree(dip);
 
-	dio_bio->bi_error = bio->bi_error;
+	dio_bio->bi_status = bio->bi_status;
 	dio_end_io(dio_bio);
 
 	if (io_bio->end_io)
-		io_bio->end_io(io_bio, err);
+		io_bio->end_io(io_bio, blk_status_to_errno(err));
 	bio_put(bio);
 }
 
@@ -8299,20 +8298,20 @@ static void btrfs_endio_direct_write(struct bio *bio)
 	struct bio *dio_bio = dip->dio_bio;
 
 	__endio_write_update_ordered(dip->inode, dip->logical_offset,
-				     dip->bytes, !bio->bi_error);
+				     dip->bytes, !bio->bi_status);
 
 	kfree(dip);
 
-	dio_bio->bi_error = bio->bi_error;
+	dio_bio->bi_status = bio->bi_status;
 	dio_end_io(dio_bio);
 	bio_put(bio);
 }
 
-static int __btrfs_submit_bio_start_direct_io(struct inode *inode,
+static blk_status_t __btrfs_submit_bio_start_direct_io(struct inode *inode,
 				    struct bio *bio, int mirror_num,
 				    unsigned long bio_flags, u64 offset)
 {
-	int ret;
+	blk_status_t ret;
 	ret = btrfs_csum_one_bio(inode, bio, offset, 1);
 	BUG_ON(ret); /* -ENOMEM */
 	return 0;
@@ -8321,7 +8320,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode,
 static void btrfs_end_dio_bio(struct bio *bio)
 {
 	struct btrfs_dio_private *dip = bio->bi_private;
-	int err = bio->bi_error;
+	blk_status_t err = bio->bi_status;
 
 	if (err)
 		btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
@@ -8351,7 +8350,7 @@ static void btrfs_end_dio_bio(struct bio *bio)
 	if (dip->errors) {
 		bio_io_error(dip->orig_bio);
 	} else {
-		dip->dio_bio->bi_error = 0;
+		dip->dio_bio->bi_status = 0;
 		bio_endio(dip->orig_bio);
 	}
 out:
@@ -8368,14 +8367,14 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
 	return bio;
 }
 
-static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode,
+static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
 						 struct btrfs_dio_private *dip,
 						 struct bio *bio,
 						 u64 file_offset)
 {
 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
 	struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
-	int ret;
+	blk_status_t ret;
 
 	/*
 	 * We load all the csum data we need when we submit
@@ -8406,7 +8405,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	struct btrfs_dio_private *dip = bio->bi_private;
 	bool write = bio_op(bio) == REQ_OP_WRITE;
-	int ret;
+	blk_status_t ret;
 
 	if (async_submit)
 		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
@@ -8649,7 +8648,7 @@ free_ordered:
 	 * callbacks - they require an allocated dip and a clone of dio_bio.
 	 */
 	if (io_bio && dip) {
-		io_bio->bi_error = -EIO;
+		io_bio->bi_status = BLK_STS_IOERR;
 		bio_endio(io_bio);
 		/*
 		 * The end io callbacks free our dip, do the final put on io_bio
@@ -8668,7 +8667,7 @@ free_ordered:
 			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
 			      file_offset + dio_bio->bi_iter.bi_size - 1);
 
-		dio_bio->bi_error = -EIO;
+		dio_bio->bi_status = BLK_STS_IOERR;
 		/*
 		 * Releases and cleans up our dio_bio, no need to bio_put()
 		 * nor bio_endio()/bio_io_error() against dio_bio.
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d8ea0eb76325..f3d30d9ea8f9 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -871,7 +871,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio)
  * this frees the rbio and runs through all the bios in the
  * bio_list and calls end_io on them
  */
-static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err)
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
 {
 	struct bio *cur = bio_list_get(&rbio->bio_list);
 	struct bio *next;
@@ -884,7 +884,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err)
 	while (cur) {
 		next = cur->bi_next;
 		cur->bi_next = NULL;
-		cur->bi_error = err;
+		cur->bi_status = err;
 		bio_endio(cur);
 		cur = next;
 	}
@@ -897,7 +897,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err)
 static void raid_write_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
-	int err = bio->bi_error;
+	blk_status_t err = bio->bi_status;
 	int max_errors;
 
 	if (err)
@@ -914,7 +914,7 @@ static void raid_write_end_io(struct bio *bio)
 	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
 		     0 : rbio->bbio->max_errors;
 	if (atomic_read(&rbio->error) > max_errors)
-		err = -EIO;
+		err = BLK_STS_IOERR;
 
 	rbio_orig_end_io(rbio, err);
 }
@@ -1092,7 +1092,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 		 * devices or if they are not contiguous
 		 */
 		if (last_end == disk_start && stripe->dev->bdev &&
-		    !last->bi_error &&
+		    !last->bi_status &&
 		    last->bi_bdev == stripe->dev->bdev) {
 			ret = bio_add_page(last, page, PAGE_SIZE, 0);
 			if (ret == PAGE_SIZE)
@@ -1448,7 +1448,7 @@ static void raid_rmw_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		fail_bio_stripe(rbio, bio);
 	else
 		set_bio_pages_uptodate(bio);
@@ -1991,7 +1991,7 @@ static void raid_recover_end_io(struct bio *bio)
 	 * we only read stripe pages off the disk, set them
 	 * up to date if there were no errors
 	 */
-	if (bio->bi_error)
+	if (bio->bi_status)
 		fail_bio_stripe(rbio, bio);
 	else
 		set_bio_pages_uptodate(bio);
@@ -2530,7 +2530,7 @@ static void raid56_parity_scrub_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		fail_bio_stripe(rbio, bio);
 	else
 		set_bio_pages_uptodate(bio);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index c7b45eb2403d..ba5595d19de1 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -95,7 +95,7 @@ struct scrub_bio {
 	struct scrub_ctx	*sctx;
 	struct btrfs_device	*dev;
 	struct bio		*bio;
-	int			err;
+	blk_status_t		status;
 	u64			logical;
 	u64			physical;
 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
@@ -1668,14 +1668,14 @@ leave_nomem:
 
 struct scrub_bio_ret {
 	struct completion event;
-	int error;
+	blk_status_t status;
 };
 
 static void scrub_bio_wait_endio(struct bio *bio)
 {
 	struct scrub_bio_ret *ret = bio->bi_private;
 
-	ret->error = bio->bi_error;
+	ret->status = bio->bi_status;
 	complete(&ret->event);
 }
 
@@ -1693,7 +1693,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
 	int ret;
 
 	init_completion(&done.event);
-	done.error = 0;
+	done.status = 0;
 	bio->bi_iter.bi_sector = page->logical >> 9;
 	bio->bi_private = &done;
 	bio->bi_end_io = scrub_bio_wait_endio;
@@ -1705,7 +1705,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
 		return ret;
 
 	wait_for_completion(&done.event);
-	if (done.error)
+	if (done.status)
 		return -EIO;
 
 	return 0;
@@ -1937,7 +1937,7 @@ again:
 		bio->bi_bdev = sbio->dev->bdev;
 		bio->bi_iter.bi_sector = sbio->physical >> 9;
 		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-		sbio->err = 0;
+		sbio->status = 0;
 	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
 		   spage->physical_for_dev_replace ||
 		   sbio->logical + sbio->page_count * PAGE_SIZE !=
@@ -1992,7 +1992,7 @@ static void scrub_wr_bio_end_io(struct bio *bio)
 	struct scrub_bio *sbio = bio->bi_private;
 	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
 
-	sbio->err = bio->bi_error;
+	sbio->status = bio->bi_status;
 	sbio->bio = bio;
 
 	btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
@@ -2007,7 +2007,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
 	int i;
 
 	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
-	if (sbio->err) {
+	if (sbio->status) {
 		struct btrfs_dev_replace *dev_replace =
 			&sbio->sctx->fs_info->dev_replace;
 
@@ -2341,7 +2341,7 @@ again:
 		bio->bi_bdev = sbio->dev->bdev;
 		bio->bi_iter.bi_sector = sbio->physical >> 9;
 		bio_set_op_attrs(bio, REQ_OP_READ, 0);
-		sbio->err = 0;
+		sbio->status = 0;
 	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
 		   spage->physical ||
 		   sbio->logical + sbio->page_count * PAGE_SIZE !=
@@ -2377,7 +2377,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio)
 	struct scrub_block *sblock = bio->bi_private;
 	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		sblock->no_io_error_seen = 0;
 
 	bio_put(bio);
@@ -2588,7 +2588,7 @@ static void scrub_bio_end_io(struct bio *bio)
 	struct scrub_bio *sbio = bio->bi_private;
 	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
 
-	sbio->err = bio->bi_error;
+	sbio->status = bio->bi_status;
 	sbio->bio = bio;
 
 	btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
@@ -2601,7 +2601,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
 	int i;
 
 	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
-	if (sbio->err) {
+	if (sbio->status) {
 		for (i = 0; i < sbio->page_count; i++) {
 			struct scrub_page *spage = sbio->pagev[i];
 
@@ -3004,7 +3004,7 @@ static void scrub_parity_bio_endio(struct bio *bio)
 	struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
 	struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
 			  sparity->nsectors);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 017b67daa3bb..84a495967e0a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6042,9 +6042,10 @@ static void btrfs_end_bio(struct bio *bio)
 	struct btrfs_bio *bbio = bio->bi_private;
 	int is_orig_bio = 0;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		atomic_inc(&bbio->error);
-		if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) {
+		if (bio->bi_status == BLK_STS_IOERR ||
+		    bio->bi_status == BLK_STS_TARGET) {
 			unsigned int stripe_index =
 				btrfs_io_bio(bio)->stripe_index;
 			struct btrfs_device *dev;
@@ -6082,13 +6083,13 @@ static void btrfs_end_bio(struct bio *bio)
 		 * beyond the tolerance of the btrfs bio
 		 */
 		if (atomic_read(&bbio->error) > bbio->max_errors) {
-			bio->bi_error = -EIO;
+			bio->bi_status = BLK_STS_IOERR;
 		} else {
 			/*
 			 * this bio is actually up to date, we didn't
 			 * go over the max number of errors
 			 */
-			bio->bi_error = 0;
+			bio->bi_status = 0;
 		}
 
 		btrfs_end_bbio(bbio, bio);
@@ -6199,7 +6200,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
 
 		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		bio->bi_iter.bi_sector = logical >> 9;
-		bio->bi_error = -EIO;
+		bio->bi_status = BLK_STS_IOERR;
 		btrfs_end_bbio(bbio, bio);
 	}
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index 161be58c5cb0..306b720f7383 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3038,7 +3038,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
 	if (unlikely(bio_flagged(bio, BIO_QUIET)))
 		set_bit(BH_Quiet, &bh->b_state);
 
-	bh->b_end_io(bh, !bio->bi_error);
+	bh->b_end_io(bh, !bio->bi_status);
 	bio_put(bio);
 }
 
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index a409a84f1bca..6181e9526860 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -129,7 +129,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 			goto errout;
 		}
 		err = submit_bio_wait(bio);
-		if ((err == 0) && bio->bi_error)
+		if (err == 0 && bio->bi_status)
 			err = -EIO;
 		bio_put(bio);
 		if (err)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index bb711e4b86c2..e8baaabebf13 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -294,7 +294,7 @@ static void dio_aio_complete_work(struct work_struct *work)
 	dio_complete(dio, 0, true);
 }
 
-static int dio_bio_complete(struct dio *dio, struct bio *bio);
+static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio);
 
 /*
  * Asynchronous IO callback. 
@@ -473,11 +473,11 @@ static struct bio *dio_await_one(struct dio *dio)
 /*
  * Process one completed BIO.  No locks are held.
  */
-static int dio_bio_complete(struct dio *dio, struct bio *bio)
+static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio)
 {
 	struct bio_vec *bvec;
 	unsigned i;
-	int err = bio->bi_error;
+	blk_status_t err = bio->bi_status;
 
 	if (err)
 		dio->io_error = -EIO;
@@ -536,7 +536,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
 			bio = dio->bio_list;
 			dio->bio_list = bio->bi_private;
 			spin_unlock_irqrestore(&dio->bio_lock, flags);
-			ret2 = dio_bio_complete(dio, bio);
+			ret2 = blk_status_to_errno(dio_bio_complete(dio, bio));
 			if (ret == 0)
 				ret = ret2;
 		}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 1a82138ba739..930ca0fc9a0f 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -85,7 +85,7 @@ static void ext4_finish_bio(struct bio *bio)
 		}
 #endif
 
-		if (bio->bi_error) {
+		if (bio->bi_status) {
 			SetPageError(page);
 			mapping_set_error(page->mapping, -EIO);
 		}
@@ -104,7 +104,7 @@ static void ext4_finish_bio(struct bio *bio)
 				continue;
 			}
 			clear_buffer_async_write(bh);
-			if (bio->bi_error)
+			if (bio->bi_status)
 				buffer_io_error(bh);
 		} while ((bh = bh->b_this_page) != head);
 		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
@@ -303,24 +303,25 @@ static void ext4_end_bio(struct bio *bio)
 		      bdevname(bio->bi_bdev, b),
 		      (long long) bio->bi_iter.bi_sector,
 		      (unsigned) bio_sectors(bio),
-		      bio->bi_error)) {
+		      bio->bi_status)) {
 		ext4_finish_bio(bio);
 		bio_put(bio);
 		return;
 	}
 	bio->bi_end_io = NULL;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		struct inode *inode = io_end->inode;
 
 		ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
 			     "(offset %llu size %ld starting block %llu)",
-			     bio->bi_error, inode->i_ino,
+			     bio->bi_status, inode->i_ino,
 			     (unsigned long long) io_end->offset,
 			     (long) io_end->size,
 			     (unsigned long long)
 			     bi_sector >> (inode->i_blkbits - 9));
-		mapping_set_error(inode->i_mapping, bio->bi_error);
+		mapping_set_error(inode->i_mapping,
+				blk_status_to_errno(bio->bi_status));
 	}
 
 	if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index a81b829d56de..40a5497b0f60 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -73,7 +73,7 @@ static void mpage_end_io(struct bio *bio)
 	int i;
 
 	if (ext4_bio_encrypted(bio)) {
-		if (bio->bi_error) {
+		if (bio->bi_status) {
 			fscrypt_release_ctx(bio->bi_private);
 		} else {
 			fscrypt_decrypt_bio_pages(bio->bi_private, bio);
@@ -83,7 +83,7 @@ static void mpage_end_io(struct bio *bio)
 	bio_for_each_segment_all(bv, bio, i) {
 		struct page *page = bv->bv_page;
 
-		if (!bio->bi_error) {
+		if (!bio->bi_status) {
 			SetPageUptodate(page);
 		} else {
 			ClearPageUptodate(page);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7c0f6bdf817d..36fe82012a33 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -58,12 +58,12 @@ static void f2fs_read_end_io(struct bio *bio)
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) {
 		f2fs_show_injection_info(FAULT_IO);
-		bio->bi_error = -EIO;
+		bio->bi_status = BLK_STS_IOERR;
 	}
 #endif
 
 	if (f2fs_bio_encrypted(bio)) {
-		if (bio->bi_error) {
+		if (bio->bi_status) {
 			fscrypt_release_ctx(bio->bi_private);
 		} else {
 			fscrypt_decrypt_bio_pages(bio->bi_private, bio);
@@ -74,7 +74,7 @@ static void f2fs_read_end_io(struct bio *bio)
 	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
 
-		if (!bio->bi_error) {
+		if (!bio->bi_status) {
 			if (!PageUptodate(page))
 				SetPageUptodate(page);
 		} else {
@@ -102,14 +102,14 @@ static void f2fs_write_end_io(struct bio *bio)
 			unlock_page(page);
 			mempool_free(page, sbi->write_io_dummy);
 
-			if (unlikely(bio->bi_error))
+			if (unlikely(bio->bi_status))
 				f2fs_stop_checkpoint(sbi, true);
 			continue;
 		}
 
 		fscrypt_pullback_bio_page(&page, true);
 
-		if (unlikely(bio->bi_error)) {
+		if (unlikely(bio->bi_status)) {
 			mapping_set_error(page->mapping, -EIO);
 			f2fs_stop_checkpoint(sbi, true);
 		}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 96845854e7ee..ea9f455d94ba 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -749,7 +749,7 @@ static void f2fs_submit_discard_endio(struct bio *bio)
 {
 	struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
 
-	dc->error = bio->bi_error;
+	dc->error = blk_status_to_errno(bio->bi_status);
 	dc->state = D_DONE;
 	complete(&dc->wait);
 	bio_put(bio);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 13ebf15a4db0..885d36e7a29f 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -170,7 +170,7 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
  */
 
 static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
-				  int error)
+				  blk_status_t error)
 {
 	struct buffer_head *bh, *next;
 	struct page *page = bvec->bv_page;
@@ -209,13 +209,13 @@ static void gfs2_end_log_write(struct bio *bio)
 	struct page *page;
 	int i;
 
-	if (bio->bi_error)
-		fs_err(sdp, "Error %d writing to log\n", bio->bi_error);
+	if (bio->bi_status)
+		fs_err(sdp, "Error %d writing to log\n", bio->bi_status);
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		page = bvec->bv_page;
 		if (page_has_buffers(page))
-			gfs2_end_log_write_bh(sdp, bvec, bio->bi_error);
+			gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
 		else
 			mempool_free(page, gfs2_page_pool);
 	}
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 663ffc135ef3..fabe1614f879 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -201,7 +201,7 @@ static void gfs2_meta_read_endio(struct bio *bio)
 		do {
 			struct buffer_head *next = bh->b_this_page;
 			len -= bh->b_size;
-			bh->b_end_io(bh, !bio->bi_error);
+			bh->b_end_io(bh, !bio->bi_status);
 			bh = next;
 		} while (bh && len);
 	}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ed67548b286c..83953cdbbc6c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -176,10 +176,10 @@ static void end_bio_io_page(struct bio *bio)
 {
 	struct page *page = bio->bi_private;
 
-	if (!bio->bi_error)
+	if (!bio->bi_status)
 		SetPageUptodate(page);
 	else
-		pr_warn("error %d reading superblock\n", bio->bi_error);
+		pr_warn("error %d reading superblock\n", bio->bi_status);
 	unlock_page(page);
 }
 
diff --git a/fs/iomap.c b/fs/iomap.c
index 4b10892967a5..18f2f2b8ba2c 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -672,8 +672,8 @@ static void iomap_dio_bio_end_io(struct bio *bio)
 	struct iomap_dio *dio = bio->bi_private;
 	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
 
-	if (bio->bi_error)
-		iomap_dio_set_error(dio, bio->bi_error);
+	if (bio->bi_status)
+		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
 
 	if (atomic_dec_and_test(&dio->ref)) {
 		if (is_sync_kiocb(dio->iocb)) {
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index bb1da1feafeb..a21f0e9eecd4 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2205,7 +2205,7 @@ static void lbmIODone(struct bio *bio)
 
 	bp->l_flag |= lbmDONE;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		bp->l_flag |= lbmERROR;
 
 		jfs_err("lbmIODone: I/O error in JFS log");
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 489aaa1403e5..ce93db3aef3c 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -280,7 +280,7 @@ static void metapage_read_end_io(struct bio *bio)
 {
 	struct page *page = bio->bi_private;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		printk(KERN_ERR "metapage_read_end_io: I/O error\n");
 		SetPageError(page);
 	}
@@ -337,7 +337,7 @@ static void metapage_write_end_io(struct bio *bio)
 
 	BUG_ON(!PagePrivate(page));
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		printk(KERN_ERR "metapage_write_end_io: I/O error\n");
 		SetPageError(page);
 	}
diff --git a/fs/mpage.c b/fs/mpage.c
index baff8f820c29..9524fdde00c2 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -50,7 +50,8 @@ static void mpage_end_io(struct bio *bio)
 
 	bio_for_each_segment_all(bv, bio, i) {
 		struct page *page = bv->bv_page;
-		page_endio(page, op_is_write(bio_op(bio)), bio->bi_error);
+		page_endio(page, op_is_write(bio_op(bio)),
+				blk_status_to_errno(bio->bi_status));
 	}
 
 	bio_put(bio);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 0ca370d23ddb..d8863a804b15 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -188,7 +188,7 @@ static void bl_end_io_read(struct bio *bio)
 {
 	struct parallel_io *par = bio->bi_private;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		struct nfs_pgio_header *header = par->data;
 
 		if (!header->pnfs_error)
@@ -319,7 +319,7 @@ static void bl_end_io_write(struct bio *bio)
 	struct parallel_io *par = bio->bi_private;
 	struct nfs_pgio_header *header = par->data;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		if (!header->pnfs_error)
 			header->pnfs_error = -EIO;
 		pnfs_set_lo_fail(header->lseg);
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 6f87b2ac1aeb..e73c86d9855c 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -338,7 +338,7 @@ static void nilfs_end_bio_write(struct bio *bio)
 {
 	struct nilfs_segment_buffer *segbuf = bio->bi_private;
 
-	if (bio->bi_error)
+	if (bio->bi_status)
 		atomic_inc(&segbuf->sb_err);
 
 	bio_put(bio);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 0da0332725aa..ffe003982d95 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -516,9 +516,9 @@ static void o2hb_bio_end_io(struct bio *bio)
 {
 	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
 
-	if (bio->bi_error) {
-		mlog(ML_ERROR, "IO Error %d\n", bio->bi_error);
-		wc->wc_error = bio->bi_error;
+	if (bio->bi_status) {
+		mlog(ML_ERROR, "IO Error %d\n", bio->bi_status);
+		wc->wc_error = blk_status_to_errno(bio->bi_status);
 	}
 
 	o2hb_bio_wait_dec(wc, 1);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 09af0f7cd55e..76b6f988e2fa 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -276,7 +276,7 @@ xfs_end_io(
 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 	xfs_off_t		offset = ioend->io_offset;
 	size_t			size = ioend->io_size;
-	int			error = ioend->io_bio->bi_error;
+	int			error;
 
 	/*
 	 * Just clean up the in-memory strutures if the fs has been shut down.
@@ -289,6 +289,7 @@ xfs_end_io(
 	/*
 	 * Clean up any COW blocks on an I/O error.
 	 */
+	error = blk_status_to_errno(ioend->io_bio->bi_status);
 	if (unlikely(error)) {
 		switch (ioend->io_type) {
 		case XFS_IO_COW:
@@ -332,7 +333,7 @@ xfs_end_bio(
 	else if (ioend->io_append_trans)
 		queue_work(mp->m_data_workqueue, &ioend->io_work);
 	else
-		xfs_destroy_ioend(ioend, bio->bi_error);
+		xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
 }
 
 STATIC int
@@ -500,7 +501,7 @@ xfs_submit_ioend(
 	 * time.
 	 */
 	if (status) {
-		ioend->io_bio->bi_error = status;
+		ioend->io_bio->bi_status = errno_to_blk_status(status);
 		bio_endio(ioend->io_bio);
 		return status;
 	}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 62fa39276a24..15c7a484a5d2 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1213,8 +1213,11 @@ xfs_buf_bio_end_io(
 	 * don't overwrite existing errors - otherwise we can lose errors on
 	 * buffers that require multiple bios to complete.
 	 */
-	if (bio->bi_error)
-		cmpxchg(&bp->b_io_error, 0, bio->bi_error);
+	if (bio->bi_status) {
+		int error = blk_status_to_errno(bio->bi_status);
+
+		cmpxchg(&bp->b_io_error, 0, error);
+	}
 
 	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
 		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d1b04b0e99cf..9455aada1399 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -414,7 +414,7 @@ extern void bio_endio(struct bio *);
 
 static inline void bio_io_error(struct bio *bio)
 {
-	bio->bi_error = -EIO;
+	bio->bi_status = BLK_STS_IOERR;
 	bio_endio(bio);
 }
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 59378939a8cd..dcd45b15a3a5 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -33,6 +33,9 @@ typedef u8 __bitwise blk_status_t;
 #define BLK_STS_RESOURCE	((__force blk_status_t)9)
 #define BLK_STS_IOERR		((__force blk_status_t)10)
 
+/* hack for device mapper, don't use elsewhere: */
+#define BLK_STS_DM_REQUEUE    ((__force blk_status_t)11)
+
 struct blk_issue_stat {
 	u64 stat;
 };
@@ -44,7 +47,7 @@ struct blk_issue_stat {
 struct bio {
 	struct bio		*bi_next;	/* request queue link */
 	struct block_device	*bi_bdev;
-	int			bi_error;
+	blk_status_t		bi_status;
 	unsigned int		bi_opf;		/* bottom bits req flags,
 						 * top bits REQ_OP. Use
 						 * accessors.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2a8871638453..76b6df862a12 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1782,7 +1782,7 @@ struct blk_integrity_iter {
 	const char		*disk_name;
 };
 
-typedef int (integrity_processing_fn) (struct blk_integrity_iter *);
+typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *);
 
 struct blk_integrity_profile {
 	integrity_processing_fn		*generate_fn;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 5de5c53251ec..456da5017b32 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -72,7 +72,7 @@ typedef void (*dm_release_clone_request_fn) (struct request *clone);
  * 2   : The target wants to push back the io
  */
 typedef int (*dm_endio_fn) (struct dm_target *ti,
-			    struct bio *bio, int *error);
+			    struct bio *bio, blk_status_t *error);
 typedef int (*dm_request_endio_fn) (struct dm_target *ti,
 				    struct request *clone, blk_status_t error,
 				    union map_info *map_context);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index f80fd33639e0..57d22571f306 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -225,14 +225,14 @@ static struct block_device *hib_resume_bdev;
 struct hib_bio_batch {
 	atomic_t		count;
 	wait_queue_head_t	wait;
-	int			error;
+	blk_status_t		error;
 };
 
 static void hib_init_batch(struct hib_bio_batch *hb)
 {
 	atomic_set(&hb->count, 0);
 	init_waitqueue_head(&hb->wait);
-	hb->error = 0;
+	hb->error = BLK_STS_OK;
 }
 
 static void hib_end_io(struct bio *bio)
@@ -240,7 +240,7 @@ static void hib_end_io(struct bio *bio)
 	struct hib_bio_batch *hb = bio->bi_private;
 	struct page *page = bio->bi_io_vec[0].bv_page;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
 				imajor(bio->bi_bdev->bd_inode),
 				iminor(bio->bi_bdev->bd_inode),
@@ -253,8 +253,8 @@ static void hib_end_io(struct bio *bio)
 		flush_icache_range((unsigned long)page_address(page),
 				   (unsigned long)page_address(page) + PAGE_SIZE);
 
-	if (bio->bi_error && !hb->error)
-		hb->error = bio->bi_error;
+	if (bio->bi_status && !hb->error)
+		hb->error = bio->bi_status;
 	if (atomic_dec_and_test(&hb->count))
 		wake_up(&hb->wait);
 
@@ -293,10 +293,10 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
 	return error;
 }
 
-static int hib_wait_io(struct hib_bio_batch *hb)
+static blk_status_t hib_wait_io(struct hib_bio_batch *hb)
 {
 	wait_event(hb->wait, atomic_read(&hb->count) == 0);
-	return hb->error;
+	return blk_status_to_errno(hb->error);
 }
 
 /*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 193c5f5e3f79..bc364f86100a 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -867,7 +867,7 @@ static void blk_add_trace_split(void *ignore,
 
 		__blk_add_trace(bt, bio->bi_iter.bi_sector,
 				bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
-				BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu),
+				BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
 				&rpdu);
 	}
 }
@@ -900,7 +900,7 @@ static void blk_add_trace_bio_remap(void *ignore,
 	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error,
+			bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
 			sizeof(r), &r);
 }
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 23f6d0d3470f..2da71e627812 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -45,7 +45,7 @@ void end_swap_bio_write(struct bio *bio)
 {
 	struct page *page = bio->bi_io_vec[0].bv_page;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		SetPageError(page);
 		/*
 		 * We failed to write the page out to swap-space.
@@ -118,7 +118,7 @@ static void end_swap_bio_read(struct bio *bio)
 {
 	struct page *page = bio->bi_io_vec[0].bv_page;
 
-	if (bio->bi_error) {
+	if (bio->bi_status) {
 		SetPageError(page);
 		ClearPageUptodate(page);
 		pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
-- 
cgit v1.3-14-g43fede


From 1e1fc133483ef3b56c20bf3cd9241146c41042f8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 May 2017 00:29:38 -0400
Subject: compat_{get,put}_bitmap(): use unsafe_{get,put}_user()

unroll the inner loops, while we are at it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/compat.h |  3 +-
 kernel/compat.c        | 81 +++++++++++++++++---------------------------------
 2 files changed, 29 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 1c5f3152cbb5..94ceb0348a25 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -388,8 +388,7 @@ asmlinkage long compat_sys_wait4(compat_pid_t pid,
 
 #define BITS_PER_COMPAT_LONG    (8*sizeof(compat_long_t))
 
-#define BITS_TO_COMPAT_LONGS(bits) \
-	(((bits)+BITS_PER_COMPAT_LONG-1)/BITS_PER_COMPAT_LONG)
+#define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG)
 
 long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
 		       unsigned long bitmap_size);
diff --git a/kernel/compat.c b/kernel/compat.c
index 860f674fa556..9c2a8f3788d5 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -871,84 +871,59 @@ int get_compat_sigevent(struct sigevent *event,
 long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
 		       unsigned long bitmap_size)
 {
-	int i, j;
-	unsigned long m;
-	compat_ulong_t um;
 	unsigned long nr_compat_longs;
 
 	/* align bitmap up to nearest compat_long_t boundary */
 	bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+	nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
 
 	if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
 		return -EFAULT;
 
-	nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
-
-	for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
-		m = 0;
-
-		for (j = 0; j < sizeof(m)/sizeof(um); j++) {
-			/*
-			 * We dont want to read past the end of the userspace
-			 * bitmap. We must however ensure the end of the
-			 * kernel bitmap is zeroed.
-			 */
-			if (nr_compat_longs) {
-				nr_compat_longs--;
-				if (__get_user(um, umask))
-					return -EFAULT;
-			} else {
-				um = 0;
-			}
-
-			umask++;
-			m |= (long)um << (j * BITS_PER_COMPAT_LONG);
-		}
-		*mask++ = m;
+	user_access_begin();
+	while (nr_compat_longs > 1) {
+		compat_ulong_t l1, l2;
+		unsafe_get_user(l1, umask++, Efault);
+		unsafe_get_user(l2, umask++, Efault);
+		*mask++ = ((unsigned long)l2 << BITS_PER_COMPAT_LONG) | l1;
+		nr_compat_longs -= 2;
 	}
-
+	if (nr_compat_longs)
+		unsafe_get_user(*mask, umask++, Efault);
+	user_access_end();
 	return 0;
+
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 
 long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
 		       unsigned long bitmap_size)
 {
-	int i, j;
-	unsigned long m;
-	compat_ulong_t um;
 	unsigned long nr_compat_longs;
 
 	/* align bitmap up to nearest compat_long_t boundary */
 	bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+	nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
 
 	if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
 		return -EFAULT;
 
-	nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
-
-	for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
-		m = *mask++;
-
-		for (j = 0; j < sizeof(m)/sizeof(um); j++) {
-			um = m;
-
-			/*
-			 * We dont want to write past the end of the userspace
-			 * bitmap.
-			 */
-			if (nr_compat_longs) {
-				nr_compat_longs--;
-				if (__put_user(um, umask))
-					return -EFAULT;
-			}
-
-			umask++;
-			m >>= 4*sizeof(um);
-			m >>= 4*sizeof(um);
-		}
+	user_access_begin();
+	while (nr_compat_longs > 1) {
+		unsigned long m = *mask++;
+		unsafe_put_user((compat_ulong_t)m, umask++, Efault);
+		unsafe_put_user(m >> BITS_PER_COMPAT_LONG, umask++, Efault);
+		nr_compat_longs -= 2;
 	}
-
+	if (nr_compat_longs)
+		unsafe_put_user((compat_ulong_t)*mask, umask++, Efault);
+	user_access_end();
 	return 0;
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 
 void
-- 
cgit v1.3-14-g43fede


From ca2406ed58fef3f7c8ef6470cba807bfc3415605 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 31 May 2017 04:22:44 -0400
Subject: times(2): move compat to native

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/time.h |  3 ---
 kernel/compat.c      | 24 ------------------------
 kernel/sys.c         | 28 +++++++++++++++++++++++++++-
 3 files changed, 27 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time.h b/include/linux/time.h
index c0543f5f25de..f769ea88250d 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -171,9 +171,6 @@ extern int do_getitimer(int which, struct itimerval *value);
 
 extern long do_utimes(int dfd, const char __user *filename, struct timespec *times, int flags);
 
-struct tms;
-extern void do_sys_times(struct tms *);
-
 /*
  * Similar to the struct tm in userspace <time.h>, but it needs to be here so
  * that the kernel source is self contained.
diff --git a/kernel/compat.c b/kernel/compat.c
index 9c2a8f3788d5..99408252762e 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -350,30 +350,6 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
 	return 0;
 }
 
-static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
-{
-	return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
-}
-
-COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
-{
-	if (tbuf) {
-		struct tms tms;
-		struct compat_tms tmp;
-
-		do_sys_times(&tms);
-		/* Convert our struct tms to the compat version. */
-		tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
-		tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
-		tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
-		tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
-		if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
-			return -EFAULT;
-	}
-	force_successful_syscall_return();
-	return compat_jiffies_to_clock_t(jiffies);
-}
-
 #ifdef __ARCH_WANT_SYS_SIGPENDING
 
 /*
diff --git a/kernel/sys.c b/kernel/sys.c
index 3778a8a417b6..161b5eae9c77 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -886,7 +886,7 @@ SYSCALL_DEFINE0(getegid)
 	return from_kgid_munged(current_user_ns(), current_egid());
 }
 
-void do_sys_times(struct tms *tms)
+static void do_sys_times(struct tms *tms)
 {
 	u64 tgutime, tgstime, cutime, cstime;
 
@@ -912,6 +912,32 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
 	return (long) jiffies_64_to_clock_t(get_jiffies_64());
 }
 
+#ifdef CONFIG_COMPAT
+static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
+{
+	return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
+}
+
+COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
+{
+	if (tbuf) {
+		struct tms tms;
+		struct compat_tms tmp;
+
+		do_sys_times(&tms);
+		/* Convert our struct tms to the compat version. */
+		tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
+		tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
+		tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
+		tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
+		if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
+			return -EFAULT;
+	}
+	force_successful_syscall_return();
+	return compat_jiffies_to_clock_t(jiffies);
+}
+#endif
+
 /*
  * This needs some heavy checking ...
  * I just haven't the stomach for it. I also don't fully
-- 
cgit v1.3-14-g43fede


From d9e968cb9f849770288f5fde3d8d3a5f7e339052 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 31 May 2017 04:33:51 -0400
Subject: getrlimit()/setrlimit(): move compat to native

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/compat.c | 38 --------------------------------------
 kernel/sys.c    | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 99408252762e..58b8e57398d1 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -427,44 +427,6 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
 
 #endif
 
-COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
-		       struct compat_rlimit __user *, rlim)
-{
-	struct rlimit r;
-
-	if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
-	    __get_user(r.rlim_cur, &rlim->rlim_cur) ||
-	    __get_user(r.rlim_max, &rlim->rlim_max))
-		return -EFAULT;
-
-	if (r.rlim_cur == COMPAT_RLIM_INFINITY)
-		r.rlim_cur = RLIM_INFINITY;
-	if (r.rlim_max == COMPAT_RLIM_INFINITY)
-		r.rlim_max = RLIM_INFINITY;
-	return do_prlimit(current, resource, &r, NULL);
-}
-
-COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
-		       struct compat_rlimit __user *, rlim)
-{
-	struct rlimit r;
-	int ret;
-
-	ret = do_prlimit(current, resource, NULL, &r);
-	if (!ret) {
-		if (r.rlim_cur > COMPAT_RLIM_INFINITY)
-			r.rlim_cur = COMPAT_RLIM_INFINITY;
-		if (r.rlim_max > COMPAT_RLIM_INFINITY)
-			r.rlim_max = COMPAT_RLIM_INFINITY;
-
-		if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
-		    __put_user(r.rlim_cur, &rlim->rlim_cur) ||
-		    __put_user(r.rlim_max, &rlim->rlim_max))
-			return -EFAULT;
-	}
-	return ret;
-}
-
 int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
 {
 	if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) ||
diff --git a/kernel/sys.c b/kernel/sys.c
index 161b5eae9c77..873e6eaa314f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1332,6 +1332,54 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
 	return ret;
 }
 
+#ifdef CONFIG_COMPAT
+
+COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
+		       struct compat_rlimit __user *, rlim)
+{
+	struct rlimit r;
+	struct compat_rlimit r32;
+
+	if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit)))
+		return -EFAULT;
+
+	if (r32.rlim_cur == COMPAT_RLIM_INFINITY)
+		r.rlim_cur = RLIM_INFINITY;
+	else
+		r.rlim_cur = r32.rlim_cur;
+	if (r32.rlim_max == COMPAT_RLIM_INFINITY)
+		r.rlim_max = RLIM_INFINITY;
+	else
+		r.rlim_max = r32.rlim_max;
+	return do_prlimit(current, resource, &r, NULL);
+}
+
+COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
+		       struct compat_rlimit __user *, rlim)
+{
+	struct rlimit r;
+	int ret;
+
+	ret = do_prlimit(current, resource, NULL, &r);
+	if (!ret) {
+		struct rlimit r32;
+		if (r.rlim_cur > COMPAT_RLIM_INFINITY)
+			r32.rlim_cur = COMPAT_RLIM_INFINITY;
+		else
+			r32.rlim_cur = r.rlim_cur;
+		if (r.rlim_max > COMPAT_RLIM_INFINITY)
+			r32.rlim_max = COMPAT_RLIM_INFINITY;
+		else
+			r32.rlim_max = r.rlim_max;
+
+		if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit)))
+			return -EFAULT;
+	}
+	return ret;
+}
+
+#endif
+
 #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
 
 /*
-- 
cgit v1.3-14-g43fede


From 8f13621abcedb278cfecf9703583743f9c474c97 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 31 May 2017 04:42:07 -0400
Subject: sigpending(): move compat to native

... and kill set_fs() use

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/compat.c | 23 -----------------------
 kernel/signal.c | 12 ++++++++++++
 2 files changed, 12 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 58b8e57398d1..195e23469854 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -350,29 +350,6 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
 	return 0;
 }
 
-#ifdef __ARCH_WANT_SYS_SIGPENDING
-
-/*
- * Assumption: old_sigset_t and compat_old_sigset_t are both
- * types that can be passed to put_user()/get_user().
- */
-
-COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
-{
-	old_sigset_t s;
-	long ret;
-	mm_segment_t old_fs = get_fs();
-
-	set_fs(KERNEL_DS);
-	ret = sys_sigpending((old_sigset_t __user *) &s);
-	set_fs(old_fs);
-	if (ret == 0)
-		ret = put_user(s, set);
-	return ret;
-}
-
-#endif
-
 #ifdef __ARCH_WANT_SYS_SIGPROCMASK
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index d1eed0d7ca64..6237f492adfc 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3254,6 +3254,18 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 	return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); 
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
+{
+	sigset_t set;
+	int err = do_sigpending(&set, sizeof(old_sigset_t)); 
+	if (err == 0)
+		if (copy_to_user(set32, &set, sizeof(old_sigset_t)))
+			err = -EFAULT;
+	return err;
+}
+#endif
+
 #endif
 
 #ifdef __ARCH_WANT_SYS_SIGPROCMASK
-- 
cgit v1.3-14-g43fede


From 7668b679c3b7931f6436d1eef50904831209e749 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 31 May 2017 06:39:31 -0400
Subject: put_compat_rusage(): switch to copy_to_user()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/compat.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 195e23469854..64e772aabdb5 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -406,25 +406,27 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
 
 int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
 {
-	if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) ||
-	    __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) ||
-	    __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) ||
-	    __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) ||
-	    __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) ||
-	    __put_user(r->ru_maxrss, &ru->ru_maxrss) ||
-	    __put_user(r->ru_ixrss, &ru->ru_ixrss) ||
-	    __put_user(r->ru_idrss, &ru->ru_idrss) ||
-	    __put_user(r->ru_isrss, &ru->ru_isrss) ||
-	    __put_user(r->ru_minflt, &ru->ru_minflt) ||
-	    __put_user(r->ru_majflt, &ru->ru_majflt) ||
-	    __put_user(r->ru_nswap, &ru->ru_nswap) ||
-	    __put_user(r->ru_inblock, &ru->ru_inblock) ||
-	    __put_user(r->ru_oublock, &ru->ru_oublock) ||
-	    __put_user(r->ru_msgsnd, &ru->ru_msgsnd) ||
-	    __put_user(r->ru_msgrcv, &ru->ru_msgrcv) ||
-	    __put_user(r->ru_nsignals, &ru->ru_nsignals) ||
-	    __put_user(r->ru_nvcsw, &ru->ru_nvcsw) ||
-	    __put_user(r->ru_nivcsw, &ru->ru_nivcsw))
+	struct compat_rusage r32;
+	memset(&r32, 0, sizeof(r32));
+	r32.ru_utime.tv_sec = r->ru_utime.tv_sec;
+	r32.ru_utime.tv_usec = r->ru_utime.tv_usec;
+	r32.ru_stime.tv_sec = r->ru_stime.tv_sec;
+	r32.ru_stime.tv_usec = r->ru_stime.tv_usec;
+	r32.ru_maxrss = r->ru_maxrss;
+	r32.ru_ixrss = r->ru_ixrss;
+	r32.ru_idrss = r->ru_idrss;
+	r32.ru_isrss = r->ru_isrss;
+	r32.ru_minflt = r->ru_minflt;
+	r32.ru_majflt = r->ru_majflt;
+	r32.ru_nswap = r->ru_nswap;
+	r32.ru_inblock = r->ru_inblock;
+	r32.ru_oublock = r->ru_oublock;
+	r32.ru_msgsnd = r->ru_msgsnd;
+	r32.ru_msgrcv = r->ru_msgrcv;
+	r32.ru_nsignals = r->ru_nsignals;
+	r32.ru_nvcsw = r->ru_nvcsw;
+	r32.ru_nivcsw = r->ru_nivcsw;
+	if (copy_to_user(ru, &r32, sizeof(r32)))
 		return -EFAULT;
 	return 0;
 }
-- 
cgit v1.3-14-g43fede


From 1b3c872c8342803d0fcd8042e4e007d173191db6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 31 May 2017 04:46:17 -0400
Subject: rt_sigtimedwait(): move compat to native

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/signal.h |  2 --
 kernel/compat.c        | 32 --------------------------------
 kernel/signal.c        | 36 +++++++++++++++++++++++++++++++++++-
 3 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 1f5a16620693..231603ac20a3 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -246,8 +246,6 @@ extern int do_send_sig_info(int sig, struct siginfo *info,
 				struct task_struct *p, bool group);
 extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
 extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
-extern int do_sigtimedwait(const sigset_t *, siginfo_t *,
-				const struct timespec *);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
 extern void set_current_blocked(sigset_t *);
 extern void __set_current_blocked(const sigset_t *);
diff --git a/kernel/compat.c b/kernel/compat.c
index 64e772aabdb5..36e6e7c405e3 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -866,38 +866,6 @@ sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
 	}
 }
 
-COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
-		struct compat_siginfo __user *, uinfo,
-		struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
-{
-	compat_sigset_t s32;
-	sigset_t s;
-	struct timespec t;
-	siginfo_t info;
-	long ret;
-
-	if (sigsetsize != sizeof(sigset_t))
-		return -EINVAL;
-
-	if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
-		return -EFAULT;
-	sigset_from_compat(&s, &s32);
-
-	if (uts) {
-		if (compat_get_timespec(&t, uts))
-			return -EFAULT;
-	}
-
-	ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
-
-	if (ret > 0 && uinfo) {
-		if (copy_siginfo_to_user32(uinfo, &info))
-			ret = -EFAULT;
-	}
-
-	return ret;
-}
-
 #ifdef __ARCH_WANT_COMPAT_SYS_TIME
 
 /* compat_time_t is a 32 bit "long" and needs to get converted. */
diff --git a/kernel/signal.c b/kernel/signal.c
index 6237f492adfc..fe5b3608c31c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2768,7 +2768,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
  *  @info: if non-null, the signal's siginfo is returned here
  *  @ts: upper bound on process time suspension
  */
-int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
+static int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
 		    const struct timespec *ts)
 {
 	ktime_t *to = NULL, timeout = KTIME_MAX;
@@ -2857,6 +2857,40 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
 	return ret;
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+		struct compat_siginfo __user *, uinfo,
+		struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
+{
+	compat_sigset_t s32;
+	sigset_t s;
+	struct timespec t;
+	siginfo_t info;
+	long ret;
+
+	if (sigsetsize != sizeof(sigset_t))
+		return -EINVAL;
+
+	if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+		return -EFAULT;
+	sigset_from_compat(&s, &s32);
+
+	if (uts) {
+		if (compat_get_timespec(&t, uts))
+			return -EFAULT;
+	}
+
+	ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
+
+	if (ret > 0 && uinfo) {
+		if (copy_siginfo_to_user32(uinfo, &info))
+			ret = -EFAULT;
+	}
+
+	return ret;
+}
+#endif
+
 /**
  *  sys_kill - send a signal to a process
  *  @pid: the PID of the process
-- 
cgit v1.3-14-g43fede


From 20b9d7ac48526ce9a14106241e76e8382d126a60 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:40 +0200
Subject: bpf: avoid excessive stack usage for perf_sample_data

perf_sample_data consumes 386 bytes on stack, reduce excessive stack
usage and move it to per cpu buffer. It's allowed due to preemption
being disabled for tracing, xdp and tc programs, thus at all times
only one program can run on a specific CPU and programs cannot run
from interrupt. We similarly also handle bpf_pt_regs.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 08eb072430b9..051d7fca0c09 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -266,14 +266,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
+
 static __always_inline u64
 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 			u64 flags, struct perf_raw_record *raw)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
-	struct perf_sample_data sample_data;
 	struct bpf_event_entry *ee;
 	struct perf_event *event;
 
@@ -294,9 +296,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
 	if (unlikely(event->oncpu != cpu))
 		return -EOPNOTSUPP;
 
-	perf_sample_data_init(&sample_data, 0, 0);
-	sample_data.raw = raw;
-	perf_event_output(event, &sample_data, regs);
+	perf_sample_data_init(sd, 0, 0);
+	sd->raw = raw;
+	perf_event_output(event, sd, regs);
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede


From d25da6caa2a1d6644360c40d7c5fd7c057551360 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:41 +0200
Subject: bpf: don't check spilled reg state for non-STACK_SPILLed type slots

spilled_regs[] state is only used for stack slots of type STACK_SPILL,
never for STACK_MISC. Right now, in states_equal(), even if we have
old and current stack state of type STACK_MISC, we compare spilled_regs[]
for that particular offset. Just skip these like we do everywhere else.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 14ccb0759fa4..d031b3b0752e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2828,6 +2828,8 @@ static bool states_equal(struct bpf_verifier_env *env,
 			return false;
 		if (i % BPF_REG_SIZE)
 			continue;
+		if (old->stack_slot_type[i] != STACK_SPILL)
+			continue;
 		if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
 			   &cur->spilled_regs[i / BPF_REG_SIZE],
 			   sizeof(old->spilled_regs[0])))
-- 
cgit v1.3-14-g43fede


From 4a2ff55aa4946b036b87572976cbfc6ab244c497 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:42 +0200
Subject: bpf: reset id on CONST_IMM transition

Whenever we set the register to the type CONST_IMM, we currently don't
reset the id to 0. id member is not used in CONST_IMM case, so don't
let it become stale, where pruning won't be able to match later on.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d031b3b0752e..d195d825515a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1952,6 +1952,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			 */
 			regs[insn->dst_reg].type = CONST_IMM;
 			regs[insn->dst_reg].imm = insn->imm;
+			regs[insn->dst_reg].id = 0;
 			regs[insn->dst_reg].max_value = insn->imm;
 			regs[insn->dst_reg].min_value = insn->imm;
 			regs[insn->dst_reg].min_align = calc_align(insn->imm);
@@ -2409,6 +2410,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		regs[insn->dst_reg].type = CONST_IMM;
 		regs[insn->dst_reg].imm = imm;
+		regs[insn->dst_reg].id = 0;
 		return 0;
 	}
 
-- 
cgit v1.3-14-g43fede


From 36e24c003091a11ec847291c9a1d36d2ec92b155 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:43 +0200
Subject: bpf: reset id on spilled regs in clear_all_pkt_pointers

Right now, we don't reset the id of spilled registers in case of
clear_all_pkt_pointers(). Given pkt_pointers are highly likely to
contain an id, do so by reusing __mark_reg_unknown_value().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d195d825515a..519a6144d3d3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1346,8 +1346,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 		if (reg->type != PTR_TO_PACKET &&
 		    reg->type != PTR_TO_PACKET_END)
 			continue;
-		reg->type = UNKNOWN_VALUE;
-		reg->imm = 0;
+		__mark_reg_unknown_value(state->spilled_regs,
+					 i / BPF_REG_SIZE);
 	}
 }
 
-- 
cgit v1.3-14-g43fede


From c6503be587e9c5c0aac4e2b45de982352f676a5b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 Jun 2017 17:21:26 +0200
Subject: posix-timers: Fix inverted SIGEV_NONE logic in common_timer_get()

The refactoring of the posix-timer core to allow better code sharing
introduced inverted logic vs. SIGEV_NONE timers in common_timer_get().

That causes hrtimer_forward() to be called on active timers, which
rightfully triggers the warning hrtimer_forward().

Make sig_none what it says: signal mode == SIGEV_NONE.

Fixes: 91d57bae0868 ("posix-timers: Make use of forward/remaining callbacks")
Reported-by: Ye Xiaolong <xiaolong.ye@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170609104457.GA39907@inn.lkp.intel.com
---
 kernel/time/posix-timers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 6e7a70b1bf37..b53a0b562516 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -644,7 +644,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 	struct timespec64 ts64;
 	bool sig_none;
 
-	sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE;
+	sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE;
 	iv = timr->it_interval;
 
 	/* interval timer ? */
-- 
cgit v1.3-14-g43fede


From 57de72125d34f83bfd39615fcc3cc25ca3b9c0ec Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 8 Jun 2017 10:55:33 +0200
Subject: cpu/hotplug: Remove unused check_for_tasks() function

clang -Wunused-function found one remaining function that was
apparently meant to be removed in a recent code cleanup:

kernel/cpu.c:565:20: warning: unused function 'check_for_tasks' [-Wunused-function]

Sebastian explained: The function became unused unintentionally, but there
is already a failure check, when a task cannot be removed from the outgoing
cpu in the scheduler code, so bringing it back is not really giving any
extra value.

Fixes: 530e9b76ae8f ("cpu/hotplug: Remove obsolete cpu hotplug register/unregister functions")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Link: http://lkml.kernel.org/r/20170608085544.2257132-1-arnd@arndb.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 7435ffc6163b..d0f5f54aa087 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -562,30 +562,6 @@ void clear_tasks_mm_cpumask(int cpu)
 	rcu_read_unlock();
 }
 
-static inline void check_for_tasks(int dead_cpu)
-{
-	struct task_struct *g, *p;
-
-	read_lock(&tasklist_lock);
-	for_each_process_thread(g, p) {
-		if (!p->on_rq)
-			continue;
-		/*
-		 * We do the check with unlocked task_rq(p)->lock.
-		 * Order the reading to do not warn about a task,
-		 * which was running on this cpu in the past, and
-		 * it's just been woken on another cpu.
-		 */
-		rmb();
-		if (task_cpu(p) != dead_cpu)
-			continue;
-
-		pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
-			p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
-	}
-	read_unlock(&tasklist_lock);
-}
-
 /* Take this CPU down. */
 static int take_cpu_down(void *_param)
 {
-- 
cgit v1.3-14-g43fede


From 5c7a3a3d20a4e175304c0e23809e3d70be8fed8a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 Jun 2017 19:44:09 +0200
Subject: posix-timers: Zero out oldval itimerspec

The recent posix timer rework moved the clearing of the itimerspec to the
real syscall implementation, but forgot that the kclock->timer_get() is
used by timer_settime() as well. That results in an uninitialized variable
and bogus values returned to user space.

Add the missing memset to timer_settime().

Fixes: eabdec043853 ("posix-timers: Zero settings value in common code")
Reported-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Link: http://lkml.kernel.org/r/20170609201156.GB21491@outlook.office365.com
---
 kernel/time/posix-timers.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index b53a0b562516..88517dcfe0ca 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -828,6 +828,8 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
 	if (!timespec64_valid(&new_spec64.it_interval) ||
 	    !timespec64_valid(&new_spec64.it_value))
 		return -EINVAL;
+	if (rtn)
+		memset(rtn, 0, sizeof(*rtn));
 retry:
 	timr = lock_timer(timer_id, &flag);
 	if (!timr)
-- 
cgit v1.3-14-g43fede


From 67edab48caeb75d412706f4b9d3107afd1e07623 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 12 Jun 2017 19:39:49 +0200
Subject: posix-timers: Handle relative posix-timers correctly

The recent rework of the posix timer internals broke the magic posix
mechanism, which requires that relative timers are not affected by
modifications of the underlying clock. That means relative CLOCK_REALTIME
timers cannot use CLOCK_REALTIME, because that can be set and adjusted. The
underlying hrtimer switches the clock for these timers to CLOCK_MONOTONIC.

That still works, but reading the remaining time of such a timer has been
broken in the rework. The old code used the hrtimer internals directly and
avoided the posix clock callbacks. Now common_timer_get() uses the
underlying kclock->timer_get() callback, which is still CLOCK_REALTIME
based. So the remaining time of such a timer is calculated against the
wrong time base.

Handle it by switching the k_itimer->kclock pointer according to the
resulting hrtimer mode. k_itimer->it_clock still contains CLOCK_REALTIME
because the timer might be set with ABSTIME later and then it needs to
switch back to the realtime posix clock implementation.

Fixes: eae1c4ae275f ("posix-timers: Make use of cancel/arm callbacks")
Reported-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Link: http://lkml.kernel.org/r/20170609201156.GB21491@outlook.office365.com
---
 kernel/time/posix-timers.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 88517dcfe0ca..58c0f60b132f 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -72,6 +72,7 @@ static DEFINE_SPINLOCK(hash_lock);
 
 static const struct k_clock * const posix_clocks[];
 static const struct k_clock *clockid_to_kclock(const clockid_t id);
+static const struct k_clock clock_realtime, clock_monotonic;
 
 /*
  * we assume that the new SIGEV_THREAD_ID shares no bits with the other
@@ -750,6 +751,18 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
 	enum hrtimer_mode mode;
 
 	mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL;
+	/*
+	 * Posix magic: Relative CLOCK_REALTIME timers are not affected by
+	 * clock modifications, so they become CLOCK_MONOTONIC based under the
+	 * hood. See hrtimer_init(). Update timr->kclock, so the generic
+	 * functions which use timr->kclock->clock_get() work.
+	 *
+	 * Note: it_clock stays unmodified, because the next timer_set() might
+	 * use ABSTIME, so it needs to switch back.
+	 */
+	if (timr->it_clock == CLOCK_REALTIME)
+		timr->kclock = absolute ? &clock_realtime : &clock_monotonic;
+
 	hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
 	timr->it.real.timer.function = posix_timer_fn;
 
-- 
cgit v1.3-14-g43fede


From e4c1a0d15b62a3ed2f8cd560e9fa589dbe880c13 Mon Sep 17 00:00:00 2001
From: Derek Robson <robsonde@gmail.com>
Date: Mon, 12 Jun 2017 14:33:03 +1200
Subject: audit: style fix

Fixed checkpatch.pl warnings of "function definition argument FOO
should also have an identifier name"

Signed-off-by: Derek Robson <robsonde@gmail.com>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.h b/kernel/audit.h
index bb3a4e14b7e5..b331d9b83f63 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -248,13 +248,13 @@ struct audit_netlink_list {
 	struct sk_buff_head q;
 };
 
-int audit_send_list(void *);
+int audit_send_list(void *_dest);
 
 extern int selinux_audit_rule_update(void);
 
 extern struct mutex audit_filter_mutex;
-extern int audit_del_rule(struct audit_entry *);
-extern void audit_free_rule_rcu(struct rcu_head *);
+extern int audit_del_rule(struct audit_entry *entry);
+extern void audit_free_rule_rcu(struct rcu_head *head);
 extern struct list_head audit_filter_list[];
 
 extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
@@ -302,17 +302,17 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
 #endif /* CONFIG_AUDIT_WATCH */
 
 #ifdef CONFIG_AUDIT_TREE
-extern struct audit_chunk *audit_tree_lookup(const struct inode *);
-extern void audit_put_chunk(struct audit_chunk *);
-extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *);
-extern int audit_make_tree(struct audit_krule *, char *, u32);
-extern int audit_add_tree_rule(struct audit_krule *);
-extern int audit_remove_tree_rule(struct audit_krule *);
+extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
+extern void audit_put_chunk(struct audit_chunk *chunk);
+extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree);
+extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
+extern int audit_add_tree_rule(struct audit_krule *rule);
+extern int audit_remove_tree_rule(struct audit_krule *rule);
 extern void audit_trim_trees(void);
 extern int audit_tag_tree(char *old, char *new);
-extern const char *audit_tree_path(struct audit_tree *);
-extern void audit_put_tree(struct audit_tree *);
-extern void audit_kill_trees(struct list_head *);
+extern const char *audit_tree_path(struct audit_tree *tree);
+extern void audit_put_tree(struct audit_tree *tree);
+extern void audit_kill_trees(struct list_head *list);
 #else
 #define audit_remove_tree_rule(rule) BUG()
 #define audit_add_tree_rule(rule) -EINVAL
@@ -324,7 +324,7 @@ extern void audit_kill_trees(struct list_head *);
 #define audit_kill_trees(list) BUG()
 #endif
 
-extern char *audit_unpack_string(void **, size_t *, size_t);
+extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
 
 extern pid_t audit_sig_pid;
 extern kuid_t audit_sig_uid;
@@ -334,7 +334,7 @@ extern int audit_filter(int msgtype, unsigned int listtype);
 
 #ifdef CONFIG_AUDITSYSCALL
 extern int audit_signal_info(int sig, struct task_struct *t);
-extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
+extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx);
 extern struct list_head *audit_killed_trees(void);
 #else
 #define audit_signal_info(s,t) AUDIT_DISABLED
-- 
cgit v1.3-14-g43fede


From d4af6d933ccffd24286528f04d5c39e702c8580f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 13 Jun 2017 06:04:14 +0200
Subject: nohz: Fix spurious warning when hrtimer and clockevent get out of
 sync

The sanity check ensuring that the tick expiry cache (ts->next_tick)
is actually in sync with the hardware clock (dev->next_event) makes the
wrong assumption that the clock can't be programmed later than the
hrtimer deadline.

In fact the clock hardware can be programmed later on some conditions
such as:

    * The hrtimer deadline is already in the past.
    * The hrtimer deadline is earlier than the minimum delay supported
      by the hardware.

Such conditions can be met when we program the tick, for example if the
last jiffies update hasn't been seen by the current CPU yet, we may
program the hrtimer to a deadline that is earlier than ktime_get()
because last_jiffies_update is our timestamp base to compute the next
tick.

As a result, we can randomly observe such warning:

	WARNING: CPU: 5 PID: 0 at kernel/time/tick-sched.c:794 tick_nohz_stop_sched_tick kernel/time/tick-sched.c:791 [inline]
	Call Trace:
	 tick_nohz_irq_exit
	 tick_irq_exit
	 irq_exit
	 exiting_irq
	 smp_call_function_interrupt
	 smp_call_function_single_interrupt
	 call_function_single_interrupt

Therefore, let's rather make sure that the tick expiry cache is sync'ed
with the tick hrtimer deadline, against which it is not supposed to
drift away. The clock hardware instead has its own will and can't be
used as a reliable comparison point.

Reported-and-tested-by: Sasha Levin <alexander.levin@verizon.com>
Reported-and-tested-by: Abdul Haleem <abdhalee@linux.vnet.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: James Hartsock <hartsjc@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Wright <tim@binbash.co.uk>
Link: http://lkml.kernel.org/r/1497326654-14122-1-git-send-email-fweisbec@gmail.com
[ Minor readability edit. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 9d31f1e0067b..204600986e0d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -768,7 +768,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	/* Skip reprogram of event if its not changed */
 	if (ts->tick_stopped && (expires == ts->next_tick)) {
 		/* Sanity check: make sure clockevent is actually programmed */
-		if (likely(dev->next_event <= ts->next_tick))
+		if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
 			goto out;
 
 		WARN_ON_ONCE(1);
@@ -806,8 +806,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		goto out;
 	}
 
+	hrtimer_set_expires(&ts->sched_timer, tick);
+
 	if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
-		hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
+		hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
 	else
 		tick_program_event(tick, 1);
 out:
-- 
cgit v1.3-14-g43fede


From 6a5ae63a0cc5d3ac73a96cb412fde66f3a71f98e Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 8 Jun 2017 19:53:24 -0700
Subject: tracing: Remove unused declaration of trace_stop_cmdline_recording

trace_stop_cmdline_recording declaration isn't in use, remove it.

Link: http://lkml.kernel.org/r/20170609025327.9508-2-joelaf@google.com

Cc: kernel-team@android.com
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1122f151466f..63deff9cdf2c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1910,8 +1910,6 @@ static void tracing_stop_tr(struct trace_array *tr)
 	raw_spin_unlock_irqrestore(&tr->start_lock, flags);
 }
 
-void trace_stop_cmdline_recording(void);
-
 static int trace_save_cmdline(struct task_struct *tsk)
 {
 	unsigned pid, idx;
-- 
cgit v1.3-14-g43fede


From f4e981cba2dec675d40ac4f270b7e8ac164c9004 Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Date: Wed, 24 May 2017 07:49:50 +0200
Subject: printk: add __printf attributes to internal functions

When compiling with -Wsuggest-attribute=format, gcc complains that some
functions in kernel/printk/printk_safe.c transmit their argument to
printf-like functions without having a printf attribute. Silence these
warnings by adding relevant __printf attributes.

Link: http://lkml.kernel.org/r/20170524054950.6722-1-nicolas.iooss_linux@m4x.org
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/printk/printk_safe.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 03a42a539b20..3cdaeaef9ce1 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -80,8 +80,8 @@ static void queue_flush_work(struct printk_safe_seq_buf *s)
  * happen, printk_safe_log_store() will notice the buffer->len mismatch
  * and repeat the write.
  */
-static int printk_safe_log_store(struct printk_safe_seq_buf *s,
-				 const char *fmt, va_list args)
+static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
+						const char *fmt, va_list args)
 {
 	int add;
 	size_t len;
@@ -299,7 +299,7 @@ void printk_safe_flush_on_panic(void)
  * one writer running. But the buffer might get flushed from another
  * CPU, so we need to be careful.
  */
-static int vprintk_nmi(const char *fmt, va_list args)
+static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
 {
 	struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
 
@@ -330,7 +330,7 @@ void printk_nmi_exit(void)
 
 #else
 
-static int vprintk_nmi(const char *fmt, va_list args)
+static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
 {
 	return 0;
 }
@@ -342,7 +342,7 @@ static int vprintk_nmi(const char *fmt, va_list args)
  * into itself. It uses a per-CPU buffer to store the message, just like
  * NMI.
  */
-static int vprintk_safe(const char *fmt, va_list args)
+static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args)
 {
 	struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
 
-- 
cgit v1.3-14-g43fede


From c81be52a3ac0267aa830a2c4cb769030ea3483c9 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 12 Jun 2017 09:35:24 -0400
Subject: audit: fix a race condition with the auditd tracking code

Originally reported by Adam and Dusty, it appears we have a small
race window in kauditd_thread(), as documented in the Fedora BZ:

 * https://bugzilla.redhat.com/show_bug.cgi?id=1459326#c35

 "This issue is partly due to the read-copy nature of RCU, and
  partly due to how we sync the auditd_connection state across
  kauditd_thread and the audit control channel.  The kauditd_thread
  thread is always running so it can service the record queues and
  emit the multicast messages, if it happens to be just past the
  "main_queue" label, but before the "if (sk == NULL || ...)"
  if-statement which calls auditd_reset() when the new auditd
  connection is registered it could end up resetting the auditd
  connection, regardless of if it is valid or not.  This is a rather
  small window and the variable nature of multi-core scheduling
  explains why this is proving rather difficult to reproduce."

The fix is to have functions only call auditd_reset() when they
believe that the kernel/auditd connection is still valid, e.g.
non-NULL, and to have these callers pass their local copy of the
auditd_connection pointer to auditd_reset() where it can be compared
with the current connection state before resetting.  If the caller
has a stale state tracking pointer then the reset is ignored.

We also make a small change to kauditd_thread() so that if the
kernel/auditd connection is dead we skip the retry queue and send the
records straight to the hold queue.  This is necessary as we used to
rely on auditd_reset() to occasionally purge the retry queue but we
are going to be calling the reset function much less now and we want
to make sure the retry queue doesn't grow unbounded.

Reported-by: Adam Williamson <awilliam@redhat.com>
Reported-by: Dusty Mabe <dustymabe@redhat.com>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index b2e877100242..e1e2b3abfb93 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -575,12 +575,16 @@ static void kauditd_retry_skb(struct sk_buff *skb)
 
 /**
  * auditd_reset - Disconnect the auditd connection
+ * @ac: auditd connection state
  *
  * Description:
  * Break the auditd/kauditd connection and move all the queued records into the
- * hold queue in case auditd reconnects.
+ * hold queue in case auditd reconnects.  It is important to note that the @ac
+ * pointer should never be dereferenced inside this function as it may be NULL
+ * or invalid, you can only compare the memory address!  If @ac is NULL then
+ * the connection will always be reset.
  */
-static void auditd_reset(void)
+static void auditd_reset(const struct auditd_connection *ac)
 {
 	unsigned long flags;
 	struct sk_buff *skb;
@@ -590,6 +594,11 @@ static void auditd_reset(void)
 	spin_lock_irqsave(&auditd_conn_lock, flags);
 	ac_old = rcu_dereference_protected(auditd_conn,
 					   lockdep_is_held(&auditd_conn_lock));
+	if (ac && ac != ac_old) {
+		/* someone already registered a new auditd connection */
+		spin_unlock_irqrestore(&auditd_conn_lock, flags);
+		return;
+	}
 	rcu_assign_pointer(auditd_conn, NULL);
 	spin_unlock_irqrestore(&auditd_conn_lock, flags);
 
@@ -649,8 +658,8 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
 	return rc;
 
 err:
-	if (rc == -ECONNREFUSED)
-		auditd_reset();
+	if (ac && rc == -ECONNREFUSED)
+		auditd_reset(ac);
 	return rc;
 }
 
@@ -795,9 +804,9 @@ static int kauditd_thread(void *dummy)
 		rc = kauditd_send_queue(sk, portid,
 					&audit_hold_queue, UNICAST_RETRIES,
 					NULL, kauditd_rehold_skb);
-		if (rc < 0) {
+		if (ac && rc < 0) {
 			sk = NULL;
-			auditd_reset();
+			auditd_reset(ac);
 			goto main_queue;
 		}
 
@@ -805,9 +814,9 @@ static int kauditd_thread(void *dummy)
 		rc = kauditd_send_queue(sk, portid,
 					&audit_retry_queue, UNICAST_RETRIES,
 					NULL, kauditd_hold_skb);
-		if (rc < 0) {
+		if (ac && rc < 0) {
 			sk = NULL;
-			auditd_reset();
+			auditd_reset(ac);
 			goto main_queue;
 		}
 
@@ -815,12 +824,13 @@ main_queue:
 		/* process the main queue - do the multicast send and attempt
 		 * unicast, dump failed record sends to the retry queue; if
 		 * sk == NULL due to previous failures we will just do the
-		 * multicast send and move the record to the retry queue */
+		 * multicast send and move the record to the hold queue */
 		rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
 					kauditd_send_multicast_skb,
-					kauditd_retry_skb);
-		if (sk == NULL || rc < 0)
-			auditd_reset();
+					(sk ?
+					 kauditd_retry_skb : kauditd_hold_skb));
+		if (ac && rc < 0)
+			auditd_reset(ac);
 		sk = NULL;
 
 		/* drop our netns reference, no auditd sends past this line */
@@ -1230,7 +1240,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 								auditd_pid, 1);
 
 				/* unregister the auditd connection */
-				auditd_reset();
+				auditd_reset(NULL);
 			}
 		}
 		if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
-- 
cgit v1.3-14-g43fede


From 02fd7f68f5342bc7e8054cb05ea4a07f26d41d12 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Wed, 31 May 2017 16:56:42 -0500
Subject: trace: rename kernel enum section to eval

The kernel and its modules have sections containing the enum
string to value conversions. Rename this section because we
intend to store more than enums in it.

Link: http://lkml.kernel.org/r/20170531215653.3240-2-jeremy.linton@arm.com

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/asm-generic/vmlinux.lds.h | 6 +++---
 include/trace/trace_events.h      | 2 +-
 kernel/module.c                   | 2 +-
 kernel/trace/trace.c              | 8 ++++----
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 314a0b9219c6..800f9f9677a6 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -125,9 +125,9 @@
 			VMLINUX_SYMBOL(__start_ftrace_events) = .;	\
 			KEEP(*(_ftrace_events))				\
 			VMLINUX_SYMBOL(__stop_ftrace_events) = .;	\
-			VMLINUX_SYMBOL(__start_ftrace_enum_maps) = .;	\
-			KEEP(*(_ftrace_enum_map))			\
-			VMLINUX_SYMBOL(__stop_ftrace_enum_maps) = .;
+			VMLINUX_SYMBOL(__start_ftrace_eval_maps) = .;	\
+			KEEP(*(_ftrace_eval_map))			\
+			VMLINUX_SYMBOL(__stop_ftrace_eval_maps) = .;
 #else
 #define FTRACE_EVENTS()
 #endif
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 00f643164ca2..4bdd84023f5b 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -43,7 +43,7 @@ TRACE_MAKE_SYSTEM_STR();
 		.enum_value = a				\
 	};						\
 	static struct trace_enum_map __used		\
-	__attribute__((section("_ftrace_enum_map")))	\
+	__attribute__((section("_ftrace_eval_map")))	\
 	*TRACE_SYSTEM##_##a = &__##TRACE_SYSTEM##_##a
 
 /*
diff --git a/kernel/module.c b/kernel/module.c
index 4a3665f8f837..9ec4713c5eee 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3077,7 +3077,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 	mod->trace_events = section_objs(info, "_ftrace_events",
 					 sizeof(*mod->trace_events),
 					 &mod->num_trace_events);
-	mod->trace_enums = section_objs(info, "_ftrace_enum_map",
+	mod->trace_enums = section_objs(info, "_ftrace_eval_map",
 					sizeof(*mod->trace_enums),
 					&mod->num_trace_enums);
 #endif
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 63deff9cdf2c..acd3eb4d56a0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7732,15 +7732,15 @@ struct dentry *tracing_init_dentry(void)
 	return NULL;
 }
 
-extern struct trace_enum_map *__start_ftrace_enum_maps[];
-extern struct trace_enum_map *__stop_ftrace_enum_maps[];
+extern struct trace_enum_map *__start_ftrace_eval_maps[];
+extern struct trace_enum_map *__stop_ftrace_eval_maps[];
 
 static void __init trace_enum_init(void)
 {
 	int len;
 
-	len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps;
-	trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len);
+	len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps;
+	trace_insert_enum_map(NULL, __start_ftrace_eval_maps, len);
 }
 
 #ifdef CONFIG_MODULES
-- 
cgit v1.3-14-g43fede


From 00f4b652b6f1dbfd4e1d5419d7f1cc23b1374da8 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Wed, 31 May 2017 16:56:43 -0500
Subject: trace: rename trace_enum_map to trace_eval_map

Each enum is loaded into the trace_enum_map, as we
are now using this for more than enums rename it.

Link: http://lkml.kernel.org/r/20170531215653.3240-3-jeremy.linton@arm.com

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/module.h       |  2 +-
 include/linux/tracepoint.h   |  6 +++---
 include/trace/trace_events.h |  8 ++++----
 kernel/trace/trace.c         | 24 ++++++++++++------------
 kernel/trace/trace.h         |  4 ++--
 kernel/trace/trace_events.c  | 14 +++++++-------
 6 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 21f56393602f..46b48043d741 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -442,7 +442,7 @@ struct module {
 #ifdef CONFIG_EVENT_TRACING
 	struct trace_event_call **trace_events;
 	unsigned int num_trace_events;
-	struct trace_enum_map **trace_enums;
+	struct trace_eval_map **trace_enums;
 	unsigned int num_trace_enums;
 #endif
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index cc48cb2ce209..f7b0f5525e46 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -25,10 +25,10 @@ struct module;
 struct tracepoint;
 struct notifier_block;
 
-struct trace_enum_map {
+struct trace_eval_map {
 	const char		*system;
-	const char		*enum_string;
-	unsigned long		enum_value;
+	const char		*eval_string;
+	unsigned long		eval_value;
 };
 
 #define TRACEPOINT_DEFAULT_PRIO	10
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 4bdd84023f5b..49cce5fb54ee 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -35,14 +35,14 @@ TRACE_MAKE_SYSTEM_STR();
 
 #undef TRACE_DEFINE_ENUM
 #define TRACE_DEFINE_ENUM(a)				\
-	static struct trace_enum_map __used __initdata	\
+	static struct trace_eval_map __used __initdata	\
 	__##TRACE_SYSTEM##_##a =			\
 	{						\
 		.system = TRACE_SYSTEM_STRING,		\
-		.enum_string = #a,			\
-		.enum_value = a				\
+		.eval_string = #a,			\
+		.eval_value = a				\
 	};						\
-	static struct trace_enum_map __used		\
+	static struct trace_eval_map __used		\
 	__attribute__((section("_ftrace_eval_map")))	\
 	*TRACE_SYSTEM##_##a = &__##TRACE_SYSTEM##_##a
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index acd3eb4d56a0..46fac3f63af1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -132,7 +132,7 @@ union trace_enum_map_item;
 struct trace_enum_map_tail {
 	/*
 	 * "end" is first and points to NULL as it must be different
-	 * than "mod" or "enum_string"
+	 * than "mod" or "eval_string"
 	 */
 	union trace_enum_map_item	*next;
 	const char			*end;	/* points to NULL */
@@ -148,7 +148,7 @@ static DEFINE_MUTEX(trace_enum_mutex);
  * pointer to the next array of saved enum_map items.
  */
 union trace_enum_map_item {
-	struct trace_enum_map		map;
+	struct trace_eval_map		map;
 	struct trace_enum_map_head	head;
 	struct trace_enum_map_tail	tail;
 };
@@ -4748,7 +4748,7 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
 static union trace_enum_map_item *
 update_enum_map(union trace_enum_map_item *ptr)
 {
-	if (!ptr->map.enum_string) {
+	if (!ptr->map.eval_string) {
 		if (ptr->tail.next) {
 			ptr = ptr->tail.next;
 			/* Set ptr to the next real item (skip head) */
@@ -4808,7 +4808,7 @@ static int enum_map_show(struct seq_file *m, void *v)
 	union trace_enum_map_item *ptr = v;
 
 	seq_printf(m, "%s %ld (%s)\n",
-		   ptr->map.enum_string, ptr->map.enum_value,
+		   ptr->map.eval_string, ptr->map.eval_value,
 		   ptr->map.system);
 
 	return 0;
@@ -4844,11 +4844,11 @@ trace_enum_jmp_to_tail(union trace_enum_map_item *ptr)
 }
 
 static void
-trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
+trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start,
 			   int len)
 {
-	struct trace_enum_map **stop;
-	struct trace_enum_map **map;
+	struct trace_eval_map **stop;
+	struct trace_eval_map **map;
 	union trace_enum_map_item *map_array;
 	union trace_enum_map_item *ptr;
 
@@ -4902,13 +4902,13 @@ static void trace_create_enum_file(struct dentry *d_tracer)
 #else /* CONFIG_TRACE_ENUM_MAP_FILE */
 static inline void trace_create_enum_file(struct dentry *d_tracer) { }
 static inline void trace_insert_enum_map_file(struct module *mod,
-			      struct trace_enum_map **start, int len) { }
+			      struct trace_eval_map **start, int len) { }
 #endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
 
 static void trace_insert_enum_map(struct module *mod,
-				  struct trace_enum_map **start, int len)
+				  struct trace_eval_map **start, int len)
 {
-	struct trace_enum_map **map;
+	struct trace_eval_map **map;
 
 	if (len <= 0)
 		return;
@@ -7732,8 +7732,8 @@ struct dentry *tracing_init_dentry(void)
 	return NULL;
 }
 
-extern struct trace_enum_map *__start_ftrace_eval_maps[];
-extern struct trace_enum_map *__stop_ftrace_eval_maps[];
+extern struct trace_eval_map *__start_ftrace_eval_maps[];
+extern struct trace_eval_map *__stop_ftrace_eval_maps[];
 
 static void __init trace_enum_init(void)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 39fd77330aab..a9667297ae49 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1773,10 +1773,10 @@ static inline const char *get_syscall_name(int syscall)
 
 #ifdef CONFIG_EVENT_TRACING
 void trace_event_init(void);
-void trace_event_enum_update(struct trace_enum_map **map, int len);
+void trace_event_enum_update(struct trace_eval_map **map, int len);
 #else
 static inline void __init trace_event_init(void) { }
-static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { }
+static inline void trace_event_enum_update(struct trace_eval_map **map, int len) { }
 #endif
 
 extern struct trace_iterator *tracepoint_print_iter;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e7973e10398c..cf5b9aa4d732 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2067,18 +2067,18 @@ __register_event(struct trace_event_call *call, struct module *mod)
 	return 0;
 }
 
-static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
+static char *enum_replace(char *ptr, struct trace_eval_map *map, int len)
 {
 	int rlen;
 	int elen;
 
 	/* Find the length of the enum value as a string */
-	elen = snprintf(ptr, 0, "%ld", map->enum_value);
+	elen = snprintf(ptr, 0, "%ld", map->eval_value);
 	/* Make sure there's enough room to replace the string with the value */
 	if (len < elen)
 		return NULL;
 
-	snprintf(ptr, elen + 1, "%ld", map->enum_value);
+	snprintf(ptr, elen + 1, "%ld", map->eval_value);
 
 	/* Get the rest of the string of ptr */
 	rlen = strlen(ptr + len);
@@ -2090,11 +2090,11 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
 }
 
 static void update_event_printk(struct trace_event_call *call,
-				struct trace_enum_map *map)
+				struct trace_eval_map *map)
 {
 	char *ptr;
 	int quote = 0;
-	int len = strlen(map->enum_string);
+	int len = strlen(map->eval_string);
 
 	for (ptr = call->print_fmt; *ptr; ptr++) {
 		if (*ptr == '\\') {
@@ -2125,7 +2125,7 @@ static void update_event_printk(struct trace_event_call *call,
 			continue;
 		}
 		if (isalpha(*ptr) || *ptr == '_') {
-			if (strncmp(map->enum_string, ptr, len) == 0 &&
+			if (strncmp(map->eval_string, ptr, len) == 0 &&
 			    !isalnum(ptr[len]) && ptr[len] != '_') {
 				ptr = enum_replace(ptr, map, len);
 				/* Hmm, enum string smaller than value */
@@ -2165,7 +2165,7 @@ static void update_event_printk(struct trace_event_call *call,
 	}
 }
 
-void trace_event_enum_update(struct trace_enum_map **map, int len)
+void trace_event_enum_update(struct trace_eval_map **map, int len)
 {
 	struct trace_event_call *call, *p;
 	const char *last_system = NULL;
-- 
cgit v1.3-14-g43fede


From 99be647c5841d570a23b5dfa65bfecada8b6e6b5 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Wed, 31 May 2017 16:56:44 -0500
Subject: trace: rename struct module entry for trace enums

Each module has a list of enum's its contributing to the
enum map, rename that entry to reflect its use by more than
enums.

Link: http://lkml.kernel.org/r/20170531215653.3240-4-jeremy.linton@arm.com

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/module.h | 4 ++--
 kernel/module.c        | 6 +++---
 kernel/trace/trace.c   | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 46b48043d741..8eb9a1e693e5 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -442,8 +442,8 @@ struct module {
 #ifdef CONFIG_EVENT_TRACING
 	struct trace_event_call **trace_events;
 	unsigned int num_trace_events;
-	struct trace_eval_map **trace_enums;
-	unsigned int num_trace_enums;
+	struct trace_eval_map **trace_evals;
+	unsigned int num_trace_evals;
 #endif
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 	unsigned int num_ftrace_callsites;
diff --git a/kernel/module.c b/kernel/module.c
index 9ec4713c5eee..df1c4a9e7abb 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3077,9 +3077,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 	mod->trace_events = section_objs(info, "_ftrace_events",
 					 sizeof(*mod->trace_events),
 					 &mod->num_trace_events);
-	mod->trace_enums = section_objs(info, "_ftrace_eval_map",
-					sizeof(*mod->trace_enums),
-					&mod->num_trace_enums);
+	mod->trace_evals = section_objs(info, "_ftrace_eval_map",
+					sizeof(*mod->trace_evals),
+					&mod->num_trace_evals);
 #endif
 #ifdef CONFIG_TRACING
 	mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 46fac3f63af1..061abd8ba101 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7746,7 +7746,7 @@ static void __init trace_enum_init(void)
 #ifdef CONFIG_MODULES
 static void trace_module_add_enums(struct module *mod)
 {
-	if (!mod->num_trace_enums)
+	if (!mod->num_trace_evals)
 		return;
 
 	/*
@@ -7756,7 +7756,7 @@ static void trace_module_add_enums(struct module *mod)
 	if (trace_module_has_bad_taint(mod))
 		return;
 
-	trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums);
+	trace_insert_enum_map(mod, mod->trace_evals, mod->num_trace_evals);
 }
 
 #ifdef CONFIG_TRACE_ENUM_MAP_FILE
@@ -7765,7 +7765,7 @@ static void trace_module_remove_enums(struct module *mod)
 	union trace_enum_map_item *map;
 	union trace_enum_map_item **last = &trace_enum_maps;
 
-	if (!mod->num_trace_enums)
+	if (!mod->num_trace_evals)
 		return;
 
 	mutex_lock(&trace_enum_mutex);
-- 
cgit v1.3-14-g43fede


From 23bf8cb8dc86c0368d2471ebb4622e7edd38190b Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Wed, 31 May 2017 16:56:45 -0500
Subject: trace: rename trace enum data structures in trace.c

The enum map entries can be exported to userspace
via a sys enum_map file. Rename those functions
and structures to reflect the fact that we are using
them for more than enums.

Link: http://lkml.kernel.org/r/20170531215653.3240-5-jeremy.linton@arm.com

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 061abd8ba101..13c81f4f2bd7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -122,38 +122,38 @@ int __disable_trace_on_warning;
 
 #ifdef CONFIG_TRACE_ENUM_MAP_FILE
 /* Map of enums to their values, for "enum_map" file */
-struct trace_enum_map_head {
+struct trace_eval_map_head {
 	struct module			*mod;
 	unsigned long			length;
 };
 
-union trace_enum_map_item;
+union trace_eval_map_item;
 
-struct trace_enum_map_tail {
+struct trace_eval_map_tail {
 	/*
 	 * "end" is first and points to NULL as it must be different
 	 * than "mod" or "eval_string"
 	 */
-	union trace_enum_map_item	*next;
+	union trace_eval_map_item	*next;
 	const char			*end;	/* points to NULL */
 };
 
 static DEFINE_MUTEX(trace_enum_mutex);
 
 /*
- * The trace_enum_maps are saved in an array with two extra elements,
+ * The trace_eval_maps are saved in an array with two extra elements,
  * one at the beginning, and one at the end. The beginning item contains
  * the count of the saved maps (head.length), and the module they
  * belong to if not built in (head.mod). The ending item contains a
  * pointer to the next array of saved enum_map items.
  */
-union trace_enum_map_item {
+union trace_eval_map_item {
 	struct trace_eval_map		map;
-	struct trace_enum_map_head	head;
-	struct trace_enum_map_tail	tail;
+	struct trace_eval_map_head	head;
+	struct trace_eval_map_tail	tail;
 };
 
-static union trace_enum_map_item *trace_enum_maps;
+static union trace_eval_map_item *trace_eval_maps;
 #endif /* CONFIG_TRACE_ENUM_MAP_FILE */
 
 static int tracing_set_tracer(struct trace_array *tr, const char *buf);
@@ -4745,8 +4745,8 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
 };
 
 #ifdef CONFIG_TRACE_ENUM_MAP_FILE
-static union trace_enum_map_item *
-update_enum_map(union trace_enum_map_item *ptr)
+static union trace_eval_map_item *
+update_enum_map(union trace_eval_map_item *ptr)
 {
 	if (!ptr->map.eval_string) {
 		if (ptr->tail.next) {
@@ -4761,7 +4761,7 @@ update_enum_map(union trace_enum_map_item *ptr)
 
 static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	union trace_enum_map_item *ptr = v;
+	union trace_eval_map_item *ptr = v;
 
 	/*
 	 * Paranoid! If ptr points to end, we don't want to increment past it.
@@ -4782,12 +4782,12 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *enum_map_start(struct seq_file *m, loff_t *pos)
 {
-	union trace_enum_map_item *v;
+	union trace_eval_map_item *v;
 	loff_t l = 0;
 
 	mutex_lock(&trace_enum_mutex);
 
-	v = trace_enum_maps;
+	v = trace_eval_maps;
 	if (v)
 		v++;
 
@@ -4805,7 +4805,7 @@ static void enum_map_stop(struct seq_file *m, void *v)
 
 static int enum_map_show(struct seq_file *m, void *v)
 {
-	union trace_enum_map_item *ptr = v;
+	union trace_eval_map_item *ptr = v;
 
 	seq_printf(m, "%s %ld (%s)\n",
 		   ptr->map.eval_string, ptr->map.eval_value,
@@ -4836,8 +4836,8 @@ static const struct file_operations tracing_enum_map_fops = {
 	.release	= seq_release,
 };
 
-static inline union trace_enum_map_item *
-trace_enum_jmp_to_tail(union trace_enum_map_item *ptr)
+static inline union trace_eval_map_item *
+trace_enum_jmp_to_tail(union trace_eval_map_item *ptr)
 {
 	/* Return tail of array given the head */
 	return ptr + ptr->head.length + 1;
@@ -4849,13 +4849,13 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start,
 {
 	struct trace_eval_map **stop;
 	struct trace_eval_map **map;
-	union trace_enum_map_item *map_array;
-	union trace_enum_map_item *ptr;
+	union trace_eval_map_item *map_array;
+	union trace_eval_map_item *ptr;
 
 	stop = start + len;
 
 	/*
-	 * The trace_enum_maps contains the map plus a head and tail item,
+	 * The trace_eval_maps contains the map plus a head and tail item,
 	 * where the head holds the module and length of array, and the
 	 * tail holds a pointer to the next list.
 	 */
@@ -4867,10 +4867,10 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start,
 
 	mutex_lock(&trace_enum_mutex);
 
-	if (!trace_enum_maps)
-		trace_enum_maps = map_array;
+	if (!trace_eval_maps)
+		trace_eval_maps = map_array;
 	else {
-		ptr = trace_enum_maps;
+		ptr = trace_eval_maps;
 		for (;;) {
 			ptr = trace_enum_jmp_to_tail(ptr);
 			if (!ptr->tail.next)
@@ -7762,15 +7762,15 @@ static void trace_module_add_enums(struct module *mod)
 #ifdef CONFIG_TRACE_ENUM_MAP_FILE
 static void trace_module_remove_enums(struct module *mod)
 {
-	union trace_enum_map_item *map;
-	union trace_enum_map_item **last = &trace_enum_maps;
+	union trace_eval_map_item *map;
+	union trace_eval_map_item **last = &trace_eval_maps;
 
 	if (!mod->num_trace_evals)
 		return;
 
 	mutex_lock(&trace_enum_mutex);
 
-	map = trace_enum_maps;
+	map = trace_eval_maps;
 
 	while (map) {
 		if (map->head.mod == mod)
-- 
cgit v1.3-14-g43fede


From 1793ed939b2a0e18b06467e10d15b66925d75d5f Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Wed, 31 May 2017 16:56:46 -0500
Subject: trace: rename trace_enum_mutex to trace_eval_mutex

There is a lock protecting the trace_enum_map, rename
it to reflect the use by more than enums.

Link: http://lkml.kernel.org/r/20170531215653.3240-6-jeremy.linton@arm.com

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 13c81f4f2bd7..e0279f5dc83f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -138,7 +138,7 @@ struct trace_eval_map_tail {
 	const char			*end;	/* points to NULL */
 };
 
-static DEFINE_MUTEX(trace_enum_mutex);
+static DEFINE_MUTEX(trace_eval_mutex);
 
 /*
  * The trace_eval_maps are saved in an array with two extra elements,
@@ -4785,7 +4785,7 @@ static void *enum_map_start(struct seq_file *m, loff_t *pos)
 	union trace_eval_map_item *v;
 	loff_t l = 0;
 
-	mutex_lock(&trace_enum_mutex);
+	mutex_lock(&trace_eval_mutex);
 
 	v = trace_eval_maps;
 	if (v)
@@ -4800,7 +4800,7 @@ static void *enum_map_start(struct seq_file *m, loff_t *pos)
 
 static void enum_map_stop(struct seq_file *m, void *v)
 {
-	mutex_unlock(&trace_enum_mutex);
+	mutex_unlock(&trace_eval_mutex);
 }
 
 static int enum_map_show(struct seq_file *m, void *v)
@@ -4865,7 +4865,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start,
 		return;
 	}
 
-	mutex_lock(&trace_enum_mutex);
+	mutex_lock(&trace_eval_mutex);
 
 	if (!trace_eval_maps)
 		trace_eval_maps = map_array;
@@ -4890,7 +4890,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start,
 	}
 	memset(map_array, 0, sizeof(*map_array));
 
-	mutex_unlock(&trace_enum_mutex);
+	mutex_unlock(&trace_eval_mutex);
 }
 
 static void trace_create_enum_file(struct dentry *d_tracer)
@@ -7768,7 +7768,7 @@ static void trace_module_remove_enums(struct module *mod)
 	if (!mod->num_trace_evals)
 		return;
 
-	mutex_lock(&trace_enum_mutex);
+	mutex_lock(&trace_eval_mutex);
 
 	map = trace_eval_maps;
 
@@ -7785,7 +7785,7 @@ static void trace_module_remove_enums(struct module *mod)
 	*last = trace_enum_jmp_to_tail(map)->tail.next;
 	kfree(map);
  out:
-	mutex_unlock(&trace_enum_mutex);
+	mutex_unlock(&trace_eval_mutex);
 }
 #else
 static inline void trace_module_remove_enums(struct module *mod) { }
-- 
cgit v1.3-14-g43fede


From 5f60b351a7e37bf243725bcc131708be2c8ea497 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Wed, 31 May 2017 16:56:47 -0500
Subject: trace: rename trace.c enum functions

Rename the init and trace_enum_jmp_to_tail() routines
to reflect their use by more than enumerated types.

Link: http://lkml.kernel.org/r/20170531215653.3240-7-jeremy.linton@arm.com

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e0279f5dc83f..d703f429bbd9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4837,7 +4837,7 @@ static const struct file_operations tracing_enum_map_fops = {
 };
 
 static inline union trace_eval_map_item *
-trace_enum_jmp_to_tail(union trace_eval_map_item *ptr)
+trace_eval_jmp_to_tail(union trace_eval_map_item *ptr)
 {
 	/* Return tail of array given the head */
 	return ptr + ptr->head.length + 1;
@@ -4872,7 +4872,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start,
 	else {
 		ptr = trace_eval_maps;
 		for (;;) {
-			ptr = trace_enum_jmp_to_tail(ptr);
+			ptr = trace_eval_jmp_to_tail(ptr);
 			if (!ptr->tail.next)
 				break;
 			ptr = ptr->tail.next;
@@ -7735,7 +7735,7 @@ struct dentry *tracing_init_dentry(void)
 extern struct trace_eval_map *__start_ftrace_eval_maps[];
 extern struct trace_eval_map *__stop_ftrace_eval_maps[];
 
-static void __init trace_enum_init(void)
+static void __init trace_eval_init(void)
 {
 	int len;
 
@@ -7775,14 +7775,14 @@ static void trace_module_remove_enums(struct module *mod)
 	while (map) {
 		if (map->head.mod == mod)
 			break;
-		map = trace_enum_jmp_to_tail(map);
+		map = trace_eval_jmp_to_tail(map);
 		last = &map->tail.next;
 		map = map->tail.next;
 	}
 	if (!map)
 		goto out;
 
-	*last = trace_enum_jmp_to_tail(map)->tail.next;
+	*last = trace_eval_jmp_to_tail(map)->tail.next;
 	kfree(map);
  out:
 	mutex_unlock(&trace_eval_mutex);
@@ -7839,7 +7839,7 @@ static __init int tracer_init_tracefs(void)
 	trace_create_file("saved_cmdlines_size", 0644, d_tracer,
 			  NULL, &tracing_saved_cmdlines_size_fops);
 
-	trace_enum_init();
+	trace_eval_init();
 
 	trace_create_enum_file(d_tracer);
 
-- 
cgit v1.3-14-g43fede


From f57a41434fc51732dd5e35e0e1aa9e607f1a05d6 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Wed, 31 May 2017 16:56:48 -0500
Subject: trace: rename enum_map functions

Rename the core trace enum routines to use eval, to
reflect their use by more than just enum to value mapping.

Link: http://lkml.kernel.org/r/20170531215653.3240-8-jeremy.linton@arm.com

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 74 ++++++++++++++++++++++-----------------------
 kernel/trace/trace.h        |  4 +--
 kernel/trace/trace_events.c |  2 +-
 3 files changed, 40 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d703f429bbd9..d830be7f0ba6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -145,7 +145,7 @@ static DEFINE_MUTEX(trace_eval_mutex);
  * one at the beginning, and one at the end. The beginning item contains
  * the count of the saved maps (head.length), and the module they
  * belong to if not built in (head.mod). The ending item contains a
- * pointer to the next array of saved enum_map items.
+ * pointer to the next array of saved enum_eval/enum_map items.
  */
 union trace_eval_map_item {
 	struct trace_eval_map		map;
@@ -1141,9 +1141,9 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
 
 /*
  * TRACE_FLAGS is defined as a tuple matching bit masks with strings.
- * It uses C(a, b) where 'a' is the enum name and 'b' is the string that
+ * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that
  * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list
- * of strings in the order that the enums were defined.
+ * of strings in the order that the evals (enum) were defined.
  */
 #undef C
 #define C(a, b) b
@@ -4746,7 +4746,7 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
 
 #ifdef CONFIG_TRACE_ENUM_MAP_FILE
 static union trace_eval_map_item *
-update_enum_map(union trace_eval_map_item *ptr)
+update_eval_map(union trace_eval_map_item *ptr)
 {
 	if (!ptr->map.eval_string) {
 		if (ptr->tail.next) {
@@ -4759,7 +4759,7 @@ update_enum_map(union trace_eval_map_item *ptr)
 	return ptr;
 }
 
-static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
+static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	union trace_eval_map_item *ptr = v;
 
@@ -4767,7 +4767,7 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
 	 * Paranoid! If ptr points to end, we don't want to increment past it.
 	 * This really should never happen.
 	 */
-	ptr = update_enum_map(ptr);
+	ptr = update_eval_map(ptr);
 	if (WARN_ON_ONCE(!ptr))
 		return NULL;
 
@@ -4775,12 +4775,12 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
 
 	(*pos)++;
 
-	ptr = update_enum_map(ptr);
+	ptr = update_eval_map(ptr);
 
 	return ptr;
 }
 
-static void *enum_map_start(struct seq_file *m, loff_t *pos)
+static void *eval_map_start(struct seq_file *m, loff_t *pos)
 {
 	union trace_eval_map_item *v;
 	loff_t l = 0;
@@ -4792,18 +4792,18 @@ static void *enum_map_start(struct seq_file *m, loff_t *pos)
 		v++;
 
 	while (v && l < *pos) {
-		v = enum_map_next(m, v, &l);
+		v = eval_map_next(m, v, &l);
 	}
 
 	return v;
 }
 
-static void enum_map_stop(struct seq_file *m, void *v)
+static void eval_map_stop(struct seq_file *m, void *v)
 {
 	mutex_unlock(&trace_eval_mutex);
 }
 
-static int enum_map_show(struct seq_file *m, void *v)
+static int eval_map_show(struct seq_file *m, void *v)
 {
 	union trace_eval_map_item *ptr = v;
 
@@ -4814,23 +4814,23 @@ static int enum_map_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static const struct seq_operations tracing_enum_map_seq_ops = {
-	.start		= enum_map_start,
-	.next		= enum_map_next,
-	.stop		= enum_map_stop,
-	.show		= enum_map_show,
+static const struct seq_operations tracing_eval_map_seq_ops = {
+	.start		= eval_map_start,
+	.next		= eval_map_next,
+	.stop		= eval_map_stop,
+	.show		= eval_map_show,
 };
 
-static int tracing_enum_map_open(struct inode *inode, struct file *filp)
+static int tracing_eval_map_open(struct inode *inode, struct file *filp)
 {
 	if (tracing_disabled)
 		return -ENODEV;
 
-	return seq_open(filp, &tracing_enum_map_seq_ops);
+	return seq_open(filp, &tracing_eval_map_seq_ops);
 }
 
-static const struct file_operations tracing_enum_map_fops = {
-	.open		= tracing_enum_map_open,
+static const struct file_operations tracing_eval_map_fops = {
+	.open		= tracing_eval_map_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
@@ -4844,7 +4844,7 @@ trace_eval_jmp_to_tail(union trace_eval_map_item *ptr)
 }
 
 static void
-trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start,
+trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
 			   int len)
 {
 	struct trace_eval_map **stop;
@@ -4861,7 +4861,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start,
 	 */
 	map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
 	if (!map_array) {
-		pr_warn("Unable to allocate trace enum mapping\n");
+		pr_warn("Unable to allocate trace eval mapping\n");
 		return;
 	}
 
@@ -4893,19 +4893,19 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start,
 	mutex_unlock(&trace_eval_mutex);
 }
 
-static void trace_create_enum_file(struct dentry *d_tracer)
+static void trace_create_eval_file(struct dentry *d_tracer)
 {
 	trace_create_file("enum_map", 0444, d_tracer,
-			  NULL, &tracing_enum_map_fops);
+			  NULL, &tracing_eval_map_fops);
 }
 
 #else /* CONFIG_TRACE_ENUM_MAP_FILE */
-static inline void trace_create_enum_file(struct dentry *d_tracer) { }
-static inline void trace_insert_enum_map_file(struct module *mod,
+static inline void trace_create_eval_file(struct dentry *d_tracer) { }
+static inline void trace_insert_eval_map_file(struct module *mod,
 			      struct trace_eval_map **start, int len) { }
 #endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
 
-static void trace_insert_enum_map(struct module *mod,
+static void trace_insert_eval_map(struct module *mod,
 				  struct trace_eval_map **start, int len)
 {
 	struct trace_eval_map **map;
@@ -4915,9 +4915,9 @@ static void trace_insert_enum_map(struct module *mod,
 
 	map = start;
 
-	trace_event_enum_update(map, len);
+	trace_event_eval_update(map, len);
 
-	trace_insert_enum_map_file(mod, start, len);
+	trace_insert_eval_map_file(mod, start, len);
 }
 
 static ssize_t
@@ -7740,11 +7740,11 @@ static void __init trace_eval_init(void)
 	int len;
 
 	len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps;
-	trace_insert_enum_map(NULL, __start_ftrace_eval_maps, len);
+	trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
 }
 
 #ifdef CONFIG_MODULES
-static void trace_module_add_enums(struct module *mod)
+static void trace_module_add_evals(struct module *mod)
 {
 	if (!mod->num_trace_evals)
 		return;
@@ -7756,11 +7756,11 @@ static void trace_module_add_enums(struct module *mod)
 	if (trace_module_has_bad_taint(mod))
 		return;
 
-	trace_insert_enum_map(mod, mod->trace_evals, mod->num_trace_evals);
+	trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
 }
 
 #ifdef CONFIG_TRACE_ENUM_MAP_FILE
-static void trace_module_remove_enums(struct module *mod)
+static void trace_module_remove_evals(struct module *mod)
 {
 	union trace_eval_map_item *map;
 	union trace_eval_map_item **last = &trace_eval_maps;
@@ -7788,7 +7788,7 @@ static void trace_module_remove_enums(struct module *mod)
 	mutex_unlock(&trace_eval_mutex);
 }
 #else
-static inline void trace_module_remove_enums(struct module *mod) { }
+static inline void trace_module_remove_evals(struct module *mod) { }
 #endif /* CONFIG_TRACE_ENUM_MAP_FILE */
 
 static int trace_module_notify(struct notifier_block *self,
@@ -7798,10 +7798,10 @@ static int trace_module_notify(struct notifier_block *self,
 
 	switch (val) {
 	case MODULE_STATE_COMING:
-		trace_module_add_enums(mod);
+		trace_module_add_evals(mod);
 		break;
 	case MODULE_STATE_GOING:
-		trace_module_remove_enums(mod);
+		trace_module_remove_evals(mod);
 		break;
 	}
 
@@ -7841,7 +7841,7 @@ static __init int tracer_init_tracefs(void)
 
 	trace_eval_init();
 
-	trace_create_enum_file(d_tracer);
+	trace_create_eval_file(d_tracer);
 
 #ifdef CONFIG_MODULES
 	register_module_notifier(&trace_module_nb);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a9667297ae49..69a3ab3ee4f5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1773,10 +1773,10 @@ static inline const char *get_syscall_name(int syscall)
 
 #ifdef CONFIG_EVENT_TRACING
 void trace_event_init(void);
-void trace_event_enum_update(struct trace_eval_map **map, int len);
+void trace_event_eval_update(struct trace_eval_map **map, int len);
 #else
 static inline void __init trace_event_init(void) { }
-static inline void trace_event_enum_update(struct trace_eval_map **map, int len) { }
+static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
 #endif
 
 extern struct trace_iterator *tracepoint_print_iter;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index cf5b9aa4d732..e6897b005947 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2165,7 +2165,7 @@ static void update_event_printk(struct trace_event_call *call,
 	}
 }
 
-void trace_event_enum_update(struct trace_eval_map **map, int len)
+void trace_event_eval_update(struct trace_eval_map **map, int len)
 {
 	struct trace_event_call *call, *p;
 	const char *last_system = NULL;
-- 
cgit v1.3-14-g43fede


From 67ec0d85955630924b971e04c0954370a74b8706 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Wed, 31 May 2017 16:56:49 -0500
Subject: tracing: Rename enum_replace to eval_replace

The enum_replace stanza works as is for sizeof()
calls as well as enums. Rename it as well.

Link: http://lkml.kernel.org/r/20170531215653.3240-9-jeremy.linton@arm.com

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e6897b005947..83dfd0dbbbfe 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2067,12 +2067,12 @@ __register_event(struct trace_event_call *call, struct module *mod)
 	return 0;
 }
 
-static char *enum_replace(char *ptr, struct trace_eval_map *map, int len)
+static char *eval_replace(char *ptr, struct trace_eval_map *map, int len)
 {
 	int rlen;
 	int elen;
 
-	/* Find the length of the enum value as a string */
+	/* Find the length of the eval value as a string */
 	elen = snprintf(ptr, 0, "%ld", map->eval_value);
 	/* Make sure there's enough room to replace the string with the value */
 	if (len < elen)
@@ -2127,14 +2127,14 @@ static void update_event_printk(struct trace_event_call *call,
 		if (isalpha(*ptr) || *ptr == '_') {
 			if (strncmp(map->eval_string, ptr, len) == 0 &&
 			    !isalnum(ptr[len]) && ptr[len] != '_') {
-				ptr = enum_replace(ptr, map, len);
-				/* Hmm, enum string smaller than value */
+				ptr = eval_replace(ptr, map, len);
+				/* enum/sizeof string smaller than value */
 				if (WARN_ON_ONCE(!ptr))
 					return;
 				/*
-				 * No need to decrement here, as enum_replace()
+				 * No need to decrement here, as eval_replace()
 				 * returns the pointer to the character passed
-				 * the enum, and two enums can not be placed
+				 * the eval, and two evals can not be placed
 				 * back to back without something in between.
 				 * We can skip that something in between.
 				 */
-- 
cgit v1.3-14-g43fede


From 681bec0367c2606b6154060310a2ffa543175980 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Wed, 31 May 2017 16:56:53 -0500
Subject: tracing: Rename update the enum_map file

The enum_map file is used to display a list of symbol
to name conversions. As its now used to resolve sizeof
lets update the name and description.

Link: http://lkml.kernel.org/r/20170531215653.3240-13-jeremy.linton@arm.com

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 22 +++++++++++-----------
 kernel/trace/trace.c | 20 ++++++++++----------
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 7e06f04e98fe..434c840e2d82 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -667,30 +667,30 @@ config RING_BUFFER_STARTUP_TEST
 
 	 If unsure, say N
 
-config TRACE_ENUM_MAP_FILE
-       bool "Show enum mappings for trace events"
+config TRACE_EVAL_MAP_FILE
+       bool "Show eval mappings for trace events"
        depends on TRACING
        help
-        The "print fmt" of the trace events will show the enum names instead
-	of their values. This can cause problems for user space tools that
-	use this string to parse the raw data as user space does not know
+	The "print fmt" of the trace events will show the enum/sizeof names
+	instead	of their values. This can cause problems for user space tools
+	that use this string to parse the raw data as user space does not know
 	how to convert the string to its value.
 
 	To fix this, there's a special macro in the kernel that can be used
-	to convert the enum into its value. If this macro is used, then the
-	print fmt strings will have the enums converted to their values.
+	to convert an enum/sizeof into its value. If this macro is used, then
+	the print fmt strings will be converted to their values.
 
 	If something does not get converted properly, this option can be
-	used to show what enums the kernel tried to convert.
+	used to show what enums/sizeof the kernel tried to convert.
 
-	This option is for debugging the enum conversions. A file is created
-	in the tracing directory called "enum_map" that will show the enum
+	This option is for debugging the conversions. A file is created
+	in the tracing directory called "eval_map" that will show the
 	names matched with their values and what trace event system they
 	belong too.
 
 	Normally, the mapping of the strings to values will be freed after
 	boot up or module load. With this option, they will not be freed, as
-	they are needed for the "enum_map" file. Enabling this option will
+	they are needed for the "eval_map" file. Enabling this option will
 	increase the memory footprint of the running kernel.
 
 	If unsure, say N
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d830be7f0ba6..19ac2088d10a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -120,8 +120,8 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
 /* When set, tracing will stop when a WARN*() is hit */
 int __disable_trace_on_warning;
 
-#ifdef CONFIG_TRACE_ENUM_MAP_FILE
-/* Map of enums to their values, for "enum_map" file */
+#ifdef CONFIG_TRACE_EVAL_MAP_FILE
+/* Map of enums to their values, for "eval_map" file */
 struct trace_eval_map_head {
 	struct module			*mod;
 	unsigned long			length;
@@ -145,7 +145,7 @@ static DEFINE_MUTEX(trace_eval_mutex);
  * one at the beginning, and one at the end. The beginning item contains
  * the count of the saved maps (head.length), and the module they
  * belong to if not built in (head.mod). The ending item contains a
- * pointer to the next array of saved enum_eval/enum_map items.
+ * pointer to the next array of saved eval_map items.
  */
 union trace_eval_map_item {
 	struct trace_eval_map		map;
@@ -154,7 +154,7 @@ union trace_eval_map_item {
 };
 
 static union trace_eval_map_item *trace_eval_maps;
-#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
 
 static int tracing_set_tracer(struct trace_array *tr, const char *buf);
 
@@ -4744,7 +4744,7 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
 	.write		= tracing_saved_cmdlines_size_write,
 };
 
-#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+#ifdef CONFIG_TRACE_EVAL_MAP_FILE
 static union trace_eval_map_item *
 update_eval_map(union trace_eval_map_item *ptr)
 {
@@ -4895,15 +4895,15 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
 
 static void trace_create_eval_file(struct dentry *d_tracer)
 {
-	trace_create_file("enum_map", 0444, d_tracer,
+	trace_create_file("eval_map", 0444, d_tracer,
 			  NULL, &tracing_eval_map_fops);
 }
 
-#else /* CONFIG_TRACE_ENUM_MAP_FILE */
+#else /* CONFIG_TRACE_EVAL_MAP_FILE */
 static inline void trace_create_eval_file(struct dentry *d_tracer) { }
 static inline void trace_insert_eval_map_file(struct module *mod,
 			      struct trace_eval_map **start, int len) { }
-#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
+#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */
 
 static void trace_insert_eval_map(struct module *mod,
 				  struct trace_eval_map **start, int len)
@@ -7759,7 +7759,7 @@ static void trace_module_add_evals(struct module *mod)
 	trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
 }
 
-#ifdef CONFIG_TRACE_ENUM_MAP_FILE
+#ifdef CONFIG_TRACE_EVAL_MAP_FILE
 static void trace_module_remove_evals(struct module *mod)
 {
 	union trace_eval_map_item *map;
@@ -7789,7 +7789,7 @@ static void trace_module_remove_evals(struct module *mod)
 }
 #else
 static inline void trace_module_remove_evals(struct module *mod) { }
-#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
 
 static int trace_module_notify(struct notifier_block *self,
 			       unsigned long val, void *data)
-- 
cgit v1.3-14-g43fede


From 86a9c446c13ecd8793ea8599761322aed125d542 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:26 +0100
Subject: posix-cpu-timers: Move copyout of timespec into do_cpu_nanosleep()

The posix-cpu-timer nanosleep() implementation can be simplified by moving
the copy out of the remaining time to do_cpu_nanosleep() which is shared
between the real nanosleep function and the restart function.

The pointer to the timespec64 which is updated has to be stored in the
restart block anyway. Instead of storing it only in the restart case, store
it before calling do_cpu_nanosleep() and copy the remaining time in the
signal exit path.

[ tglx: Added changelog ]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-1-viro@ZenIV.linux.org.uk
---
 kernel/time/posix-cpu-timers.c | 63 +++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index cb4a4eb44279..239fff980fd0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1226,9 +1226,10 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 }
 
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
-			    struct timespec64 *rqtp, struct itimerspec64 *it)
+			    struct timespec64 *rqtp)
 {
 	struct k_itimer timer;
+	struct itimerspec64 it;
 	int error;
 
 	/*
@@ -1242,12 +1243,14 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 	timer.it_process = current;
 	if (!error) {
 		static struct itimerspec64 zero_it;
+		struct restart_block *restart = &current->restart_block;
+		struct timespec __user *rmtp;
 
-		memset(it, 0, sizeof *it);
-		it->it_value = *rqtp;
+		memset(&it, 0, sizeof it);
+		it.it_value = *rqtp;
 
 		spin_lock_irq(&timer.it_lock);
-		error = posix_cpu_timer_set(&timer, flags, it, NULL);
+		error = posix_cpu_timer_set(&timer, flags, &it, NULL);
 		if (error) {
 			spin_unlock_irq(&timer.it_lock);
 			return error;
@@ -1277,7 +1280,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		 * We were interrupted by a signal.
 		 */
 		*rqtp = ns_to_timespec64(timer.it.cpu.expires);
-		error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
+		error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
 		if (!error) {
 			/*
 			 * Timer is now unarmed, deletion can not fail.
@@ -1297,7 +1300,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 			spin_unlock_irq(&timer.it_lock);
 		}
 
-		if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
+		if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
 			/*
 			 * It actually did fire already.
 			 */
@@ -1305,6 +1308,18 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		}
 
 		error = -ERESTART_RESTARTBLOCK;
+		/*
+		 * Report back to the user the time still remaining.
+		 */
+		rmtp = restart->nanosleep.rmtp;
+		if (rmtp) {
+			struct timespec ts;
+
+			ts = timespec64_to_timespec(it.it_value);
+			if (copy_to_user(rmtp, &ts, sizeof(*rmtp)))
+				return -EFAULT;
+		}
+		restart->nanosleep.expires = timespec64_to_ns(rqtp);
 	}
 
 	return error;
@@ -1316,10 +1331,13 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 			    struct timespec64 *rqtp, struct timespec __user *rmtp)
 {
 	struct restart_block *restart_block = &current->restart_block;
-	struct itimerspec64 it;
-	struct timespec ts;
 	int error;
 
+	if (flags & TIMER_ABSTIME)
+		rmtp = NULL;
+
+	restart_block->nanosleep.rmtp = rmtp;
+
 	/*
 	 * Diagnose required errors first.
 	 */
@@ -1328,23 +1346,15 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 	     CPUCLOCK_PID(which_clock) == task_pid_vnr(current)))
 		return -EINVAL;
 
-	error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
+	error = do_cpu_nanosleep(which_clock, flags, rqtp);
 
 	if (error == -ERESTART_RESTARTBLOCK) {
 
 		if (flags & TIMER_ABSTIME)
 			return -ERESTARTNOHAND;
-		/*
-		 * Report back to the user the time still remaining.
-		 */
-		ts = timespec64_to_timespec(it.it_value);
-		if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp)))
-			return -EFAULT;
 
 		restart_block->fn = posix_cpu_nsleep_restart;
 		restart_block->nanosleep.clockid = which_clock;
-		restart_block->nanosleep.rmtp = rmtp;
-		restart_block->nanosleep.expires = timespec64_to_ns(rqtp);
 	}
 	return error;
 }
@@ -1352,28 +1362,11 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->nanosleep.clockid;
-	struct itimerspec64 it;
 	struct timespec64 t;
-	struct timespec tmp;
-	int error;
 
 	t = ns_to_timespec64(restart_block->nanosleep.expires);
 
-	error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
-
-	if (error == -ERESTART_RESTARTBLOCK) {
-		struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
-		/*
-		 * Report back to the user the time still remaining.
-		 */
-		 tmp = timespec64_to_timespec(it.it_value);
-		if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp)))
-			return -EFAULT;
-
-		restart_block->nanosleep.expires = timespec64_to_ns(&t);
-	}
-	return error;
-
+	return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
 }
 
 #define PROCESS_CLOCK	MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
-- 
cgit v1.3-14-g43fede


From 15f27ce24cb613e6e01ce27c4094c55e55dde5d4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:27 +0100
Subject: alarmtimer: Move copyout and freeze handling into
 alarmtimer_do_nsleep()

The alarmtimer nanosleep() implementation can be simplified by moving the
copy out of the remaining time to alarmtimer_do_nsleep() which is shared
between the real nanosleep function and the restart function.

The pointer to the timespec64 which is updated has to be stored in the
restart block anyway. Instead of storing it only in the restart case, store
it before calling alarmtimer_do_nsleep() and copy the remaining time in the
signal exit path.

[ tglx: Added changelog ]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-2-viro@ZenIV.linux.org.uk
---
 kernel/time/alarmtimer.c | 102 +++++++++++++++--------------------------------
 1 file changed, 32 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index d8a7a7e214de..ac6e9bc6cc59 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -688,8 +688,10 @@ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
  *
  * Sets the alarm timer and sleeps until it is fired or interrupted.
  */
-static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
+static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
+				enum alarmtimer_type type)
 {
+	struct timespec __user *rmtp;
 	alarm->data = (void *)current;
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -702,36 +704,26 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
 
 	__set_current_state(TASK_RUNNING);
 
-	return (alarm->data == NULL);
-}
-
-
-/**
- * update_rmtp - Update remaining timespec value
- * @exp: expiration time
- * @type: timer type
- * @rmtp: user pointer to remaining timepsec value
- *
- * Helper function that fills in rmtp value with time between
- * now and the exp value
- */
-static int update_rmtp(ktime_t exp, enum  alarmtimer_type type,
-			struct timespec __user *rmtp)
-{
-	struct timespec rmt;
-	ktime_t rem;
-
-	rem = ktime_sub(exp, alarm_bases[type].gettime());
-
-	if (rem <= 0)
+	if (!alarm->data)
 		return 0;
-	rmt = ktime_to_timespec(rem);
 
-	if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
-		return -EFAULT;
+	if (freezing(current))
+		alarmtimer_freezerset(absexp, type);
+	rmtp = current->restart_block.nanosleep.rmtp;
+	if (rmtp) {
+		struct timespec rmt;
+		ktime_t rem;
 
-	return 1;
+		rem = ktime_sub(absexp, alarm_bases[type].gettime());
 
+		if (rem <= 0)
+			return 0;
+		rmt = ktime_to_timespec(rem);
+
+		if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
+			return -EFAULT;
+	}
+	return -ERESTART_RESTARTBLOCK;
 }
 
 /**
@@ -743,32 +735,12 @@ static int update_rmtp(ktime_t exp, enum  alarmtimer_type type,
 static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
 {
 	enum  alarmtimer_type type = restart->nanosleep.clockid;
-	ktime_t exp;
-	struct timespec __user  *rmtp;
+	ktime_t exp = restart->nanosleep.expires;
 	struct alarm alarm;
-	int ret = 0;
 
-	exp = restart->nanosleep.expires;
 	alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
 
-	if (alarmtimer_do_nsleep(&alarm, exp))
-		goto out;
-
-	if (freezing(current))
-		alarmtimer_freezerset(exp, type);
-
-	rmtp = restart->nanosleep.rmtp;
-	if (rmtp) {
-		ret = update_rmtp(exp, type, rmtp);
-		if (ret <= 0)
-			goto out;
-	}
-
-
-	/* The other values in restart are already filled in */
-	ret = -ERESTART_RESTARTBLOCK;
-out:
-	return ret;
+	return alarmtimer_do_nsleep(&alarm, exp, type);
 }
 
 /**
@@ -785,11 +757,16 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
 			      struct timespec __user *rmtp)
 {
 	enum  alarmtimer_type type = clock2alarm(which_clock);
-	struct restart_block *restart;
+	struct restart_block *restart = &current->restart_block;
 	struct alarm alarm;
 	ktime_t exp;
 	int ret = 0;
 
+	if (flags & TIMER_ABSTIME)
+		rmtp = NULL;
+
+	restart->nanosleep.rmtp = rmtp;
+
 	if (!alarmtimer_get_rtcdev())
 		return -ENOTSUPP;
 
@@ -808,32 +785,17 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
 		exp = ktime_add(now, exp);
 	}
 
-	if (alarmtimer_do_nsleep(&alarm, exp))
-		goto out;
-
-	if (freezing(current))
-		alarmtimer_freezerset(exp, type);
+	ret = alarmtimer_do_nsleep(&alarm, exp, type);
+	if (ret != -ERESTART_RESTARTBLOCK)
+		return ret;
 
 	/* abs timers don't set remaining time or restart */
-	if (flags == TIMER_ABSTIME) {
-		ret = -ERESTARTNOHAND;
-		goto out;
-	}
-
-	if (rmtp) {
-		ret = update_rmtp(exp, type, rmtp);
-		if (ret <= 0)
-			goto out;
-	}
+	if (flags == TIMER_ABSTIME)
+		return -ERESTARTNOHAND;
 
-	restart = &current->restart_block;
 	restart->fn = alarm_timer_nsleep_restart;
 	restart->nanosleep.clockid = type;
 	restart->nanosleep.expires = exp;
-	restart->nanosleep.rmtp = rmtp;
-	ret = -ERESTART_RESTARTBLOCK;
-
-out:
 	return ret;
 }
 
-- 
cgit v1.3-14-g43fede


From 192a82f9003fe8fabd6088aa646e829225a94c55 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:28 +0100
Subject: hrtimer_nanosleep(): Pass rmtp in restart_block

Store the pointer to the timespec which gets updated with the remaining
time in the restart block and remove the function argument.

[ tglx: Added changelog ]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-3-viro@ZenIV.linux.org.uk
---
 include/linux/hrtimer.h    |  1 -
 kernel/compat.c            |  6 +++---
 kernel/time/hrtimer.c      | 11 ++++++-----
 kernel/time/posix-stubs.c  |  5 ++++-
 kernel/time/posix-timers.c |  5 ++++-
 5 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 8c5b10eb7265..b80c34f6fd4b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -453,7 +453,6 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer,
 
 /* Precise sleep: */
 extern long hrtimer_nanosleep(struct timespec64 *rqtp,
-			      struct timespec __user *rmtp,
 			      const enum hrtimer_mode mode,
 			      const clockid_t clockid);
 extern long hrtimer_nanosleep_restart(struct restart_block *restart_block);
diff --git a/kernel/compat.c b/kernel/compat.c
index 933bcb31ae10..cc9ba9d29b47 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -253,9 +253,9 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
 
 	oldfs = get_fs();
 	set_fs(KERNEL_DS);
-	ret = hrtimer_nanosleep(&tu64,
-				rmtp ? (struct timespec __user *)&rmt : NULL,
-				HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+	current->restart_block.nanosleep.rmtp =
+				rmtp ? (struct timespec __user *)&rmt : NULL;
+	ret = hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 	set_fs(oldfs);
 
 	/*
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index ac053bb5296e..4ae777f159de 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1503,10 +1503,11 @@ out:
 	return ret;
 }
 
-long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp,
+long hrtimer_nanosleep(struct timespec64 *rqtp,
 		       const enum hrtimer_mode mode, const clockid_t clockid)
 {
-	struct restart_block *restart;
+	struct restart_block *restart = &current->restart_block;
+	struct timespec __user *rmtp;
 	struct hrtimer_sleeper t;
 	int ret = 0;
 	u64 slack;
@@ -1526,16 +1527,15 @@ long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp,
 		goto out;
 	}
 
+	rmtp = restart->nanosleep.rmtp;
 	if (rmtp) {
 		ret = update_rmtp(&t.timer, rmtp);
 		if (ret <= 0)
 			goto out;
 	}
 
-	restart = &current->restart_block;
 	restart->fn = hrtimer_nanosleep_restart;
 	restart->nanosleep.clockid = t.timer.base->clockid;
-	restart->nanosleep.rmtp = rmtp;
 	restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
 
 	ret = -ERESTART_RESTARTBLOCK;
@@ -1557,7 +1557,8 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 	if (!timespec64_valid(&tu64))
 		return -EINVAL;
 
-	return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+	current->restart_block.nanosleep.rmtp = rmtp;
+	return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 
 /*
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index c0cd53eb018a..156a5e6f3bd2 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -115,7 +115,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 		t64 = timespec_to_timespec64(t);
 		if (!timespec64_valid(&t64))
 			return -EINVAL;
-		return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ?
+		if (flags & TIMER_ABSTIME)
+			rmtp = NULL;
+		current->restart_block.nanosleep.rmtp = rmtp;
+		return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ?
 					 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
 					 which_clock);
 	default:
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 58c0f60b132f..1a9f59f8afc2 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1043,7 +1043,10 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 static int common_nsleep(const clockid_t which_clock, int flags,
 			 struct timespec64 *tsave, struct timespec __user *rmtp)
 {
-	return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
+	if (flags & TIMER_ABSTIME)
+		rmtp = NULL;
+	current->restart_block.nanosleep.rmtp = rmtp;
+	return hrtimer_nanosleep(tsave, flags & TIMER_ABSTIME ?
 				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
 				 which_clock);
 }
-- 
cgit v1.3-14-g43fede


From a7602681fc63f1a3ddd3da336296c9634c2ff974 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:29 +0100
Subject: hrtimer: Move copyout of remaining time to do_nanosleep()

The hrtimer nanosleep() implementation can be simplified by moving the copy
out of the remaining time to do_nanosleep() which is shared between the
real nanosleep function and the restart function.

The pointer to the timespec64 which is updated is already stored in the
restart block at the call site, so the seperate handling of nanosleep and
restart function can be avoided.

[ tglx: Added changelog ]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-4-viro@ZenIV.linux.org.uk
---
 kernel/time/hrtimer.c | 62 +++++++++++++++++----------------------------------
 1 file changed, 20 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 4ae777f159de..baa7b846b6e3 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1441,6 +1441,7 @@ EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
 
 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
 {
+	struct timespec __user *rmtp;
 	hrtimer_init_sleeper(t, current);
 
 	do {
@@ -1457,48 +1458,33 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 
 	__set_current_state(TASK_RUNNING);
 
-	return t->task == NULL;
-}
-
-static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
-{
-	struct timespec rmt;
-	ktime_t rem;
-
-	rem = hrtimer_expires_remaining(timer);
-	if (rem <= 0)
+	if (!t->task)
 		return 0;
-	rmt = ktime_to_timespec(rem);
 
-	if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
-		return -EFAULT;
-
-	return 1;
+	rmtp = current->restart_block.nanosleep.rmtp;
+	if (rmtp) {
+		struct timespec rmt;
+		ktime_t rem = hrtimer_expires_remaining(&t->timer);
+		if (rem <= 0)
+			return 0;
+		rmt = ktime_to_timespec(rem);
+
+		if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
+			return -EFAULT;
+	}
+	return -ERESTART_RESTARTBLOCK;
 }
 
 long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 {
 	struct hrtimer_sleeper t;
-	struct timespec __user  *rmtp;
-	int ret = 0;
+	int ret;
 
 	hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
 				HRTIMER_MODE_ABS);
 	hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
 
-	if (do_nanosleep(&t, HRTIMER_MODE_ABS))
-		goto out;
-
-	rmtp = restart->nanosleep.rmtp;
-	if (rmtp) {
-		ret = update_rmtp(&t.timer, rmtp);
-		if (ret <= 0)
-			goto out;
-	}
-
-	/* The other values in restart are already filled in */
-	ret = -ERESTART_RESTARTBLOCK;
-out:
+	ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
 	destroy_hrtimer_on_stack(&t.timer);
 	return ret;
 }
@@ -1506,8 +1492,7 @@ out:
 long hrtimer_nanosleep(struct timespec64 *rqtp,
 		       const enum hrtimer_mode mode, const clockid_t clockid)
 {
-	struct restart_block *restart = &current->restart_block;
-	struct timespec __user *rmtp;
+	struct restart_block *restart;
 	struct hrtimer_sleeper t;
 	int ret = 0;
 	u64 slack;
@@ -1518,7 +1503,8 @@ long hrtimer_nanosleep(struct timespec64 *rqtp,
 
 	hrtimer_init_on_stack(&t.timer, clockid, mode);
 	hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
-	if (do_nanosleep(&t, mode))
+	ret = do_nanosleep(&t, mode);
+	if (ret != -ERESTART_RESTARTBLOCK)
 		goto out;
 
 	/* Absolute timers do not update the rmtp value and restart: */
@@ -1527,18 +1513,10 @@ long hrtimer_nanosleep(struct timespec64 *rqtp,
 		goto out;
 	}
 
-	rmtp = restart->nanosleep.rmtp;
-	if (rmtp) {
-		ret = update_rmtp(&t.timer, rmtp);
-		if (ret <= 0)
-			goto out;
-	}
-
+	restart = &current->restart_block;
 	restart->fn = hrtimer_nanosleep_restart;
 	restart->nanosleep.clockid = t.timer.base->clockid;
 	restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
-
-	ret = -ERESTART_RESTARTBLOCK;
 out:
 	destroy_hrtimer_on_stack(&t.timer);
 	return ret;
-- 
cgit v1.3-14-g43fede


From 99e6c0e6ec349575886ca7daffc9cb7ec583176f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:30 +0100
Subject: posix-timers: Store rmtp into restart_block in sys_clock_nanosleep()

... instead of doing that in every ->nsleep() instance

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-5-viro@ZenIV.linux.org.uk
---
 kernel/time/alarmtimer.c       |  8 +-------
 kernel/time/posix-cpu-timers.c | 12 +++---------
 kernel/time/posix-timers.c     | 10 +++++-----
 kernel/time/posix-timers.h     |  2 +-
 4 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ac6e9bc6cc59..d859a3601ddd 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -753,8 +753,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
  * Handles clock_nanosleep calls against _ALARM clockids
  */
 static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
-			      struct timespec64 *tsreq,
-			      struct timespec __user *rmtp)
+			      struct timespec64 *tsreq)
 {
 	enum  alarmtimer_type type = clock2alarm(which_clock);
 	struct restart_block *restart = &current->restart_block;
@@ -762,11 +761,6 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
 	ktime_t exp;
 	int ret = 0;
 
-	if (flags & TIMER_ABSTIME)
-		rmtp = NULL;
-
-	restart->nanosleep.rmtp = rmtp;
-
 	if (!alarmtimer_get_rtcdev())
 		return -ENOTSUPP;
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 239fff980fd0..ec6258c9cde5 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1328,16 +1328,11 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
 
 static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-			    struct timespec64 *rqtp, struct timespec __user *rmtp)
+			    struct timespec64 *rqtp)
 {
 	struct restart_block *restart_block = &current->restart_block;
 	int error;
 
-	if (flags & TIMER_ABSTIME)
-		rmtp = NULL;
-
-	restart_block->nanosleep.rmtp = rmtp;
-
 	/*
 	 * Diagnose required errors first.
 	 */
@@ -1388,10 +1383,9 @@ static int process_cpu_timer_create(struct k_itimer *timer)
 	return posix_cpu_timer_create(timer);
 }
 static int process_cpu_nsleep(const clockid_t which_clock, int flags,
-			      struct timespec64 *rqtp,
-			      struct timespec __user *rmtp)
+			      struct timespec64 *rqtp)
 {
-	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
+	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
 }
 static long process_cpu_nsleep_restart(struct restart_block *restart_block)
 {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 1a9f59f8afc2..a3e5c01b430e 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1041,11 +1041,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
  * nanosleep for monotonic and realtime clocks
  */
 static int common_nsleep(const clockid_t which_clock, int flags,
-			 struct timespec64 *tsave, struct timespec __user *rmtp)
+			 struct timespec64 *tsave)
 {
-	if (flags & TIMER_ABSTIME)
-		rmtp = NULL;
-	current->restart_block.nanosleep.rmtp = rmtp;
 	return hrtimer_nanosleep(tsave, flags & TIMER_ABSTIME ?
 				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
 				 which_clock);
@@ -1070,8 +1067,11 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 	t64 = timespec_to_timespec64(t);
 	if (!timespec64_valid(&t64))
 		return -EINVAL;
+	if (flags & TIMER_ABSTIME)
+		rmtp = NULL;
+	current->restart_block.nanosleep.rmtp = rmtp;
 
-	return kc->nsleep(which_clock, flags, &t64, rmtp);
+	return kc->nsleep(which_clock, flags, &t64);
 }
 
 /*
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index b086f5ba2f5b..bfd9e15c6ce0 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -10,7 +10,7 @@ struct k_clock {
 	int	(*clock_adj)(const clockid_t which_clock, struct timex *tx);
 	int	(*timer_create)(struct k_itimer *timer);
 	int	(*nsleep)(const clockid_t which_clock, int flags,
-			  struct timespec64 *, struct timespec __user *);
+			  struct timespec64 *);
 	long	(*nsleep_restart)(struct restart_block *restart_block);
 	int	(*timer_set)(struct k_itimer *timr, int flags,
 			     struct itimerspec64 *new_setting,
-- 
cgit v1.3-14-g43fede


From edbeda46322fbcb15af2d2d0f2daffb0cd349a5a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:31 +0100
Subject: time/posix-timers: Move the compat copyouts to the nanosleep
 implementations

Turn restart_block.nanosleep.{rmtp,compat_rmtp} into a tagged union (kind =
1 -> native, kind = 2 -> compat, kind = 0 -> nothing) and make the places
doing actual copyout handle compat as well as native (that will become a
helper in the next commit).  Result: compat wrappers, messing with
reassignments, etc. are gone.

[ tglx: Folded in a variant of Peter Zijlstras enum patch ]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-6-viro@ZenIV.linux.org.uk
---
 include/linux/posix-timers.h   |   2 -
 include/linux/restart_block.h  |  15 ++++-
 kernel/compat.c                | 131 -----------------------------------------
 kernel/time/alarmtimer.c       |  16 +++--
 kernel/time/hrtimer.c          |  42 +++++++++++--
 kernel/time/posix-cpu-timers.c |  20 +++++--
 kernel/time/posix-stubs.c      |  55 +++++++++++++----
 kernel/time/posix-timers.c     |  32 +++++++---
 8 files changed, 142 insertions(+), 171 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 667095dbcd37..29f1b7f09ced 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -110,8 +110,6 @@ void posix_cpu_timers_exit_group(struct task_struct *task);
 void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
 			   u64 *newval, u64 *oldval);
 
-long clock_nanosleep_restart(struct restart_block *restart_block);
-
 void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);
 
 void posixtimer_rearm(struct siginfo *info);
diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h
index 0d905d8ec553..19df8422606c 100644
--- a/include/linux/restart_block.h
+++ b/include/linux/restart_block.h
@@ -11,6 +11,14 @@ struct timespec;
 struct compat_timespec;
 struct pollfd;
 
+enum timespec_type {
+	TT_NONE		= 0,
+	TT_NATIVE	= 1,
+#ifdef CONFIG_COMPAT
+	TT_COMPAT	= 2,
+#endif
+};
+
 /*
  * System call restart block.
  */
@@ -29,10 +37,13 @@ struct restart_block {
 		/* For nanosleep */
 		struct {
 			clockid_t clockid;
-			struct timespec __user *rmtp;
+			enum timespec_type type;
+			union {
+				struct timespec __user *rmtp;
 #ifdef CONFIG_COMPAT
-			struct compat_timespec __user *compat_rmtp;
+				struct compat_timespec __user *compat_rmtp;
 #endif
+			};
 			u64 expires;
 		} nanosleep;
 		/* For poll */
diff --git a/kernel/compat.c b/kernel/compat.c
index cc9ba9d29b47..23afa26f574b 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -213,82 +213,6 @@ int compat_convert_timespec(struct timespec __user **kts,
 	return 0;
 }
 
-static long compat_nanosleep_restart(struct restart_block *restart)
-{
-	struct compat_timespec __user *rmtp;
-	struct timespec rmt;
-	mm_segment_t oldfs;
-	long ret;
-
-	restart->nanosleep.rmtp = (struct timespec __user *) &rmt;
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	ret = hrtimer_nanosleep_restart(restart);
-	set_fs(oldfs);
-
-	if (ret == -ERESTART_RESTARTBLOCK) {
-		rmtp = restart->nanosleep.compat_rmtp;
-
-		if (rmtp && compat_put_timespec(&rmt, rmtp))
-			return -EFAULT;
-	}
-
-	return ret;
-}
-
-COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
-		       struct compat_timespec __user *, rmtp)
-{
-	struct timespec tu, rmt;
-	struct timespec64 tu64;
-	mm_segment_t oldfs;
-	long ret;
-
-	if (compat_get_timespec(&tu, rqtp))
-		return -EFAULT;
-
-	tu64 = timespec_to_timespec64(tu);
-	if (!timespec64_valid(&tu64))
-		return -EINVAL;
-
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	current->restart_block.nanosleep.rmtp =
-				rmtp ? (struct timespec __user *)&rmt : NULL;
-	ret = hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
-	set_fs(oldfs);
-
-	/*
-	 * hrtimer_nanosleep() can only return 0 or
-	 * -ERESTART_RESTARTBLOCK here because:
-	 *
-	 * - we call it with HRTIMER_MODE_REL and therefor exclude the
-	 *   -ERESTARTNOHAND return path.
-	 *
-	 * - we supply the rmtp argument from the task stack (due to
-	 *   the necessary compat conversion. So the update cannot
-	 *   fail, which excludes the -EFAULT return path as well. If
-	 *   it fails nevertheless we have a bigger problem and wont
-	 *   reach this place anymore.
-	 *
-	 * - if the return value is 0, we do not have to update rmtp
-	 *    because there is no remaining time.
-	 *
-	 * We check for -ERESTART_RESTARTBLOCK nevertheless if the
-	 * core implementation decides to return random nonsense.
-	 */
-	if (ret == -ERESTART_RESTARTBLOCK) {
-		struct restart_block *restart = &current->restart_block;
-
-		restart->fn = compat_nanosleep_restart;
-		restart->nanosleep.compat_rmtp = rmtp;
-
-		if (rmtp && compat_put_timespec(&rmt, rmtp))
-			return -EFAULT;
-	}
-	return ret;
-}
-
 static inline long get_compat_itimerval(struct itimerval *o,
 		struct compat_itimerval __user *i)
 {
@@ -821,61 +745,6 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
 	return err;
 }
 
-static long compat_clock_nanosleep_restart(struct restart_block *restart)
-{
-	long err;
-	mm_segment_t oldfs;
-	struct timespec tu;
-	struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp;
-
-	restart->nanosleep.rmtp = (struct timespec __user *) &tu;
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	err = clock_nanosleep_restart(restart);
-	set_fs(oldfs);
-
-	if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
-	    compat_put_timespec(&tu, rmtp))
-		return -EFAULT;
-
-	if (err == -ERESTART_RESTARTBLOCK) {
-		restart->fn = compat_clock_nanosleep_restart;
-		restart->nanosleep.compat_rmtp = rmtp;
-	}
-	return err;
-}
-
-COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
-		       struct compat_timespec __user *, rqtp,
-		       struct compat_timespec __user *, rmtp)
-{
-	long err;
-	mm_segment_t oldfs;
-	struct timespec in, out;
-	struct restart_block *restart;
-
-	if (compat_get_timespec(&in, rqtp))
-		return -EFAULT;
-
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	err = sys_clock_nanosleep(which_clock, flags,
-				  (struct timespec __user *) &in,
-				  (struct timespec __user *) &out);
-	set_fs(oldfs);
-
-	if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
-	    compat_put_timespec(&out, rmtp))
-		return -EFAULT;
-
-	if (err == -ERESTART_RESTARTBLOCK) {
-		restart = &current->restart_block;
-		restart->fn = compat_clock_nanosleep_restart;
-		restart->nanosleep.compat_rmtp = rmtp;
-	}
-	return err;
-}
-
 /*
  * We currently only need the following fields from the sigevent
  * structure: sigev_value, sigev_signo, sig_notify and (sometimes
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index d859a3601ddd..57bcf94ee132 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -27,6 +27,7 @@
 #include <linux/posix-timers.h>
 #include <linux/workqueue.h>
 #include <linux/freezer.h>
+#include <linux/compat.h>
 
 #include "posix-timers.h"
 
@@ -691,7 +692,7 @@ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm,
 static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
 				enum alarmtimer_type type)
 {
-	struct timespec __user *rmtp;
+	struct restart_block *restart;
 	alarm->data = (void *)current;
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -709,8 +710,8 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
 
 	if (freezing(current))
 		alarmtimer_freezerset(absexp, type);
-	rmtp = current->restart_block.nanosleep.rmtp;
-	if (rmtp) {
+	restart = &current->restart_block;
+	if (restart->nanosleep.type != TT_NONE) {
 		struct timespec rmt;
 		ktime_t rem;
 
@@ -720,7 +721,14 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
 			return 0;
 		rmt = ktime_to_timespec(rem);
 
-		if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
+#ifdef CONFIG_COMPAT
+		if (restart->nanosleep.type == TT_COMPAT) {
+			if (compat_put_timespec(&rmt,
+						restart->nanosleep.compat_rmtp))
+				return -EFAULT;
+		} else
+#endif
+		if (copy_to_user(restart->nanosleep.rmtp, &rmt, sizeof(rmt)))
 			return -EFAULT;
 	}
 	return -ERESTART_RESTARTBLOCK;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index baa7b846b6e3..5370da8fc0a4 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -51,6 +51,7 @@
 #include <linux/sched/debug.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
+#include <linux/compat.h>
 
 #include <linux/uaccess.h>
 
@@ -1441,7 +1442,8 @@ EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
 
 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
 {
-	struct timespec __user *rmtp;
+	struct restart_block *restart;
+
 	hrtimer_init_sleeper(t, current);
 
 	do {
@@ -1461,15 +1463,23 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 	if (!t->task)
 		return 0;
 
-	rmtp = current->restart_block.nanosleep.rmtp;
-	if (rmtp) {
-		struct timespec rmt;
+	restart = &current->restart_block;
+	if (restart->nanosleep.type != TT_NONE) {
 		ktime_t rem = hrtimer_expires_remaining(&t->timer);
+		struct timespec rmt;
+
 		if (rem <= 0)
 			return 0;
 		rmt = ktime_to_timespec(rem);
 
-		if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
+#ifdef CONFIG_COMPAT
+		if (restart->nanosleep.type == TT_COMPAT) {
+			if (compat_put_timespec(&rmt,
+						restart->nanosleep.compat_rmtp))
+				return -EFAULT;
+		} else
+#endif
+		if (copy_to_user(restart->nanosleep.rmtp, &rmt, sizeof(rmt)))
 			return -EFAULT;
 	}
 	return -ERESTART_RESTARTBLOCK;
@@ -1535,10 +1545,32 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 	if (!timespec64_valid(&tu64))
 		return -EINVAL;
 
+	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
 	current->restart_block.nanosleep.rmtp = rmtp;
 	return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 
+#ifdef CONFIG_COMPAT
+
+COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
+		       struct compat_timespec __user *, rmtp)
+{
+	struct timespec64 tu64;
+	struct timespec tu;
+
+	if (compat_get_timespec(&tu, rqtp))
+		return -EFAULT;
+
+	tu64 = timespec_to_timespec64(tu);
+	if (!timespec64_valid(&tu64))
+		return -EINVAL;
+
+	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
+	current->restart_block.nanosleep.compat_rmtp = rmtp;
+	return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+}
+#endif
+
 /*
  * Functions related to boot-time initialization:
  */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index ec6258c9cde5..1563ca22cf1f 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -12,6 +12,7 @@
 #include <trace/events/timer.h>
 #include <linux/tick.h>
 #include <linux/workqueue.h>
+#include <linux/compat.h>
 
 #include "posix-timers.h"
 
@@ -1243,10 +1244,9 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 	timer.it_process = current;
 	if (!error) {
 		static struct itimerspec64 zero_it;
-		struct restart_block *restart = &current->restart_block;
-		struct timespec __user *rmtp;
+		struct restart_block *restart;
 
-		memset(&it, 0, sizeof it);
+		memset(&it, 0, sizeof(it));
 		it.it_value = *rqtp;
 
 		spin_lock_irq(&timer.it_lock);
@@ -1311,12 +1311,20 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		/*
 		 * Report back to the user the time still remaining.
 		 */
-		rmtp = restart->nanosleep.rmtp;
-		if (rmtp) {
+		restart = &current->restart_block;
+		if (restart->nanosleep.type != TT_NONE) {
 			struct timespec ts;
 
 			ts = timespec64_to_timespec(it.it_value);
-			if (copy_to_user(rmtp, &ts, sizeof(*rmtp)))
+#ifdef CONFIG_COMPAT
+			if (restart->nanosleep.type == TT_COMPAT) {
+				if (compat_put_timespec(&ts,
+						restart->nanosleep.compat_rmtp))
+					return -EFAULT;
+			} else
+#endif
+			if (copy_to_user(restart->nanosleep.rmtp, &ts,
+					sizeof(ts)))
 				return -EFAULT;
 		}
 		restart->nanosleep.expires = timespec64_to_ns(rqtp);
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 156a5e6f3bd2..749b76f2d757 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -17,6 +17,7 @@
 #include <linux/ktime.h>
 #include <linux/timekeeping.h>
 #include <linux/posix-timers.h>
+#include <linux/compat.h>
 
 asmlinkage long sys_ni_posix_timers(void)
 {
@@ -110,25 +111,53 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 	case CLOCK_REALTIME:
 	case CLOCK_MONOTONIC:
 	case CLOCK_BOOTTIME:
-		if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
-			return -EFAULT;
-		t64 = timespec_to_timespec64(t);
-		if (!timespec64_valid(&t64))
-			return -EINVAL;
-		if (flags & TIMER_ABSTIME)
-			rmtp = NULL;
-		current->restart_block.nanosleep.rmtp = rmtp;
-		return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ?
-					 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
-					 which_clock);
+		break;
 	default:
 		return -EINVAL;
 	}
+
+	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+		return -EFAULT;
+	t64 = timespec_to_timespec64(t);
+	if (!timespec64_valid(&t64))
+		return -EINVAL;
+	if (flags & TIMER_ABSTIME)
+		rmtp = NULL;
+	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
+	current->restart_block.nanosleep.rmtp = rmtp;
+	return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ?
+				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+				 which_clock);
 }
 
 #ifdef CONFIG_COMPAT
-long clock_nanosleep_restart(struct restart_block *restart_block)
+COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
+		       struct compat_timespec __user *, rqtp,
+		       struct compat_timespec __user *, rmtp)
 {
-	return hrtimer_nanosleep_restart(restart_block);
+	struct timespec64 t64;
+	struct timespec t;
+
+	switch (which_clock) {
+	case CLOCK_REALTIME:
+	case CLOCK_MONOTONIC:
+	case CLOCK_BOOTTIME:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (compat_get_timespec(&t, rqtp))
+		return -EFAULT;
+	t64 = timespec_to_timespec64(t);
+	if (!timespec64_valid(&t64))
+		return -EINVAL;
+	if (flags & TIMER_ABSTIME)
+		rmtp = NULL;
+	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
+	current->restart_block.nanosleep.compat_rmtp = rmtp;
+	return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ?
+				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+				 which_clock);
 }
 #endif
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index a3e5c01b430e..bec86b6b9814 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -49,6 +49,7 @@
 #include <linux/workqueue.h>
 #include <linux/export.h>
 #include <linux/hashtable.h>
+#include <linux/compat.h>
 
 #include "timekeeping.h"
 #include "posix-timers.h"
@@ -1069,25 +1070,40 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 		return -EINVAL;
 	if (flags & TIMER_ABSTIME)
 		rmtp = NULL;
+	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
 	current->restart_block.nanosleep.rmtp = rmtp;
 
 	return kc->nsleep(which_clock, flags, &t64);
 }
 
-/*
- * This will restart clock_nanosleep. This is required only by
- * compat_clock_nanosleep_restart for now.
- */
-long clock_nanosleep_restart(struct restart_block *restart_block)
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
+		       struct compat_timespec __user *, rqtp,
+		       struct compat_timespec __user *, rmtp)
 {
-	clockid_t which_clock = restart_block->nanosleep.clockid;
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timespec64 t64;
+	struct timespec t;
 
-	if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
+	if (!kc)
 		return -EINVAL;
+	if (!kc->nsleep)
+		return -ENANOSLEEP_NOTSUP;
+
+	if (compat_get_timespec(&t, rqtp))
+		return -EFAULT;
 
-	return kc->nsleep_restart(restart_block);
+	t64 = timespec_to_timespec64(t);
+	if (!timespec64_valid(&t64))
+		return -EINVAL;
+	if (flags & TIMER_ABSTIME)
+		rmtp = NULL;
+	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
+	current->restart_block.nanosleep.compat_rmtp = rmtp;
+
+	return kc->nsleep(which_clock, flags, &t64);
 }
+#endif
 
 static const struct k_clock clock_realtime = {
 	.clock_getres		= posix_get_hrtimer_res,
-- 
cgit v1.3-14-g43fede


From ce41aaf47af3d28c4c958e07675a3e0a51f09bd3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:32 +0100
Subject: hrtimers/posix-timers: Merge nanosleep timespec copyout logics into a
 new helper

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-7-viro@ZenIV.linux.org.uk
---
 include/linux/hrtimer.h        |  2 ++
 kernel/time/alarmtimer.c       | 10 +---------
 kernel/time/hrtimer.c          | 29 ++++++++++++++++++++---------
 kernel/time/posix-cpu-timers.c | 13 ++-----------
 4 files changed, 25 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index b80c34f6fd4b..38b968f3df4e 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -452,6 +452,8 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer,
 }
 
 /* Precise sleep: */
+
+extern int nanosleep_copyout(struct restart_block *, struct timespec *);
 extern long hrtimer_nanosleep(struct timespec64 *rqtp,
 			      const enum hrtimer_mode mode,
 			      const clockid_t clockid);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 57bcf94ee132..7bed4e44f9bd 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -721,15 +721,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
 			return 0;
 		rmt = ktime_to_timespec(rem);
 
-#ifdef CONFIG_COMPAT
-		if (restart->nanosleep.type == TT_COMPAT) {
-			if (compat_put_timespec(&rmt,
-						restart->nanosleep.compat_rmtp))
-				return -EFAULT;
-		} else
-#endif
-		if (copy_to_user(restart->nanosleep.rmtp, &rmt, sizeof(rmt)))
-			return -EFAULT;
+		return nanosleep_copyout(restart, &rmt);
 	}
 	return -ERESTART_RESTARTBLOCK;
 }
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5370da8fc0a4..db2f5f7b4ba5 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1440,6 +1440,25 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 }
 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
 
+int nanosleep_copyout(struct restart_block *restart, struct timespec *ts)
+{
+	switch(restart->nanosleep.type) {
+#ifdef CONFIG_COMPAT
+	case TT_COMPAT:
+		if (compat_put_timespec(ts, restart->nanosleep.compat_rmtp))
+			return -EFAULT;
+		break;
+#endif
+	case TT_NATIVE:
+		if (copy_to_user(restart->nanosleep.rmtp, ts, sizeof(struct timespec)))
+			return -EFAULT;
+		break;
+	default:
+		BUG();
+	}
+	return -ERESTART_RESTARTBLOCK;
+}
+
 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
 {
 	struct restart_block *restart;
@@ -1472,15 +1491,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 			return 0;
 		rmt = ktime_to_timespec(rem);
 
-#ifdef CONFIG_COMPAT
-		if (restart->nanosleep.type == TT_COMPAT) {
-			if (compat_put_timespec(&rmt,
-						restart->nanosleep.compat_rmtp))
-				return -EFAULT;
-		} else
-#endif
-		if (copy_to_user(restart->nanosleep.rmtp, &rmt, sizeof(rmt)))
-			return -EFAULT;
+		return nanosleep_copyout(restart, &rmt);
 	}
 	return -ERESTART_RESTARTBLOCK;
 }
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1563ca22cf1f..993a924d1399 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1312,22 +1312,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		 * Report back to the user the time still remaining.
 		 */
 		restart = &current->restart_block;
+		restart->nanosleep.expires = timespec64_to_ns(rqtp);
 		if (restart->nanosleep.type != TT_NONE) {
 			struct timespec ts;
 
 			ts = timespec64_to_timespec(it.it_value);
-#ifdef CONFIG_COMPAT
-			if (restart->nanosleep.type == TT_COMPAT) {
-				if (compat_put_timespec(&ts,
-						restart->nanosleep.compat_rmtp))
-					return -EFAULT;
-			} else
-#endif
-			if (copy_to_user(restart->nanosleep.rmtp, &ts,
-					sizeof(ts)))
-				return -EFAULT;
+			error = nanosleep_copyout(restart, &ts);
 		}
-		restart->nanosleep.expires = timespec64_to_ns(rqtp);
 	}
 
 	return error;
-- 
cgit v1.3-14-g43fede


From fb923c4a3c2ee735755d4a93522150fc35d0ecbd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:33 +0100
Subject: posix-timers: Kill ->nsleep_restart()

No more users.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-8-viro@ZenIV.linux.org.uk
---
 include/linux/hrtimer.h        | 1 -
 kernel/time/hrtimer.c          | 2 +-
 kernel/time/posix-cpu-timers.c | 6 ------
 kernel/time/posix-timers.c     | 4 ----
 kernel/time/posix-timers.h     | 1 -
 5 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 38b968f3df4e..d83b7ed1cb0e 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -457,7 +457,6 @@ extern int nanosleep_copyout(struct restart_block *, struct timespec *);
 extern long hrtimer_nanosleep(struct timespec64 *rqtp,
 			      const enum hrtimer_mode mode,
 			      const clockid_t clockid);
-extern long hrtimer_nanosleep_restart(struct restart_block *restart_block);
 
 extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 				 struct task_struct *tsk);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index db2f5f7b4ba5..45f83cc7c0c7 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1496,7 +1496,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 	return -ERESTART_RESTARTBLOCK;
 }
 
-long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
+static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 {
 	struct hrtimer_sleeper t;
 	int ret;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 993a924d1399..515148d4eeb1 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1386,10 +1386,6 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags,
 {
 	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
 }
-static long process_cpu_nsleep_restart(struct restart_block *restart_block)
-{
-	return -EINVAL;
-}
 static int thread_cpu_clock_getres(const clockid_t which_clock,
 				   struct timespec64 *tp)
 {
@@ -1412,7 +1408,6 @@ const struct k_clock clock_posix_cpu = {
 	.clock_get	= posix_cpu_clock_get,
 	.timer_create	= posix_cpu_timer_create,
 	.nsleep		= posix_cpu_nsleep,
-	.nsleep_restart	= posix_cpu_nsleep_restart,
 	.timer_set	= posix_cpu_timer_set,
 	.timer_del	= posix_cpu_timer_del,
 	.timer_get	= posix_cpu_timer_get,
@@ -1424,7 +1419,6 @@ const struct k_clock clock_process = {
 	.clock_get	= process_cpu_clock_get,
 	.timer_create	= process_cpu_timer_create,
 	.nsleep		= process_cpu_nsleep,
-	.nsleep_restart	= process_cpu_nsleep_restart,
 };
 
 const struct k_clock clock_thread = {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index bec86b6b9814..ea4a463436bf 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1111,7 +1111,6 @@ static const struct k_clock clock_realtime = {
 	.clock_set		= posix_clock_realtime_set,
 	.clock_adj		= posix_clock_realtime_adj,
 	.nsleep			= common_nsleep,
-	.nsleep_restart		= hrtimer_nanosleep_restart,
 	.timer_create		= common_timer_create,
 	.timer_set		= common_timer_set,
 	.timer_get		= common_timer_get,
@@ -1127,7 +1126,6 @@ static const struct k_clock clock_monotonic = {
 	.clock_getres		= posix_get_hrtimer_res,
 	.clock_get		= posix_ktime_get_ts,
 	.nsleep			= common_nsleep,
-	.nsleep_restart		= hrtimer_nanosleep_restart,
 	.timer_create		= common_timer_create,
 	.timer_set		= common_timer_set,
 	.timer_get		= common_timer_get,
@@ -1158,7 +1156,6 @@ static const struct k_clock clock_tai = {
 	.clock_getres		= posix_get_hrtimer_res,
 	.clock_get		= posix_get_tai,
 	.nsleep			= common_nsleep,
-	.nsleep_restart		= hrtimer_nanosleep_restart,
 	.timer_create		= common_timer_create,
 	.timer_set		= common_timer_set,
 	.timer_get		= common_timer_get,
@@ -1174,7 +1171,6 @@ static const struct k_clock clock_boottime = {
 	.clock_getres		= posix_get_hrtimer_res,
 	.clock_get		= posix_get_boottime,
 	.nsleep			= common_nsleep,
-	.nsleep_restart		= hrtimer_nanosleep_restart,
 	.timer_create		= common_timer_create,
 	.timer_set		= common_timer_set,
 	.timer_get		= common_timer_get,
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index bfd9e15c6ce0..5e69bb85629f 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -11,7 +11,6 @@ struct k_clock {
 	int	(*timer_create)(struct k_itimer *timer);
 	int	(*nsleep)(const clockid_t which_clock, int flags,
 			  struct timespec64 *);
-	long	(*nsleep_restart)(struct restart_block *restart_block);
 	int	(*timer_set)(struct k_itimer *timr, int flags,
 			     struct itimerspec64 *new_setting,
 			     struct itimerspec64 *old_setting);
-- 
cgit v1.3-14-g43fede


From 3a4d44b6162555070194e486ff6b3799a8d323a2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:34 +0100
Subject: ntp: Move adjtimex related compat syscalls to native counterparts

Get rid of set_fs() mess and sanitize compat_{get,put}_timex(),
while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-9-viro@ZenIV.linux.org.uk
---
 include/linux/compat.h     |   4 ++
 kernel/compat.c            | 141 +++++++++++++++++----------------------------
 kernel/time/posix-stubs.c  |   2 +
 kernel/time/posix-timers.c |  27 +++++++++
 kernel/time/time.c         |  24 +++++++-
 5 files changed, 108 insertions(+), 90 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 1c5f3152cbb5..ecb8dd261d36 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -128,6 +128,10 @@ struct compat_timex {
 	compat_int_t:32; compat_int_t:32; compat_int_t:32;
 };
 
+struct timex;
+int compat_get_timex(struct timex *, const struct compat_timex __user *);
+int compat_put_timex(struct compat_timex __user *, const struct timex *);
+
 #define _COMPAT_NSIG_WORDS	(_COMPAT_NSIG / _COMPAT_NSIG_BPW)
 
 typedef struct {
diff --git a/kernel/compat.c b/kernel/compat.c
index 23afa26f574b..97087b333543 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -30,60 +30,64 @@
 
 #include <linux/uaccess.h>
 
-static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
+int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp)
 {
-	memset(txc, 0, sizeof(struct timex));
-
-	if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
-			__get_user(txc->modes, &utp->modes) ||
-			__get_user(txc->offset, &utp->offset) ||
-			__get_user(txc->freq, &utp->freq) ||
-			__get_user(txc->maxerror, &utp->maxerror) ||
-			__get_user(txc->esterror, &utp->esterror) ||
-			__get_user(txc->status, &utp->status) ||
-			__get_user(txc->constant, &utp->constant) ||
-			__get_user(txc->precision, &utp->precision) ||
-			__get_user(txc->tolerance, &utp->tolerance) ||
-			__get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
-			__get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
-			__get_user(txc->tick, &utp->tick) ||
-			__get_user(txc->ppsfreq, &utp->ppsfreq) ||
-			__get_user(txc->jitter, &utp->jitter) ||
-			__get_user(txc->shift, &utp->shift) ||
-			__get_user(txc->stabil, &utp->stabil) ||
-			__get_user(txc->jitcnt, &utp->jitcnt) ||
-			__get_user(txc->calcnt, &utp->calcnt) ||
-			__get_user(txc->errcnt, &utp->errcnt) ||
-			__get_user(txc->stbcnt, &utp->stbcnt))
+	struct compat_timex tx32;
+
+	if (copy_from_user(&tx32, utp, sizeof(struct compat_timex)))
 		return -EFAULT;
 
+	txc->modes = tx32.modes;
+	txc->offset = tx32.offset;
+	txc->freq = tx32.freq;
+	txc->maxerror = tx32.maxerror;
+	txc->esterror = tx32.esterror;
+	txc->status = tx32.status;
+	txc->constant = tx32.constant;
+	txc->precision = tx32.precision;
+	txc->tolerance = tx32.tolerance;
+	txc->time.tv_sec = tx32.time.tv_sec;
+	txc->time.tv_usec = tx32.time.tv_usec;
+	txc->tick = tx32.tick;
+	txc->ppsfreq = tx32.ppsfreq;
+	txc->jitter = tx32.jitter;
+	txc->shift = tx32.shift;
+	txc->stabil = tx32.stabil;
+	txc->jitcnt = tx32.jitcnt;
+	txc->calcnt = tx32.calcnt;
+	txc->errcnt = tx32.errcnt;
+	txc->stbcnt = tx32.stbcnt;
+
 	return 0;
 }
 
-static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
-{
-	if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
-			__put_user(txc->modes, &utp->modes) ||
-			__put_user(txc->offset, &utp->offset) ||
-			__put_user(txc->freq, &utp->freq) ||
-			__put_user(txc->maxerror, &utp->maxerror) ||
-			__put_user(txc->esterror, &utp->esterror) ||
-			__put_user(txc->status, &utp->status) ||
-			__put_user(txc->constant, &utp->constant) ||
-			__put_user(txc->precision, &utp->precision) ||
-			__put_user(txc->tolerance, &utp->tolerance) ||
-			__put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
-			__put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
-			__put_user(txc->tick, &utp->tick) ||
-			__put_user(txc->ppsfreq, &utp->ppsfreq) ||
-			__put_user(txc->jitter, &utp->jitter) ||
-			__put_user(txc->shift, &utp->shift) ||
-			__put_user(txc->stabil, &utp->stabil) ||
-			__put_user(txc->jitcnt, &utp->jitcnt) ||
-			__put_user(txc->calcnt, &utp->calcnt) ||
-			__put_user(txc->errcnt, &utp->errcnt) ||
-			__put_user(txc->stbcnt, &utp->stbcnt) ||
-			__put_user(txc->tai, &utp->tai))
+int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc)
+{
+	struct compat_timex tx32;
+
+	memset(&tx32, 0, sizeof(struct compat_timex));
+	tx32.modes = txc->modes;
+	tx32.offset = txc->offset;
+	tx32.freq = txc->freq;
+	tx32.maxerror = txc->maxerror;
+	tx32.esterror = txc->esterror;
+	tx32.status = txc->status;
+	tx32.constant = txc->constant;
+	tx32.precision = txc->precision;
+	tx32.tolerance = txc->tolerance;
+	tx32.time.tv_sec = txc->time.tv_sec;
+	tx32.time.tv_usec = txc->time.tv_usec;
+	tx32.tick = txc->tick;
+	tx32.ppsfreq = txc->ppsfreq;
+	tx32.jitter = txc->jitter;
+	tx32.shift = txc->shift;
+	tx32.stabil = txc->stabil;
+	tx32.jitcnt = txc->jitcnt;
+	tx32.calcnt = txc->calcnt;
+	tx32.errcnt = txc->errcnt;
+	tx32.stbcnt = txc->stbcnt;
+	tx32.tai = txc->tai;
+	if (copy_to_user(utp, &tx32, sizeof(struct compat_timex)))
 		return -EFAULT;
 	return 0;
 }
@@ -705,29 +709,6 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
 	return err;
 }
 
-COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
-		       struct compat_timex __user *, utp)
-{
-	struct timex txc;
-	mm_segment_t oldfs;
-	int err, ret;
-
-	err = compat_get_timex(&txc, utp);
-	if (err)
-		return err;
-
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
-	set_fs(oldfs);
-
-	err = compat_put_timex(utp, &txc);
-	if (err)
-		return err;
-
-	return ret;
-}
-
 COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
 		       struct compat_timespec __user *, tp)
 {
@@ -944,24 +925,6 @@ COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
 
 #endif /* __ARCH_WANT_COMPAT_SYS_TIME */
 
-COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
-{
-	struct timex txc;
-	int err, ret;
-
-	err = compat_get_timex(&txc, utp);
-	if (err)
-		return err;
-
-	ret = do_adjtimex(&txc);
-
-	err = compat_put_timex(utp, &txc);
-	if (err)
-		return err;
-
-	return ret;
-}
-
 #ifdef CONFIG_NUMA
 COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
 		       compat_uptr_t __user *, pages32,
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 749b76f2d757..954d1d8ff9e6 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -28,6 +28,7 @@ asmlinkage long sys_ni_posix_timers(void)
 }
 
 #define SYS_NI(name)  SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers)
+#define COMPAT_SYS_NI(name)  SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers)
 
 SYS_NI(timer_create);
 SYS_NI(timer_gettime);
@@ -40,6 +41,7 @@ SYS_NI(setitimer);
 #ifdef __ARCH_WANT_SYS_ALARM
 SYS_NI(alarm);
 #endif
+COMPAT_SYS_NI(clock_adjtime);
 
 /*
  * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index ea4a463436bf..b1b6d52d6425 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1018,6 +1018,33 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
 	return err;
 }
 
+#ifdef CONFIG_COMPAT
+
+COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
+		       struct compat_timex __user *, utp)
+{
+	const struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timex ktx;
+	int err;
+
+	if (!kc)
+		return -EINVAL;
+	if (!kc->clock_adj)
+		return -EOPNOTSUPP;
+
+	err = compat_get_timex(&ktx, utp);
+	if (err)
+		return err;
+
+	err = kc->clock_adj(which_clock, &ktx);
+
+	if (err >= 0)
+		err = compat_put_timex(utp, &ktx);
+
+	return err;
+}
+#endif
+
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 		struct timespec __user *, tp)
 {
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 49c73c6ed648..400662f16c5a 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -39,6 +39,7 @@
 #include <linux/ptrace.h>
 
 #include <linux/uaccess.h>
+#include <linux/compat.h>
 #include <asm/unistd.h>
 
 #include <generated/timeconst.h>
@@ -224,12 +225,33 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
 	 * structure. But bear in mind that the structures
 	 * may change
 	 */
-	if(copy_from_user(&txc, txc_p, sizeof(struct timex)))
+	if (copy_from_user(&txc, txc_p, sizeof(struct timex)))
 		return -EFAULT;
 	ret = do_adjtimex(&txc);
 	return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
 }
 
+#ifdef CONFIG_COMPAT
+
+COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
+{
+	struct timex txc;
+	int err, ret;
+
+	err = compat_get_timex(&txc, utp);
+	if (err)
+		return err;
+
+	ret = do_adjtimex(&txc);
+
+	err = compat_put_timex(utp, &txc);
+	if (err)
+		return err;
+
+	return ret;
+}
+#endif
+
 /*
  * Convert jiffies to milliseconds and back.
  *
-- 
cgit v1.3-14-g43fede


From 1acbe7708b0313b33287bb4ffcbf26462ea3c588 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:35 +0100
Subject: posix-timers: Take compat timer_settime(2) to native one

... and get rid of set_fs() in there

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-10-viro@ZenIV.linux.org.uk
---
 kernel/compat.c            | 23 ------------
 kernel/time/posix-stubs.c  |  1 +
 kernel/time/posix-timers.c | 87 +++++++++++++++++++++++++++++++++-------------
 3 files changed, 64 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 97087b333543..df39e2e00c47 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -635,29 +635,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
 	return sys_timer_create(which_clock, event, created_timer_id);
 }
 
-COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
-		       struct compat_itimerspec __user *, new,
-		       struct compat_itimerspec __user *, old)
-{
-	long err;
-	mm_segment_t oldfs;
-	struct itimerspec newts, oldts;
-
-	if (!new)
-		return -EINVAL;
-	if (get_compat_itimerspec(&newts, new))
-		return -EFAULT;
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	err = sys_timer_settime(timer_id, flags,
-				(struct itimerspec __user *) &newts,
-				(struct itimerspec __user *) &oldts);
-	set_fs(oldfs);
-	if (!err && old && put_compat_itimerspec(old, &oldts))
-		return -EFAULT;
-	return err;
-}
-
 COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 		       struct compat_itimerspec __user *, setting)
 {
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 954d1d8ff9e6..ad263df132d6 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -42,6 +42,7 @@ SYS_NI(setitimer);
 SYS_NI(alarm);
 #endif
 COMPAT_SYS_NI(clock_adjtime);
+COMPAT_SYS_NI(timer_settime);
 
 /*
  * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index b1b6d52d6425..a73feac191f9 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -819,31 +819,21 @@ int common_timer_set(struct k_itimer *timr, int flags,
 	return 0;
 }
 
-/* Set a POSIX.1b interval timer */
-SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
-		const struct itimerspec __user *, new_setting,
-		struct itimerspec __user *, old_setting)
+static int do_timer_settime(timer_t timer_id, int flags,
+			    struct itimerspec64 *new_spec64,
+			    struct itimerspec64 *old_spec64)
 {
-	struct itimerspec64 new_spec64, old_spec64;
-	struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL;
-	struct itimerspec new_spec, old_spec;
+	const struct k_clock *kc;
 	struct k_itimer *timr;
 	unsigned long flag;
-	const struct k_clock *kc;
 	int error = 0;
 
-	if (!new_setting)
+	if (!timespec64_valid(&new_spec64->it_interval) ||
+	    !timespec64_valid(&new_spec64->it_value))
 		return -EINVAL;
 
-	if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
-		return -EFAULT;
-	new_spec64 = itimerspec_to_itimerspec64(&new_spec);
-
-	if (!timespec64_valid(&new_spec64.it_interval) ||
-	    !timespec64_valid(&new_spec64.it_value))
-		return -EINVAL;
-	if (rtn)
-		memset(rtn, 0, sizeof(*rtn));
+	if (old_spec64)
+		memset(old_spec64, 0, sizeof(*old_spec64));
 retry:
 	timr = lock_timer(timer_id, &flag);
 	if (!timr)
@@ -853,22 +843,71 @@ retry:
 	if (WARN_ON_ONCE(!kc || !kc->timer_set))
 		error = -EINVAL;
 	else
-		error = kc->timer_set(timr, flags, &new_spec64, rtn);
+		error = kc->timer_set(timr, flags, new_spec64, old_spec64);
 
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
-		rtn = NULL;	// We already got the old time...
+		old_spec64 = NULL;	// We already got the old time...
 		goto retry;
 	}
 
-	old_spec = itimerspec64_to_itimerspec(&old_spec64);
-	if (old_setting && !error &&
-	    copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
-		error = -EFAULT;
+	return error;
+}
+
+/* Set a POSIX.1b interval timer */
+SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
+		const struct itimerspec __user *, new_setting,
+		struct itimerspec __user *, old_setting)
+{
+	struct itimerspec64 new_spec64, old_spec64;
+	struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL;
+	struct itimerspec new_spec;
+	int error = 0;
+
+	if (!new_setting)
+		return -EINVAL;
 
+	if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
+		return -EFAULT;
+	new_spec64 = itimerspec_to_itimerspec64(&new_spec);
+
+	error = do_timer_settime(timer_id, flags, &new_spec64, rtn);
+	if (!error && old_setting) {
+		struct itimerspec old_spec;
+		old_spec = itimerspec64_to_itimerspec(&old_spec64);
+		if (copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
+			error = -EFAULT;
+	}
 	return error;
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
+		       struct compat_itimerspec __user *, new,
+		       struct compat_itimerspec __user *, old)
+{
+	struct itimerspec64 new_spec64, old_spec64;
+	struct itimerspec64 *rtn = old ? &old_spec64 : NULL;
+	struct itimerspec new_spec;
+	int error = 0;
+
+	if (!new)
+		return -EINVAL;
+	if (get_compat_itimerspec(&new_spec, new))
+		return -EFAULT;
+
+	new_spec64 = itimerspec_to_itimerspec64(&new_spec);
+	error = do_timer_settime(timer_id, flags, &new_spec64, rtn);
+	if (!error && old) {
+		struct itimerspec old_spec;
+		old_spec = itimerspec64_to_itimerspec(&old_spec64);
+		if (put_compat_itimerspec(old, &old_spec))
+			error = -EFAULT;
+	}
+	return error;
+}
+#endif
+
 int common_timer_del(struct k_itimer *timer)
 {
 	const struct k_clock *kc = timer->kclock;
-- 
cgit v1.3-14-g43fede


From b0dc12426ec404de99d7e75a12a22d9201d90914 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:36 +0100
Subject: posix-timers: Take compat timer_gettime(2) to native one

... and get rid of set_fs() in there

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-11-viro@ZenIV.linux.org.uk
---
 kernel/compat.c            | 17 -----------------
 kernel/time/posix-stubs.c  |  1 +
 kernel/time/posix-timers.c | 43 ++++++++++++++++++++++++++++++++++---------
 3 files changed, 35 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index df39e2e00c47..1fb8cf7e691e 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -635,23 +635,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
 	return sys_timer_create(which_clock, event, created_timer_id);
 }
 
-COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
-		       struct compat_itimerspec __user *, setting)
-{
-	long err;
-	mm_segment_t oldfs;
-	struct itimerspec ts;
-
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	err = sys_timer_gettime(timer_id,
-				(struct itimerspec __user *) &ts);
-	set_fs(oldfs);
-	if (!err && put_compat_itimerspec(setting, &ts))
-		return -EFAULT;
-	return err;
-}
-
 COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
 		       struct compat_timespec __user *, tp)
 {
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index ad263df132d6..f4a1962d1729 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -43,6 +43,7 @@ SYS_NI(alarm);
 #endif
 COMPAT_SYS_NI(clock_adjtime);
 COMPAT_SYS_NI(timer_settime);
+COMPAT_SYS_NI(timer_gettime);
 
 /*
  * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index a73feac191f9..e82bb1fd614e 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -690,11 +690,8 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 }
 
 /* Get the time remaining on a POSIX.1b interval timer. */
-SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
-		struct itimerspec __user *, setting)
+static int do_timer_gettime(timer_t timer_id,  struct itimerspec64 *setting)
 {
-	struct itimerspec64 cur_setting64;
-	struct itimerspec cur_setting;
 	struct k_itimer *timr;
 	const struct k_clock *kc;
 	unsigned long flags;
@@ -704,21 +701,49 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 	if (!timr)
 		return -EINVAL;
 
-	memset(&cur_setting64, 0, sizeof(cur_setting64));
+	memset(setting, 0, sizeof(*setting));
 	kc = timr->kclock;
 	if (WARN_ON_ONCE(!kc || !kc->timer_get))
 		ret = -EINVAL;
 	else
-		kc->timer_get(timr, &cur_setting64);
+		kc->timer_get(timr, setting);
 
 	unlock_timer(timr, flags);
+	return ret;
+}
 
-	cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
-	if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
-		return -EFAULT;
+/* Get the time remaining on a POSIX.1b interval timer. */
+SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
+		struct itimerspec __user *, setting)
+{
+	struct itimerspec64 cur_setting64;
 
+	int ret = do_timer_gettime(timer_id, &cur_setting64);
+	if (!ret) {
+		struct itimerspec cur_setting;
+		cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
+		if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+			ret = -EFAULT;
+	}
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
+		       struct compat_itimerspec __user *, setting)
+{
+	struct itimerspec64 cur_setting64;
+
+	int ret = do_timer_gettime(timer_id, &cur_setting64);
+	if (!ret) {
+		struct itimerspec cur_setting;
+		cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
+		if (put_compat_itimerspec(setting, &cur_setting))
+			ret = -EFAULT;
+	}
 	return ret;
 }
+#endif
 
 /*
  * Get the number of overruns of a POSIX.1b interval timer.  This is to
-- 
cgit v1.3-14-g43fede


From 54ad9c46c262ce4a603dc7887e37956896a0211d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:37 +0100
Subject: itimers: Move compat itimer syscalls to native ones

get rid of set_fs(), sanitize compat copyin/copyout.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-12-viro@ZenIV.linux.org.uk
---
 include/linux/compat.h    |  4 +++
 kernel/compat.c           | 69 +++++++++++------------------------------------
 kernel/time/itimer.c      | 38 ++++++++++++++++++++++++++
 kernel/time/posix-stubs.c |  2 ++
 4 files changed, 60 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index ecb8dd261d36..425563c7647b 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -94,6 +94,10 @@ struct compat_itimerval {
 	struct compat_timeval	it_value;
 };
 
+struct itimerval;
+int get_compat_itimerval(struct itimerval *, const struct compat_itimerval __user *);
+int put_compat_itimerval(struct compat_itimerval __user *, const struct itimerval *);
+
 struct compat_tms {
 	compat_clock_t		tms_utime;
 	compat_clock_t		tms_stime;
diff --git a/kernel/compat.c b/kernel/compat.c
index 1fb8cf7e691e..c349417d2c40 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -217,65 +217,28 @@ int compat_convert_timespec(struct timespec __user **kts,
 	return 0;
 }
 
-static inline long get_compat_itimerval(struct itimerval *o,
-		struct compat_itimerval __user *i)
+int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i)
 {
-	return (!access_ok(VERIFY_READ, i, sizeof(*i)) ||
-		(__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) |
-		 __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) |
-		 __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) |
-		 __get_user(o->it_value.tv_usec, &i->it_value.tv_usec)));
-}
-
-static inline long put_compat_itimerval(struct compat_itimerval __user *o,
-		struct itimerval *i)
-{
-	return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) ||
-		(__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) |
-		 __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) |
-		 __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) |
-		 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
-}
-
-asmlinkage long sys_ni_posix_timers(void);
-
-COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
-		struct compat_itimerval __user *, it)
-{
-	struct itimerval kit;
-	int error;
-
-	if (!IS_ENABLED(CONFIG_POSIX_TIMERS))
-		return sys_ni_posix_timers();
+	struct compat_itimerval v32;
 
-	error = do_getitimer(which, &kit);
-	if (!error && put_compat_itimerval(it, &kit))
-		error = -EFAULT;
-	return error;
+	if (copy_from_user(&v32, i, sizeof(struct compat_itimerval)))
+		return -EFAULT;
+	o->it_interval.tv_sec = v32.it_interval.tv_sec;
+	o->it_interval.tv_usec = v32.it_interval.tv_usec;
+	o->it_value.tv_sec = v32.it_value.tv_sec;
+	o->it_value.tv_usec = v32.it_value.tv_usec;
+	return 0;
 }
 
-COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
-		struct compat_itimerval __user *, in,
-		struct compat_itimerval __user *, out)
+int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerval *i)
 {
-	struct itimerval kin, kout;
-	int error;
+	struct compat_itimerval v32;
 
-	if (!IS_ENABLED(CONFIG_POSIX_TIMERS))
-		return sys_ni_posix_timers();
-
-	if (in) {
-		if (get_compat_itimerval(&kin, in))
-			return -EFAULT;
-	} else
-		memset(&kin, 0, sizeof(kin));
-
-	error = do_setitimer(which, &kin, out ? &kout : NULL);
-	if (error || !out)
-		return error;
-	if (put_compat_itimerval(out, &kout))
-		return -EFAULT;
-	return 0;
+	v32.it_interval.tv_sec = i->it_interval.tv_sec;
+	v32.it_interval.tv_usec = i->it_interval.tv_usec;
+	v32.it_value.tv_sec = i->it_value.tv_sec;
+	v32.it_value.tv_usec = i->it_value.tv_usec;
+	return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0;
 }
 
 static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 087d6a1279b8..9dd7ff5e445a 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -15,6 +15,7 @@
 #include <linux/posix-timers.h>
 #include <linux/hrtimer.h>
 #include <trace/events/timer.h>
+#include <linux/compat.h>
 
 #include <linux/uaccess.h>
 
@@ -116,6 +117,19 @@ SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
 	return error;
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
+		       struct compat_itimerval __user *, it)
+{
+	struct itimerval kit;
+	int error = do_getitimer(which, &kit);
+
+	if (!error && put_compat_itimerval(it, &kit))
+		error = -EFAULT;
+	return error;
+}
+#endif
+
 
 /*
  * The timer is automagically restarted, when interval != 0
@@ -294,3 +308,27 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
 		return -EFAULT;
 	return 0;
 }
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
+		       struct compat_itimerval __user *, in,
+		       struct compat_itimerval __user *, out)
+{
+	struct itimerval kin, kout;
+	int error;
+
+	if (in) {
+		if (get_compat_itimerval(&kin, in))
+			return -EFAULT;
+	} else {
+		memset(&kin, 0, sizeof(kin));
+	}
+
+	error = do_setitimer(which, &kin, out ? &kout : NULL);
+	if (error || !out)
+		return error;
+	if (put_compat_itimerval(out, &kout))
+		return -EFAULT;
+	return 0;
+}
+#endif
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index f4a1962d1729..7f88517461e8 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -44,6 +44,8 @@ SYS_NI(alarm);
 COMPAT_SYS_NI(clock_adjtime);
 COMPAT_SYS_NI(timer_settime);
 COMPAT_SYS_NI(timer_gettime);
+COMPAT_SYS_NI(getitimer);
+COMPAT_SYS_NI(setitimer);
 
 /*
  * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
-- 
cgit v1.3-14-g43fede


From d822cdcce43f9d4dcddbf9c68f9537d542ccc3c3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:38 +0100
Subject: posix-timers: Move compat versions of clock_gettime/settime/getres

Move them to the native implementations and get rid of the set_fs() hackery.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-13-viro@ZenIV.linux.org.uk
---
 kernel/compat.c            | 51 -----------------------------------
 kernel/time/posix-stubs.c  | 53 +++++++++++++++++++++++++++++++++++++
 kernel/time/posix-timers.c | 66 +++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 115 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index c349417d2c40..582c38bfdf60 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -598,57 +598,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
 	return sys_timer_create(which_clock, event, created_timer_id);
 }
 
-COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
-		       struct compat_timespec __user *, tp)
-{
-	long err;
-	mm_segment_t oldfs;
-	struct timespec ts;
-
-	if (compat_get_timespec(&ts, tp))
-		return -EFAULT;
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	err = sys_clock_settime(which_clock,
-				(struct timespec __user *) &ts);
-	set_fs(oldfs);
-	return err;
-}
-
-COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
-		       struct compat_timespec __user *, tp)
-{
-	long err;
-	mm_segment_t oldfs;
-	struct timespec ts;
-
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	err = sys_clock_gettime(which_clock,
-				(struct timespec __user *) &ts);
-	set_fs(oldfs);
-	if (!err && compat_put_timespec(&ts, tp))
-		return -EFAULT;
-	return err;
-}
-
-COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
-		       struct compat_timespec __user *, tp)
-{
-	long err;
-	mm_segment_t oldfs;
-	struct timespec ts;
-
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	err = sys_clock_getres(which_clock,
-			       (struct timespec __user *) &ts);
-	set_fs(oldfs);
-	if (!err && tp && compat_put_timespec(&ts, tp))
-		return -EFAULT;
-	return err;
-}
-
 /*
  * We currently only need the following fields from the sigevent
  * structure: sigev_value, sigev_signo, sig_notify and (sometimes
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 7f88517461e8..a375c31cb352 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -137,6 +137,59 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 }
 
 #ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
+		       struct compat_timespec __user *, tp)
+{
+	struct timespec64 new_tp64;
+	struct timespec new_tp;
+
+	if (which_clock != CLOCK_REALTIME)
+		return -EINVAL;
+	if (compat_get_timespec(&new_tp, tp))
+		return -EFAULT;
+
+	new_tp64 = timespec_to_timespec64(new_tp);
+	return do_sys_settimeofday64(&new_tp64, NULL);
+}
+
+COMPAT_SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
+		       struct compat_timespec __user *,tp)
+{
+	struct timespec64 kernel_tp64;
+	struct timespec kernel_tp;
+
+	switch (which_clock) {
+	case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
+	case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
+	case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
+	default: return -EINVAL;
+	}
+
+	kernel_tp = timespec64_to_timespec(kernel_tp64);
+	if (compat_put_timespec(&kernel_tp, tp))
+		return -EFAULT;
+	return 0;
+}
+
+COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
+		       struct compat_timespec __user *, tp)
+{
+	struct timespec rtn_tp = {
+		.tv_sec = 0,
+		.tv_nsec = hrtimer_resolution,
+	};
+
+	switch (which_clock) {
+	case CLOCK_REALTIME:
+	case CLOCK_MONOTONIC:
+	case CLOCK_BOOTTIME:
+		if (compat_put_timespec(&rtn_tp, tp))
+			return -EFAULT;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
 COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
 		       struct compat_timespec __user *, rqtp,
 		       struct compat_timespec __user *, rmtp)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index e82bb1fd614e..61a5fb91a3c7 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1082,8 +1082,66 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
 	return err;
 }
 
+SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
+		struct timespec __user *, tp)
+{
+	const struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timespec64 rtn_tp64;
+	struct timespec rtn_tp;
+	int error;
+
+	if (!kc)
+		return -EINVAL;
+
+	error = kc->clock_getres(which_clock, &rtn_tp64);
+	rtn_tp = timespec64_to_timespec(rtn_tp64);
+
+	if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
+		error = -EFAULT;
+
+	return error;
+}
+
 #ifdef CONFIG_COMPAT
 
+COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
+		       struct compat_timespec __user *, tp)
+{
+	const struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timespec64 new_tp64;
+	struct timespec new_tp;
+
+	if (!kc || !kc->clock_set)
+		return -EINVAL;
+
+	if (compat_get_timespec(&new_tp, tp))
+		return -EFAULT;
+
+	new_tp64 = timespec_to_timespec64(new_tp);
+
+	return kc->clock_set(which_clock, &new_tp64);
+}
+
+COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
+		       struct compat_timespec __user *, tp)
+{
+	const struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timespec64 kernel_tp64;
+	struct timespec kernel_tp;
+	int error;
+
+	if (!kc)
+		return -EINVAL;
+
+	error = kc->clock_get(which_clock, &kernel_tp64);
+	kernel_tp = timespec64_to_timespec(kernel_tp64);
+
+	if (!error && compat_put_timespec(&kernel_tp, tp))
+		error = -EFAULT;
+
+	return error;
+}
+
 COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
 		       struct compat_timex __user *, utp)
 {
@@ -1107,10 +1165,9 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
 
 	return err;
 }
-#endif
 
-SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
-		struct timespec __user *, tp)
+COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
+		       struct compat_timespec __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct timespec64 rtn_tp64;
@@ -1123,11 +1180,12 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 	error = kc->clock_getres(which_clock, &rtn_tp64);
 	rtn_tp = timespec64_to_timespec(rtn_tp64);
 
-	if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
+	if (!error && tp && compat_put_timespec(&rtn_tp, tp))
 		error = -EFAULT;
 
 	return error;
 }
+#endif
 
 /*
  * nanosleep for monotonic and realtime clocks
-- 
cgit v1.3-14-g43fede


From 2482097c6c0f01ad74c9b2cff120a519ac59846e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:39 +0100
Subject: posix-timers: Move compat_timer_create() to native, get rid of
 set_fs()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-14-viro@ZenIV.linux.org.uk
---
 kernel/compat.c            | 18 --------------
 kernel/time/posix-stubs.c  |  1 +
 kernel/time/posix-timers.c | 59 ++++++++++++++++++++++++++++++++--------------
 3 files changed, 42 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 582c38bfdf60..4544eb63edfa 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -580,24 +580,6 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst,
 	return 0;
 }
 
-COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
-		       struct compat_sigevent __user *, timer_event_spec,
-		       timer_t __user *, created_timer_id)
-{
-	struct sigevent __user *event = NULL;
-
-	if (timer_event_spec) {
-		struct sigevent kevent;
-
-		event = compat_alloc_user_space(sizeof(*event));
-		if (get_compat_sigevent(&kevent, timer_event_spec) ||
-		    copy_to_user(event, &kevent, sizeof(*event)))
-			return -EFAULT;
-	}
-
-	return sys_timer_create(which_clock, event, created_timer_id);
-}
-
 /*
  * We currently only need the following fields from the sigevent
  * structure: sigev_value, sigev_signo, sig_notify and (sometimes
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index a375c31cb352..38f3b20efa29 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -41,6 +41,7 @@ SYS_NI(setitimer);
 #ifdef __ARCH_WANT_SYS_ALARM
 SYS_NI(alarm);
 #endif
+COMPAT_SYS_NI(timer_create);
 COMPAT_SYS_NI(clock_adjtime);
 COMPAT_SYS_NI(timer_settime);
 COMPAT_SYS_NI(timer_gettime);
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 61a5fb91a3c7..c9f45a84fb8b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -490,15 +490,12 @@ static int common_timer_create(struct k_itimer *new_timer)
 }
 
 /* Create a POSIX.1b interval timer. */
-
-SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
-		struct sigevent __user *, timer_event_spec,
-		timer_t __user *, created_timer_id)
+static int do_timer_create(clockid_t which_clock, struct sigevent *event,
+			   timer_t __user *created_timer_id)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
 	struct k_itimer *new_timer;
 	int error, new_timer_id;
-	sigevent_t event;
 	int it_id_set = IT_ID_NOT_SET;
 
 	if (!kc)
@@ -523,29 +520,25 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 	new_timer->kclock = kc;
 	new_timer->it_overrun = -1;
 
-	if (timer_event_spec) {
-		if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
-			error = -EFAULT;
-			goto out;
-		}
+	if (event) {
 		rcu_read_lock();
-		new_timer->it_pid = get_pid(good_sigevent(&event));
+		new_timer->it_pid = get_pid(good_sigevent(event));
 		rcu_read_unlock();
 		if (!new_timer->it_pid) {
 			error = -EINVAL;
 			goto out;
 		}
+		new_timer->it_sigev_notify     = event->sigev_notify;
+		new_timer->sigq->info.si_signo = event->sigev_signo;
+		new_timer->sigq->info.si_value = event->sigev_value;
 	} else {
-		memset(&event.sigev_value, 0, sizeof(event.sigev_value));
-		event.sigev_notify = SIGEV_SIGNAL;
-		event.sigev_signo = SIGALRM;
-		event.sigev_value.sival_int = new_timer->it_id;
+		new_timer->it_sigev_notify     = SIGEV_SIGNAL;
+		new_timer->sigq->info.si_signo = SIGALRM;
+		memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t));
+		new_timer->sigq->info.si_value.sival_int = new_timer->it_id;
 		new_timer->it_pid = get_pid(task_tgid(current));
 	}
 
-	new_timer->it_sigev_notify     = event.sigev_notify;
-	new_timer->sigq->info.si_signo = event.sigev_signo;
-	new_timer->sigq->info.si_value = event.sigev_value;
 	new_timer->sigq->info.si_tid   = new_timer->it_id;
 	new_timer->sigq->info.si_code  = SI_TIMER;
 
@@ -576,6 +569,36 @@ out:
 	return error;
 }
 
+SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
+		struct sigevent __user *, timer_event_spec,
+		timer_t __user *, created_timer_id)
+{
+	if (timer_event_spec) {
+		sigevent_t event;
+
+		if (copy_from_user(&event, timer_event_spec, sizeof (event)))
+			return -EFAULT;
+		return do_timer_create(which_clock, &event, created_timer_id);
+	}
+	return do_timer_create(which_clock, NULL, created_timer_id);
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
+		       struct compat_sigevent __user *, timer_event_spec,
+		       timer_t __user *, created_timer_id)
+{
+	if (timer_event_spec) {
+		sigevent_t event;
+
+		if (get_compat_sigevent(&event, timer_event_spec))
+			return -EFAULT;
+		return do_timer_create(which_clock, &event, created_timer_id);
+	}
+	return do_timer_create(which_clock, NULL, created_timer_id);
+}
+#endif
+
 /*
  * Locking issues: We need to protect the result of the id look up until
  * we get the timer locked down so it is not deleted under us.  The
-- 
cgit v1.3-14-g43fede


From b180db2c8ca6692a10b79631cadc18d03303d494 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:40 +0100
Subject: time: Move compat_time()/stime() to native

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-15-viro@ZenIV.linux.org.uk
---
 kernel/compat.c    | 40 ----------------------------------------
 kernel/time/time.c | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 4544eb63edfa..aa7b9a27f9e7 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -739,46 +739,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 	return ret;
 }
 
-#ifdef __ARCH_WANT_COMPAT_SYS_TIME
-
-/* compat_time_t is a 32 bit "long" and needs to get converted. */
-
-COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
-{
-	compat_time_t i;
-	struct timeval tv;
-
-	do_gettimeofday(&tv);
-	i = tv.tv_sec;
-
-	if (tloc) {
-		if (put_user(i,tloc))
-			return -EFAULT;
-	}
-	force_successful_syscall_return();
-	return i;
-}
-
-COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
-{
-	struct timespec tv;
-	int err;
-
-	if (get_user(tv.tv_sec, tptr))
-		return -EFAULT;
-
-	tv.tv_nsec = 0;
-
-	err = security_settime(&tv, NULL);
-	if (err)
-		return err;
-
-	do_settimeofday(&tv);
-	return 0;
-}
-
-#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
-
 #ifdef CONFIG_NUMA
 COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
 		       compat_uptr_t __user *, pages32,
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 400662f16c5a..e5d44999ff78 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -100,6 +100,47 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr)
 
 #endif /* __ARCH_WANT_SYS_TIME */
 
+#ifdef CONFIG_COMPAT
+#ifdef __ARCH_WANT_COMPAT_SYS_TIME
+
+/* compat_time_t is a 32 bit "long" and needs to get converted. */
+COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
+{
+	struct timeval tv;
+	compat_time_t i;
+
+	do_gettimeofday(&tv);
+	i = tv.tv_sec;
+
+	if (tloc) {
+		if (put_user(i,tloc))
+			return -EFAULT;
+	}
+	force_successful_syscall_return();
+	return i;
+}
+
+COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
+{
+	struct timespec tv;
+	int err;
+
+	if (get_user(tv.tv_sec, tptr))
+		return -EFAULT;
+
+	tv.tv_nsec = 0;
+
+	err = security_settime(&tv, NULL);
+	if (err)
+		return err;
+
+	do_settimeofday(&tv);
+	return 0;
+}
+
+#endif /* __ARCH_WANT_COMPAT_SYS_TIME */
+#endif
+
 SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
 		struct timezone __user *, tz)
 {
-- 
cgit v1.3-14-g43fede


From 2b2d02856b3176701c91d723356f766d6ee27853 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 7 Jun 2017 09:42:41 +0100
Subject: time: Move compat_gettimeofday()/settimeofday() to native

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170607084241.28657-16-viro@ZenIV.linux.org.uk
---
 kernel/compat.c    | 38 --------------------------------------
 kernel/time/time.c | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index aa7b9a27f9e7..ebd8bdc3fd68 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -92,44 +92,6 @@ int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc)
 	return 0;
 }
 
-COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
-		       struct timezone __user *, tz)
-{
-	if (tv) {
-		struct timeval ktv;
-		do_gettimeofday(&ktv);
-		if (compat_put_timeval(&ktv, tv))
-			return -EFAULT;
-	}
-	if (tz) {
-		if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
-			return -EFAULT;
-	}
-
-	return 0;
-}
-
-COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
-		       struct timezone __user *, tz)
-{
-	struct timespec64 new_ts;
-	struct timeval user_tv;
-	struct timezone new_tz;
-
-	if (tv) {
-		if (compat_get_timeval(&user_tv, tv))
-			return -EFAULT;
-		new_ts.tv_sec = user_tv.tv_sec;
-		new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
-	}
-	if (tz) {
-		if (copy_from_user(&new_tz, tz, sizeof(*tz)))
-			return -EFAULT;
-	}
-
-	return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
-}
-
 static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
 {
 	return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
diff --git a/kernel/time/time.c b/kernel/time/time.c
index e5d44999ff78..7c89e437c4d7 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -257,6 +257,47 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
 	return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
+		       struct timezone __user *, tz)
+{
+	if (tv) {
+		struct timeval ktv;
+
+		do_gettimeofday(&ktv);
+		if (compat_put_timeval(&ktv, tv))
+			return -EFAULT;
+	}
+	if (tz) {
+		if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
+		       struct timezone __user *, tz)
+{
+	struct timespec64 new_ts;
+	struct timeval user_tv;
+	struct timezone new_tz;
+
+	if (tv) {
+		if (compat_get_timeval(&user_tv, tv))
+			return -EFAULT;
+		new_ts.tv_sec = user_tv.tv_sec;
+		new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
+	}
+	if (tz) {
+		if (copy_from_user(&new_tz, tz, sizeof(*tz)))
+			return -EFAULT;
+	}
+
+	return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+}
+#endif
+
 SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
 {
 	struct timex txc;		/* Local copy of parameter */
-- 
cgit v1.3-14-g43fede


From 343d8fc208e43e50525ae6e0fc4845b9966b7318 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 13 Jun 2017 23:29:14 +0200
Subject: posix-cpu-timers: Avoid timespec conversion in do_cpu_nanosleep()

No point in converting the expiry time back and forth.

No point either to update the value in the caller supplied variable. mark
the rqtp argument const.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/time/posix-cpu-timers.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 515148d4eeb1..3adfa42ca24c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1227,10 +1227,11 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 }
 
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
-			    struct timespec64 *rqtp)
+			    const struct timespec64 *rqtp)
 {
-	struct k_itimer timer;
 	struct itimerspec64 it;
+	struct k_itimer timer;
+	u64 expires;
 	int error;
 
 	/*
@@ -1279,7 +1280,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		/*
 		 * We were interrupted by a signal.
 		 */
-		*rqtp = ns_to_timespec64(timer.it.cpu.expires);
+		expires = timer.it.cpu.expires;
 		error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
 		if (!error) {
 			/*
@@ -1312,7 +1313,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		 * Report back to the user the time still remaining.
 		 */
 		restart = &current->restart_block;
-		restart->nanosleep.expires = timespec64_to_ns(rqtp);
+		restart->nanosleep.expires = expires;
 		if (restart->nanosleep.type != TT_NONE) {
 			struct timespec ts;
 
-- 
cgit v1.3-14-g43fede


From 938e7cf2d569833a5acf689a8926faf507826253 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 13 Jun 2017 23:34:33 +0200
Subject: posix-timers: Make nanosleep timespec argument const

No nanosleep implementation modifies the rqtp argument. Mark is const.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/hrtimer.h        | 2 +-
 kernel/time/alarmtimer.c       | 2 +-
 kernel/time/hrtimer.c          | 2 +-
 kernel/time/posix-cpu-timers.c | 4 ++--
 kernel/time/posix-timers.c     | 4 ++--
 kernel/time/posix-timers.h     | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index d83b7ed1cb0e..255edd5e7a74 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -454,7 +454,7 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer,
 /* Precise sleep: */
 
 extern int nanosleep_copyout(struct restart_block *, struct timespec *);
-extern long hrtimer_nanosleep(struct timespec64 *rqtp,
+extern long hrtimer_nanosleep(const struct timespec64 *rqtp,
 			      const enum hrtimer_mode mode,
 			      const clockid_t clockid);
 
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 7bed4e44f9bd..c991cf212c6d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -753,7 +753,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
  * Handles clock_nanosleep calls against _ALARM clockids
  */
 static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
-			      struct timespec64 *tsreq)
+			      const struct timespec64 *tsreq)
 {
 	enum  alarmtimer_type type = clock2alarm(which_clock);
 	struct restart_block *restart = &current->restart_block;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 45f83cc7c0c7..81da124f1115 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1510,7 +1510,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 	return ret;
 }
 
-long hrtimer_nanosleep(struct timespec64 *rqtp,
+long hrtimer_nanosleep(const struct timespec64 *rqtp,
 		       const enum hrtimer_mode mode, const clockid_t clockid)
 {
 	struct restart_block *restart;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3adfa42ca24c..9df618ee64cf 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1328,7 +1328,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
 
 static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-			    struct timespec64 *rqtp)
+			    const struct timespec64 *rqtp)
 {
 	struct restart_block *restart_block = &current->restart_block;
 	int error;
@@ -1383,7 +1383,7 @@ static int process_cpu_timer_create(struct k_itimer *timer)
 	return posix_cpu_timer_create(timer);
 }
 static int process_cpu_nsleep(const clockid_t which_clock, int flags,
-			      struct timespec64 *rqtp)
+			      const struct timespec64 *rqtp)
 {
 	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
 }
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index c9f45a84fb8b..82d67be7d9d1 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1214,9 +1214,9 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
  * nanosleep for monotonic and realtime clocks
  */
 static int common_nsleep(const clockid_t which_clock, int flags,
-			 struct timespec64 *tsave)
+			 const struct timespec64 *rqtp)
 {
-	return hrtimer_nanosleep(tsave, flags & TIMER_ABSTIME ?
+	return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ?
 				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
 				 which_clock);
 }
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 5e69bb85629f..fb303c3be4d3 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -10,7 +10,7 @@ struct k_clock {
 	int	(*clock_adj)(const clockid_t which_clock, struct timex *tx);
 	int	(*timer_create)(struct k_itimer *timer);
 	int	(*nsleep)(const clockid_t which_clock, int flags,
-			  struct timespec64 *);
+			  const struct timespec64 *);
 	int	(*timer_set)(struct k_itimer *timr, int flags,
 			     struct itimerspec64 *new_setting,
 			     struct itimerspec64 *old_setting);
-- 
cgit v1.3-14-g43fede


From 31fd85816dbe3a714bcc3f67c17c3dd87011f79e Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 13 Jun 2017 15:52:13 -0700
Subject: bpf: permits narrower load from bpf program context fields

Currently, verifier will reject a program if it contains an
narrower load from the bpf context structure. For example,
        __u8 h = __sk_buff->hash, or
        __u16 p = __sk_buff->protocol
        __u32 sample_period = bpf_perf_event_data->sample_period
which are narrower loads of 4-byte or 8-byte field.

This patch solves the issue by:
  . Introduce a new parameter ctx_field_size to carry the
    field size of narrower load from prog type
    specific *__is_valid_access validator back to verifier.
  . The non-zero ctx_field_size for a memory access indicates
    (1). underlying prog type specific convert_ctx_accesses
         supporting non-whole-field access
    (2). the current insn is a narrower or whole field access.
  . In verifier, for such loads where load memory size is
    less than ctx_field_size, verifier transforms it
    to a full field load followed by proper masking.
  . Currently, __sk_buff and bpf_perf_event_data->sample_period
    are supporting narrowing loads.
  . Narrower stores are still not allowed as typical ctx stores
    are just normal stores.

Because of this change, some tests in verifier will fail and
these tests are removed. As a bonus, rename some out of bound
__sk_buff->cb access to proper field name and remove two
redundant "skb cb oob" tests.

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h                         |  2 +-
 include/linux/bpf_verifier.h                |  1 +
 kernel/bpf/verifier.c                       | 71 ++++++++++++++++------
 kernel/trace/bpf_trace.c                    | 21 +++++--
 net/core/filter.c                           | 56 +++++++++++++-----
 tools/testing/selftests/bpf/test_verifier.c | 92 ++++-------------------------
 6 files changed, 124 insertions(+), 119 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c32bace66d3d..1bcbf0a71f75 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -157,7 +157,7 @@ struct bpf_verifier_ops {
 	 * with 'type' (read or write) is allowed
 	 */
 	bool (*is_valid_access)(int off, int size, enum bpf_access_type type,
-				enum bpf_reg_type *reg_type);
+				enum bpf_reg_type *reg_type, int *ctx_field_size);
 	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
 			    const struct bpf_prog *prog);
 	u32 (*convert_ctx_access)(enum bpf_access_type type,
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index d5093b52b485..189741c0da85 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -73,6 +73,7 @@ struct bpf_insn_aux_data {
 		enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
 		struct bpf_map *map_ptr;	/* pointer for call insn into lookup_elem */
 	};
+	int ctx_field_size; /* the ctx field size for load/store insns, maybe 0 */
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 519a6144d3d3..44b97d958fb7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -758,15 +758,26 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 }
 
 /* check access to 'struct bpf_context' fields */
-static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
+static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
 {
+	int ctx_field_size = 0;
+
 	/* for analyzer ctx accesses are already validated and converted */
 	if (env->analyzer_ops)
 		return 0;
 
 	if (env->prog->aux->ops->is_valid_access &&
-	    env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
+	    env->prog->aux->ops->is_valid_access(off, size, t, reg_type, &ctx_field_size)) {
+		/* a non zero ctx_field_size indicates:
+		 * . For this field, the prog type specific ctx conversion algorithm
+		 *   only supports whole field access.
+		 * . This ctx access is a candiate for later verifier transformation
+		 *   to load the whole field and then apply a mask to get correct result.
+		 */
+		if (ctx_field_size)
+			env->insn_aux_data[insn_idx].ctx_field_size = ctx_field_size;
+
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
 			env->prog->aux->max_ctx_offset = off + size;
@@ -868,7 +879,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
  * if t==write && value_regno==-1, some unknown value is stored into memory
  * if t==read && value_regno==-1, don't care what we read from memory
  */
-static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
+static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off,
 			    int bpf_size, enum bpf_access_type t,
 			    int value_regno)
 {
@@ -911,7 +922,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 			verbose("R%d leaks addr into ctx\n", value_regno);
 			return -EACCES;
 		}
-		err = check_ctx_access(env, off, size, t, &reg_type);
+		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
 		if (!err && t == BPF_READ && value_regno >= 0) {
 			mark_reg_unknown_value_and_range(state->regs,
 							 value_regno);
@@ -972,7 +983,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 	return err;
 }
 
-static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
 {
 	struct bpf_reg_state *regs = env->cur_state.regs;
 	int err;
@@ -994,13 +1005,13 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return err;
 
 	/* check whether atomic_add can read the memory */
-	err = check_mem_access(env, insn->dst_reg, insn->off,
+	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 			       BPF_SIZE(insn->code), BPF_READ, -1);
 	if (err)
 		return err;
 
 	/* check whether atomic_add can write into the same memory */
-	return check_mem_access(env, insn->dst_reg, insn->off,
+	return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 				BPF_SIZE(insn->code), BPF_WRITE, -1);
 }
 
@@ -1416,7 +1427,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	 * is inferred from register state.
 	 */
 	for (i = 0; i < meta.access_size; i++) {
-		err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1);
+		err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1);
 		if (err)
 			return err;
 	}
@@ -2993,18 +3004,12 @@ static int do_check(struct bpf_verifier_env *env)
 			/* check that memory (src_reg + off) is readable,
 			 * the state of dst_reg will be updated by this func
 			 */
-			err = check_mem_access(env, insn->src_reg, insn->off,
+			err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
 					       BPF_SIZE(insn->code), BPF_READ,
 					       insn->dst_reg);
 			if (err)
 				return err;
 
-			if (BPF_SIZE(insn->code) != BPF_W &&
-			    BPF_SIZE(insn->code) != BPF_DW) {
-				insn_idx++;
-				continue;
-			}
-
 			prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
 
 			if (*prev_src_type == NOT_INIT) {
@@ -3032,7 +3037,7 @@ static int do_check(struct bpf_verifier_env *env)
 			enum bpf_reg_type *prev_dst_type, dst_reg_type;
 
 			if (BPF_MODE(insn->code) == BPF_XADD) {
-				err = check_xadd(env, insn);
+				err = check_xadd(env, insn_idx, insn);
 				if (err)
 					return err;
 				insn_idx++;
@@ -3051,7 +3056,7 @@ static int do_check(struct bpf_verifier_env *env)
 			dst_reg_type = regs[insn->dst_reg].type;
 
 			/* check that memory (dst_reg + off) is writeable */
-			err = check_mem_access(env, insn->dst_reg, insn->off,
+			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 					       BPF_SIZE(insn->code), BPF_WRITE,
 					       insn->src_reg);
 			if (err)
@@ -3080,7 +3085,7 @@ static int do_check(struct bpf_verifier_env *env)
 				return err;
 
 			/* check that memory (dst_reg + off) is writeable */
-			err = check_mem_access(env, insn->dst_reg, insn->off,
+			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 					       BPF_SIZE(insn->code), BPF_WRITE,
 					       -1);
 			if (err)
@@ -3383,7 +3388,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 	struct bpf_insn insn_buf[16], *insn;
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
-	int i, cnt, delta = 0;
+	int i, cnt, off, size, ctx_field_size, is_narrower_load, delta = 0;
 
 	if (ops->gen_prologue) {
 		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
@@ -3423,11 +3428,39 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
 			continue;
 
+		off = insn->off;
+		size = bpf_size_to_bytes(BPF_SIZE(insn->code));
+		ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
+		is_narrower_load = (type == BPF_READ && size < ctx_field_size);
+
+		/* If the read access is a narrower load of the field,
+		 * convert to a 4/8-byte load, to minimum program type specific
+		 * convert_ctx_access changes. If conversion is successful,
+		 * we will apply proper mask to the result.
+		 */
+		if (is_narrower_load) {
+			int size_code = BPF_H;
+
+			if (ctx_field_size == 4)
+				size_code = BPF_W;
+			else if (ctx_field_size == 8)
+				size_code = BPF_DW;
+			insn->off = off & ~(ctx_field_size - 1);
+			insn->code = BPF_LDX | BPF_MEM | size_code;
+		}
 		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
 		}
+		if (is_narrower_load) {
+			if (ctx_field_size <= 4)
+				insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
+							(1 << size * 8) - 1);
+			else
+				insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
+							(1 << size * 8) - 1);
+		}
 
 		new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
 		if (!new_prog)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 051d7fca0c09..9d3ec8253131 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -479,7 +479,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 
 /* bpf+kprobe programs can access fields of 'struct pt_regs' */
 static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
-					enum bpf_reg_type *reg_type)
+					enum bpf_reg_type *reg_type, int *ctx_field_size)
 {
 	if (off < 0 || off >= sizeof(struct pt_regs))
 		return false;
@@ -562,7 +562,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 }
 
 static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
-				    enum bpf_reg_type *reg_type)
+				    enum bpf_reg_type *reg_type, int *ctx_field_size)
 {
 	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
 		return false;
@@ -581,17 +581,26 @@ const struct bpf_verifier_ops tracepoint_prog_ops = {
 };
 
 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
-				    enum bpf_reg_type *reg_type)
+				    enum bpf_reg_type *reg_type, int *ctx_field_size)
 {
+	int sample_period_off;
+
 	if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
 		return false;
 	if (type != BPF_READ)
 		return false;
 	if (off % size != 0)
 		return false;
-	if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
-		if (size != sizeof(u64))
-			return false;
+
+	/* permit 1, 2, 4 byte narrower and 8 normal read access to sample_period */
+	sample_period_off = offsetof(struct bpf_perf_event_data, sample_period);
+	if (off >= sample_period_off && off < sample_period_off + sizeof(__u64)) {
+		*ctx_field_size = 8;
+#ifdef __LITTLE_ENDIAN
+		return (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0;
+#else
+		return ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0;
+#endif
 	} else {
 		if (size != sizeof(long))
 			return false;
diff --git a/net/core/filter.c b/net/core/filter.c
index a65a3b25e104..60ed6f343a63 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2856,7 +2856,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id)
 	}
 }
 
-static bool __is_valid_access(int off, int size)
+static bool __is_valid_access(int off, int size, enum bpf_access_type type,
+			      int *ctx_field_size)
 {
 	if (off < 0 || off >= sizeof(struct __sk_buff))
 		return false;
@@ -2872,9 +2873,27 @@ static bool __is_valid_access(int off, int size)
 		    offsetof(struct __sk_buff, cb[4]) + sizeof(__u32))
 			return false;
 		break;
-	default:
+	case offsetof(struct __sk_buff, data) ...
+	     offsetof(struct __sk_buff, data) + sizeof(__u32) - 1:
+	case offsetof(struct __sk_buff, data_end) ...
+	     offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1:
 		if (size != sizeof(__u32))
 			return false;
+		break;
+	default:
+		/* permit narrower load for not cb/data/data_end fields */
+		*ctx_field_size = 4;
+		if (type == BPF_WRITE) {
+			if (size != sizeof(__u32))
+				return false;
+		} else {
+			if (size != sizeof(__u32))
+#ifdef __LITTLE_ENDIAN
+				return (off & 0x3) == 0 && (size == 1 || size == 2);
+#else
+				return (off & 0x3) + size == 4 && (size == 1 || size == 2);
+#endif
+		}
 	}
 
 	return true;
@@ -2882,12 +2901,16 @@ static bool __is_valid_access(int off, int size)
 
 static bool sk_filter_is_valid_access(int off, int size,
 				      enum bpf_access_type type,
-				      enum bpf_reg_type *reg_type)
+				      enum bpf_reg_type *reg_type,
+				      int *ctx_field_size)
 {
 	switch (off) {
-	case offsetof(struct __sk_buff, tc_classid):
-	case offsetof(struct __sk_buff, data):
-	case offsetof(struct __sk_buff, data_end):
+	case offsetof(struct __sk_buff, tc_classid) ...
+	     offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1:
+	case offsetof(struct __sk_buff, data) ...
+	     offsetof(struct __sk_buff, data) + sizeof(__u32) - 1:
+	case offsetof(struct __sk_buff, data_end) ...
+	     offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1:
 		return false;
 	}
 
@@ -2901,15 +2924,17 @@ static bool sk_filter_is_valid_access(int off, int size,
 		}
 	}
 
-	return __is_valid_access(off, size);
+	return __is_valid_access(off, size, type, ctx_field_size);
 }
 
 static bool lwt_is_valid_access(int off, int size,
 				enum bpf_access_type type,
-				enum bpf_reg_type *reg_type)
+				enum bpf_reg_type *reg_type,
+				int *ctx_field_size)
 {
 	switch (off) {
-	case offsetof(struct __sk_buff, tc_classid):
+	case offsetof(struct __sk_buff, tc_classid) ...
+	     offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1:
 		return false;
 	}
 
@@ -2934,12 +2959,13 @@ static bool lwt_is_valid_access(int off, int size,
 		break;
 	}
 
-	return __is_valid_access(off, size);
+	return __is_valid_access(off, size, type, ctx_field_size);
 }
 
 static bool sock_filter_is_valid_access(int off, int size,
 					enum bpf_access_type type,
-					enum bpf_reg_type *reg_type)
+					enum bpf_reg_type *reg_type,
+					int *ctx_field_size)
 {
 	if (type == BPF_WRITE) {
 		switch (off) {
@@ -3002,7 +3028,8 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 
 static bool tc_cls_act_is_valid_access(int off, int size,
 				       enum bpf_access_type type,
-				       enum bpf_reg_type *reg_type)
+				       enum bpf_reg_type *reg_type,
+				       int *ctx_field_size)
 {
 	if (type == BPF_WRITE) {
 		switch (off) {
@@ -3027,7 +3054,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 		break;
 	}
 
-	return __is_valid_access(off, size);
+	return __is_valid_access(off, size, type, ctx_field_size);
 }
 
 static bool __is_valid_xdp_access(int off, int size)
@@ -3044,7 +3071,8 @@ static bool __is_valid_xdp_access(int off, int size)
 
 static bool xdp_is_valid_access(int off, int size,
 				enum bpf_access_type type,
-				enum bpf_reg_type *reg_type)
+				enum bpf_reg_type *reg_type,
+				int *ctx_field_size)
 {
 	if (type == BPF_WRITE)
 		return false;
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 4ee4708b0d60..13341700930c 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -1073,44 +1073,22 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 	},
 	{
-		"check cb access: byte, oob 1",
+		"__sk_buff->hash, offset 0, byte store not permitted",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
-				    offsetof(struct __sk_buff, cb[4]) + 4),
+				    offsetof(struct __sk_buff, hash)),
 			BPF_EXIT_INSN(),
 		},
 		.errstr = "invalid bpf_context access",
 		.result = REJECT,
 	},
 	{
-		"check cb access: byte, oob 2",
+		"__sk_buff->tc_index, offset 3, byte store not permitted",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
-				    offsetof(struct __sk_buff, cb[0]) - 1),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "invalid bpf_context access",
-		.result = REJECT,
-	},
-	{
-		"check cb access: byte, oob 3",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
-				    offsetof(struct __sk_buff, cb[4]) + 4),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "invalid bpf_context access",
-		.result = REJECT,
-	},
-	{
-		"check cb access: byte, oob 4",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
-				    offsetof(struct __sk_buff, cb[0]) - 1),
+				    offsetof(struct __sk_buff, tc_index) + 3),
 			BPF_EXIT_INSN(),
 		},
 		.errstr = "invalid bpf_context access",
@@ -1188,44 +1166,22 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 	},
 	{
-		"check cb access: half, oob 1",
+		"check __sk_buff->hash, offset 0, half store not permitted",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
-				    offsetof(struct __sk_buff, cb[4]) + 4),
+				    offsetof(struct __sk_buff, hash)),
 			BPF_EXIT_INSN(),
 		},
 		.errstr = "invalid bpf_context access",
 		.result = REJECT,
 	},
 	{
-		"check cb access: half, oob 2",
+		"check __sk_buff->tc_index, offset 2, half store not permitted",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
-				    offsetof(struct __sk_buff, cb[0]) - 2),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "invalid bpf_context access",
-		.result = REJECT,
-	},
-	{
-		"check cb access: half, oob 3",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
-				    offsetof(struct __sk_buff, cb[4]) + 4),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "invalid bpf_context access",
-		.result = REJECT,
-	},
-	{
-		"check cb access: half, oob 4",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
-				    offsetof(struct __sk_buff, cb[0]) - 2),
+				    offsetof(struct __sk_buff, tc_index) + 2),
 			BPF_EXIT_INSN(),
 		},
 		.errstr = "invalid bpf_context access",
@@ -1366,28 +1322,6 @@ static struct bpf_test tests[] = {
 	},
 	{
 		"check cb access: double, oob 2",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
-				    offsetof(struct __sk_buff, cb[4]) + 8),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "invalid bpf_context access",
-		.result = REJECT,
-	},
-	{
-		"check cb access: double, oob 3",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
-				    offsetof(struct __sk_buff, cb[0]) - 8),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "invalid bpf_context access",
-		.result = REJECT,
-	},
-	{
-		"check cb access: double, oob 4",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
@@ -1398,22 +1332,22 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 	},
 	{
-		"check cb access: double, oob 5",
+		"check __sk_buff->ifindex dw store not permitted",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
-			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
-				    offsetof(struct __sk_buff, cb[4]) + 8),
+			BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, ifindex)),
 			BPF_EXIT_INSN(),
 		},
 		.errstr = "invalid bpf_context access",
 		.result = REJECT,
 	},
 	{
-		"check cb access: double, oob 6",
+		"check __sk_buff->ifindex dw load not permitted",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_0, 0),
 			BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
-				    offsetof(struct __sk_buff, cb[0]) - 8),
+				    offsetof(struct __sk_buff, ifindex)),
 			BPF_EXIT_INSN(),
 		},
 		.errstr = "invalid bpf_context access",
-- 
cgit v1.3-14-g43fede


From 73a7242a06ff995d771fbe243e72b516feaa6e3d Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 13 Jun 2017 17:18:01 -0400
Subject: cgroup: Keep accurate count of tasks in each css_set

The reference count in the css_set data structure was used as a
proxy of the number of tasks attached to that css_set. However, that
count is actually not an accurate measure especially with thread mode
support. So a new variable nr_tasks is added to the css_set to keep
track of the actual task count. This new variable is protected by
the css_set_lock. Functions that require the actual task count are
updated to use the new variable.

tj: s/task_count/nr_tasks/ for consistency with cgroup_root->nr_cgrps.
    Refreshed on top of cgroup/for-v4.13 which dropped on
    css_set_populated() -> nr_tasks conversion.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |  3 +++
 kernel/cgroup/cgroup-v1.c   |  6 +-----
 kernel/cgroup/cgroup.c      | 10 ++++++++++
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ec47101cb1bf..3bc4196bf217 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -166,6 +166,9 @@ struct css_set {
 	/* the default cgroup associated with this css_set */
 	struct cgroup *dfl_cgrp;
 
+	/* internal task count, protected by css_set_lock */
+	int nr_tasks;
+
 	/*
 	 * Lists running through all tasks using this cgroup group.
 	 * mg_tasks lists tasks which belong to this cset but are in the
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 85d75152402d..e9ea5f201fac 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -334,10 +334,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup.  The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
  */
 static int cgroup_task_count(const struct cgroup *cgrp)
 {
@@ -346,7 +342,7 @@ static int cgroup_task_count(const struct cgroup *cgrp)
 
 	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
-		count += refcount_read(&link->cset->refcount);
+		count += link->cset->nr_tasks;
 	spin_unlock_irq(&css_set_lock);
 	return count;
 }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8d4e85eae42c..dbfd7028b1c6 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -573,6 +573,11 @@ static int css_set_count	= 1;	/* 1 for init_css_set */
 /**
  * css_set_populated - does a css_set contain any tasks?
  * @cset: target css_set
+ *
+ * css_set_populated() should be the same as !!cset->nr_tasks at steady
+ * state. However, css_set_populated() can be called while a task is being
+ * added to or removed from the linked list before the nr_tasks is
+ * properly updated. Hence, we can't just look at ->nr_tasks here.
  */
 static bool css_set_populated(struct css_set *cset)
 {
@@ -1598,6 +1603,7 @@ static void cgroup_enable_task_cg_lists(void)
 				css_set_update_populated(cset, true);
 			list_add_tail(&p->cg_list, &cset->tasks);
 			get_css_set(cset);
+			cset->nr_tasks++;
 		}
 		spin_unlock(&p->sighand->siglock);
 	} while_each_thread(g, p);
@@ -2064,8 +2070,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 			struct css_set *to_cset = cset->mg_dst_cset;
 
 			get_css_set(to_cset);
+			to_cset->nr_tasks++;
 			css_set_move_task(task, from_cset, to_cset, true);
 			put_css_set_locked(from_cset);
+			from_cset->nr_tasks--;
 		}
 	}
 	spin_unlock_irq(&css_set_lock);
@@ -4789,6 +4797,7 @@ void cgroup_post_fork(struct task_struct *child)
 		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
 			get_css_set(cset);
+			cset->nr_tasks++;
 			css_set_move_task(child, NULL, cset, false);
 		}
 		spin_unlock_irq(&css_set_lock);
@@ -4838,6 +4847,7 @@ void cgroup_exit(struct task_struct *tsk)
 	if (!list_empty(&tsk->cg_list)) {
 		spin_lock_irq(&css_set_lock);
 		css_set_move_task(tsk, cset, NULL, false);
+		cset->nr_tasks--;
 		spin_unlock_irq(&css_set_lock);
 	} else {
 		get_css_set(cset);
-- 
cgit v1.3-14-g43fede


From a28f8f5e995fe5964ae304444913536058f26e37 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 13 Jun 2017 17:18:02 -0400
Subject: cgroup: Move debug cgroup to its own file

The debug cgroup currently resides within cgroup-v1.c and is enabled
only for v1 cgroup. To enable the debug cgroup also for v2, it makes
sense to put the code into its own file as it will no longer be v1
specific. There is no change to the debug cgroup specific code.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/Makefile          |   1 +
 kernel/cgroup/cgroup-internal.h |   2 +
 kernel/cgroup/cgroup-v1.c       | 149 +-------------------------------------
 kernel/cgroup/debug.c           | 153 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 157 insertions(+), 148 deletions(-)
 create mode 100644 kernel/cgroup/debug.c

(limited to 'kernel')

diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 387348a40c64..ce693ccb8c58 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
 obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 00f4d6bf048f..793565c05742 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn);
 int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 		     struct kernfs_root *kf_root);
 
+int cgroup_task_count(const struct cgroup *cgrp);
+
 /*
  * namespace.c
  */
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index e9ea5f201fac..7bf4b1533f34 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -335,7 +335,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
  */
-static int cgroup_task_count(const struct cgroup *cgrp)
+int cgroup_task_count(const struct cgroup *cgrp)
 {
 	int count = 0;
 	struct cgrp_cset_link *link;
@@ -1259,150 +1259,3 @@ static int __init cgroup_no_v1(char *str)
 	return 1;
 }
 __setup("cgroup_no_v1=", cgroup_no_v1);
-
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
-	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
-	if (!css)
-		return ERR_PTR(-ENOMEM);
-
-	return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
-	kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
-					 struct cftype *cft)
-{
-	u64 count;
-
-	rcu_read_lock();
-	count = refcount_read(&task_css_set(current)->refcount);
-	rcu_read_unlock();
-	return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
-	struct cgrp_cset_link *link;
-	struct css_set *cset;
-	char *name_buf;
-
-	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-	if (!name_buf)
-		return -ENOMEM;
-
-	spin_lock_irq(&css_set_lock);
-	rcu_read_lock();
-	cset = rcu_dereference(current->cgroups);
-	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
-		struct cgroup *c = link->cgrp;
-
-		cgroup_name(c, name_buf, NAME_MAX + 1);
-		seq_printf(seq, "Root %d group %s\n",
-			   c->root->hierarchy_id, name_buf);
-	}
-	rcu_read_unlock();
-	spin_unlock_irq(&css_set_lock);
-	kfree(name_buf);
-	return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
-	struct cgroup_subsys_state *css = seq_css(seq);
-	struct cgrp_cset_link *link;
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
-		struct css_set *cset = link->cset;
-		struct task_struct *task;
-		int count = 0;
-
-		seq_printf(seq, "css_set %pK\n", cset);
-
-		list_for_each_entry(task, &cset->tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-		}
-
-		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-		}
-		continue;
-	overflow:
-		seq_puts(seq, "  ...\n");
-	}
-	spin_unlock_irq(&css_set_lock);
-	return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
-	return (!cgroup_is_populated(css->cgroup) &&
-		!css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] =  {
-	{
-		.name = "taskcount",
-		.read_u64 = debug_taskcount_read,
-	},
-
-	{
-		.name = "current_css_set",
-		.read_u64 = current_css_set_read,
-	},
-
-	{
-		.name = "current_css_set_refcount",
-		.read_u64 = current_css_set_refcount_read,
-	},
-
-	{
-		.name = "current_css_set_cg_links",
-		.seq_show = current_css_set_cg_links_read,
-	},
-
-	{
-		.name = "cgroup_css_links",
-		.seq_show = cgroup_css_links_read,
-	},
-
-	{
-		.name = "releasable",
-		.read_u64 = releasable_read,
-	},
-
-	{ }	/* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
-	.css_alloc = debug_css_alloc,
-	.css_free = debug_css_free,
-	.legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
new file mode 100644
index 000000000000..1c209aa43733
--- /dev/null
+++ b/kernel/cgroup/debug.c
@@ -0,0 +1,153 @@
+#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "cgroup-internal.h"
+
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+	if (!css)
+		return ERR_PTR(-ENOMEM);
+
+	return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css);
+}
+
+/*
+ * debug_taskcount_read - return the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ */
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return cgroup_task_count(css->cgroup);
+}
+
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	u64 count;
+
+	rcu_read_lock();
+	count = refcount_read(&task_css_set(current)->refcount);
+	rcu_read_unlock();
+	return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+	struct cgrp_cset_link *link;
+	struct css_set *cset;
+	char *name_buf;
+
+	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!name_buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&css_set_lock);
+	rcu_read_lock();
+	cset = rcu_dereference(current->cgroups);
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+		struct cgroup *c = link->cgrp;
+
+		cgroup_name(c, name_buf, NAME_MAX + 1);
+		seq_printf(seq, "Root %d group %s\n",
+			   c->root->hierarchy_id, name_buf);
+	}
+	rcu_read_unlock();
+	spin_unlock_irq(&css_set_lock);
+	kfree(name_buf);
+	return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(seq);
+	struct cgrp_cset_link *link;
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+		struct css_set *cset = link->cset;
+		struct task_struct *task;
+		int count = 0;
+
+		seq_printf(seq, "css_set %pK\n", cset);
+
+		list_for_each_entry(task, &cset->tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+
+		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+		continue;
+	overflow:
+		seq_puts(seq, "  ...\n");
+	}
+	spin_unlock_irq(&css_set_lock);
+	return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return (!cgroup_is_populated(css->cgroup) &&
+		!css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_files[] =  {
+	{
+		.name = "taskcount",
+		.read_u64 = debug_taskcount_read,
+	},
+
+	{
+		.name = "current_css_set",
+		.read_u64 = current_css_set_read,
+	},
+
+	{
+		.name = "current_css_set_refcount",
+		.read_u64 = current_css_set_refcount_read,
+	},
+
+	{
+		.name = "current_css_set_cg_links",
+		.seq_show = current_css_set_cg_links_read,
+	},
+
+	{
+		.name = "cgroup_css_links",
+		.seq_show = cgroup_css_links_read,
+	},
+
+	{
+		.name = "releasable",
+		.read_u64 = releasable_read,
+	},
+
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+	.css_alloc = debug_css_alloc,
+	.css_free = debug_css_free,
+	.legacy_cftypes = debug_files,
+};
-- 
cgit v1.3-14-g43fede


From 23b0be480f341db26ce0dee7d3f6e67f8e0e166f Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 13 Jun 2017 17:18:03 -0400
Subject: cgroup: Make Kconfig prompt of debug cgroup more accurate

The Kconfig prompt and description of the debug cgroup controller
more accurate by saying that it is for debug purpose only and its
interfaces are unstable.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 init/Kconfig          | 7 +++++--
 kernel/cgroup/debug.c | 6 ++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index 1d3475fc9496..5f9d13929ae0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1205,11 +1205,14 @@ config CGROUP_BPF
 	  inet sockets.
 
 config CGROUP_DEBUG
-	bool "Example controller"
+	bool "Debug controller"
 	default n
+	depends on DEBUG_KERNEL
 	help
 	  This option enables a simple controller that exports
-	  debugging information about the cgroups framework.
+	  debugging information about the cgroups framework. This
+	  controller is for control cgroup debugging only. Its
+	  interfaces are not stable.
 
 	  Say N.
 
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 1c209aa43733..cbe77a2087c5 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -1,3 +1,9 @@
+/*
+ * Debug controller
+ *
+ * WARNING: This controller is for cgroup core debugging only.
+ * Its interfaces are unstable and subject to changes at any time.
+ */
 #include <linux/ctype.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-- 
cgit v1.3-14-g43fede


From 575313f40ff33d0c2aff2701dfb2ccfcd6211d55 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 13 Jun 2017 17:18:04 -0400
Subject: cgroup: Make debug cgroup support v2 and thread mode

Besides supporting cgroup v2 and thread mode, the following changes
are also made:
 1) current_* cgroup files now resides only at the root as we don't
    need duplicated files of the same function all over the cgroup
    hierarchy.
 2) The cgroup_css_links_read() function is modified to report
    the number of tasks that are skipped because of overflow.
 3) The number of extra unaccounted references are displayed.
 4) The current_css_set_read() function now prints out the addresses of
    the css'es associated with the current css_set.
 5) A new cgroup_subsys_states file is added to display the css objects
    associated with a cgroup.
 6) A new cgroup_masks file is added to display the various controller
    bit masks in the cgroup.

tj: Dropped thread mode related information for now so that debug
    controller changes aren't blocked on the thread mode.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/debug.c | 170 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 153 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index cbe77a2087c5..057d9b07f461 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -36,10 +36,37 @@ static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
 	return cgroup_task_count(css->cgroup);
 }
 
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
+static int current_css_set_read(struct seq_file *seq, void *v)
 {
-	return (u64)(unsigned long)current->cgroups;
+	struct css_set *cset;
+	struct cgroup_subsys *ss;
+	struct cgroup_subsys_state *css;
+	int i, refcnt;
+
+	mutex_lock(&cgroup_mutex);
+	spin_lock_irq(&css_set_lock);
+	rcu_read_lock();
+	cset = rcu_dereference(current->cgroups);
+	refcnt = refcount_read(&cset->refcount);
+	seq_printf(seq, "css_set %pK %d", cset, refcnt);
+	if (refcnt > cset->nr_tasks)
+		seq_printf(seq, " +%d", refcnt - cset->nr_tasks);
+	seq_puts(seq, "\n");
+
+	/*
+	 * Print the css'es stored in the current css_set.
+	 */
+	for_each_subsys(ss, i) {
+		css = cset->subsys[ss->id];
+		if (!css)
+			continue;
+		seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
+			  (unsigned long)css, css->id);
+	}
+	rcu_read_unlock();
+	spin_unlock_irq(&css_set_lock);
+	mutex_unlock(&cgroup_mutex);
+	return 0;
 }
 
 static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
@@ -84,31 +111,126 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 {
 	struct cgroup_subsys_state *css = seq_css(seq);
 	struct cgrp_cset_link *link;
+	int dead_cnt = 0, extra_refs = 0;
 
 	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
 		struct css_set *cset = link->cset;
 		struct task_struct *task;
 		int count = 0;
+		int refcnt = refcount_read(&cset->refcount);
 
-		seq_printf(seq, "css_set %pK\n", cset);
+		seq_printf(seq, " %d", refcnt);
+		if (refcnt - cset->nr_tasks > 0) {
+			int extra = refcnt - cset->nr_tasks;
+
+			seq_printf(seq, " +%d", extra);
+			/*
+			 * Take out the one additional reference in
+			 * init_css_set.
+			 */
+			if (cset == &init_css_set)
+				extra--;
+			extra_refs += extra;
+		}
+		seq_puts(seq, "\n");
 
 		list_for_each_entry(task, &cset->tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+			if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+				seq_printf(seq, "  task %d\n",
+					   task_pid_vnr(task));
 		}
 
 		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+			if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+				seq_printf(seq, "  task %d\n",
+					   task_pid_vnr(task));
 		}
-		continue;
-	overflow:
-		seq_puts(seq, "  ...\n");
+		/* show # of overflowed tasks */
+		if (count > MAX_TASKS_SHOWN_PER_CSS)
+			seq_printf(seq, "  ... (%d)\n",
+				   count - MAX_TASKS_SHOWN_PER_CSS);
+
+		if (cset->dead) {
+			seq_puts(seq, "    [dead]\n");
+			dead_cnt++;
+		}
+
+		WARN_ON(count != cset->nr_tasks);
 	}
 	spin_unlock_irq(&css_set_lock);
+
+	if (!dead_cnt && !extra_refs)
+		return 0;
+
+	seq_puts(seq, "\n");
+	if (extra_refs)
+		seq_printf(seq, "extra references = %d\n", extra_refs);
+	if (dead_cnt)
+		seq_printf(seq, "dead css_sets = %d\n", dead_cnt);
+
+	return 0;
+}
+
+static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	struct cgroup_subsys *ss;
+	struct cgroup_subsys_state *css;
+	char pbuf[16];
+	int i;
+
+	mutex_lock(&cgroup_mutex);
+	for_each_subsys(ss, i) {
+		css = rcu_dereference_check(cgrp->subsys[ss->id], true);
+		if (!css)
+			continue;
+
+		pbuf[0] = '\0';
+
+		/* Show the parent CSS if applicable*/
+		if (css->parent)
+			snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
+				 css->parent->id);
+		seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
+			  (unsigned long)css, css->id,
+			  atomic_read(&css->online_cnt), pbuf);
+	}
+	mutex_unlock(&cgroup_mutex);
+	return 0;
+}
+
+static int cgroup_masks_read(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	struct cgroup_subsys *ss;
+	int i, j;
+	struct {
+		u16  *mask;
+		char *name;
+	} mask_list[] = {
+		{ &cgrp->subtree_control,  "subtree_control"  },
+		{ &cgrp->subtree_ss_mask,  "subtree_ss_mask"  },
+	};
+
+	mutex_lock(&cgroup_mutex);
+	for (i = 0; i < ARRAY_SIZE(mask_list); i++) {
+		u16 mask = *mask_list[i].mask;
+		bool first = true;
+
+		seq_printf(seq, "%-17s: ", mask_list[i].name);
+		for_each_subsys(ss, j) {
+			if (!(mask & (1 << ss->id)))
+				continue;
+			if (!first)
+				seq_puts(seq, ", ");
+			seq_puts(seq, ss->name);
+			first = false;
+		}
+		seq_putc(seq, '\n');
+	}
+
+	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 
@@ -126,17 +248,20 @@ static struct cftype debug_files[] =  {
 
 	{
 		.name = "current_css_set",
-		.read_u64 = current_css_set_read,
+		.seq_show = current_css_set_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
 	},
 
 	{
 		.name = "current_css_set_refcount",
 		.read_u64 = current_css_set_refcount_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
 	},
 
 	{
 		.name = "current_css_set_cg_links",
 		.seq_show = current_css_set_cg_links_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
 	},
 
 	{
@@ -144,6 +269,16 @@ static struct cftype debug_files[] =  {
 		.seq_show = cgroup_css_links_read,
 	},
 
+	{
+		.name = "cgroup_subsys_states",
+		.seq_show = cgroup_subsys_states_read,
+	},
+
+	{
+		.name = "cgroup_masks",
+		.seq_show = cgroup_masks_read,
+	},
+
 	{
 		.name = "releasable",
 		.read_u64 = releasable_read,
@@ -153,7 +288,8 @@ static struct cftype debug_files[] =  {
 };
 
 struct cgroup_subsys debug_cgrp_subsys = {
-	.css_alloc = debug_css_alloc,
-	.css_free = debug_css_free,
-	.legacy_cftypes = debug_files,
+	.css_alloc	= debug_css_alloc,
+	.css_free	= debug_css_free,
+	.legacy_cftypes	= debug_files,
+	.dfl_cftypes	= debug_files,
 };
-- 
cgit v1.3-14-g43fede


From 8cc38fa7fa317d44710f24475576b1f9ee205da9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Jun 2017 16:01:32 -0400
Subject: cgroup: make debug an implicit controller on cgroup2

Make debug an implicit controller on cgroup2 which is enabled by
"cgroup_debug" boot param.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Waiman Long <longman@redhat.com>
---
 kernel/cgroup/debug.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 56 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 057d9b07f461..d61e692a5338 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -240,7 +240,7 @@ static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
 		!css_has_online_children(&css->cgroup->self));
 }
 
-static struct cftype debug_files[] =  {
+static struct cftype debug_legacy_files[] =  {
 	{
 		.name = "taskcount",
 		.read_u64 = debug_taskcount_read,
@@ -287,9 +287,62 @@ static struct cftype debug_files[] =  {
 	{ }	/* terminate */
 };
 
+static struct cftype debug_files[] =  {
+	{
+		.name = "taskcount",
+		.read_u64 = debug_taskcount_read,
+	},
+
+	{
+		.name = "current_css_set",
+		.seq_show = current_css_set_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+	},
+
+	{
+		.name = "current_css_set_refcount",
+		.read_u64 = current_css_set_refcount_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+	},
+
+	{
+		.name = "current_css_set_cg_links",
+		.seq_show = current_css_set_cg_links_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+	},
+
+	{
+		.name = "css_links",
+		.seq_show = cgroup_css_links_read,
+	},
+
+	{
+		.name = "csses",
+		.seq_show = cgroup_subsys_states_read,
+	},
+
+	{
+		.name = "masks",
+		.seq_show = cgroup_masks_read,
+	},
+
+	{ }	/* terminate */
+};
+
 struct cgroup_subsys debug_cgrp_subsys = {
 	.css_alloc	= debug_css_alloc,
 	.css_free	= debug_css_free,
-	.legacy_cftypes	= debug_files,
-	.dfl_cftypes	= debug_files,
+	.legacy_cftypes	= debug_legacy_files,
 };
+
+/*
+ * On v2, debug is an implicit controller enabled by "cgroup_debug" boot
+ * parameter.
+ */
+static int __init enable_cgroup_debug(char *str)
+{
+	debug_cgrp_subsys.dfl_cftypes = debug_files;
+	debug_cgrp_subsys.implicit_on_dfl = true;
+	return 1;
+}
+__setup("cgroup_debug", enable_cgroup_debug);
-- 
cgit v1.3-14-g43fede


From 2866c0b4cf25274040f0b4cc045ad1f44b885dd8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Jun 2017 16:01:36 -0400
Subject: cgroup: refactor cgroup_masks_read() in the debug controller

Factor out cgroup_masks_read_one() out of cgroup_masks_read() for
simplicity.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Waiman Long <longman@redhat.com>
---
 kernel/cgroup/debug.c | 46 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index d61e692a5338..163fdbd7adf6 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -200,36 +200,32 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int cgroup_masks_read(struct seq_file *seq, void *v)
+static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
+				  u16 mask)
 {
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	struct cgroup_subsys *ss;
-	int i, j;
-	struct {
-		u16  *mask;
-		char *name;
-	} mask_list[] = {
-		{ &cgrp->subtree_control,  "subtree_control"  },
-		{ &cgrp->subtree_ss_mask,  "subtree_ss_mask"  },
-	};
+	int ssid;
+	bool first = true;
 
-	mutex_lock(&cgroup_mutex);
-	for (i = 0; i < ARRAY_SIZE(mask_list); i++) {
-		u16 mask = *mask_list[i].mask;
-		bool first = true;
-
-		seq_printf(seq, "%-17s: ", mask_list[i].name);
-		for_each_subsys(ss, j) {
-			if (!(mask & (1 << ss->id)))
-				continue;
-			if (!first)
-				seq_puts(seq, ", ");
-			seq_puts(seq, ss->name);
-			first = false;
-		}
-		seq_putc(seq, '\n');
+	seq_printf(seq, "%-17s: ", name);
+	for_each_subsys(ss, ssid) {
+		if (!(mask & (1 << ssid)))
+			continue;
+		if (!first)
+			seq_puts(seq, ", ");
+		seq_puts(seq, ss->name);
+		first = false;
 	}
+	seq_putc(seq, '\n');
+}
 
+static int cgroup_masks_read(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+	mutex_lock(&cgroup_mutex);
+	cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
+	cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
-- 
cgit v1.3-14-g43fede


From b6053d40e3387c916d56d319ba6bfdb2c42a67c7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 14 Jun 2017 16:01:41 -0400
Subject: cgroup: fix lockdep warning in debug controller

The debug controller grabs cgroup_mutex from interface file show
functions which can deadlock and triggers lockdep warnings.  Fix it by
using cgroup_kn_lock_live()/cgroup_kn_unlock() instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Waiman Long <longman@redhat.com>
---
 kernel/cgroup/debug.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 163fdbd7adf6..dac46af22782 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -38,12 +38,15 @@ static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
 
 static int current_css_set_read(struct seq_file *seq, void *v)
 {
+	struct kernfs_open_file *of = seq->private;
 	struct css_set *cset;
 	struct cgroup_subsys *ss;
 	struct cgroup_subsys_state *css;
 	int i, refcnt;
 
-	mutex_lock(&cgroup_mutex);
+	if (!cgroup_kn_lock_live(of->kn, false))
+		return -ENODEV;
+
 	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	cset = rcu_dereference(current->cgroups);
@@ -65,7 +68,7 @@ static int current_css_set_read(struct seq_file *seq, void *v)
 	}
 	rcu_read_unlock();
 	spin_unlock_irq(&css_set_lock);
-	mutex_unlock(&cgroup_mutex);
+	cgroup_kn_unlock(of->kn);
 	return 0;
 }
 
@@ -174,13 +177,17 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 
 static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
 {
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	struct kernfs_open_file *of = seq->private;
+	struct cgroup *cgrp;
 	struct cgroup_subsys *ss;
 	struct cgroup_subsys_state *css;
 	char pbuf[16];
 	int i;
 
-	mutex_lock(&cgroup_mutex);
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
+
 	for_each_subsys(ss, i) {
 		css = rcu_dereference_check(cgrp->subsys[ss->id], true);
 		if (!css)
@@ -196,7 +203,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
 			  (unsigned long)css, css->id,
 			  atomic_read(&css->online_cnt), pbuf);
 	}
-	mutex_unlock(&cgroup_mutex);
+
+	cgroup_kn_unlock(of->kn);
 	return 0;
 }
 
@@ -221,12 +229,17 @@ static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
 
 static int cgroup_masks_read(struct seq_file *seq, void *v)
 {
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	struct kernfs_open_file *of = seq->private;
+	struct cgroup *cgrp;
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
 
-	mutex_lock(&cgroup_mutex);
 	cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
 	cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
-	mutex_unlock(&cgroup_mutex);
+
+	cgroup_kn_unlock(of->kn);
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede


From 33e4f80ee69b5168badf37edbfed796eb48434b9 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 12 Jun 2017 22:56:34 +0200
Subject: ACPI / PM: Ignore spurious SCI wakeups from suspend-to-idle

The ACPI SCI (System Control Interrupt) is set up as a wakeup IRQ
during suspend-to-idle transitions and, consequently, any events
signaled through it wake up the system from that state.  However,
on some systems some of the events signaled via the ACPI SCI while
suspended to idle should not cause the system to wake up.  In fact,
quite often they should just be discarded.

Arguably, systems should not resume entirely on such events, but in
order to decide which events really should cause the system to resume
and which are spurious, it is necessary to resume up to the point
when ACPI SCIs are actually handled and processed, which is after
executing dpm_resume_noirq() in the system resume path.

For this reasons, add a loop around freeze_enter() in which the
platforms can process events signaled via multiplexed IRQ lines
like the ACPI SCI and add suspend-to-idle hooks that can be
used for this purpose to struct platform_freeze_ops.

In the ACPI case, the ->wake hook is used for checking if the SCI
has triggered while suspended and deferring the interrupt-induced
system wakeup until the events signaled through it are actually
processed sufficiently to decide whether or not the system should
resume.  In turn, the ->sync hook allows all of the relevant event
queues to be flushed so as to prevent events from being missed due
to race conditions.

In addition to that, some ACPI code processing wakeup events needs
to be modified to use the "hard" version of wakeup triggers, so that
it will cause a system resume to happen on device-induced wakeup
events even if the "soft" mechanism to prevent the system from
suspending is not enabled.  However, to preserve the existing
behavior with respect to suspend-to-RAM, this only is done in
the suspend-to-idle case and only if an SCI has occurred while
suspended.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/battery.c      |  2 +-
 drivers/acpi/button.c       |  5 +++--
 drivers/acpi/device_pm.c    |  9 ++++++++-
 drivers/acpi/internal.h     |  2 ++
 drivers/acpi/sleep.c        | 37 +++++++++++++++++++++++++++++++++++++
 drivers/base/power/main.c   |  5 -----
 drivers/base/power/wakeup.c | 18 ++++++++++++------
 include/acpi/acpi_bus.h     |  6 +++++-
 include/linux/suspend.h     |  7 +++++--
 kernel/power/process.c      |  2 +-
 kernel/power/suspend.c      | 35 +++++++++++++++++++++++++++++------
 11 files changed, 103 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index d42eeef9d928..1cbb88d938e5 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -782,7 +782,7 @@ static int acpi_battery_update(struct acpi_battery *battery, bool resume)
 	if ((battery->state & ACPI_BATTERY_STATE_CRITICAL) ||
 	    (test_bit(ACPI_BATTERY_ALARM_PRESENT, &battery->flags) &&
             (battery->capacity_now <= battery->alarm)))
-		pm_wakeup_event(&battery->device->dev, 0);
+		acpi_pm_wakeup_event(&battery->device->dev);
 
 	return result;
 }
diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c
index e19f530f1083..91cfdf377df7 100644
--- a/drivers/acpi/button.c
+++ b/drivers/acpi/button.c
@@ -217,7 +217,7 @@ static int acpi_lid_notify_state(struct acpi_device *device, int state)
 	}
 
 	if (state)
-		pm_wakeup_event(&device->dev, 0);
+		acpi_pm_wakeup_event(&device->dev);
 
 	ret = blocking_notifier_call_chain(&acpi_lid_notifier, state, device);
 	if (ret == NOTIFY_DONE)
@@ -402,7 +402,7 @@ static void acpi_button_notify(struct acpi_device *device, u32 event)
 		} else {
 			int keycode;
 
-			pm_wakeup_event(&device->dev, 0);
+			acpi_pm_wakeup_event(&device->dev);
 			if (button->suspended)
 				break;
 
@@ -534,6 +534,7 @@ static int acpi_button_add(struct acpi_device *device)
 		lid_device = device;
 	}
 
+	device_init_wakeup(&device->dev, true);
 	printk(KERN_INFO PREFIX "%s [%s]\n", name, acpi_device_bid(device));
 	return 0;
 
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index f13c62c4b117..ca0210213773 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -24,6 +24,7 @@
 #include <linux/pm_qos.h>
 #include <linux/pm_domain.h>
 #include <linux/pm_runtime.h>
+#include <linux/suspend.h>
 
 #include "internal.h"
 
@@ -385,6 +386,12 @@ EXPORT_SYMBOL(acpi_bus_power_manageable);
 #ifdef CONFIG_PM
 static DEFINE_MUTEX(acpi_pm_notifier_lock);
 
+void acpi_pm_wakeup_event(struct device *dev)
+{
+	pm_wakeup_dev_event(dev, 0, acpi_s2idle_wakeup());
+}
+EXPORT_SYMBOL_GPL(acpi_pm_wakeup_event);
+
 static void acpi_pm_notify_handler(acpi_handle handle, u32 val, void *not_used)
 {
 	struct acpi_device *adev;
@@ -399,7 +406,7 @@ static void acpi_pm_notify_handler(acpi_handle handle, u32 val, void *not_used)
 	mutex_lock(&acpi_pm_notifier_lock);
 
 	if (adev->wakeup.flags.notifier_present) {
-		__pm_wakeup_event(adev->wakeup.ws, 0);
+		pm_wakeup_ws_event(adev->wakeup.ws, 0, acpi_s2idle_wakeup());
 		if (adev->wakeup.context.func)
 			adev->wakeup.context.func(&adev->wakeup.context);
 	}
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 66229ffa909b..75924ea69071 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -198,8 +198,10 @@ void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit);
                                   Suspend/Resume
   -------------------------------------------------------------------------- */
 #ifdef CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT
+extern bool acpi_s2idle_wakeup(void);
 extern int acpi_sleep_init(void);
 #else
+static inline bool acpi_s2idle_wakeup(void) { return false; }
 static inline int acpi_sleep_init(void) { return -ENXIO; }
 #endif
 
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index a4782c75ebdd..555de11a56b6 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -650,6 +650,8 @@ static const struct platform_suspend_ops acpi_suspend_ops_old = {
 	.recover = acpi_pm_finish,
 };
 
+static bool s2idle_wakeup;
+
 static int acpi_freeze_begin(void)
 {
 	acpi_scan_lock_acquire();
@@ -666,6 +668,33 @@ static int acpi_freeze_prepare(void)
 	return 0;
 }
 
+static void acpi_freeze_wake(void)
+{
+	/*
+	 * If IRQD_WAKEUP_ARMED is not set for the SCI at this point, it means
+	 * that the SCI has triggered while suspended, so cancel the wakeup in
+	 * case it has not been a wakeup event (the GPEs will be checked later).
+	 */
+	if (acpi_sci_irq_valid() &&
+	    !irqd_is_wakeup_armed(irq_get_irq_data(acpi_sci_irq))) {
+		pm_system_cancel_wakeup();
+		s2idle_wakeup = true;
+	}
+}
+
+static void acpi_freeze_sync(void)
+{
+	/*
+	 * Process all pending events in case there are any wakeup ones.
+	 *
+	 * The EC driver uses the system workqueue, so that one needs to be
+	 * flushed too.
+	 */
+	acpi_os_wait_events_complete();
+	flush_scheduled_work();
+	s2idle_wakeup = false;
+}
+
 static void acpi_freeze_restore(void)
 {
 	if (acpi_sci_irq_valid())
@@ -682,6 +711,8 @@ static void acpi_freeze_end(void)
 static const struct platform_freeze_ops acpi_freeze_ops = {
 	.begin = acpi_freeze_begin,
 	.prepare = acpi_freeze_prepare,
+	.wake = acpi_freeze_wake,
+	.sync = acpi_freeze_sync,
 	.restore = acpi_freeze_restore,
 	.end = acpi_freeze_end,
 };
@@ -700,9 +731,15 @@ static void acpi_sleep_suspend_setup(void)
 }
 
 #else /* !CONFIG_SUSPEND */
+#define s2idle_wakeup	(false)
 static inline void acpi_sleep_suspend_setup(void) {}
 #endif /* !CONFIG_SUSPEND */
 
+bool acpi_s2idle_wakeup(void)
+{
+	return s2idle_wakeup;
+}
+
 #ifdef CONFIG_PM_SLEEP
 static u32 saved_bm_rld;
 
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 253f860e8981..ef5b6a6e5045 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1095,11 +1095,6 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a
 	if (async_error)
 		goto Complete;
 
-	if (pm_wakeup_pending()) {
-		async_error = -EBUSY;
-		goto Complete;
-	}
-
 	if (dev->power.syscore || dev->power.direct_complete)
 		goto Complete;
 
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index c313b600d356..9c36b27996fc 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -28,8 +28,8 @@ bool events_check_enabled __read_mostly;
 /* First wakeup IRQ seen by the kernel in the last cycle. */
 unsigned int pm_wakeup_irq __read_mostly;
 
-/* If set and the system is suspending, terminate the suspend. */
-static bool pm_abort_suspend __read_mostly;
+/* If greater than 0 and the system is suspending, terminate the suspend. */
+static atomic_t pm_abort_suspend __read_mostly;
 
 /*
  * Combined counters of registered wakeup events and wakeup events in progress.
@@ -855,20 +855,26 @@ bool pm_wakeup_pending(void)
 		pm_print_active_wakeup_sources();
 	}
 
-	return ret || pm_abort_suspend;
+	return ret || atomic_read(&pm_abort_suspend) > 0;
 }
 
 void pm_system_wakeup(void)
 {
-	pm_abort_suspend = true;
+	atomic_inc(&pm_abort_suspend);
 	freeze_wake();
 }
 EXPORT_SYMBOL_GPL(pm_system_wakeup);
 
-void pm_wakeup_clear(void)
+void pm_system_cancel_wakeup(void)
+{
+	atomic_dec(&pm_abort_suspend);
+}
+
+void pm_wakeup_clear(bool reset)
 {
-	pm_abort_suspend = false;
 	pm_wakeup_irq = 0;
+	if (reset)
+		atomic_set(&pm_abort_suspend, 0);
 }
 
 void pm_system_irq_wakeup(unsigned int irq_number)
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 79c0af419300..63a90a624a0f 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -598,15 +598,19 @@ static inline bool acpi_device_always_present(struct acpi_device *adev)
 #endif
 
 #ifdef CONFIG_PM
+void acpi_pm_wakeup_event(struct device *dev);
 acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev,
 			void (*func)(struct acpi_device_wakeup_context *context));
 acpi_status acpi_remove_pm_notifier(struct acpi_device *adev);
 int acpi_pm_device_sleep_state(struct device *, int *, int);
 int acpi_pm_device_run_wake(struct device *, bool);
 #else
+static inline void acpi_pm_wakeup_event(struct device *dev)
+{
+}
 static inline acpi_status acpi_add_pm_notifier(struct acpi_device *adev,
 					       struct device *dev,
-				               void (*work_func)(struct work_struct *work))
+					       void (*func)(struct acpi_device_wakeup_context *context))
 {
 	return AE_SUPPORT;
 }
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index d9718378a8be..0b1cf32edfd7 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -189,6 +189,8 @@ struct platform_suspend_ops {
 struct platform_freeze_ops {
 	int (*begin)(void);
 	int (*prepare)(void);
+	void (*wake)(void);
+	void (*sync)(void);
 	void (*restore)(void);
 	void (*end)(void);
 };
@@ -428,7 +430,8 @@ extern unsigned int pm_wakeup_irq;
 
 extern bool pm_wakeup_pending(void);
 extern void pm_system_wakeup(void);
-extern void pm_wakeup_clear(void);
+extern void pm_system_cancel_wakeup(void);
+extern void pm_wakeup_clear(bool reset);
 extern void pm_system_irq_wakeup(unsigned int irq_number);
 extern bool pm_get_wakeup_count(unsigned int *count, bool block);
 extern bool pm_save_wakeup_count(unsigned int count);
@@ -478,7 +481,7 @@ static inline int unregister_pm_notifier(struct notifier_block *nb)
 
 static inline bool pm_wakeup_pending(void) { return false; }
 static inline void pm_system_wakeup(void) {}
-static inline void pm_wakeup_clear(void) {}
+static inline void pm_wakeup_clear(bool reset) {}
 static inline void pm_system_irq_wakeup(unsigned int irq_number) {}
 
 static inline void lock_system_sleep(void) {}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index c7209f060eeb..78672d324a6e 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -132,7 +132,7 @@ int freeze_processes(void)
 	if (!pm_freezing)
 		atomic_inc(&system_freezing_cnt);
 
-	pm_wakeup_clear();
+	pm_wakeup_clear(true);
 	pr_info("Freezing user space processes ... ");
 	pm_freezing = true;
 	error = try_to_freeze_tasks(true);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 15e6baef5c73..3ecf275d7e44 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -72,6 +72,8 @@ static void freeze_begin(void)
 
 static void freeze_enter(void)
 {
+	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true);
+
 	spin_lock_irq(&suspend_freeze_lock);
 	if (pm_wakeup_pending())
 		goto out;
@@ -84,11 +86,9 @@ static void freeze_enter(void)
 
 	/* Push all the CPUs into the idle loop. */
 	wake_up_all_idle_cpus();
-	pr_debug("PM: suspend-to-idle\n");
 	/* Make the current CPU wait so it can enter the idle loop too. */
 	wait_event(suspend_freeze_wait_head,
 		   suspend_freeze_state == FREEZE_STATE_WAKE);
-	pr_debug("PM: resume from suspend-to-idle\n");
 
 	cpuidle_pause();
 	put_online_cpus();
@@ -98,6 +98,31 @@ static void freeze_enter(void)
  out:
 	suspend_freeze_state = FREEZE_STATE_NONE;
 	spin_unlock_irq(&suspend_freeze_lock);
+
+	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false);
+}
+
+static void s2idle_loop(void)
+{
+	pr_debug("PM: suspend-to-idle\n");
+
+	do {
+		freeze_enter();
+
+		if (freeze_ops && freeze_ops->wake)
+			freeze_ops->wake();
+
+		dpm_resume_noirq(PMSG_RESUME);
+		if (freeze_ops && freeze_ops->sync)
+			freeze_ops->sync();
+
+		if (pm_wakeup_pending())
+			break;
+
+		pm_wakeup_clear(false);
+	} while (!dpm_suspend_noirq(PMSG_SUSPEND));
+
+	pr_debug("PM: resume from suspend-to-idle\n");
 }
 
 void freeze_wake(void)
@@ -371,10 +396,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 	 * all the devices are suspended.
 	 */
 	if (state == PM_SUSPEND_FREEZE) {
-		trace_suspend_resume(TPS("machine_suspend"), state, true);
-		freeze_enter();
-		trace_suspend_resume(TPS("machine_suspend"), state, false);
-		goto Platform_wake;
+		s2idle_loop();
+		goto Platform_early_resume;
 	}
 
 	error = disable_nonboot_cpus();
-- 
cgit v1.3-14-g43fede


From cd33f5f2cbfaadc21270f3ddac7c3c33e0a1a28c Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 12 Jun 2017 11:53:09 -0400
Subject: audit: make sure we never skip the multicast broadcast

When the auditd connection is reset, either intentionally or due to
a failure, any records that were in the main backlog queue would not
be sent in a multicast broadcast.  This patch fixes this problem by
not flushing the main backlog queue on a connection reset, the main
kauditd_thread() will take care of that normally.

Resolves: https://github.com/linux-audit/audit-kernel/issues/41
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index e1e2b3abfb93..7cad70214b81 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -605,11 +605,10 @@ static void auditd_reset(const struct auditd_connection *ac)
 	if (ac_old)
 		call_rcu(&ac_old->rcu, auditd_conn_free);
 
-	/* flush all of the main and retry queues to the hold queue */
+	/* flush the retry queue to the hold queue, but don't touch the main
+	 * queue since we need to process that normally for multicast */
 	while ((skb = skb_dequeue(&audit_retry_queue)))
 		kauditd_hold_skb(skb);
-	while ((skb = skb_dequeue(&audit_queue)))
-		kauditd_hold_skb(skb);
 }
 
 /**
-- 
cgit v1.3-14-g43fede


From 204a2be30a7a8a8d12642f23f3fbdc8b9923b500 Mon Sep 17 00:00:00 2001
From: Andreas Schwab <schwab@linux-m68k.org>
Date: Wed, 7 Jun 2017 00:11:44 +0200
Subject: m68k: Remove ptrace_signal_deliver

This fixes debugger syscall restart interactions.  A debugger that
modifies the tracee's program counter is expected to set the orig_d0
pseudo register to -1, to disable a possible syscall restart.

This removes the last user of the ptrace_signal_deliver hook in the ptrace
signal handling, so remove that as well.

Signed-off-by: Andreas Schwab <schwab@linux-m68k.org>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/include/asm/signal.h |  5 -----
 arch/m68k/kernel/signal.c      | 16 ----------------
 include/linux/ptrace.h         |  4 ----
 kernel/signal.c                |  1 -
 4 files changed, 26 deletions(-)

(limited to 'kernel')

diff --git a/arch/m68k/include/asm/signal.h b/arch/m68k/include/asm/signal.h
index 8c8ce5e1ee0e..3bc64d02ba5f 100644
--- a/arch/m68k/include/asm/signal.h
+++ b/arch/m68k/include/asm/signal.h
@@ -62,9 +62,4 @@ static inline int __gen_sigismember(sigset_t *set, int _sig)
 
 #endif /* !CONFIG_CPU_HAS_NO_BITFIELDS */
 
-#ifndef __uClinux__
-extern void ptrace_signal_deliver(void);
-#define ptrace_signal_deliver ptrace_signal_deliver
-#endif /* __uClinux__ */
-
 #endif /* _M68K_SIGNAL_H */
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 6f945bb5ffbd..e79421f5b9cd 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -109,22 +109,6 @@ int fixup_exception(struct pt_regs *regs)
 	return 1;
 }
 
-void ptrace_signal_deliver(void)
-{
-	struct pt_regs *regs = signal_pt_regs();
-	if (regs->orig_d0 < 0)
-		return;
-	switch (regs->d0) {
-	case -ERESTARTNOHAND:
-	case -ERESTARTSYS:
-	case -ERESTARTNOINTR:
-		regs->d0 = regs->orig_d0;
-		regs->orig_d0 = -1;
-		regs->pc -= 2;
-		break;
-	}
-}
-
 static inline void push_cache (unsigned long vaddr)
 {
 	/*
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 422bc2e4cb6a..9a2e04be0657 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -388,10 +388,6 @@ static inline void user_single_step_siginfo(struct task_struct *tsk,
 #define current_pt_regs() task_pt_regs(current)
 #endif
 
-#ifndef ptrace_signal_deliver
-#define ptrace_signal_deliver() ((void)0)
-#endif
-
 /*
  * unlike current_pt_regs(), this one is equal to task_pt_regs(current)
  * on *all* architectures; the only reason to have a per-arch definition
diff --git a/kernel/signal.c b/kernel/signal.c
index ca92bcfeb322..f0a48e5b1f0b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2092,7 +2092,6 @@ static void do_jobctl_trap(void)
 
 static int ptrace_signal(int signr, siginfo_t *info)
 {
-	ptrace_signal_deliver();
 	/*
 	 * We do not check sig_kernel_stop(signr) but set this marker
 	 * unconditionally because we do not know whether debugger will
-- 
cgit v1.3-14-g43fede


From cde50a67397c0da7d11795d4b4418384022ab8e6 Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Sun, 18 Jun 2017 14:06:01 +0000
Subject: locking/rtmutex: Don't initialize lockdep when not required

pi_mutex isn't supposed to be tracked by lockdep, but just
passing NULLs for name and key will cause lockdep to spew a
warning and die, which is not what we want it to do.

Skip lockdep initialization if the caller passed NULLs for
name and key, suggesting such initialization isn't desired.

Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: f5694788ad8d ("rt_mutex: Add lockdep annotations")
Link: http://lkml.kernel.org/r/20170618140548.4763-1-alexander.levin@verizon.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rtmutex.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 43123533e9b1..78069895032a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1661,7 +1661,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name,
 	lock->waiters = RB_ROOT;
 	lock->waiters_leftmost = NULL;
 
-	debug_rt_mutex_init(lock, name, key);
+	if (name && key)
+		debug_rt_mutex_init(lock, name, key);
 }
 EXPORT_SYMBOL_GPL(__rt_mutex_init);
 
-- 
cgit v1.3-14-g43fede


From ac6424b981bce1c4bc55675c6ce11bfe1bbfa64f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 20 Jun 2017 12:06:13 +0200
Subject: sched/wait: Rename wait_queue_t => wait_queue_entry_t

Rename:

	wait_queue_t		=>	wait_queue_entry_t

'wait_queue_t' was always a slight misnomer: its name implies that it's a "queue",
but in reality it's a queue *entry*. The 'real' queue is the wait queue head,
which had to carry the name.

Start sorting this out by renaming it to 'wait_queue_entry_t'.

This also allows the real structure name 'struct __wait_queue' to
lose its double underscore and become 'struct wait_queue_entry',
which is the more canonical nomenclature for such data types.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/DocBook/kernel-hacking.tmpl          |  2 +-
 Documentation/filesystems/autofs4.txt              | 12 ++--
 block/blk-mq.c                                     |  2 +-
 block/blk-wbt.c                                    |  2 +-
 block/kyber-iosched.c                              |  8 +--
 drivers/bluetooth/btmrvl_main.c                    |  2 +-
 drivers/char/ipmi/ipmi_watchdog.c                  |  2 +-
 drivers/gpu/drm/i915/i915_gem_request.h            |  2 +-
 drivers/gpu/drm/i915/i915_sw_fence.c               | 14 ++---
 drivers/gpu/drm/i915/i915_sw_fence.h               |  2 +-
 drivers/gpu/drm/radeon/radeon.h                    |  2 +-
 drivers/gpu/drm/radeon/radeon_fence.c              |  2 +-
 drivers/gpu/vga/vgaarb.c                           |  2 +-
 drivers/infiniband/hw/i40iw/i40iw_main.c           |  2 +-
 drivers/md/bcache/btree.h                          |  2 +-
 drivers/net/ethernet/cavium/liquidio/octeon_main.h |  4 +-
 drivers/net/wireless/cisco/airo.c                  |  2 +-
 .../net/wireless/intersil/hostap/hostap_ioctl.c    |  2 +-
 drivers/net/wireless/marvell/libertas/main.c       |  2 +-
 drivers/scsi/dpt/dpti_i2o.h                        |  2 +-
 drivers/scsi/ips.c                                 | 12 ++--
 drivers/scsi/ips.h                                 |  4 +-
 .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c |  6 +-
 .../staging/lustre/lnet/klnds/socklnd/socklnd_cb.c |  4 +-
 drivers/staging/lustre/lnet/libcfs/debug.c         |  2 +-
 drivers/staging/lustre/lnet/libcfs/tracefile.c     |  2 +-
 drivers/staging/lustre/lnet/lnet/lib-eq.c          |  2 +-
 drivers/staging/lustre/lnet/lnet/lib-socket.c      |  2 +-
 drivers/staging/lustre/lustre/fid/fid_request.c    |  6 +-
 drivers/staging/lustre/lustre/include/lustre_lib.h |  4 +-
 drivers/staging/lustre/lustre/llite/lcommon_cl.c   |  2 +-
 .../staging/lustre/lustre/lov/lov_cl_internal.h    |  2 +-
 drivers/staging/lustre/lustre/lov/lov_object.c     |  2 +-
 drivers/staging/lustre/lustre/obdclass/lu_object.c |  6 +-
 drivers/tty/synclink_gt.c                          |  2 +-
 drivers/vfio/virqfd.c                              |  2 +-
 drivers/vhost/vhost.c                              |  2 +-
 drivers/vhost/vhost.h                              |  2 +-
 fs/autofs4/autofs_i.h                              |  2 +-
 fs/autofs4/waitq.c                                 | 18 +++---
 fs/cachefiles/internal.h                           |  2 +-
 fs/cachefiles/namei.c                              |  2 +-
 fs/cachefiles/rdwr.c                               |  2 +-
 fs/dax.c                                           |  4 +-
 fs/eventfd.c                                       |  2 +-
 fs/eventpoll.c                                     | 10 ++--
 fs/fs_pin.c                                        |  2 +-
 fs/nfs/nfs4proc.c                                  |  4 +-
 fs/nilfs2/segment.c                                |  2 +-
 fs/orangefs/orangefs-bufmap.c                      |  4 +-
 fs/reiserfs/journal.c                              |  2 +-
 fs/select.c                                        |  4 +-
 fs/signalfd.c                                      |  2 +-
 fs/userfaultfd.c                                   |  8 +--
 include/linux/blk-mq.h                             |  2 +-
 include/linux/eventfd.h                            |  4 +-
 include/linux/kvm_irqfd.h                          |  2 +-
 include/linux/pagemap.h                            |  2 +-
 include/linux/poll.h                               |  2 +-
 include/linux/vfio.h                               |  2 +-
 include/linux/wait.h                               | 67 +++++++++++-----------
 include/net/af_unix.h                              |  2 +-
 include/uapi/linux/auto_fs.h                       |  4 +-
 include/uapi/linux/auto_fs4.h                      |  4 +-
 kernel/exit.c                                      |  4 +-
 kernel/futex.c                                     |  2 +-
 kernel/sched/completion.c                          |  2 +-
 kernel/sched/core.c                                |  2 +-
 kernel/sched/wait.c                                | 42 +++++++-------
 kernel/workqueue.c                                 |  4 +-
 mm/filemap.c                                       | 10 ++--
 mm/memcontrol.c                                    |  8 +--
 mm/mempool.c                                       |  2 +-
 mm/shmem.c                                         |  2 +-
 net/9p/trans_fd.c                                  |  4 +-
 net/bluetooth/bnep/core.c                          |  2 +-
 net/bluetooth/cmtp/core.c                          |  2 +-
 net/bluetooth/hidp/core.c                          |  2 +-
 net/core/datagram.c                                |  2 +-
 net/unix/af_unix.c                                 |  4 +-
 sound/core/control.c                               |  2 +-
 sound/core/hwdep.c                                 |  2 +-
 sound/core/init.c                                  |  2 +-
 sound/core/oss/pcm_oss.c                           |  4 +-
 sound/core/pcm_lib.c                               |  2 +-
 sound/core/pcm_native.c                            |  4 +-
 sound/core/rawmidi.c                               |  8 +--
 sound/core/seq/seq_fifo.c                          |  2 +-
 sound/core/seq/seq_memory.c                        |  2 +-
 sound/core/timer.c                                 |  2 +-
 sound/isa/wavefront/wavefront_synth.c              |  2 +-
 sound/pci/mixart/mixart_core.c                     |  4 +-
 sound/pci/ymfpci/ymfpci_main.c                     |  2 +-
 virt/kvm/eventfd.c                                 |  2 +-
 94 files changed, 216 insertions(+), 213 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
index da5c087462b1..c3c705591532 100644
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -819,7 +819,7 @@ printk(KERN_INFO "my ip: %pI4\n", &amp;ipaddress);
    certain condition is true.  They must be used carefully to ensure
    there is no race condition.  You declare a
    <type>wait_queue_head_t</type>, and then processes which want to
-   wait for that condition declare a <type>wait_queue_t</type>
+   wait for that condition declare a <type>wait_queue_entry_t</type>
    referring to themselves, and place that in the queue.
   </para>
 
diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt
index f10dd590f69f..8444dc3d57e8 100644
--- a/Documentation/filesystems/autofs4.txt
+++ b/Documentation/filesystems/autofs4.txt
@@ -316,7 +316,7 @@ For version 5, the format of the message is:
         struct autofs_v5_packet {
                 int proto_version;                /* Protocol version */
                 int type;                        /* Type of packet */
-                autofs_wqt_t wait_queue_token;
+                autofs_wqt_t wait_queue_entry_token;
                 __u32 dev;
                 __u64 ino;
                 __u32 uid;
@@ -341,12 +341,12 @@ The pipe will be set to "packet mode" (equivalent to passing
 `O_DIRECT`) to _pipe2(2)_ so that a read from the pipe will return at
 most one packet, and any unread portion of a packet will be discarded.
 
-The `wait_queue_token` is a unique number which can identify a
+The `wait_queue_entry_token` is a unique number which can identify a
 particular request to be acknowledged.  When a message is sent over
 the pipe the affected dentry is marked as either "active" or
 "expiring" and other accesses to it block until the message is
 acknowledged using one of the ioctls below and the relevant
-`wait_queue_token`.
+`wait_queue_entry_token`.
 
 Communicating with autofs: root directory ioctls
 ------------------------------------------------
@@ -358,7 +358,7 @@ capability, or must be the automount daemon.
 The available ioctl commands are:
 
 - **AUTOFS_IOC_READY**: a notification has been handled.  The argument
-    to the ioctl command is the "wait_queue_token" number
+    to the ioctl command is the "wait_queue_entry_token" number
     corresponding to the notification being acknowledged.
 - **AUTOFS_IOC_FAIL**: similar to above, but indicates failure with
     the error code `ENOENT`.
@@ -382,14 +382,14 @@ The available ioctl commands are:
         struct autofs_packet_expire_multi {
                 int proto_version;              /* Protocol version */
                 int type;                       /* Type of packet */
-                autofs_wqt_t wait_queue_token;
+                autofs_wqt_t wait_queue_entry_token;
                 int len;
                 char name[NAME_MAX+1];
         };
 
      is required.  This is filled in with the name of something
      that can be unmounted or removed.  If nothing can be expired,
-     `errno` is set to `EAGAIN`.  Even though a `wait_queue_token`
+     `errno` is set to `EAGAIN`.  Even though a `wait_queue_entry_token`
      is present in the structure, no "wait queue" is established
      and no acknowledgment is needed.
 - **AUTOFS_IOC_EXPIRE_MULTI**:  This is similar to
diff --git a/block/blk-mq.c b/block/blk-mq.c
index bb66c96850b1..a083f95e04b1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -926,7 +926,7 @@ static bool reorder_tags_to_front(struct list_head *list)
 	return first != NULL;
 }
 
-static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
+static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 				void *key)
 {
 	struct blk_mq_hw_ctx *hctx;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 17676f4d7fd1..5f3a37c2784c 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -503,7 +503,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 }
 
 static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
-			     wait_queue_t *wait, unsigned long rw)
+			     wait_queue_entry_t *wait, unsigned long rw)
 {
 	/*
 	 * inc it here even if disabled, since we'll dec it at completion.
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index b9faabc75fdb..b95d6bd714c0 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -99,7 +99,7 @@ struct kyber_hctx_data {
 	struct list_head rqs[KYBER_NUM_DOMAINS];
 	unsigned int cur_domain;
 	unsigned int batching;
-	wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
+	wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
 	atomic_t wait_index[KYBER_NUM_DOMAINS];
 };
 
@@ -507,7 +507,7 @@ static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
 	}
 }
 
-static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
+static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 			     void *key)
 {
 	struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
@@ -523,7 +523,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 {
 	unsigned int sched_domain = khd->cur_domain;
 	struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
-	wait_queue_t *wait = &khd->domain_wait[sched_domain];
+	wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
 	struct sbq_wait_state *ws;
 	int nr;
 
@@ -734,7 +734,7 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m)	\
 {									\
 	struct blk_mq_hw_ctx *hctx = data;				\
 	struct kyber_hctx_data *khd = hctx->sched_data;			\
-	wait_queue_t *wait = &khd->domain_wait[domain];			\
+	wait_queue_entry_t *wait = &khd->domain_wait[domain];		\
 									\
 	seq_printf(m, "%d\n", !list_empty_careful(&wait->task_list));	\
 	return 0;							\
diff --git a/drivers/bluetooth/btmrvl_main.c b/drivers/bluetooth/btmrvl_main.c
index c38cb5b91291..fe850f0567cb 100644
--- a/drivers/bluetooth/btmrvl_main.c
+++ b/drivers/bluetooth/btmrvl_main.c
@@ -602,7 +602,7 @@ static int btmrvl_service_main_thread(void *data)
 	struct btmrvl_thread *thread = data;
 	struct btmrvl_private *priv = thread->priv;
 	struct btmrvl_adapter *adapter = priv->adapter;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	struct sk_buff *skb;
 	ulong flags;
 
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index d165af8abe36..a5c6cfe71a8e 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -821,7 +821,7 @@ static ssize_t ipmi_read(struct file *file,
 			 loff_t      *ppos)
 {
 	int          rv = 0;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	if (count <= 0)
 		return 0;
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 129c58bb4805..a4a920c4c454 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -123,7 +123,7 @@ struct drm_i915_gem_request {
 	 * It is used by the driver to then queue the request for execution.
 	 */
 	struct i915_sw_fence submit;
-	wait_queue_t submitq;
+	wait_queue_entry_t submitq;
 	wait_queue_head_t execute;
 
 	/* A list of everyone we wait upon, and everyone who waits upon us.
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index a277f8eb7beb..8669bfa33064 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -152,7 +152,7 @@ static void __i915_sw_fence_wake_up_all(struct i915_sw_fence *fence,
 					struct list_head *continuation)
 {
 	wait_queue_head_t *x = &fence->wait;
-	wait_queue_t *pos, *next;
+	wait_queue_entry_t *pos, *next;
 	unsigned long flags;
 
 	debug_fence_deactivate(fence);
@@ -254,7 +254,7 @@ void i915_sw_fence_commit(struct i915_sw_fence *fence)
 	__i915_sw_fence_commit(fence);
 }
 
-static int i915_sw_fence_wake(wait_queue_t *wq, unsigned mode, int flags, void *key)
+static int i915_sw_fence_wake(wait_queue_entry_t *wq, unsigned mode, int flags, void *key)
 {
 	list_del(&wq->task_list);
 	__i915_sw_fence_complete(wq->private, key);
@@ -267,7 +267,7 @@ static int i915_sw_fence_wake(wait_queue_t *wq, unsigned mode, int flags, void *
 static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
 				    const struct i915_sw_fence * const signaler)
 {
-	wait_queue_t *wq;
+	wait_queue_entry_t *wq;
 
 	if (__test_and_set_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
 		return false;
@@ -288,7 +288,7 @@ static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
 
 static void __i915_sw_fence_clear_checked_bit(struct i915_sw_fence *fence)
 {
-	wait_queue_t *wq;
+	wait_queue_entry_t *wq;
 
 	if (!__test_and_clear_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
 		return;
@@ -320,7 +320,7 @@ static bool i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
 
 static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
 					  struct i915_sw_fence *signaler,
-					  wait_queue_t *wq, gfp_t gfp)
+					  wait_queue_entry_t *wq, gfp_t gfp)
 {
 	unsigned long flags;
 	int pending;
@@ -359,7 +359,7 @@ static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
 
 	spin_lock_irqsave(&signaler->wait.lock, flags);
 	if (likely(!i915_sw_fence_done(signaler))) {
-		__add_wait_queue_tail(&signaler->wait, wq);
+		__add_wait_queue_entry_tail(&signaler->wait, wq);
 		pending = 1;
 	} else {
 		i915_sw_fence_wake(wq, 0, 0, NULL);
@@ -372,7 +372,7 @@ static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
 
 int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
 				 struct i915_sw_fence *signaler,
-				 wait_queue_t *wq)
+				 wait_queue_entry_t *wq)
 {
 	return __i915_sw_fence_await_sw_fence(fence, signaler, wq, 0);
 }
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index d31cefbbcc04..fd3c3bf6c8b7 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -66,7 +66,7 @@ void i915_sw_fence_commit(struct i915_sw_fence *fence);
 
 int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
 				 struct i915_sw_fence *after,
-				 wait_queue_t *wq);
+				 wait_queue_entry_t *wq);
 int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
 				     struct i915_sw_fence *after,
 				     gfp_t gfp);
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index c1c8e2208a21..e562a78510ff 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -375,7 +375,7 @@ struct radeon_fence {
 	unsigned		ring;
 	bool			is_vm_update;
 
-	wait_queue_t		fence_wake;
+	wait_queue_entry_t		fence_wake;
 };
 
 int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring);
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index ef09f0a63754..e86f2bd38410 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -158,7 +158,7 @@ int radeon_fence_emit(struct radeon_device *rdev,
  * for the fence locking itself, so unlocked variants are used for
  * fence_signal, and remove_wait_queue.
  */
-static int radeon_fence_check_signaled(wait_queue_t *wait, unsigned mode, int flags, void *key)
+static int radeon_fence_check_signaled(wait_queue_entry_t *wait, unsigned mode, int flags, void *key)
 {
 	struct radeon_fence *fence;
 	u64 seq;
diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c
index 92f1452dad57..76875f6299b8 100644
--- a/drivers/gpu/vga/vgaarb.c
+++ b/drivers/gpu/vga/vgaarb.c
@@ -417,7 +417,7 @@ int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible)
 {
 	struct vga_device *vgadev, *conflict;
 	unsigned long flags;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	int rc = 0;
 
 	vga_check_first_use();
diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c
index a3f18a22f5ed..e0f47cc2effc 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -1939,7 +1939,7 @@ static int i40iw_virtchnl_receive(struct i40e_info *ldev,
 bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev)
 {
 	struct i40iw_device *iwdev;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	iwdev = dev->back_dev;
 
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 9b80417cd547..73da1f5626cb 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -207,7 +207,7 @@ void bkey_put(struct cache_set *c, struct bkey *k);
 
 struct btree_op {
 	/* for waiting on btree reserve in btree_split() */
-	wait_queue_t		wait;
+	wait_queue_entry_t		wait;
 
 	/* Btree level at which we start taking write locks */
 	short			lock;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_main.h b/drivers/net/ethernet/cavium/liquidio/octeon_main.h
index bed9ef17bc26..7ccffbb0019e 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_main.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_main.h
@@ -144,7 +144,7 @@ static inline int
 sleep_cond(wait_queue_head_t *wait_queue, int *condition)
 {
 	int errno = 0;
-	wait_queue_t we;
+	wait_queue_entry_t we;
 
 	init_waitqueue_entry(&we, current);
 	add_wait_queue(wait_queue, &we);
@@ -171,7 +171,7 @@ sleep_timeout_cond(wait_queue_head_t *wait_queue,
 		   int *condition,
 		   int timeout)
 {
-	wait_queue_t we;
+	wait_queue_entry_t we;
 
 	init_waitqueue_entry(&we, current);
 	add_wait_queue(wait_queue, &we);
diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c
index 1b7e125a28e2..6a13303af2b7 100644
--- a/drivers/net/wireless/cisco/airo.c
+++ b/drivers/net/wireless/cisco/airo.c
@@ -3066,7 +3066,7 @@ static int airo_thread(void *data) {
 		if (ai->jobs) {
 			locked = down_interruptible(&ai->sem);
 		} else {
-			wait_queue_t wait;
+			wait_queue_entry_t wait;
 
 			init_waitqueue_entry(&wait, current);
 			add_wait_queue(&ai->thr_wait, &wait);
diff --git a/drivers/net/wireless/intersil/hostap/hostap_ioctl.c b/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
index b2c6b065b542..ff153ce29539 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_ioctl.c
@@ -2544,7 +2544,7 @@ static int prism2_ioctl_priv_prism2_param(struct net_device *dev,
 			ret = -EINVAL;
 		}
 		if (local->iw_mode == IW_MODE_MASTER) {
-			wait_queue_t __wait;
+			wait_queue_entry_t __wait;
 			init_waitqueue_entry(&__wait, current);
 			add_wait_queue(&local->hostscan_wq, &__wait);
 			set_current_state(TASK_INTERRUPTIBLE);
diff --git a/drivers/net/wireless/marvell/libertas/main.c b/drivers/net/wireless/marvell/libertas/main.c
index e3500203715c..dde065d0d5c1 100644
--- a/drivers/net/wireless/marvell/libertas/main.c
+++ b/drivers/net/wireless/marvell/libertas/main.c
@@ -453,7 +453,7 @@ static int lbs_thread(void *data)
 {
 	struct net_device *dev = data;
 	struct lbs_private *priv = dev->ml_priv;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	lbs_deb_enter(LBS_DEB_THREAD);
 
diff --git a/drivers/scsi/dpt/dpti_i2o.h b/drivers/scsi/dpt/dpti_i2o.h
index bd9e31e16249..16fc380b5512 100644
--- a/drivers/scsi/dpt/dpti_i2o.h
+++ b/drivers/scsi/dpt/dpti_i2o.h
@@ -48,7 +48,7 @@
 #include <linux/wait.h>
 typedef wait_queue_head_t adpt_wait_queue_head_t;
 #define ADPT_DECLARE_WAIT_QUEUE_HEAD(wait) DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wait)
-typedef wait_queue_t adpt_wait_queue_t;
+typedef wait_queue_entry_t adpt_wait_queue_entry_t;
 
 /*
  * message structures
diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c
index 3419e1bcdff6..67621308eb9c 100644
--- a/drivers/scsi/ips.c
+++ b/drivers/scsi/ips.c
@@ -301,13 +301,13 @@ static uint32_t ips_statupd_copperhead_memio(ips_ha_t *);
 static uint32_t ips_statupd_morpheus(ips_ha_t *);
 static ips_scb_t *ips_getscb(ips_ha_t *);
 static void ips_putq_scb_head(ips_scb_queue_t *, ips_scb_t *);
-static void ips_putq_wait_tail(ips_wait_queue_t *, struct scsi_cmnd *);
+static void ips_putq_wait_tail(ips_wait_queue_entry_t *, struct scsi_cmnd *);
 static void ips_putq_copp_tail(ips_copp_queue_t *,
 				      ips_copp_wait_item_t *);
 static ips_scb_t *ips_removeq_scb_head(ips_scb_queue_t *);
 static ips_scb_t *ips_removeq_scb(ips_scb_queue_t *, ips_scb_t *);
-static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *);
-static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_t *,
+static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_entry_t *);
+static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_entry_t *,
 					  struct scsi_cmnd *);
 static ips_copp_wait_item_t *ips_removeq_copp(ips_copp_queue_t *,
 						     ips_copp_wait_item_t *);
@@ -2871,7 +2871,7 @@ ips_removeq_scb(ips_scb_queue_t * queue, ips_scb_t * item)
 /* ASSUMED to be called from within the HA lock                             */
 /*                                                                          */
 /****************************************************************************/
-static void ips_putq_wait_tail(ips_wait_queue_t *queue, struct scsi_cmnd *item)
+static void ips_putq_wait_tail(ips_wait_queue_entry_t *queue, struct scsi_cmnd *item)
 {
 	METHOD_TRACE("ips_putq_wait_tail", 1);
 
@@ -2902,7 +2902,7 @@ static void ips_putq_wait_tail(ips_wait_queue_t *queue, struct scsi_cmnd *item)
 /* ASSUMED to be called from within the HA lock                             */
 /*                                                                          */
 /****************************************************************************/
-static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *queue)
+static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_entry_t *queue)
 {
 	struct scsi_cmnd *item;
 
@@ -2936,7 +2936,7 @@ static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *queue)
 /* ASSUMED to be called from within the HA lock                             */
 /*                                                                          */
 /****************************************************************************/
-static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_t *queue,
+static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_entry_t *queue,
 					  struct scsi_cmnd *item)
 {
 	struct scsi_cmnd *p;
diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h
index b782bb60baf0..366be3b2f9b4 100644
--- a/drivers/scsi/ips.h
+++ b/drivers/scsi/ips.h
@@ -989,7 +989,7 @@ typedef struct ips_wait_queue {
 	struct scsi_cmnd *head;
 	struct scsi_cmnd *tail;
 	int count;
-} ips_wait_queue_t;
+} ips_wait_queue_entry_t;
 
 typedef struct ips_copp_wait_item {
 	struct scsi_cmnd *scsi_cmd;
@@ -1035,7 +1035,7 @@ typedef struct ips_ha {
    ips_stat_t         sp;                 /* Status packer pointer      */
    struct ips_scb    *scbs;               /* Array of all CCBS          */
    struct ips_scb    *scb_freelist;       /* SCB free list              */
-   ips_wait_queue_t   scb_waitlist;       /* Pending SCB list           */
+   ips_wait_queue_entry_t   scb_waitlist;       /* Pending SCB list           */
    ips_copp_queue_t   copp_waitlist;      /* Pending PT list            */
    ips_scb_queue_t    scb_activelist;     /* Active SCB list            */
    IPS_IO_CMD        *dummy;              /* dummy command              */
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
index 0db662d6abdd..85b242ec5f9b 100644
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -3267,7 +3267,7 @@ int
 kiblnd_connd(void *arg)
 {
 	spinlock_t *lock = &kiblnd_data.kib_connd_lock;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	unsigned long flags;
 	struct kib_conn *conn;
 	int timeout;
@@ -3521,7 +3521,7 @@ kiblnd_scheduler(void *arg)
 	long id = (long)arg;
 	struct kib_sched_info *sched;
 	struct kib_conn *conn;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	unsigned long flags;
 	struct ib_wc wc;
 	int did_something;
@@ -3656,7 +3656,7 @@ kiblnd_failover_thread(void *arg)
 {
 	rwlock_t *glock = &kiblnd_data.kib_global_lock;
 	struct kib_dev *dev;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	unsigned long flags;
 	int rc;
 
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
index 3ed3b08c122c..6b38d5a8fe92 100644
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c
@@ -2166,7 +2166,7 @@ ksocknal_connd(void *arg)
 {
 	spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock;
 	struct ksock_connreq *cr;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	int nloops = 0;
 	int cons_retry = 0;
 
@@ -2554,7 +2554,7 @@ ksocknal_check_peer_timeouts(int idx)
 int
 ksocknal_reaper(void *arg)
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	struct ksock_conn *conn;
 	struct ksock_sched *sched;
 	struct list_head enomem_conns;
diff --git a/drivers/staging/lustre/lnet/libcfs/debug.c b/drivers/staging/lustre/lnet/libcfs/debug.c
index c56e9922cd5b..49deb448b044 100644
--- a/drivers/staging/lustre/lnet/libcfs/debug.c
+++ b/drivers/staging/lustre/lnet/libcfs/debug.c
@@ -361,7 +361,7 @@ static int libcfs_debug_dumplog_thread(void *arg)
 
 void libcfs_debug_dumplog(void)
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	struct task_struct *dumper;
 
 	/* we're being careful to ensure that the kernel thread is
diff --git a/drivers/staging/lustre/lnet/libcfs/tracefile.c b/drivers/staging/lustre/lnet/libcfs/tracefile.c
index 9599b7441feb..27082d2f7938 100644
--- a/drivers/staging/lustre/lnet/libcfs/tracefile.c
+++ b/drivers/staging/lustre/lnet/libcfs/tracefile.c
@@ -990,7 +990,7 @@ static int tracefiled(void *arg)
 	complete(&tctl->tctl_start);
 
 	while (1) {
-		wait_queue_t __wait;
+		wait_queue_entry_t __wait;
 
 		pc.pc_want_daemon_pages = 0;
 		collect_pages(&pc);
diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c
index ce4b83584e17..9ebba4ef5f90 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-eq.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c
@@ -312,7 +312,7 @@ __must_hold(&the_lnet.ln_eq_wait_lock)
 {
 	int tms = *timeout_ms;
 	int wait;
-	wait_queue_t wl;
+	wait_queue_entry_t wl;
 	unsigned long now;
 
 	if (!tms)
diff --git a/drivers/staging/lustre/lnet/lnet/lib-socket.c b/drivers/staging/lustre/lnet/lnet/lib-socket.c
index 9fca8d225ee0..f075706bba6d 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-socket.c
@@ -516,7 +516,7 @@ lnet_sock_listen(struct socket **sockp, __u32 local_ip, int local_port,
 int
 lnet_sock_accept(struct socket **newsockp, struct socket *sock)
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	struct socket *newsock;
 	int rc;
 
diff --git a/drivers/staging/lustre/lustre/fid/fid_request.c b/drivers/staging/lustre/lustre/fid/fid_request.c
index 999f250ceed0..bf31bc200d27 100644
--- a/drivers/staging/lustre/lustre/fid/fid_request.c
+++ b/drivers/staging/lustre/lustre/fid/fid_request.c
@@ -192,7 +192,7 @@ static int seq_client_alloc_seq(const struct lu_env *env,
 }
 
 static int seq_fid_alloc_prep(struct lu_client_seq *seq,
-			      wait_queue_t *link)
+			      wait_queue_entry_t *link)
 {
 	if (seq->lcs_update) {
 		add_wait_queue(&seq->lcs_waitq, link);
@@ -223,7 +223,7 @@ static void seq_fid_alloc_fini(struct lu_client_seq *seq)
 int seq_client_alloc_fid(const struct lu_env *env,
 			 struct lu_client_seq *seq, struct lu_fid *fid)
 {
-	wait_queue_t link;
+	wait_queue_entry_t link;
 	int rc;
 
 	LASSERT(seq);
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(seq_client_alloc_fid);
  */
 void seq_client_flush(struct lu_client_seq *seq)
 {
-	wait_queue_t link;
+	wait_queue_entry_t link;
 
 	LASSERT(seq);
 	init_waitqueue_entry(&link, current);
diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h
index b04d613846ee..f24970da8323 100644
--- a/drivers/staging/lustre/lustre/include/lustre_lib.h
+++ b/drivers/staging/lustre/lustre/include/lustre_lib.h
@@ -201,7 +201,7 @@ struct l_wait_info {
 			   sigmask(SIGALRM))
 
 /**
- * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively
+ * wait_queue_entry_t of Linux (version < 2.6.34) is a FIFO list for exclusively
  * waiting threads, which is not always desirable because all threads will
  * be waken up again and again, even user only needs a few of them to be
  * active most time. This is not good for performance because cache can
@@ -228,7 +228,7 @@ struct l_wait_info {
  */
 #define __l_wait_event(wq, condition, info, ret, l_add_wait)		   \
 do {									   \
-	wait_queue_t __wait;						 \
+	wait_queue_entry_t __wait;						 \
 	long __timeout = info->lwi_timeout;			  \
 	sigset_t   __blocked;					      \
 	int   __allow_intr = info->lwi_allow_intr;			     \
diff --git a/drivers/staging/lustre/lustre/llite/lcommon_cl.c b/drivers/staging/lustre/lustre/llite/lcommon_cl.c
index 8af611033e12..96515b839436 100644
--- a/drivers/staging/lustre/lustre/llite/lcommon_cl.c
+++ b/drivers/staging/lustre/lustre/llite/lcommon_cl.c
@@ -207,7 +207,7 @@ int cl_file_inode_init(struct inode *inode, struct lustre_md *md)
 static void cl_object_put_last(struct lu_env *env, struct cl_object *obj)
 {
 	struct lu_object_header *header = obj->co_lu.lo_header;
-	wait_queue_t	   waiter;
+	wait_queue_entry_t	   waiter;
 
 	if (unlikely(atomic_read(&header->loh_ref) != 1)) {
 		struct lu_site *site = obj->co_lu.lo_dev->ld_site;
diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
index 391c632365ae..e889d3a7de9c 100644
--- a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
+++ b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h
@@ -370,7 +370,7 @@ struct lov_thread_info {
 	struct ost_lvb	  lti_lvb;
 	struct cl_2queue	lti_cl2q;
 	struct cl_page_list     lti_plist;
-	wait_queue_t	  lti_waiter;
+	wait_queue_entry_t	  lti_waiter;
 	struct cl_attr          lti_attr;
 };
 
diff --git a/drivers/staging/lustre/lustre/lov/lov_object.c b/drivers/staging/lustre/lustre/lov/lov_object.c
index ab3ecfeeadc8..eddabbe31e5c 100644
--- a/drivers/staging/lustre/lustre/lov/lov_object.c
+++ b/drivers/staging/lustre/lustre/lov/lov_object.c
@@ -371,7 +371,7 @@ static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov,
 	struct lov_layout_raid0 *r0;
 	struct lu_site	  *site;
 	struct lu_site_bkt_data *bkt;
-	wait_queue_t	  *waiter;
+	wait_queue_entry_t	  *waiter;
 
 	r0  = &lov->u.raid0;
 	LASSERT(r0->lo_sub[idx] == los);
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c
index abcf951208d2..76ae600ae2c8 100644
--- a/drivers/staging/lustre/lustre/obdclass/lu_object.c
+++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c
@@ -556,7 +556,7 @@ EXPORT_SYMBOL(lu_object_print);
 static struct lu_object *htable_lookup(struct lu_site *s,
 				       struct cfs_hash_bd *bd,
 				       const struct lu_fid *f,
-				       wait_queue_t *waiter,
+				       wait_queue_entry_t *waiter,
 				       __u64 *version)
 {
 	struct lu_site_bkt_data *bkt;
@@ -670,7 +670,7 @@ static struct lu_object *lu_object_find_try(const struct lu_env *env,
 					    struct lu_device *dev,
 					    const struct lu_fid *f,
 					    const struct lu_object_conf *conf,
-					    wait_queue_t *waiter)
+					    wait_queue_entry_t *waiter)
 {
 	struct lu_object      *o;
 	struct lu_object      *shadow;
@@ -750,7 +750,7 @@ struct lu_object *lu_object_find_at(const struct lu_env *env,
 {
 	struct lu_site_bkt_data *bkt;
 	struct lu_object	*obj;
-	wait_queue_t	   wait;
+	wait_queue_entry_t	   wait;
 
 	while (1) {
 		obj = lu_object_find_try(env, dev, f, conf, &wait);
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 31885f20fc15..cc047de72e2a 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -184,7 +184,7 @@ static void hdlcdev_exit(struct slgt_info *info);
 struct cond_wait {
 	struct cond_wait *next;
 	wait_queue_head_t q;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	unsigned int data;
 };
 static void init_cond_wait(struct cond_wait *w, unsigned int data);
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 27c89cd5d70b..4797217e5e72 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -43,7 +43,7 @@ static void virqfd_deactivate(struct virqfd *virqfd)
 	queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown);
 }
 
-static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int virqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	struct virqfd *virqfd = container_of(wait, struct virqfd, wait);
 	unsigned long flags = (unsigned long)key;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 042030e5a035..e4613a3c362d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -165,7 +165,7 @@ static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh,
 	add_wait_queue(wqh, &poll->wait);
 }
 
-static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync,
+static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync,
 			     void *key)
 {
 	struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index f55671d53f28..f72095868b93 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -31,7 +31,7 @@ struct vhost_work {
 struct vhost_poll {
 	poll_table                table;
 	wait_queue_head_t        *wqh;
-	wait_queue_t              wait;
+	wait_queue_entry_t              wait;
 	struct vhost_work	  work;
 	unsigned long		  mask;
 	struct vhost_dev	 *dev;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index beef981aa54f..974f5346458a 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -83,7 +83,7 @@ struct autofs_info {
 struct autofs_wait_queue {
 	wait_queue_head_t queue;
 	struct autofs_wait_queue *next;
-	autofs_wqt_t wait_queue_token;
+	autofs_wqt_t wait_queue_entry_token;
 	/* We use the following to see what we are waiting for */
 	struct qstr name;
 	u32 dev;
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 24a58bf9ca72..7071895b0678 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -104,7 +104,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	size_t pktsz;
 
 	pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n",
-		 (unsigned long) wq->wait_queue_token,
+		 (unsigned long) wq->wait_queue_entry_token,
 		 wq->name.len, wq->name.name, type);
 
 	memset(&pkt, 0, sizeof(pkt)); /* For security reasons */
@@ -120,7 +120,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 		pktsz = sizeof(*mp);
 
-		mp->wait_queue_token = wq->wait_queue_token;
+		mp->wait_queue_entry_token = wq->wait_queue_entry_token;
 		mp->len = wq->name.len;
 		memcpy(mp->name, wq->name.name, wq->name.len);
 		mp->name[wq->name.len] = '\0';
@@ -133,7 +133,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 		pktsz = sizeof(*ep);
 
-		ep->wait_queue_token = wq->wait_queue_token;
+		ep->wait_queue_entry_token = wq->wait_queue_entry_token;
 		ep->len = wq->name.len;
 		memcpy(ep->name, wq->name.name, wq->name.len);
 		ep->name[wq->name.len] = '\0';
@@ -153,7 +153,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 		pktsz = sizeof(*packet);
 
-		packet->wait_queue_token = wq->wait_queue_token;
+		packet->wait_queue_entry_token = wq->wait_queue_entry_token;
 		packet->len = wq->name.len;
 		memcpy(packet->name, wq->name.name, wq->name.len);
 		packet->name[wq->name.len] = '\0';
@@ -428,7 +428,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 			return -ENOMEM;
 		}
 
-		wq->wait_queue_token = autofs4_next_wait_queue;
+		wq->wait_queue_entry_token = autofs4_next_wait_queue;
 		if (++autofs4_next_wait_queue == 0)
 			autofs4_next_wait_queue = 1;
 		wq->next = sbi->queues;
@@ -461,7 +461,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 		}
 
 		pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
-			 (unsigned long) wq->wait_queue_token, wq->name.len,
+			 (unsigned long) wq->wait_queue_entry_token, wq->name.len,
 			 wq->name.name, notify);
 
 		/*
@@ -471,7 +471,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 	} else {
 		wq->wait_ctr++;
 		pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n",
-			 (unsigned long) wq->wait_queue_token, wq->name.len,
+			 (unsigned long) wq->wait_queue_entry_token, wq->name.len,
 			 wq->name.name, notify);
 		mutex_unlock(&sbi->wq_mutex);
 		kfree(qstr.name);
@@ -550,13 +550,13 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 }
 
 
-int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
+int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_entry_token, int status)
 {
 	struct autofs_wait_queue *wq, **wql;
 
 	mutex_lock(&sbi->wq_mutex);
 	for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) {
-		if (wq->wait_queue_token == wait_queue_token)
+		if (wq->wait_queue_entry_token == wait_queue_entry_token)
 			break;
 	}
 
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 9bf90bcc56ac..54a4fcd679ed 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -97,7 +97,7 @@ struct cachefiles_cache {
  * backing file read tracking
  */
 struct cachefiles_one_read {
-	wait_queue_t			monitor;	/* link into monitored waitqueue */
+	wait_queue_entry_t			monitor;	/* link into monitored waitqueue */
 	struct page			*back_page;	/* backing file page we're waiting for */
 	struct page			*netfs_page;	/* netfs page we're going to fill */
 	struct fscache_retrieval	*op;		/* retrieval op covering this */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 41df8a27d7eb..3978b324cbca 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -204,7 +204,7 @@ wait_for_old_object:
 		wait_queue_head_t *wq;
 
 		signed long timeout = 60 * HZ;
-		wait_queue_t wait;
+		wait_queue_entry_t wait;
 		bool requeue;
 
 		/* if the object we're waiting for is queued for processing,
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index afbdc418966d..8be33b33b981 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -21,7 +21,7 @@
  * - we use this to detect read completion of backing pages
  * - the caller holds the waitqueue lock
  */
-static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
+static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
 				  int sync, void *_key)
 {
 	struct cachefiles_one_read *monitor =
diff --git a/fs/dax.c b/fs/dax.c
index 2a6889b3585f..323ea481d4a8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -84,7 +84,7 @@ struct exceptional_entry_key {
 };
 
 struct wait_exceptional_entry_queue {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	struct exceptional_entry_key key;
 };
 
@@ -108,7 +108,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
 	return wait_table + hash;
 }
 
-static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
+static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode,
 				       int sync, void *keyp)
 {
 	struct exceptional_entry_key *key = keyp;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 68b9fffcb2c8..9736df2ce89d 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -191,7 +191,7 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
  * This is used to atomically remove a wait queue entry from the eventfd wait
  * queue head, and read/reset the counter value.
  */
-int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt)
 {
 	unsigned long flags;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5420767c9b68..5ac1cba5ef72 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -244,7 +244,7 @@ struct eppoll_entry {
 	 * Wait queue item that will be linked to the target file wait
 	 * queue head.
 	 */
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	/* The wait queue head that linked the "wait" wait queue item */
 	wait_queue_head_t *whead;
@@ -347,13 +347,13 @@ static inline int ep_is_linked(struct list_head *p)
 	return !list_empty(p);
 }
 
-static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
+static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
 {
 	return container_of(p, struct eppoll_entry, wait);
 }
 
 /* Get the "struct epitem" from a wait queue pointer */
-static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
+static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
 {
 	return container_of(p, struct eppoll_entry, wait)->base;
 }
@@ -1078,7 +1078,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
  * mechanism. It is called by the stored file descriptors when they
  * have events to report.
  */
-static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	int pwake = 0;
 	unsigned long flags;
@@ -1699,7 +1699,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	int res = 0, eavail, timed_out = 0;
 	unsigned long flags;
 	u64 slack = 0;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	ktime_t expires, *to = NULL;
 
 	if (timeout > 0) {
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 611b5408f6ec..7b447a245760 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -34,7 +34,7 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m)
 
 void pin_kill(struct fs_pin *p)
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	if (!p) {
 		rcu_read_unlock();
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c08c46a3b8cd..be5a8f84e5bb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6372,7 +6372,7 @@ struct nfs4_lock_waiter {
 };
 
 static int
-nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key)
+nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key)
 {
 	int ret;
 	struct cb_notify_lock_args *cbnl = key;
@@ -6415,7 +6415,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 					   .inode = state->inode,
 					   .owner = &owner,
 					   .notified = false };
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	/* Don't bother with waitqueue if we don't expect a callback */
 	if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index febed1217b3f..775304e7f96f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2161,7 +2161,7 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino)
 }
 
 struct nilfs_segctor_wait_request {
-	wait_queue_t	wq;
+	wait_queue_entry_t	wq;
 	__u32		seq;
 	int		err;
 	atomic_t	done;
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 83b506020718..9e37b7028ea4 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -47,7 +47,7 @@ static void run_down(struct slot_map *m)
 	if (m->c != -1) {
 		for (;;) {
 			if (likely(list_empty(&wait.task_list)))
-				__add_wait_queue_tail(&m->q, &wait);
+				__add_wait_queue_entry_tail(&m->q, &wait);
 			set_current_state(TASK_UNINTERRUPTIBLE);
 
 			if (m->c == -1)
@@ -85,7 +85,7 @@ static int wait_for_free(struct slot_map *m)
 	do {
 		long n = left, t;
 		if (likely(list_empty(&wait.task_list)))
-			__add_wait_queue_tail_exclusive(&m->q, &wait);
+			__add_wait_queue_entry_tail_exclusive(&m->q, &wait);
 		set_current_state(TASK_INTERRUPTIBLE);
 
 		if (m->c > 0)
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 39bb1e838d8d..a11d773e5ff3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2956,7 +2956,7 @@ void reiserfs_wait_on_write_block(struct super_block *s)
 
 static void queue_log_writer(struct super_block *s)
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	struct reiserfs_journal *journal = SB_JOURNAL(s);
 	set_bit(J_WRITERS_QUEUED, &journal->j_state);
 
diff --git a/fs/select.c b/fs/select.c
index d6c652a31e99..5b524a977d91 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -180,7 +180,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
 	return table->entry++;
 }
 
-static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	struct poll_wqueues *pwq = wait->private;
 	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
@@ -206,7 +206,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
 	return default_wake_function(&dummy_wait, mode, sync, key);
 }
 
-static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	struct poll_table_entry *entry;
 
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 7e3d71109f51..593b022ac11b 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -43,7 +43,7 @@ void signalfd_cleanup(struct sighand_struct *sighand)
 	if (likely(!waitqueue_active(wqh)))
 		return;
 
-	/* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */
+	/* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */
 	wake_up_poll(wqh, POLLHUP | POLLFREE);
 }
 
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 1d622f276e3a..bda64fcd8a0c 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -81,7 +81,7 @@ struct userfaultfd_unmap_ctx {
 
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
-	wait_queue_t wq;
+	wait_queue_entry_t wq;
 	struct userfaultfd_ctx *ctx;
 	bool waken;
 };
@@ -91,7 +91,7 @@ struct userfaultfd_wake_range {
 	unsigned long len;
 };
 
-static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
+static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
 				     int wake_flags, void *key)
 {
 	struct userfaultfd_wake_range *range = key;
@@ -860,7 +860,7 @@ wakeup:
 static inline struct userfaultfd_wait_queue *find_userfault_in(
 		wait_queue_head_t *wqh)
 {
-	wait_queue_t *wq;
+	wait_queue_entry_t *wq;
 	struct userfaultfd_wait_queue *uwq;
 
 	VM_BUG_ON(!spin_is_locked(&wqh->lock));
@@ -1747,7 +1747,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
 {
 	struct userfaultfd_ctx *ctx = f->private_data;
-	wait_queue_t *wq;
+	wait_queue_entry_t *wq;
 	struct userfaultfd_wait_queue *uwq;
 	unsigned long pending = 0, total = 0;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index fcd641032f8d..95ba83806c5d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -33,7 +33,7 @@ struct blk_mq_hw_ctx {
 	struct blk_mq_ctx	**ctxs;
 	unsigned int		nr_ctx;
 
-	wait_queue_t		dispatch_wait;
+	wait_queue_entry_t		dispatch_wait;
 	atomic_t		wait_index;
 
 	struct blk_mq_tags	*tags;
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index ff0b981f078e..9e4befd95bc7 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -37,7 +37,7 @@ struct eventfd_ctx *eventfd_ctx_fdget(int fd);
 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
 ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt);
-int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait,
+int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
 
 #else /* CONFIG_EVENTFD */
@@ -73,7 +73,7 @@ static inline ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait,
 }
 
 static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
-						wait_queue_t *wait, __u64 *cnt)
+						wait_queue_entry_t *wait, __u64 *cnt)
 {
 	return -ENOSYS;
 }
diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
index 0c1de05098c8..76c2fbc59f35 100644
--- a/include/linux/kvm_irqfd.h
+++ b/include/linux/kvm_irqfd.h
@@ -46,7 +46,7 @@ struct kvm_kernel_irqfd_resampler {
 struct kvm_kernel_irqfd {
 	/* Used for MSI fast-path */
 	struct kvm *kvm;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	/* Update side is protected by irqfds.lock */
 	struct kvm_kernel_irq_routing_entry irq_entry;
 	seqcount_t irq_entry_sc;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 316a19f6b635..e7bbd9d4dc6c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -524,7 +524,7 @@ void page_endio(struct page *page, bool is_write, int err);
 /*
  * Add an arbitrary waiter to a page's wait queue
  */
-extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter);
+extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
 
 /*
  * Fault everything in given userspace address range in.
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 75ffc5729e4c..2889f09a1c60 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -75,7 +75,7 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 struct poll_table_entry {
 	struct file *filp;
 	unsigned long key;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	wait_queue_head_t *wait_address;
 };
 
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index edf9b2cad277..f57076b958b7 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -183,7 +183,7 @@ struct virqfd {
 	void			(*thread)(void *, void *);
 	void			*data;
 	struct work_struct	inject;
-	wait_queue_t		wait;
+	wait_queue_entry_t		wait;
 	poll_table		pt;
 	struct work_struct	shutdown;
 	struct virqfd		**pvirqfd;
diff --git a/include/linux/wait.h b/include/linux/wait.h
index db076ca7f11d..5889f0c86ff7 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -10,15 +10,18 @@
 #include <asm/current.h>
 #include <uapi/linux/wait.h>
 
-typedef struct __wait_queue wait_queue_t;
-typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key);
-int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
+typedef struct wait_queue_entry wait_queue_entry_t;
+typedef int (*wait_queue_func_t)(wait_queue_entry_t *wait, unsigned mode, int flags, void *key);
+int default_wake_function(wait_queue_entry_t *wait, unsigned mode, int flags, void *key);
 
-/* __wait_queue::flags */
+/* wait_queue_entry::flags */
 #define WQ_FLAG_EXCLUSIVE	0x01
 #define WQ_FLAG_WOKEN		0x02
 
-struct __wait_queue {
+/*
+ * A single wait-queue entry structure:
+ */
+struct wait_queue_entry {
 	unsigned int		flags;
 	void			*private;
 	wait_queue_func_t	func;
@@ -34,7 +37,7 @@ struct wait_bit_key {
 
 struct wait_bit_queue {
 	struct wait_bit_key	key;
-	wait_queue_t		wait;
+	wait_queue_entry_t	wait;
 };
 
 struct __wait_queue_head {
@@ -55,7 +58,7 @@ struct task_struct;
 	.task_list	= { NULL, NULL } }
 
 #define DECLARE_WAITQUEUE(name, tsk)					\
-	wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
+	wait_queue_entry_t name = __WAITQUEUE_INITIALIZER(name, tsk)
 
 #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {				\
 	.lock		= __SPIN_LOCK_UNLOCKED(name.lock),		\
@@ -88,7 +91,7 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct
 # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
 #endif
 
-static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
+static inline void init_waitqueue_entry(wait_queue_entry_t *q, struct task_struct *p)
 {
 	q->flags	= 0;
 	q->private	= p;
@@ -96,7 +99,7 @@ static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
 }
 
 static inline void
-init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func)
+init_waitqueue_func_entry(wait_queue_entry_t *q, wait_queue_func_t func)
 {
 	q->flags	= 0;
 	q->private	= NULL;
@@ -159,11 +162,11 @@ static inline bool wq_has_sleeper(wait_queue_head_t *wq)
 	return waitqueue_active(wq);
 }
 
-extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
-extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait);
-extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
+extern void add_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait);
+extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait);
+extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait);
 
-static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
+static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_entry_t *new)
 {
 	list_add(&new->task_list, &head->task_list);
 }
@@ -172,27 +175,27 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
  * Used for wake-one threads:
  */
 static inline void
-__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait)
 {
 	wait->flags |= WQ_FLAG_EXCLUSIVE;
 	__add_wait_queue(q, wait);
 }
 
-static inline void __add_wait_queue_tail(wait_queue_head_t *head,
-					 wait_queue_t *new)
+static inline void __add_wait_queue_entry_tail(wait_queue_head_t *head,
+					 wait_queue_entry_t *new)
 {
 	list_add_tail(&new->task_list, &head->task_list);
 }
 
 static inline void
-__add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+__add_wait_queue_entry_tail_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait)
 {
 	wait->flags |= WQ_FLAG_EXCLUSIVE;
-	__add_wait_queue_tail(q, wait);
+	__add_wait_queue_entry_tail(q, wait);
 }
 
 static inline void
-__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
+__remove_wait_queue(wait_queue_head_t *head, wait_queue_entry_t *old)
 {
 	list_del(&old->task_list);
 }
@@ -249,7 +252,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 	(!__builtin_constant_p(state) ||				\
 		state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)	\
 
-extern void init_wait_entry(wait_queue_t *__wait, int flags);
+extern void init_wait_entry(wait_queue_entry_t *__wait, int flags);
 
 /*
  * The below macro ___wait_event() has an explicit shadow of the __ret
@@ -266,7 +269,7 @@ extern void init_wait_entry(wait_queue_t *__wait, int flags);
 #define ___wait_event(wq, condition, state, exclusive, ret, cmd)	\
 ({									\
 	__label__ __out;						\
-	wait_queue_t __wait;						\
+	wait_queue_entry_t __wait;						\
 	long __ret = ret;	/* explicit shadow */			\
 									\
 	init_wait_entry(&__wait, exclusive ? WQ_FLAG_EXCLUSIVE : 0);	\
@@ -620,8 +623,8 @@ do {									\
 	__ret;								\
 })
 
-extern int do_wait_intr(wait_queue_head_t *, wait_queue_t *);
-extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *);
+extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *);
+extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);
 
 #define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \
 ({									\
@@ -967,17 +970,17 @@ do {									\
 /*
  * Waitqueues which are removed from the waitqueue_head at wakeup time
  */
-void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
-void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state);
-void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
-long wait_woken(wait_queue_t *wait, unsigned mode, long timeout);
-int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
-int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
-int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
+void prepare_to_wait(wait_queue_head_t *q, wait_queue_entry_t *wait, int state);
+void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait, int state);
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int state);
+void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait);
+long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout);
+int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key);
+int autoremove_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key);
+int wake_bit_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key);
 
 #define DEFINE_WAIT_FUNC(name, function)				\
-	wait_queue_t name = {						\
+	wait_queue_entry_t name = {					\
 		.private	= current,				\
 		.func		= function,				\
 		.task_list	= LIST_HEAD_INIT((name).task_list),	\
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index fd60eccb59a6..75e612a45824 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -62,7 +62,7 @@ struct unix_sock {
 #define UNIX_GC_CANDIDATE	0
 #define UNIX_GC_MAYBE_CYCLE	1
 	struct socket_wq	peer_wq;
-	wait_queue_t		peer_wake;
+	wait_queue_entry_t		peer_wake;
 };
 
 static inline struct unix_sock *unix_sk(const struct sock *sk)
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index aa63451ef20a..1953f8d6063b 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -26,7 +26,7 @@
 #define AUTOFS_MIN_PROTO_VERSION	AUTOFS_PROTO_VERSION
 
 /*
- * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed
+ * The wait_queue_entry_token (autofs_wqt_t) is part of a structure which is passed
  * back to the kernel via ioctl from userspace. On architectures where 32- and
  * 64-bit userspace binaries can be executed it's important that the size of
  * autofs_wqt_t stays constant between 32- and 64-bit Linux kernels so that we
@@ -49,7 +49,7 @@ struct autofs_packet_hdr {
 
 struct autofs_packet_missing {
 	struct autofs_packet_hdr hdr;
-	autofs_wqt_t wait_queue_token;
+	autofs_wqt_t wait_queue_entry_token;
 	int len;
 	char name[NAME_MAX+1];
 };	
diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index 7c6da423d54e..65b72d0222e7 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -108,7 +108,7 @@ enum autofs_notify {
 /* v4 multi expire (via pipe) */
 struct autofs_packet_expire_multi {
 	struct autofs_packet_hdr hdr;
-	autofs_wqt_t wait_queue_token;
+	autofs_wqt_t wait_queue_entry_token;
 	int len;
 	char name[NAME_MAX+1];
 };
@@ -123,7 +123,7 @@ union autofs_packet_union {
 /* autofs v5 common packet struct */
 struct autofs_v5_packet {
 	struct autofs_packet_hdr hdr;
-	autofs_wqt_t wait_queue_token;
+	autofs_wqt_t wait_queue_entry_token;
 	__u32 dev;
 	__u64 ino;
 	__u32 uid;
diff --git a/kernel/exit.c b/kernel/exit.c
index 516acdb0e0ec..7d694437ab44 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1004,7 +1004,7 @@ struct wait_opts {
 	int __user		*wo_stat;
 	struct rusage __user	*wo_rusage;
 
-	wait_queue_t		child_wait;
+	wait_queue_entry_t		child_wait;
 	int			notask_error;
 };
 
@@ -1541,7 +1541,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 	return 0;
 }
 
-static int child_wait_callback(wait_queue_t *wait, unsigned mode,
+static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
 				int sync, void *key)
 {
 	struct wait_opts *wo = container_of(wait, struct wait_opts,
diff --git a/kernel/futex.c b/kernel/futex.c
index 357348a6cf6b..d6cf71d08f21 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -225,7 +225,7 @@ struct futex_pi_state {
  * @requeue_pi_key:	the requeue_pi target futex key
  * @bitset:		bitset for the optional bitmasked wakeup
  *
- * We use this hashed waitqueue, instead of a normal wait_queue_t, so
+ * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
  * we can wake only the relevant ones (hashed queues may be shared).
  *
  * A futex_q has a woken state, just like tasks have TASK_RUNNING.
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 53f9558fa925..13fc5ae9bf2f 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -66,7 +66,7 @@ do_wait_for_common(struct completion *x,
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 
-		__add_wait_queue_tail_exclusive(&x->wait, &wait);
+		__add_wait_queue_entry_tail_exclusive(&x->wait, &wait);
 		do {
 			if (signal_pending_state(state, current)) {
 				timeout = -ERESTARTSYS;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 326d4f88e2b1..5b36644536ab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3687,7 +3687,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
 	exception_exit(prev_state);
 }
 
-int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
 			  void *key)
 {
 	return try_to_wake_up(curr->private, mode, wake_flags);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index b8c84c6dee64..301ea02dede0 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -21,7 +21,7 @@ void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_c
 
 EXPORT_SYMBOL(__init_waitqueue_head);
 
-void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait)
 {
 	unsigned long flags;
 
@@ -32,18 +32,18 @@ void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(add_wait_queue);
 
-void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait)
 {
 	unsigned long flags;
 
 	wait->flags |= WQ_FLAG_EXCLUSIVE;
 	spin_lock_irqsave(&q->lock, flags);
-	__add_wait_queue_tail(q, wait);
+	__add_wait_queue_entry_tail(q, wait);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(add_wait_queue_exclusive);
 
-void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+void remove_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait)
 {
 	unsigned long flags;
 
@@ -66,7 +66,7 @@ EXPORT_SYMBOL(remove_wait_queue);
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, int wake_flags, void *key)
 {
-	wait_queue_t *curr, *next;
+	wait_queue_entry_t *curr, *next;
 
 	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
 		unsigned flags = curr->flags;
@@ -170,7 +170,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
  * loads to move into the critical region).
  */
 void
-prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
+prepare_to_wait(wait_queue_head_t *q, wait_queue_entry_t *wait, int state)
 {
 	unsigned long flags;
 
@@ -184,20 +184,20 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
 EXPORT_SYMBOL(prepare_to_wait);
 
 void
-prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
+prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait, int state)
 {
 	unsigned long flags;
 
 	wait->flags |= WQ_FLAG_EXCLUSIVE;
 	spin_lock_irqsave(&q->lock, flags);
 	if (list_empty(&wait->task_list))
-		__add_wait_queue_tail(q, wait);
+		__add_wait_queue_entry_tail(q, wait);
 	set_current_state(state);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
 
-void init_wait_entry(wait_queue_t *wait, int flags)
+void init_wait_entry(wait_queue_entry_t *wait, int flags)
 {
 	wait->flags = flags;
 	wait->private = current;
@@ -206,7 +206,7 @@ void init_wait_entry(wait_queue_t *wait, int flags)
 }
 EXPORT_SYMBOL(init_wait_entry);
 
-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int state)
 {
 	unsigned long flags;
 	long ret = 0;
@@ -230,7 +230,7 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
 	} else {
 		if (list_empty(&wait->task_list)) {
 			if (wait->flags & WQ_FLAG_EXCLUSIVE)
-				__add_wait_queue_tail(q, wait);
+				__add_wait_queue_entry_tail(q, wait);
 			else
 				__add_wait_queue(q, wait);
 		}
@@ -249,10 +249,10 @@ EXPORT_SYMBOL(prepare_to_wait_event);
  * condition in the caller before they add the wait
  * entry to the wake queue.
  */
-int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait)
+int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
 {
 	if (likely(list_empty(&wait->task_list)))
-		__add_wait_queue_tail(wq, wait);
+		__add_wait_queue_entry_tail(wq, wait);
 
 	set_current_state(TASK_INTERRUPTIBLE);
 	if (signal_pending(current))
@@ -265,10 +265,10 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(do_wait_intr);
 
-int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait)
+int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
 {
 	if (likely(list_empty(&wait->task_list)))
-		__add_wait_queue_tail(wq, wait);
+		__add_wait_queue_entry_tail(wq, wait);
 
 	set_current_state(TASK_INTERRUPTIBLE);
 	if (signal_pending(current))
@@ -290,7 +290,7 @@ EXPORT_SYMBOL(do_wait_intr_irq);
  * the wait descriptor from the given waitqueue if still
  * queued.
  */
-void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait)
 {
 	unsigned long flags;
 
@@ -316,7 +316,7 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(finish_wait);
 
-int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+int autoremove_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	int ret = default_wake_function(wait, mode, sync, key);
 
@@ -351,7 +351,7 @@ static inline bool is_kthread_should_stop(void)
  * remove_wait_queue(&wq, &wait);
  *
  */
-long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
+long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout)
 {
 	set_current_state(mode); /* A */
 	/*
@@ -375,7 +375,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
 }
 EXPORT_SYMBOL(wait_woken);
 
-int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	/*
 	 * Although this function is called under waitqueue lock, LOCK
@@ -391,7 +391,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
 }
 EXPORT_SYMBOL(woken_wake_function);
 
-int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+int wake_bit_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
 {
 	struct wait_bit_key *key = arg;
 	struct wait_bit_queue *wait_bit
@@ -534,7 +534,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
 	return bit_waitqueue(p, 0);
 }
 
-static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync,
+static int wake_atomic_t_function(wait_queue_entry_t *wait, unsigned mode, int sync,
 				  void *arg)
 {
 	struct wait_bit_key *key = arg;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c74bf39ef764..a86688fabc55 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2864,11 +2864,11 @@ bool flush_work(struct work_struct *work)
 EXPORT_SYMBOL_GPL(flush_work);
 
 struct cwt_wait {
-	wait_queue_t		wait;
+	wait_queue_entry_t		wait;
 	struct work_struct	*work;
 };
 
-static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 6f1be573a5e6..80c19ee81e95 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -768,10 +768,10 @@ struct wait_page_key {
 struct wait_page_queue {
 	struct page *page;
 	int bit_nr;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 };
 
-static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
+static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
 {
 	struct wait_page_key *key = arg;
 	struct wait_page_queue *wait_page
@@ -834,7 +834,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 		struct page *page, int bit_nr, int state, bool lock)
 {
 	struct wait_page_queue wait_page;
-	wait_queue_t *wait = &wait_page.wait;
+	wait_queue_entry_t *wait = &wait_page.wait;
 	int ret = 0;
 
 	init_wait(wait);
@@ -847,7 +847,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 
 		if (likely(list_empty(&wait->task_list))) {
 			if (lock)
-				__add_wait_queue_tail_exclusive(q, wait);
+				__add_wait_queue_entry_tail_exclusive(q, wait);
 			else
 				__add_wait_queue(q, wait);
 			SetPageWaiters(page);
@@ -907,7 +907,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
  *
  * Add an arbitrary @waiter to the wait queue for the nominated @page.
  */
-void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
+void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
 {
 	wait_queue_head_t *q = page_waitqueue(page);
 	unsigned long flags;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 94172089f52f..9a90b096dc6b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -170,7 +170,7 @@ struct mem_cgroup_event {
 	 */
 	poll_table pt;
 	wait_queue_head_t *wqh;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	struct work_struct remove;
 };
 
@@ -1479,10 +1479,10 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 
 struct oom_wait_info {
 	struct mem_cgroup *memcg;
-	wait_queue_t	wait;
+	wait_queue_entry_t	wait;
 };
 
-static int memcg_oom_wake_function(wait_queue_t *wait,
+static int memcg_oom_wake_function(wait_queue_entry_t *wait,
 	unsigned mode, int sync, void *arg)
 {
 	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
@@ -3725,7 +3725,7 @@ static void memcg_event_remove(struct work_struct *work)
  *
  * Called with wqh->lock held and interrupts disabled.
  */
-static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
 			    int sync, void *key)
 {
 	struct mem_cgroup_event *event =
diff --git a/mm/mempool.c b/mm/mempool.c
index 47a659dedd44..1c0294858527 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -312,7 +312,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 {
 	void *element;
 	unsigned long flags;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	gfp_t gfp_temp;
 
 	VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
diff --git a/mm/shmem.c b/mm/shmem.c
index e67d6ba4e98e..a6c7dece4660 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1902,7 +1902,7 @@ unlock:
  * entry unconditionally - even if something else had already woken the
  * target.
  */
-static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	int ret = default_wake_function(wait, mode, sync, key);
 	list_del_init(&wait->task_list);
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 7bc2208b6cc4..dca3cdd1a014 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -95,7 +95,7 @@ enum {
 
 struct p9_poll_wait {
 	struct p9_conn *conn;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	wait_queue_head_t *wait_addr;
 };
 
@@ -522,7 +522,7 @@ error:
 	clear_bit(Wworksched, &m->wsched);
 }
 
-static int p9_pollwake(wait_queue_t *wait, unsigned int mode, int sync, void *key)
+static int p9_pollwake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
 {
 	struct p9_poll_wait *pwait =
 		container_of(wait, struct p9_poll_wait, wait);
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index fbf251fef70f..5c4808b3da2d 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -484,7 +484,7 @@ static int bnep_session(void *arg)
 	struct net_device *dev = s->dev;
 	struct sock *sk = s->sock->sk;
 	struct sk_buff *skb;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	BT_DBG("");
 
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 9e59b6654126..14f7c8135c31 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -280,7 +280,7 @@ static int cmtp_session(void *arg)
 	struct cmtp_session *session = arg;
 	struct sock *sk = session->sock->sk;
 	struct sk_buff *skb;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	BT_DBG("session %p", session);
 
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 0bec4588c3c8..fc31161e98f2 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -1244,7 +1244,7 @@ static void hidp_session_run(struct hidp_session *session)
 static int hidp_session_thread(void *arg)
 {
 	struct hidp_session *session = arg;
-	wait_queue_t ctrl_wait, intr_wait;
+	wait_queue_entry_t ctrl_wait, intr_wait;
 
 	BT_DBG("session %p", session);
 
diff --git a/net/core/datagram.c b/net/core/datagram.c
index db1866f2ffcf..34678828e2bb 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -68,7 +68,7 @@ static inline int connection_based(struct sock *sk)
 	return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
 }
 
-static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync,
+static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,
 				  void *key)
 {
 	unsigned long bits = (unsigned long)key;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 1a0c961f4ffe..c77ced0109b7 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -343,7 +343,7 @@ found:
  * are still connected to it and there's no way to inform "a polling
  * implementation" that it should let go of a certain wait queue
  *
- * In order to propagate a wake up, a wait_queue_t of the client
+ * In order to propagate a wake up, a wait_queue_entry_t of the client
  * socket is enqueued on the peer_wait queue of the server socket
  * whose wake function does a wake_up on the ordinary client socket
  * wait queue. This connection is established whenever a write (or
@@ -352,7 +352,7 @@ found:
  * was relayed.
  */
 
-static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
+static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
 				      void *key)
 {
 	struct unix_sock *u;
diff --git a/sound/core/control.c b/sound/core/control.c
index c109b82eef4b..6362da17ac3f 100644
--- a/sound/core/control.c
+++ b/sound/core/control.c
@@ -1577,7 +1577,7 @@ static ssize_t snd_ctl_read(struct file *file, char __user *buffer,
 		struct snd_ctl_event ev;
 		struct snd_kctl_event *kev;
 		while (list_empty(&ctl->events)) {
-			wait_queue_t wait;
+			wait_queue_entry_t wait;
 			if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) {
 				err = -EAGAIN;
 				goto __end_lock;
diff --git a/sound/core/hwdep.c b/sound/core/hwdep.c
index 9602a7e38d8a..a73baa1242be 100644
--- a/sound/core/hwdep.c
+++ b/sound/core/hwdep.c
@@ -85,7 +85,7 @@ static int snd_hwdep_open(struct inode *inode, struct file * file)
 	int major = imajor(inode);
 	struct snd_hwdep *hw;
 	int err;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	if (major == snd_major) {
 		hw = snd_lookup_minor_data(iminor(inode),
diff --git a/sound/core/init.c b/sound/core/init.c
index 6bda8436d765..d61d2b3cd521 100644
--- a/sound/core/init.c
+++ b/sound/core/init.c
@@ -989,7 +989,7 @@ EXPORT_SYMBOL(snd_card_file_remove);
  */
 int snd_power_wait(struct snd_card *card, unsigned int power_state)
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	int result = 0;
 
 	/* fastpath */
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index 36baf962f9b0..cd8b7bef8d06 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -1554,7 +1554,7 @@ static int snd_pcm_oss_sync1(struct snd_pcm_substream *substream, size_t size)
 	ssize_t result = 0;
 	snd_pcm_state_t state;
 	long res;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	runtime = substream->runtime;
 	init_waitqueue_entry(&wait, current);
@@ -2387,7 +2387,7 @@ static int snd_pcm_oss_open(struct inode *inode, struct file *file)
 	struct snd_pcm_oss_file *pcm_oss_file;
 	struct snd_pcm_oss_setup setup[2];
 	int nonblock;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	err = nonseekable_open(inode, file);
 	if (err < 0)
diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
index 5088d4b8db22..dd5254077ef7 100644
--- a/sound/core/pcm_lib.c
+++ b/sound/core/pcm_lib.c
@@ -1904,7 +1904,7 @@ static int wait_for_avail(struct snd_pcm_substream *substream,
 {
 	struct snd_pcm_runtime *runtime = substream->runtime;
 	int is_playback = substream->stream == SNDRV_PCM_STREAM_PLAYBACK;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	int err = 0;
 	snd_pcm_uframes_t avail = 0;
 	long wait_time, tout;
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 13dec5ec93f2..faa2e2be6f2e 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -1652,7 +1652,7 @@ static int snd_pcm_drain(struct snd_pcm_substream *substream,
 	struct snd_card *card;
 	struct snd_pcm_runtime *runtime;
 	struct snd_pcm_substream *s;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	int result = 0;
 	int nonblock = 0;
 
@@ -2353,7 +2353,7 @@ static int snd_pcm_capture_open(struct inode *inode, struct file *file)
 static int snd_pcm_open(struct file *file, struct snd_pcm *pcm, int stream)
 {
 	int err;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	if (pcm == NULL) {
 		err = -ENODEV;
diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index ab890336175f..32588ad05653 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -368,7 +368,7 @@ static int snd_rawmidi_open(struct inode *inode, struct file *file)
 	int err;
 	struct snd_rawmidi *rmidi;
 	struct snd_rawmidi_file *rawmidi_file = NULL;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	if ((file->f_flags & O_APPEND) && !(file->f_flags & O_NONBLOCK)) 
 		return -EINVAL;		/* invalid combination */
@@ -1002,7 +1002,7 @@ static ssize_t snd_rawmidi_read(struct file *file, char __user *buf, size_t coun
 	while (count > 0) {
 		spin_lock_irq(&runtime->lock);
 		while (!snd_rawmidi_ready(substream)) {
-			wait_queue_t wait;
+			wait_queue_entry_t wait;
 			if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) {
 				spin_unlock_irq(&runtime->lock);
 				return result > 0 ? result : -EAGAIN;
@@ -1306,7 +1306,7 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf,
 	while (count > 0) {
 		spin_lock_irq(&runtime->lock);
 		while (!snd_rawmidi_ready_append(substream, count)) {
-			wait_queue_t wait;
+			wait_queue_entry_t wait;
 			if (file->f_flags & O_NONBLOCK) {
 				spin_unlock_irq(&runtime->lock);
 				return result > 0 ? result : -EAGAIN;
@@ -1338,7 +1338,7 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf,
 	if (file->f_flags & O_DSYNC) {
 		spin_lock_irq(&runtime->lock);
 		while (runtime->avail != runtime->buffer_size) {
-			wait_queue_t wait;
+			wait_queue_entry_t wait;
 			unsigned int last_avail = runtime->avail;
 			init_waitqueue_entry(&wait, current);
 			add_wait_queue(&runtime->sleep, &wait);
diff --git a/sound/core/seq/seq_fifo.c b/sound/core/seq/seq_fifo.c
index 01c4cfe30c9f..a8c2822e0198 100644
--- a/sound/core/seq/seq_fifo.c
+++ b/sound/core/seq/seq_fifo.c
@@ -179,7 +179,7 @@ int snd_seq_fifo_cell_out(struct snd_seq_fifo *f,
 {
 	struct snd_seq_event_cell *cell;
 	unsigned long flags;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	if (snd_BUG_ON(!f))
 		return -EINVAL;
diff --git a/sound/core/seq/seq_memory.c b/sound/core/seq/seq_memory.c
index d4c61ec9be13..d6e9aacdc36b 100644
--- a/sound/core/seq/seq_memory.c
+++ b/sound/core/seq/seq_memory.c
@@ -227,7 +227,7 @@ static int snd_seq_cell_alloc(struct snd_seq_pool *pool,
 	struct snd_seq_event_cell *cell;
 	unsigned long flags;
 	int err = -EAGAIN;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	if (pool == NULL)
 		return -EINVAL;
diff --git a/sound/core/timer.c b/sound/core/timer.c
index cd67d1c12cf1..884c3066b028 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -1964,7 +1964,7 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer,
 	spin_lock_irq(&tu->qlock);
 	while ((long)count - result >= unit) {
 		while (!tu->qused) {
-			wait_queue_t wait;
+			wait_queue_entry_t wait;
 
 			if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) {
 				err = -EAGAIN;
diff --git a/sound/isa/wavefront/wavefront_synth.c b/sound/isa/wavefront/wavefront_synth.c
index 4dae9ff9ef5a..0b1e4b34b299 100644
--- a/sound/isa/wavefront/wavefront_synth.c
+++ b/sound/isa/wavefront/wavefront_synth.c
@@ -1782,7 +1782,7 @@ wavefront_should_cause_interrupt (snd_wavefront_t *dev,
 				  int val, int port, unsigned long timeout)
 
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 
 	init_waitqueue_entry(&wait, current);
 	spin_lock_irq(&dev->irq_lock);
diff --git a/sound/pci/mixart/mixart_core.c b/sound/pci/mixart/mixart_core.c
index dccf3db48fe0..8bf2ce32d4a8 100644
--- a/sound/pci/mixart/mixart_core.c
+++ b/sound/pci/mixart/mixart_core.c
@@ -239,7 +239,7 @@ int snd_mixart_send_msg(struct mixart_mgr *mgr, struct mixart_msg *request, int
 	struct mixart_msg resp;
 	u32 msg_frame = 0; /* set to 0, so it's no notification to wait for, but the answer */
 	int err;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	long timeout;
 
 	init_waitqueue_entry(&wait, current);
@@ -284,7 +284,7 @@ int snd_mixart_send_msg_wait_notif(struct mixart_mgr *mgr,
 				   struct mixart_msg *request, u32 notif_event)
 {
 	int err;
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	long timeout;
 
 	if (snd_BUG_ON(!notif_event))
diff --git a/sound/pci/ymfpci/ymfpci_main.c b/sound/pci/ymfpci/ymfpci_main.c
index fe4ba463b57c..1114166c685c 100644
--- a/sound/pci/ymfpci/ymfpci_main.c
+++ b/sound/pci/ymfpci/ymfpci_main.c
@@ -781,7 +781,7 @@ static snd_pcm_uframes_t snd_ymfpci_capture_pointer(struct snd_pcm_substream *su
 
 static void snd_ymfpci_irq_wait(struct snd_ymfpci *chip)
 {
-	wait_queue_t wait;
+	wait_queue_entry_t wait;
 	int loops = 4;
 
 	while (loops-- > 0) {
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index a8d540398bbd..9120edf3c94b 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -184,7 +184,7 @@ int __attribute__((weak)) kvm_arch_set_irq_inatomic(
  * Called with wqh->lock held and interrupts disabled
  */
 static int
-irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
+irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(wait, struct kvm_kernel_irqfd, wait);
-- 
cgit v1.3-14-g43fede


From 50816c48997af857d4bab3dca1aba90339705e96 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Mar 2017 10:33:16 +0100
Subject: sched/wait: Standardize internal naming of wait-queue entries

So the various wait-queue entry variables in include/linux/wait.h
and kernel/sched/wait.c are named in a colorfully inconsistent
way:

	wait_queue_entry_t *wait
	wait_queue_entry_t *__wait	(even in plain C code!)
	wait_queue_entry_t *q		(!)
	wait_queue_entry_t *new		(making anyone who knows C++ cringe)
	wait_queue_entry_t *old

I think part of the reason for the inconsistency is the constant
apparent confusion about what a wait queue 'head' versus 'entry' is.

( Some of the documentation talks about a 'wait descriptor', which is
  the wait-queue entry itself - further adding to the confusion. )

The most common name is 'wait', but that in itself is somewhat
ambiguous as well, as it does not really make it clear whether
it's a wait-queue entry or head.

To improve all this name the wait-queue entry structure parameters
and variables consistently and push through this naming into all
the wait.h and wait.c code:

	struct wait_queue_entry *wq_entry

The 'wq_' prefix makes it easy to grep for, and we also use the
opportunity to move away from the typedef to a plain 'struct' naming:
in the kernel we typically reserve typedefs for cases where a
C structure is really small and somewhat opaque - such as pte_t.

wait-queue entries are neither small nor opaque, so use the more
standard 'struct xxx_entry' list management code nomenclature instead.

( We don't touch external users, and we preserve the typedef as well
  for actual wait-queue users, to reduce unnecessary churn. )

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/wait.h | 84 ++++++++++++++++++++++----------------------
 kernel/sched/wait.c  | 98 ++++++++++++++++++++++++++--------------------------
 2 files changed, 91 insertions(+), 91 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5889f0c86ff7..77fdea851d8b 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -11,8 +11,9 @@
 #include <uapi/linux/wait.h>
 
 typedef struct wait_queue_entry wait_queue_entry_t;
-typedef int (*wait_queue_func_t)(wait_queue_entry_t *wait, unsigned mode, int flags, void *key);
-int default_wake_function(wait_queue_entry_t *wait, unsigned mode, int flags, void *key);
+
+typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);
+int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);
 
 /* wait_queue_entry::flags */
 #define WQ_FLAG_EXCLUSIVE	0x01
@@ -37,7 +38,7 @@ struct wait_bit_key {
 
 struct wait_bit_queue {
 	struct wait_bit_key	key;
-	wait_queue_entry_t	wait;
+	struct wait_queue_entry	wait;
 };
 
 struct __wait_queue_head {
@@ -58,7 +59,7 @@ struct task_struct;
 	.task_list	= { NULL, NULL } }
 
 #define DECLARE_WAITQUEUE(name, tsk)					\
-	wait_queue_entry_t name = __WAITQUEUE_INITIALIZER(name, tsk)
+	struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk)
 
 #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {				\
 	.lock		= __SPIN_LOCK_UNLOCKED(name.lock),		\
@@ -91,19 +92,19 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct
 # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
 #endif
 
-static inline void init_waitqueue_entry(wait_queue_entry_t *q, struct task_struct *p)
+static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p)
 {
-	q->flags	= 0;
-	q->private	= p;
-	q->func		= default_wake_function;
+	wq_entry->flags		= 0;
+	wq_entry->private	= p;
+	wq_entry->func		= default_wake_function;
 }
 
 static inline void
-init_waitqueue_func_entry(wait_queue_entry_t *q, wait_queue_func_t func)
+init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
 {
-	q->flags	= 0;
-	q->private	= NULL;
-	q->func		= func;
+	wq_entry->flags		= 0;
+	wq_entry->private	= NULL;
+	wq_entry->func		= func;
 }
 
 /**
@@ -162,42 +163,41 @@ static inline bool wq_has_sleeper(wait_queue_head_t *wq)
 	return waitqueue_active(wq);
 }
 
-extern void add_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait);
-extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait);
-extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait);
+extern void add_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry);
+extern void add_wait_queue_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry);
+extern void remove_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry);
 
-static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_entry_t *new)
+static inline void __add_wait_queue(wait_queue_head_t *head, struct wait_queue_entry *wq_entry)
 {
-	list_add(&new->task_list, &head->task_list);
+	list_add(&wq_entry->task_list, &head->task_list);
 }
 
 /*
  * Used for wake-one threads:
  */
 static inline void
-__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait)
+__add_wait_queue_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry)
 {
-	wait->flags |= WQ_FLAG_EXCLUSIVE;
-	__add_wait_queue(q, wait);
+	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue(q, wq_entry);
 }
 
-static inline void __add_wait_queue_entry_tail(wait_queue_head_t *head,
-					 wait_queue_entry_t *new)
+static inline void __add_wait_queue_entry_tail(wait_queue_head_t *head, struct wait_queue_entry *wq_entry)
 {
-	list_add_tail(&new->task_list, &head->task_list);
+	list_add_tail(&wq_entry->task_list, &head->task_list);
 }
 
 static inline void
-__add_wait_queue_entry_tail_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait)
+__add_wait_queue_entry_tail_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry)
 {
-	wait->flags |= WQ_FLAG_EXCLUSIVE;
-	__add_wait_queue_entry_tail(q, wait);
+	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue_entry_tail(q, wq_entry);
 }
 
 static inline void
-__remove_wait_queue(wait_queue_head_t *head, wait_queue_entry_t *old)
+__remove_wait_queue(wait_queue_head_t *head, struct wait_queue_entry *wq_entry)
 {
-	list_del(&old->task_list);
+	list_del(&wq_entry->task_list);
 }
 
 typedef int wait_bit_action_f(struct wait_bit_key *, int mode);
@@ -252,7 +252,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 	(!__builtin_constant_p(state) ||				\
 		state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)	\
 
-extern void init_wait_entry(wait_queue_entry_t *__wait, int flags);
+extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);
 
 /*
  * The below macro ___wait_event() has an explicit shadow of the __ret
@@ -269,12 +269,12 @@ extern void init_wait_entry(wait_queue_entry_t *__wait, int flags);
 #define ___wait_event(wq, condition, state, exclusive, ret, cmd)	\
 ({									\
 	__label__ __out;						\
-	wait_queue_entry_t __wait;						\
+	struct wait_queue_entry __wq_entry;				\
 	long __ret = ret;	/* explicit shadow */			\
 									\
-	init_wait_entry(&__wait, exclusive ? WQ_FLAG_EXCLUSIVE : 0);	\
+	init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);\
 	for (;;) {							\
-		long __int = prepare_to_wait_event(&wq, &__wait, state);\
+		long __int = prepare_to_wait_event(&wq, &__wq_entry, state);\
 									\
 		if (condition)						\
 			break;						\
@@ -286,7 +286,7 @@ extern void init_wait_entry(wait_queue_entry_t *__wait, int flags);
 									\
 		cmd;							\
 	}								\
-	finish_wait(&wq, &__wait);					\
+	finish_wait(&wq, &__wq_entry);					\
 __out:	__ret;								\
 })
 
@@ -970,17 +970,17 @@ do {									\
 /*
  * Waitqueues which are removed from the waitqueue_head at wakeup time
  */
-void prepare_to_wait(wait_queue_head_t *q, wait_queue_entry_t *wait, int state);
-void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait, int state);
-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int state);
-void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait);
-long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout);
-int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key);
-int autoremove_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key);
-int wake_bit_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key);
+void prepare_to_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state);
+void prepare_to_wait_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state);
+long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state);
+void finish_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry);
+long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
+int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
+int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
+int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
 
 #define DEFINE_WAIT_FUNC(name, function)				\
-	wait_queue_entry_t name = {					\
+	struct wait_queue_entry name = {				\
 		.private	= current,				\
 		.func		= function,				\
 		.task_list	= LIST_HEAD_INIT((name).task_list),	\
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 301ea02dede0..c37b3140763e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -21,34 +21,34 @@ void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_c
 
 EXPORT_SYMBOL(__init_waitqueue_head);
 
-void add_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait)
+void add_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry)
 {
 	unsigned long flags;
 
-	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+	wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
 	spin_lock_irqsave(&q->lock, flags);
-	__add_wait_queue(q, wait);
+	__add_wait_queue_entry_tail(q, wq_entry);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(add_wait_queue);
 
-void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait)
+void add_wait_queue_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry)
 {
 	unsigned long flags;
 
-	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
 	spin_lock_irqsave(&q->lock, flags);
-	__add_wait_queue_entry_tail(q, wait);
+	__add_wait_queue_entry_tail(q, wq_entry);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(add_wait_queue_exclusive);
 
-void remove_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait)
+void remove_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&q->lock, flags);
-	__remove_wait_queue(q, wait);
+	__remove_wait_queue(q, wq_entry);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(remove_wait_queue);
@@ -170,43 +170,43 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
  * loads to move into the critical region).
  */
 void
-prepare_to_wait(wait_queue_head_t *q, wait_queue_entry_t *wait, int state)
+prepare_to_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state)
 {
 	unsigned long flags;
 
-	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+	wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
 	spin_lock_irqsave(&q->lock, flags);
-	if (list_empty(&wait->task_list))
-		__add_wait_queue(q, wait);
+	if (list_empty(&wq_entry->task_list))
+		__add_wait_queue(q, wq_entry);
 	set_current_state(state);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(prepare_to_wait);
 
 void
-prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait, int state)
+prepare_to_wait_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state)
 {
 	unsigned long flags;
 
-	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
 	spin_lock_irqsave(&q->lock, flags);
-	if (list_empty(&wait->task_list))
-		__add_wait_queue_entry_tail(q, wait);
+	if (list_empty(&wq_entry->task_list))
+		__add_wait_queue_entry_tail(q, wq_entry);
 	set_current_state(state);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
 
-void init_wait_entry(wait_queue_entry_t *wait, int flags)
+void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
 {
-	wait->flags = flags;
-	wait->private = current;
-	wait->func = autoremove_wake_function;
-	INIT_LIST_HEAD(&wait->task_list);
+	wq_entry->flags = flags;
+	wq_entry->private = current;
+	wq_entry->func = autoremove_wake_function;
+	INIT_LIST_HEAD(&wq_entry->task_list);
 }
 EXPORT_SYMBOL(init_wait_entry);
 
-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int state)
+long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state)
 {
 	unsigned long flags;
 	long ret = 0;
@@ -225,14 +225,14 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int s
 		 * can't see us, it should wake up another exclusive waiter if
 		 * we fail.
 		 */
-		list_del_init(&wait->task_list);
+		list_del_init(&wq_entry->task_list);
 		ret = -ERESTARTSYS;
 	} else {
-		if (list_empty(&wait->task_list)) {
-			if (wait->flags & WQ_FLAG_EXCLUSIVE)
-				__add_wait_queue_entry_tail(q, wait);
+		if (list_empty(&wq_entry->task_list)) {
+			if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
+				__add_wait_queue_entry_tail(q, wq_entry);
 			else
-				__add_wait_queue(q, wait);
+				__add_wait_queue(q, wq_entry);
 		}
 		set_current_state(state);
 	}
@@ -284,13 +284,13 @@ EXPORT_SYMBOL(do_wait_intr_irq);
 /**
  * finish_wait - clean up after waiting in a queue
  * @q: waitqueue waited on
- * @wait: wait descriptor
+ * @wq_entry: wait descriptor
  *
  * Sets current thread back to running state and removes
  * the wait descriptor from the given waitqueue if still
  * queued.
  */
-void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait)
+void finish_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry)
 {
 	unsigned long flags;
 
@@ -308,20 +308,20 @@ void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait)
 	 *    have _one_ other CPU that looks at or modifies
 	 *    the list).
 	 */
-	if (!list_empty_careful(&wait->task_list)) {
+	if (!list_empty_careful(&wq_entry->task_list)) {
 		spin_lock_irqsave(&q->lock, flags);
-		list_del_init(&wait->task_list);
+		list_del_init(&wq_entry->task_list);
 		spin_unlock_irqrestore(&q->lock, flags);
 	}
 }
 EXPORT_SYMBOL(finish_wait);
 
-int autoremove_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
+int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
 {
-	int ret = default_wake_function(wait, mode, sync, key);
+	int ret = default_wake_function(wq_entry, mode, sync, key);
 
 	if (ret)
-		list_del_init(&wait->task_list);
+		list_del_init(&wq_entry->task_list);
 	return ret;
 }
 EXPORT_SYMBOL(autoremove_wake_function);
@@ -341,17 +341,17 @@ static inline bool is_kthread_should_stop(void)
  *
  *     p->state = mode;				condition = true;
  *     smp_mb(); // A				smp_wmb(); // C
- *     if (!wait->flags & WQ_FLAG_WOKEN)	wait->flags |= WQ_FLAG_WOKEN;
+ *     if (!wq_entry->flags & WQ_FLAG_WOKEN)	wq_entry->flags |= WQ_FLAG_WOKEN;
  *         schedule()				try_to_wake_up();
  *     p->state = TASK_RUNNING;		    ~~~~~~~~~~~~~~~~~~
- *     wait->flags &= ~WQ_FLAG_WOKEN;		condition = true;
+ *     wq_entry->flags &= ~WQ_FLAG_WOKEN;		condition = true;
  *     smp_mb() // B				smp_wmb(); // C
- *						wait->flags |= WQ_FLAG_WOKEN;
+ *						wq_entry->flags |= WQ_FLAG_WOKEN;
  * }
  * remove_wait_queue(&wq, &wait);
  *
  */
-long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout)
+long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
 {
 	set_current_state(mode); /* A */
 	/*
@@ -359,7 +359,7 @@ long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout)
 	 * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
 	 * also observe all state before the wakeup.
 	 */
-	if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+	if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
 		timeout = schedule_timeout(timeout);
 	__set_current_state(TASK_RUNNING);
 
@@ -369,13 +369,13 @@ long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout)
 	 * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
 	 * an event.
 	 */
-	smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+	smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */
 
 	return timeout;
 }
 EXPORT_SYMBOL(wait_woken);
 
-int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
+int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key)
 {
 	/*
 	 * Although this function is called under waitqueue lock, LOCK
@@ -385,24 +385,24 @@ int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void
 	 * and is paired with smp_store_mb() in wait_woken().
 	 */
 	smp_wmb(); /* C */
-	wait->flags |= WQ_FLAG_WOKEN;
+	wq_entry->flags |= WQ_FLAG_WOKEN;
 
-	return default_wake_function(wait, mode, sync, key);
+	return default_wake_function(wq_entry, mode, sync, key);
 }
 EXPORT_SYMBOL(woken_wake_function);
 
-int wake_bit_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
+int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg)
 {
 	struct wait_bit_key *key = arg;
 	struct wait_bit_queue *wait_bit
-		= container_of(wait, struct wait_bit_queue, wait);
+		= container_of(wq_entry, struct wait_bit_queue, wait);
 
 	if (wait_bit->key.flags != key->flags ||
 			wait_bit->key.bit_nr != key->bit_nr ||
 			test_bit(key->bit_nr, key->flags))
 		return 0;
 	else
-		return autoremove_wake_function(wait, mode, sync, key);
+		return autoremove_wake_function(wq_entry, mode, sync, key);
 }
 EXPORT_SYMBOL(wake_bit_function);
 
@@ -534,19 +534,19 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
 	return bit_waitqueue(p, 0);
 }
 
-static int wake_atomic_t_function(wait_queue_entry_t *wait, unsigned mode, int sync,
+static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync,
 				  void *arg)
 {
 	struct wait_bit_key *key = arg;
 	struct wait_bit_queue *wait_bit
-		= container_of(wait, struct wait_bit_queue, wait);
+		= container_of(wq_entry, struct wait_bit_queue, wait);
 	atomic_t *val = key->flags;
 
 	if (wait_bit->key.flags != key->flags ||
 	    wait_bit->key.bit_nr != key->bit_nr ||
 	    atomic_read(val) != 0)
 		return 0;
-	return autoremove_wake_function(wait, mode, sync, key);
+	return autoremove_wake_function(wq_entry, mode, sync, key);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 9d9d676f595b5081326be7a17dc681fcb38fb3b2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Mar 2017 11:10:18 +0100
Subject: sched/wait: Standardize internal naming of wait-queue heads

The wait-queue head parameters and variables are named in a
couple of ways, we have the following variants currently:

	wait_queue_head_t *q
	wait_queue_head_t *wq
	wait_queue_head_t *head

In particular the 'wq' naming is ambiguous in the sense whether it's
a wait-queue head or entry name - as entries were often named 'wait'.

( Not to mention the confusion of any readers coming over from
  workqueue-land. )

Standardize all this around a single, unambiguous parameter and
variable name:

	struct wait_queue_head *wq_head

which is easy to grep for and also rhymes nicely with the wait-queue
entry naming:

	struct wait_queue_entry *wq_entry

Also rename:

	struct __wait_queue_head => struct wait_queue_head

... and use this struct type to migrate from typedefs usage to 'struct'
usage, which is more in line with existing kernel practices.

Don't touch any external users and preserve the main wait_queue_head_t
typedef.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/wait.h |  76 ++++++++++++-------------
 kernel/sched/wait.c  | 154 +++++++++++++++++++++++++--------------------------
 2 files changed, 115 insertions(+), 115 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 77fdea851d8b..c3d1cefc7853 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -41,11 +41,11 @@ struct wait_bit_queue {
 	struct wait_queue_entry	wait;
 };
 
-struct __wait_queue_head {
+struct wait_queue_head {
 	spinlock_t		lock;
 	struct list_head	task_list;
 };
-typedef struct __wait_queue_head wait_queue_head_t;
+typedef struct wait_queue_head wait_queue_head_t;
 
 struct task_struct;
 
@@ -66,7 +66,7 @@ struct task_struct;
 	.task_list	= { &(name).task_list, &(name).task_list } }
 
 #define DECLARE_WAIT_QUEUE_HEAD(name) \
-	wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
+	struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
 
 #define __WAIT_BIT_KEY_INITIALIZER(word, bit)				\
 	{ .flags = word, .bit_nr = bit, }
@@ -74,20 +74,20 @@ struct task_struct;
 #define __WAIT_ATOMIC_T_KEY_INITIALIZER(p)				\
 	{ .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, }
 
-extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *);
+extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *);
 
-#define init_waitqueue_head(q)				\
+#define init_waitqueue_head(wq_head)			\
 	do {						\
 		static struct lock_class_key __key;	\
 							\
-		__init_waitqueue_head((q), #q, &__key);	\
+		__init_waitqueue_head((wq_head), #wq_head, &__key);	\
 	} while (0)
 
 #ifdef CONFIG_LOCKDEP
 # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
 	({ init_waitqueue_head(&name); name; })
 # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \
-	wait_queue_head_t name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name)
+	struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name)
 #else
 # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
 #endif
@@ -109,14 +109,14 @@ init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t f
 
 /**
  * waitqueue_active -- locklessly test for waiters on the queue
- * @q: the waitqueue to test for waiters
+ * @wq_head: the waitqueue to test for waiters
  *
  * returns true if the wait list is not empty
  *
  * NOTE: this function is lockless and requires care, incorrect usage _will_
  * lead to sporadic and non-obvious failure.
  *
- * Use either while holding wait_queue_head_t::lock or when used for wakeups
+ * Use either while holding wait_queue_head::lock or when used for wakeups
  * with an extra smp_mb() like:
  *
  *      CPU0 - waker                    CPU1 - waiter
@@ -137,9 +137,9 @@ init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t f
  * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
  * which (when the lock is uncontended) are of roughly equal cost.
  */
-static inline int waitqueue_active(wait_queue_head_t *q)
+static inline int waitqueue_active(struct wait_queue_head *wq_head)
 {
-	return !list_empty(&q->task_list);
+	return !list_empty(&wq_head->task_list);
 }
 
 /**
@@ -150,7 +150,7 @@ static inline int waitqueue_active(wait_queue_head_t *q)
  *
  * Please refer to the comment for waitqueue_active.
  */
-static inline bool wq_has_sleeper(wait_queue_head_t *wq)
+static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
 {
 	/*
 	 * We need to be sure we are in sync with the
@@ -160,62 +160,62 @@ static inline bool wq_has_sleeper(wait_queue_head_t *wq)
 	 * waiting side.
 	 */
 	smp_mb();
-	return waitqueue_active(wq);
+	return waitqueue_active(wq_head);
 }
 
-extern void add_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry);
-extern void add_wait_queue_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry);
-extern void remove_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry);
+extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 
-static inline void __add_wait_queue(wait_queue_head_t *head, struct wait_queue_entry *wq_entry)
+static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
-	list_add(&wq_entry->task_list, &head->task_list);
+	list_add(&wq_entry->task_list, &wq_head->task_list);
 }
 
 /*
  * Used for wake-one threads:
  */
 static inline void
-__add_wait_queue_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry)
+__add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
 	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
-	__add_wait_queue(q, wq_entry);
+	__add_wait_queue(wq_head, wq_entry);
 }
 
-static inline void __add_wait_queue_entry_tail(wait_queue_head_t *head, struct wait_queue_entry *wq_entry)
+static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
-	list_add_tail(&wq_entry->task_list, &head->task_list);
+	list_add_tail(&wq_entry->task_list, &wq_head->task_list);
 }
 
 static inline void
-__add_wait_queue_entry_tail_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry)
+__add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
 	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
-	__add_wait_queue_entry_tail(q, wq_entry);
+	__add_wait_queue_entry_tail(wq_head, wq_entry);
 }
 
 static inline void
-__remove_wait_queue(wait_queue_head_t *head, struct wait_queue_entry *wq_entry)
+__remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
 	list_del(&wq_entry->task_list);
 }
 
 typedef int wait_bit_action_f(struct wait_bit_key *, int mode);
-void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
-void __wake_up_bit(wait_queue_head_t *, void *, int);
-int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_action_f *, unsigned);
-int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_action_f *, unsigned);
+void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
+void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
+void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
+void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
+void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
+void __wake_up_bit(struct wait_queue_head *, void *, int);
+int __wait_on_bit(struct wait_queue_head *, struct wait_bit_queue *, wait_bit_action_f *, unsigned);
+int __wait_on_bit_lock(struct wait_queue_head *, struct wait_bit_queue *, wait_bit_action_f *, unsigned);
 void wake_up_bit(void *, int);
 void wake_up_atomic_t(atomic_t *);
 int out_of_line_wait_on_bit(void *, int, wait_bit_action_f *, unsigned);
 int out_of_line_wait_on_bit_timeout(void *, int, wait_bit_action_f *, unsigned, unsigned long);
 int out_of_line_wait_on_bit_lock(void *, int, wait_bit_action_f *, unsigned);
 int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned);
-wait_queue_head_t *bit_waitqueue(void *, int);
+struct wait_queue_head *bit_waitqueue(void *, int);
 
 #define wake_up(x)			__wake_up(x, TASK_NORMAL, 1, NULL)
 #define wake_up_nr(x, nr)		__wake_up(x, TASK_NORMAL, nr, NULL)
@@ -970,10 +970,10 @@ do {									\
 /*
  * Waitqueues which are removed from the waitqueue_head at wakeup time
  */
-void prepare_to_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state);
-void prepare_to_wait_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state);
-long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state);
-void finish_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry);
+void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+void prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
+void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
 int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
 int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index c37b3140763e..203aeea96f16 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -12,44 +12,44 @@
 #include <linux/hash.h>
 #include <linux/kthread.h>
 
-void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
+void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
 {
-	spin_lock_init(&q->lock);
-	lockdep_set_class_and_name(&q->lock, key, name);
-	INIT_LIST_HEAD(&q->task_list);
+	spin_lock_init(&wq_head->lock);
+	lockdep_set_class_and_name(&wq_head->lock, key, name);
+	INIT_LIST_HEAD(&wq_head->task_list);
 }
 
 EXPORT_SYMBOL(__init_waitqueue_head);
 
-void add_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry)
+void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
 	unsigned long flags;
 
 	wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
-	spin_lock_irqsave(&q->lock, flags);
-	__add_wait_queue_entry_tail(q, wq_entry);
-	spin_unlock_irqrestore(&q->lock, flags);
+	spin_lock_irqsave(&wq_head->lock, flags);
+	__add_wait_queue_entry_tail(wq_head, wq_entry);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
 }
 EXPORT_SYMBOL(add_wait_queue);
 
-void add_wait_queue_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry)
+void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
 	unsigned long flags;
 
 	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
-	spin_lock_irqsave(&q->lock, flags);
-	__add_wait_queue_entry_tail(q, wq_entry);
-	spin_unlock_irqrestore(&q->lock, flags);
+	spin_lock_irqsave(&wq_head->lock, flags);
+	__add_wait_queue_entry_tail(wq_head, wq_entry);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
 }
 EXPORT_SYMBOL(add_wait_queue_exclusive);
 
-void remove_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry)
+void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&q->lock, flags);
-	__remove_wait_queue(q, wq_entry);
-	spin_unlock_irqrestore(&q->lock, flags);
+	spin_lock_irqsave(&wq_head->lock, flags);
+	__remove_wait_queue(wq_head, wq_entry);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
 }
 EXPORT_SYMBOL(remove_wait_queue);
 
@@ -63,12 +63,12 @@ EXPORT_SYMBOL(remove_wait_queue);
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, int wake_flags, void *key)
 {
 	wait_queue_entry_t *curr, *next;
 
-	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
+	list_for_each_entry_safe(curr, next, &wq_head->task_list, task_list) {
 		unsigned flags = curr->flags;
 
 		if (curr->func(curr, mode, wake_flags, key) &&
@@ -79,7 +79,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 
 /**
  * __wake_up - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
+ * @wq_head: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: is directly passed to the wakeup function
@@ -87,35 +87,35 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
  */
-void __wake_up(wait_queue_head_t *q, unsigned int mode,
+void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, 0, key);
-	spin_unlock_irqrestore(&q->lock, flags);
+	spin_lock_irqsave(&wq_head->lock, flags);
+	__wake_up_common(wq_head, mode, nr_exclusive, 0, key);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
 }
 EXPORT_SYMBOL(__wake_up);
 
 /*
  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
  */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr)
 {
-	__wake_up_common(q, mode, nr, 0, NULL);
+	__wake_up_common(wq_head, mode, nr, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key)
 {
-	__wake_up_common(q, mode, 1, 0, key);
+	__wake_up_common(wq_head, mode, 1, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
 /**
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
+ * @wq_head: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: opaque value to be passed to wakeup targets
@@ -130,30 +130,30 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key);
  * It may be assumed that this function implies a write memory barrier before
  * changing the task state if and only if any tasks are woken up.
  */
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
 			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	int wake_flags = 1; /* XXX WF_SYNC */
 
-	if (unlikely(!q))
+	if (unlikely(!wq_head))
 		return;
 
 	if (unlikely(nr_exclusive != 1))
 		wake_flags = 0;
 
-	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
-	spin_unlock_irqrestore(&q->lock, flags);
+	spin_lock_irqsave(&wq_head->lock, flags);
+	__wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 
 /*
  * __wake_up_sync - see __wake_up_sync_key()
  */
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive)
 {
-	__wake_up_sync_key(q, mode, nr_exclusive, NULL);
+	__wake_up_sync_key(wq_head, mode, nr_exclusive, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 
@@ -170,30 +170,30 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
  * loads to move into the critical region).
  */
 void
-prepare_to_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state)
+prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
 {
 	unsigned long flags;
 
 	wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
-	spin_lock_irqsave(&q->lock, flags);
+	spin_lock_irqsave(&wq_head->lock, flags);
 	if (list_empty(&wq_entry->task_list))
-		__add_wait_queue(q, wq_entry);
+		__add_wait_queue(wq_head, wq_entry);
 	set_current_state(state);
-	spin_unlock_irqrestore(&q->lock, flags);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
 }
 EXPORT_SYMBOL(prepare_to_wait);
 
 void
-prepare_to_wait_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state)
+prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
 {
 	unsigned long flags;
 
 	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
-	spin_lock_irqsave(&q->lock, flags);
+	spin_lock_irqsave(&wq_head->lock, flags);
 	if (list_empty(&wq_entry->task_list))
-		__add_wait_queue_entry_tail(q, wq_entry);
+		__add_wait_queue_entry_tail(wq_head, wq_entry);
 	set_current_state(state);
-	spin_unlock_irqrestore(&q->lock, flags);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
 
@@ -206,12 +206,12 @@ void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
 }
 EXPORT_SYMBOL(init_wait_entry);
 
-long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state)
+long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state)
 {
 	unsigned long flags;
 	long ret = 0;
 
-	spin_lock_irqsave(&q->lock, flags);
+	spin_lock_irqsave(&wq_head->lock, flags);
 	if (unlikely(signal_pending_state(state, current))) {
 		/*
 		 * Exclusive waiter must not fail if it was selected by wakeup,
@@ -219,7 +219,7 @@ long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_ent
 		 *
 		 * The caller will recheck the condition and return success if
 		 * we were already woken up, we can not miss the event because
-		 * wakeup locks/unlocks the same q->lock.
+		 * wakeup locks/unlocks the same wq_head->lock.
 		 *
 		 * But we need to ensure that set-condition + wakeup after that
 		 * can't see us, it should wake up another exclusive waiter if
@@ -230,13 +230,13 @@ long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_ent
 	} else {
 		if (list_empty(&wq_entry->task_list)) {
 			if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
-				__add_wait_queue_entry_tail(q, wq_entry);
+				__add_wait_queue_entry_tail(wq_head, wq_entry);
 			else
-				__add_wait_queue(q, wq_entry);
+				__add_wait_queue(wq_head, wq_entry);
 		}
 		set_current_state(state);
 	}
-	spin_unlock_irqrestore(&q->lock, flags);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
 
 	return ret;
 }
@@ -283,14 +283,14 @@ EXPORT_SYMBOL(do_wait_intr_irq);
 
 /**
  * finish_wait - clean up after waiting in a queue
- * @q: waitqueue waited on
+ * @wq_head: waitqueue waited on
  * @wq_entry: wait descriptor
  *
  * Sets current thread back to running state and removes
  * the wait descriptor from the given waitqueue if still
  * queued.
  */
-void finish_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry)
+void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
 	unsigned long flags;
 
@@ -309,9 +309,9 @@ void finish_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry)
 	 *    the list).
 	 */
 	if (!list_empty_careful(&wq_entry->task_list)) {
-		spin_lock_irqsave(&q->lock, flags);
+		spin_lock_irqsave(&wq_head->lock, flags);
 		list_del_init(&wq_entry->task_list);
-		spin_unlock_irqrestore(&q->lock, flags);
+		spin_unlock_irqrestore(&wq_head->lock, flags);
 	}
 }
 EXPORT_SYMBOL(finish_wait);
@@ -334,7 +334,7 @@ static inline bool is_kthread_should_stop(void)
 /*
  * DEFINE_WAIT_FUNC(wait, woken_wake_func);
  *
- * add_wait_queue(&wq, &wait);
+ * add_wait_queue(&wq_head, &wait);
  * for (;;) {
  *     if (condition)
  *         break;
@@ -348,7 +348,7 @@ static inline bool is_kthread_should_stop(void)
  *     smp_mb() // B				smp_wmb(); // C
  *						wq_entry->flags |= WQ_FLAG_WOKEN;
  * }
- * remove_wait_queue(&wq, &wait);
+ * remove_wait_queue(&wq_head, &wait);
  *
  */
 long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
@@ -412,17 +412,17 @@ EXPORT_SYMBOL(wake_bit_function);
  * permitted return codes. Nonzero return codes halt waiting and return.
  */
 int __sched
-__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
+__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue *q,
 	      wait_bit_action_f *action, unsigned mode)
 {
 	int ret = 0;
 
 	do {
-		prepare_to_wait(wq, &q->wait, mode);
+		prepare_to_wait(wq_head, &q->wait, mode);
 		if (test_bit(q->key.bit_nr, q->key.flags))
 			ret = (*action)(&q->key, mode);
 	} while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
-	finish_wait(wq, &q->wait);
+	finish_wait(wq_head, &q->wait);
 	return ret;
 }
 EXPORT_SYMBOL(__wait_on_bit);
@@ -430,10 +430,10 @@ EXPORT_SYMBOL(__wait_on_bit);
 int __sched out_of_line_wait_on_bit(void *word, int bit,
 				    wait_bit_action_f *action, unsigned mode)
 {
-	wait_queue_head_t *wq = bit_waitqueue(word, bit);
+	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
 	DEFINE_WAIT_BIT(wait, word, bit);
 
-	return __wait_on_bit(wq, &wait, action, mode);
+	return __wait_on_bit(wq_head, &wait, action, mode);
 }
 EXPORT_SYMBOL(out_of_line_wait_on_bit);
 
@@ -441,36 +441,36 @@ int __sched out_of_line_wait_on_bit_timeout(
 	void *word, int bit, wait_bit_action_f *action,
 	unsigned mode, unsigned long timeout)
 {
-	wait_queue_head_t *wq = bit_waitqueue(word, bit);
+	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
 	DEFINE_WAIT_BIT(wait, word, bit);
 
 	wait.key.timeout = jiffies + timeout;
-	return __wait_on_bit(wq, &wait, action, mode);
+	return __wait_on_bit(wq_head, &wait, action, mode);
 }
 EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
 
 int __sched
-__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
+__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q,
 			wait_bit_action_f *action, unsigned mode)
 {
 	int ret = 0;
 
 	for (;;) {
-		prepare_to_wait_exclusive(wq, &q->wait, mode);
+		prepare_to_wait_exclusive(wq_head, &q->wait, mode);
 		if (test_bit(q->key.bit_nr, q->key.flags)) {
 			ret = action(&q->key, mode);
 			/*
 			 * See the comment in prepare_to_wait_event().
-			 * finish_wait() does not necessarily takes wq->lock,
+			 * finish_wait() does not necessarily takes wwq_head->lock,
 			 * but test_and_set_bit() implies mb() which pairs with
 			 * smp_mb__after_atomic() before wake_up_page().
 			 */
 			if (ret)
-				finish_wait(wq, &q->wait);
+				finish_wait(wq_head, &q->wait);
 		}
 		if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
 			if (!ret)
-				finish_wait(wq, &q->wait);
+				finish_wait(wq_head, &q->wait);
 			return 0;
 		} else if (ret) {
 			return ret;
@@ -482,18 +482,18 @@ EXPORT_SYMBOL(__wait_on_bit_lock);
 int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
 					 wait_bit_action_f *action, unsigned mode)
 {
-	wait_queue_head_t *wq = bit_waitqueue(word, bit);
+	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
 	DEFINE_WAIT_BIT(wait, word, bit);
 
-	return __wait_on_bit_lock(wq, &wait, action, mode);
+	return __wait_on_bit_lock(wq_head, &wait, action, mode);
 }
 EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
 
-void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
+void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
 {
 	struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
-	if (waitqueue_active(wq))
-		__wake_up(wq, TASK_NORMAL, 1, &key);
+	if (waitqueue_active(wq_head))
+		__wake_up(wq_head, TASK_NORMAL, 1, &key);
 }
 EXPORT_SYMBOL(__wake_up_bit);
 
@@ -555,20 +555,20 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
  * return codes halt waiting and return.
  */
 static __sched
-int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
+int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue *q,
 		       int (*action)(atomic_t *), unsigned mode)
 {
 	atomic_t *val;
 	int ret = 0;
 
 	do {
-		prepare_to_wait(wq, &q->wait, mode);
+		prepare_to_wait(wq_head, &q->wait, mode);
 		val = q->key.flags;
 		if (atomic_read(val) == 0)
 			break;
 		ret = (*action)(val);
 	} while (!ret && atomic_read(val) != 0);
-	finish_wait(wq, &q->wait);
+	finish_wait(wq_head, &q->wait);
 	return ret;
 }
 
@@ -586,10 +586,10 @@ int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
 __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
 					 unsigned mode)
 {
-	wait_queue_head_t *wq = atomic_t_waitqueue(p);
+	struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
 	DEFINE_WAIT_ATOMIC_T(wait, p);
 
-	return __wait_on_atomic_t(wq, &wait, action, mode);
+	return __wait_on_atomic_t(wq_head, &wait, action, mode);
 }
 EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
 
-- 
cgit v1.3-14-g43fede


From 2141713616c652aeabf2dd5c1e89bc601c4fed6a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Mar 2017 11:25:39 +0100
Subject: sched/wait: Standardize 'struct wait_bit_queue' wait-queue entry
 field name

Rename 'struct wait_bit_queue::wait' to ::wq_entry, to more clearly
name it as a wait-queue entry.

Propagate it to a couple of usage sites where the wait-bit-queue internals
are exposed.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/inode.c           |  8 ++++----
 fs/jbd2/journal.c    |  4 ++--
 fs/xfs/xfs_icache.c  |  4 ++--
 fs/xfs/xfs_inode.c   |  8 ++++----
 include/linux/wait.h |  6 +++---
 kernel/sched/wait.c  | 41 ++++++++++++++++++++---------------------
 6 files changed, 35 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/fs/inode.c b/fs/inode.c
index db5914783a71..70761d6cafcd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1891,11 +1891,11 @@ static void __wait_on_freeing_inode(struct inode *inode)
 	wait_queue_head_t *wq;
 	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
 	wq = bit_waitqueue(&inode->i_state, __I_NEW);
-	prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_hash_lock);
 	schedule();
-	finish_wait(wq, &wait.wait);
+	finish_wait(wq, &wait.wq_entry);
 	spin_lock(&inode_hash_lock);
 }
 
@@ -2038,11 +2038,11 @@ static void __inode_dio_wait(struct inode *inode)
 	DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
 
 	do {
-		prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
 		if (atomic_read(&inode->i_dio_count))
 			schedule();
 	} while (atomic_read(&inode->i_dio_count));
-	finish_wait(wq, &q.wait);
+	finish_wait(wq, &q.wq_entry);
 }
 
 /**
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ebad34266bcf..7d5ef3bf3f3e 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2579,10 +2579,10 @@ restart:
 		wait_queue_head_t *wq;
 		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
 		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
-		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 		spin_unlock(&journal->j_list_lock);
 		schedule();
-		finish_wait(wq, &wait.wait);
+		finish_wait(wq, &wait.wq_entry);
 		goto restart;
 	}
 
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 990210fcb9c3..b9c12e1cc23a 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -269,12 +269,12 @@ xfs_inew_wait(
 	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
 
 	do {
-		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 		if (!xfs_iflags_test(ip, XFS_INEW))
 			break;
 		schedule();
 	} while (true);
-	finish_wait(wq, &wait.wait);
+	finish_wait(wq, &wait.wq_entry);
 }
 
 /*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ec9826c56500..c0a1e840a588 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -622,12 +622,12 @@ __xfs_iflock(
 	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
 
 	do {
-		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 		if (xfs_isiflocked(ip))
 			io_schedule();
 	} while (!xfs_iflock_nowait(ip));
 
-	finish_wait(wq, &wait.wait);
+	finish_wait(wq, &wait.wq_entry);
 }
 
 STATIC uint
@@ -2486,11 +2486,11 @@ __xfs_iunpin_wait(
 	xfs_iunpin(ip);
 
 	do {
-		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 		if (xfs_ipincount(ip))
 			io_schedule();
 	} while (xfs_ipincount(ip));
-	finish_wait(wq, &wait.wait);
+	finish_wait(wq, &wait.wq_entry);
 }
 
 void
diff --git a/include/linux/wait.h b/include/linux/wait.h
index c3d1cefc7853..1c8add685f22 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -38,7 +38,7 @@ struct wait_bit_key {
 
 struct wait_bit_queue {
 	struct wait_bit_key	key;
-	struct wait_queue_entry	wait;
+	struct wait_queue_entry	wq_entry;
 };
 
 struct wait_queue_head {
@@ -991,11 +991,11 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
 #define DEFINE_WAIT_BIT(name, word, bit)				\
 	struct wait_bit_queue name = {					\
 		.key = __WAIT_BIT_KEY_INITIALIZER(word, bit),		\
-		.wait	= {						\
+		.wq_entry = {						\
 			.private	= current,			\
 			.func		= wake_bit_function,		\
 			.task_list	=				\
-				LIST_HEAD_INIT((name).wait.task_list),	\
+				LIST_HEAD_INIT((name).wq_entry.task_list), \
 		},							\
 	}
 
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 203aeea96f16..f1ba0625b8be 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -395,7 +395,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
 {
 	struct wait_bit_key *key = arg;
 	struct wait_bit_queue *wait_bit
-		= container_of(wq_entry, struct wait_bit_queue, wait);
+		= container_of(wq_entry, struct wait_bit_queue, wq_entry);
 
 	if (wait_bit->key.flags != key->flags ||
 			wait_bit->key.bit_nr != key->bit_nr ||
@@ -418,11 +418,11 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue *q,
 	int ret = 0;
 
 	do {
-		prepare_to_wait(wq_head, &q->wait, mode);
+		prepare_to_wait(wq_head, &q->wq_entry, mode);
 		if (test_bit(q->key.bit_nr, q->key.flags))
 			ret = (*action)(&q->key, mode);
 	} while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
-	finish_wait(wq_head, &q->wait);
+	finish_wait(wq_head, &q->wq_entry);
 	return ret;
 }
 EXPORT_SYMBOL(__wait_on_bit);
@@ -431,9 +431,9 @@ int __sched out_of_line_wait_on_bit(void *word, int bit,
 				    wait_bit_action_f *action, unsigned mode)
 {
 	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
-	DEFINE_WAIT_BIT(wait, word, bit);
+	DEFINE_WAIT_BIT(wq_entry, word, bit);
 
-	return __wait_on_bit(wq_head, &wait, action, mode);
+	return __wait_on_bit(wq_head, &wq_entry, action, mode);
 }
 EXPORT_SYMBOL(out_of_line_wait_on_bit);
 
@@ -442,10 +442,10 @@ int __sched out_of_line_wait_on_bit_timeout(
 	unsigned mode, unsigned long timeout)
 {
 	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
-	DEFINE_WAIT_BIT(wait, word, bit);
+	DEFINE_WAIT_BIT(wq_entry, word, bit);
 
-	wait.key.timeout = jiffies + timeout;
-	return __wait_on_bit(wq_head, &wait, action, mode);
+	wq_entry.key.timeout = jiffies + timeout;
+	return __wait_on_bit(wq_head, &wq_entry, action, mode);
 }
 EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
 
@@ -456,7 +456,7 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q,
 	int ret = 0;
 
 	for (;;) {
-		prepare_to_wait_exclusive(wq_head, &q->wait, mode);
+		prepare_to_wait_exclusive(wq_head, &q->wq_entry, mode);
 		if (test_bit(q->key.bit_nr, q->key.flags)) {
 			ret = action(&q->key, mode);
 			/*
@@ -466,11 +466,11 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q,
 			 * smp_mb__after_atomic() before wake_up_page().
 			 */
 			if (ret)
-				finish_wait(wq_head, &q->wait);
+				finish_wait(wq_head, &q->wq_entry);
 		}
 		if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
 			if (!ret)
-				finish_wait(wq_head, &q->wait);
+				finish_wait(wq_head, &q->wq_entry);
 			return 0;
 		} else if (ret) {
 			return ret;
@@ -483,9 +483,9 @@ int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
 					 wait_bit_action_f *action, unsigned mode)
 {
 	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
-	DEFINE_WAIT_BIT(wait, word, bit);
+	DEFINE_WAIT_BIT(wq_entry, word, bit);
 
-	return __wait_on_bit_lock(wq_head, &wait, action, mode);
+	return __wait_on_bit_lock(wq_head, &wq_entry, action, mode);
 }
 EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
 
@@ -538,8 +538,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
 				  void *arg)
 {
 	struct wait_bit_key *key = arg;
-	struct wait_bit_queue *wait_bit
-		= container_of(wq_entry, struct wait_bit_queue, wait);
+	struct wait_bit_queue *wait_bit = container_of(wq_entry, struct wait_bit_queue, wq_entry);
 	atomic_t *val = key->flags;
 
 	if (wait_bit->key.flags != key->flags ||
@@ -562,24 +561,24 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue *q
 	int ret = 0;
 
 	do {
-		prepare_to_wait(wq_head, &q->wait, mode);
+		prepare_to_wait(wq_head, &q->wq_entry, mode);
 		val = q->key.flags;
 		if (atomic_read(val) == 0)
 			break;
 		ret = (*action)(val);
 	} while (!ret && atomic_read(val) != 0);
-	finish_wait(wq_head, &q->wait);
+	finish_wait(wq_head, &q->wq_entry);
 	return ret;
 }
 
 #define DEFINE_WAIT_ATOMIC_T(name, p)					\
 	struct wait_bit_queue name = {					\
 		.key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p),		\
-		.wait	= {						\
+		.wq_entry = {						\
 			.private	= current,			\
 			.func		= wake_atomic_t_function,	\
 			.task_list	=				\
-				LIST_HEAD_INIT((name).wait.task_list),	\
+				LIST_HEAD_INIT((name).wq_entry.task_list), \
 		},							\
 	}
 
@@ -587,9 +586,9 @@ __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
 					 unsigned mode)
 {
 	struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
-	DEFINE_WAIT_ATOMIC_T(wait, p);
+	DEFINE_WAIT_ATOMIC_T(wq_entry, p);
 
-	return __wait_on_atomic_t(wq_head, &wait, action, mode);
+	return __wait_on_atomic_t(wq_head, &wq_entry, action, mode);
 }
 EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
 
-- 
cgit v1.3-14-g43fede


From 76c85ddc4695bb7b8209bfeff11f5156088f9197 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Mar 2017 11:35:27 +0100
Subject: sched/wait: Standardize wait_bit_queue naming

So wait-bit-queue head variables are often named:

	struct wait_bit_queue *q

... which is a bit ambiguous and super confusing, because
they clearly suggest wait-queue head semantics and behavior
(they rhyme with the old wait_queue_t *q naming), while they
are extended wait-queue _entries_, not heads!

They are misnomers in two ways:

 - the 'wait_bit_queue' leaves open the question of whether
   it's an entry or a head

 - the 'q' parameter and local variable naming falsely implies
   that it's a 'queue' - while it's an entry.

This resulted in sometimes confusing cases such as:

	finish_wait(wq, &q->wait);

where the 'q' is not a wait-queue head, but a wait-bit-queue entry.

So improve this all by standardizing wait-bit-queue nomenclature
similar to wait-queue head naming:

	struct wait_bit_queue   => struct wait_bit_queue_entry
	q			=> wbq_entry

Which makes it all a much clearer:

	struct wait_bit_queue_entry *wbq_entry

... and turns the former confusing piece of code into:

	finish_wait(wq_head, &wbq_entry->wq_entry;

which IMHO makes it apparently clear what we are doing,
without having to analyze the context of the code: we are
adding a wait-queue entry to a regular wait-queue head,
which entry is embedded in a wait-bit-queue entry.

I'm not a big fan of acronyms, but repeating wait_bit_queue_entry
in field and local variable names is too long, so Hopefully it's
clear enough that 'wq_' prefixes stand for wait-queues, while
'wbq_' prefixes stand for wait-bit-queues.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/wait.h |  8 ++++----
 kernel/sched/wait.c  | 41 ++++++++++++++++++++---------------------
 2 files changed, 24 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1c8add685f22..fc7c32d82120 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -36,7 +36,7 @@ struct wait_bit_key {
 	unsigned long		timeout;
 };
 
-struct wait_bit_queue {
+struct wait_bit_queue_entry {
 	struct wait_bit_key	key;
 	struct wait_queue_entry	wq_entry;
 };
@@ -207,8 +207,8 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 void __wake_up_bit(struct wait_queue_head *, void *, int);
-int __wait_on_bit(struct wait_queue_head *, struct wait_bit_queue *, wait_bit_action_f *, unsigned);
-int __wait_on_bit_lock(struct wait_queue_head *, struct wait_bit_queue *, wait_bit_action_f *, unsigned);
+int __wait_on_bit(struct wait_queue_head *, struct wait_bit_queue_entry *, wait_bit_action_f *, unsigned);
+int __wait_on_bit_lock(struct wait_queue_head *, struct wait_bit_queue_entry *, wait_bit_action_f *, unsigned);
 void wake_up_bit(void *, int);
 void wake_up_atomic_t(atomic_t *);
 int out_of_line_wait_on_bit(void *, int, wait_bit_action_f *, unsigned);
@@ -989,7 +989,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
 #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
 
 #define DEFINE_WAIT_BIT(name, word, bit)				\
-	struct wait_bit_queue name = {					\
+	struct wait_bit_queue_entry name = {				\
 		.key = __WAIT_BIT_KEY_INITIALIZER(word, bit),		\
 		.wq_entry = {						\
 			.private	= current,			\
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index f1ba0625b8be..95e6d3820cba 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -394,8 +394,7 @@ EXPORT_SYMBOL(woken_wake_function);
 int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg)
 {
 	struct wait_bit_key *key = arg;
-	struct wait_bit_queue *wait_bit
-		= container_of(wq_entry, struct wait_bit_queue, wq_entry);
+	struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
 
 	if (wait_bit->key.flags != key->flags ||
 			wait_bit->key.bit_nr != key->bit_nr ||
@@ -412,17 +411,17 @@ EXPORT_SYMBOL(wake_bit_function);
  * permitted return codes. Nonzero return codes halt waiting and return.
  */
 int __sched
-__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue *q,
+__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
 	      wait_bit_action_f *action, unsigned mode)
 {
 	int ret = 0;
 
 	do {
-		prepare_to_wait(wq_head, &q->wq_entry, mode);
-		if (test_bit(q->key.bit_nr, q->key.flags))
-			ret = (*action)(&q->key, mode);
-	} while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
-	finish_wait(wq_head, &q->wq_entry);
+		prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
+		if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
+			ret = (*action)(&wbq_entry->key, mode);
+	} while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+	finish_wait(wq_head, &wbq_entry->wq_entry);
 	return ret;
 }
 EXPORT_SYMBOL(__wait_on_bit);
@@ -450,15 +449,15 @@ int __sched out_of_line_wait_on_bit_timeout(
 EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
 
 int __sched
-__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q,
+__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
 			wait_bit_action_f *action, unsigned mode)
 {
 	int ret = 0;
 
 	for (;;) {
-		prepare_to_wait_exclusive(wq_head, &q->wq_entry, mode);
-		if (test_bit(q->key.bit_nr, q->key.flags)) {
-			ret = action(&q->key, mode);
+		prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode);
+		if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) {
+			ret = action(&wbq_entry->key, mode);
 			/*
 			 * See the comment in prepare_to_wait_event().
 			 * finish_wait() does not necessarily takes wwq_head->lock,
@@ -466,11 +465,11 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q,
 			 * smp_mb__after_atomic() before wake_up_page().
 			 */
 			if (ret)
-				finish_wait(wq_head, &q->wq_entry);
+				finish_wait(wq_head, &wbq_entry->wq_entry);
 		}
-		if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
+		if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) {
 			if (!ret)
-				finish_wait(wq_head, &q->wq_entry);
+				finish_wait(wq_head, &wbq_entry->wq_entry);
 			return 0;
 		} else if (ret) {
 			return ret;
@@ -538,7 +537,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
 				  void *arg)
 {
 	struct wait_bit_key *key = arg;
-	struct wait_bit_queue *wait_bit = container_of(wq_entry, struct wait_bit_queue, wq_entry);
+	struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
 	atomic_t *val = key->flags;
 
 	if (wait_bit->key.flags != key->flags ||
@@ -554,25 +553,25 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
  * return codes halt waiting and return.
  */
 static __sched
-int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue *q,
+int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
 		       int (*action)(atomic_t *), unsigned mode)
 {
 	atomic_t *val;
 	int ret = 0;
 
 	do {
-		prepare_to_wait(wq_head, &q->wq_entry, mode);
-		val = q->key.flags;
+		prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
+		val = wbq_entry->key.flags;
 		if (atomic_read(val) == 0)
 			break;
 		ret = (*action)(val);
 	} while (!ret && atomic_read(val) != 0);
-	finish_wait(wq_head, &q->wq_entry);
+	finish_wait(wq_head, &wbq_entry->wq_entry);
 	return ret;
 }
 
 #define DEFINE_WAIT_ATOMIC_T(name, p)					\
-	struct wait_bit_queue name = {					\
+	struct wait_bit_queue_entry name = {				\
 		.key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p),		\
 		.wq_entry = {						\
 			.private	= current,			\
-- 
cgit v1.3-14-g43fede


From 5dd43ce2f69d42a71dcacdb13d17d8c0ac1fe8f7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 20 Jun 2017 12:19:09 +0200
Subject: sched/wait: Split out the wait_bit*() APIs from <linux/wait.h> into
 <linux/wait_bit.h>

The wait_bit*() types and APIs are mixed into wait.h, but they
are a pretty orthogonal extension of wait-queues.

Furthermore, only about 50 kernel files use these APIs, while
over 1000 use the regular wait-queue functionality.

So clean up the main wait.h by moving the wait-bit functionality
out of it, into a separate .h and .c file:

  include/linux/wait_bit.h  for types and APIs
  kernel/sched/wait_bit.c   for the implementation

Update all header dependencies.

This reduces the size of wait.h rather significantly, by about 30%.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/cachefiles/internal.h     |   2 +-
 fs/cifs/inode.c              |   1 +
 fs/nfs/internal.h            |   1 +
 include/linux/fs.h           |   2 +-
 include/linux/sunrpc/sched.h |   2 +-
 include/linux/wait.h         | 250 ----------------------------------------
 include/linux/wait_bit.h     | 260 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/Makefile        |   2 +-
 kernel/sched/wait.c          | 257 ------------------------------------------
 kernel/sched/wait_bit.c      | 263 +++++++++++++++++++++++++++++++++++++++++++
 security/keys/internal.h     |   1 +
 11 files changed, 530 insertions(+), 511 deletions(-)
 create mode 100644 include/linux/wait_bit.h
 create mode 100644 kernel/sched/wait_bit.c

(limited to 'kernel')

diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 54a4fcd679ed..bb3a02ca9da4 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -18,7 +18,7 @@
 
 #include <linux/fscache-cache.h>
 #include <linux/timer.h>
-#include <linux/wait.h>
+#include <linux/wait_bit.h>
 #include <linux/cred.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4d1fcd76d022..a8693632235f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -24,6 +24,7 @@
 #include <linux/pagemap.h>
 #include <linux/freezer.h>
 #include <linux/sched/signal.h>
+#include <linux/wait_bit.h>
 
 #include <asm/div64.h>
 #include "cifsfs.h"
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 3e24392f2caa..8701d7617964 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -7,6 +7,7 @@
 #include <linux/security.h>
 #include <linux/crc32.h>
 #include <linux/nfs_page.h>
+#include <linux/wait_bit.h>
 
 #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 803e5a9b2654..53f7e49d8fe5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2,7 +2,7 @@
 #define _LINUX_FS_H
 
 #include <linux/linkage.h>
-#include <linux/wait.h>
+#include <linux/wait_bit.h>
 #include <linux/kdev_t.h>
 #include <linux/dcache.h>
 #include <linux/path.h>
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 7ba040c797ec..9d7529ffc4ce 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -13,7 +13,7 @@
 #include <linux/ktime.h>
 #include <linux/sunrpc/types.h>
 #include <linux/spinlock.h>
-#include <linux/wait.h>
+#include <linux/wait_bit.h>
 #include <linux/workqueue.h>
 #include <linux/sunrpc/xdr.h>
 
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 0805098f3589..629489746f8a 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -29,18 +29,6 @@ struct wait_queue_entry {
 	struct list_head	task_list;
 };
 
-struct wait_bit_key {
-	void			*flags;
-	int			bit_nr;
-#define WAIT_ATOMIC_T_BIT_NR	-1
-	unsigned long		timeout;
-};
-
-struct wait_bit_queue_entry {
-	struct wait_bit_key	key;
-	struct wait_queue_entry	wq_entry;
-};
-
 struct wait_queue_head {
 	spinlock_t		lock;
 	struct list_head	task_list;
@@ -68,12 +56,6 @@ struct task_struct;
 #define DECLARE_WAIT_QUEUE_HEAD(name) \
 	struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
 
-#define __WAIT_BIT_KEY_INITIALIZER(word, bit)					\
-	{ .flags = word, .bit_nr = bit, }
-
-#define __WAIT_ATOMIC_T_KEY_INITIALIZER(p)					\
-	{ .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, }
-
 extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *);
 
 #define init_waitqueue_head(wq_head)						\
@@ -200,22 +182,11 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
 	list_del(&wq_entry->task_list);
 }
 
-typedef int wait_bit_action_f(struct wait_bit_key *key, int mode);
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
-void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit);
-int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
-int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
-void wake_up_bit(void *word, int bit);
-void wake_up_atomic_t(atomic_t *p);
-int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode);
-int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
-int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
-int out_of_line_wait_on_atomic_t(atomic_t *p, int (*)(atomic_t *), unsigned int mode);
-struct wait_queue_head *bit_waitqueue(void *word, int bit);
 
 #define wake_up(x)			__wake_up(x, TASK_NORMAL, 1, NULL)
 #define wake_up_nr(x, nr)		__wake_up(x, TASK_NORMAL, nr, NULL)
@@ -976,7 +947,6 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_en
 long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
 int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
 int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
-int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
 
 #define DEFINE_WAIT_FUNC(name, function)					\
 	struct wait_queue_entry name = {					\
@@ -987,17 +957,6 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
 
 #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
 
-#define DEFINE_WAIT_BIT(name, word, bit)					\
-	struct wait_bit_queue_entry name = {					\
-		.key = __WAIT_BIT_KEY_INITIALIZER(word, bit),			\
-		.wq_entry = {							\
-			.private	= current,				\
-			.func		= wake_bit_function,			\
-			.task_list	=					\
-				LIST_HEAD_INIT((name).wq_entry.task_list),	\
-		},								\
-	}
-
 #define init_wait(wait)								\
 	do {									\
 		(wait)->private = current;					\
@@ -1006,213 +965,4 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
 		(wait)->flags = 0;						\
 	} while (0)
 
-
-extern int bit_wait(struct wait_bit_key *key, int bit);
-extern int bit_wait_io(struct wait_bit_key *key, int bit);
-extern int bit_wait_timeout(struct wait_bit_key *key, int bit);
-extern int bit_wait_io_timeout(struct wait_bit_key *key, int bit);
-
-/**
- * wait_on_bit - wait for a bit to be cleared
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- * @mode: the task state to sleep in
- *
- * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hashtable's accessor API that waits on a bit.
- * For instance, if one were to have waiters on a bitflag, one would
- * call wait_on_bit() in threads waiting for the bit to clear.
- * One uses wait_on_bit() where one is waiting for the bit to clear,
- * but has no intention of setting it.
- * Returned value will be zero if the bit was cleared, or non-zero
- * if the process received a signal and the mode permitted wakeup
- * on that signal.
- */
-static inline int
-wait_on_bit(unsigned long *word, int bit, unsigned mode)
-{
-	might_sleep();
-	if (!test_bit(bit, word))
-		return 0;
-	return out_of_line_wait_on_bit(word, bit,
-				       bit_wait,
-				       mode);
-}
-
-/**
- * wait_on_bit_io - wait for a bit to be cleared
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- * @mode: the task state to sleep in
- *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared.  This is similar to wait_on_bit(), but calls
- * io_schedule() instead of schedule() for the actual waiting.
- *
- * Returned value will be zero if the bit was cleared, or non-zero
- * if the process received a signal and the mode permitted wakeup
- * on that signal.
- */
-static inline int
-wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
-{
-	might_sleep();
-	if (!test_bit(bit, word))
-		return 0;
-	return out_of_line_wait_on_bit(word, bit,
-				       bit_wait_io,
-				       mode);
-}
-
-/**
- * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- * @mode: the task state to sleep in
- * @timeout: timeout, in jiffies
- *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared. This is similar to wait_on_bit(), except also takes a
- * timeout parameter.
- *
- * Returned value will be zero if the bit was cleared before the
- * @timeout elapsed, or non-zero if the @timeout elapsed or process
- * received a signal and the mode permitted wakeup on that signal.
- */
-static inline int
-wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
-		    unsigned long timeout)
-{
-	might_sleep();
-	if (!test_bit(bit, word))
-		return 0;
-	return out_of_line_wait_on_bit_timeout(word, bit,
-					       bit_wait_timeout,
-					       mode, timeout);
-}
-
-/**
- * wait_on_bit_action - wait for a bit to be cleared
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- * @action: the function used to sleep, which may take special actions
- * @mode: the task state to sleep in
- *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared, and allow the waiting action to be specified.
- * This is like wait_on_bit() but allows fine control of how the waiting
- * is done.
- *
- * Returned value will be zero if the bit was cleared, or non-zero
- * if the process received a signal and the mode permitted wakeup
- * on that signal.
- */
-static inline int
-wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
-		   unsigned mode)
-{
-	might_sleep();
-	if (!test_bit(bit, word))
-		return 0;
-	return out_of_line_wait_on_bit(word, bit, action, mode);
-}
-
-/**
- * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- * @mode: the task state to sleep in
- *
- * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hashtable's accessor API that waits on a bit
- * when one intends to set it, for instance, trying to lock bitflags.
- * For instance, if one were to have waiters trying to set bitflag
- * and waiting for it to clear before setting it, one would call
- * wait_on_bit() in threads waiting to be able to set the bit.
- * One uses wait_on_bit_lock() where one is waiting for the bit to
- * clear with the intention of setting it, and when done, clearing it.
- *
- * Returns zero if the bit was (eventually) found to be clear and was
- * set.  Returns non-zero if a signal was delivered to the process and
- * the @mode allows that signal to wake the process.
- */
-static inline int
-wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
-{
-	might_sleep();
-	if (!test_and_set_bit(bit, word))
-		return 0;
-	return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode);
-}
-
-/**
- * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- * @mode: the task state to sleep in
- *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared and then to atomically set it.  This is similar
- * to wait_on_bit(), but calls io_schedule() instead of schedule()
- * for the actual waiting.
- *
- * Returns zero if the bit was (eventually) found to be clear and was
- * set.  Returns non-zero if a signal was delivered to the process and
- * the @mode allows that signal to wake the process.
- */
-static inline int
-wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
-{
-	might_sleep();
-	if (!test_and_set_bit(bit, word))
-		return 0;
-	return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode);
-}
-
-/**
- * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- * @action: the function used to sleep, which may take special actions
- * @mode: the task state to sleep in
- *
- * Use the standard hashed waitqueue table to wait for a bit
- * to be cleared and then to set it, and allow the waiting action
- * to be specified.
- * This is like wait_on_bit() but allows fine control of how the waiting
- * is done.
- *
- * Returns zero if the bit was (eventually) found to be clear and was
- * set.  Returns non-zero if a signal was delivered to the process and
- * the @mode allows that signal to wake the process.
- */
-static inline int
-wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
-			unsigned mode)
-{
-	might_sleep();
-	if (!test_and_set_bit(bit, word))
-		return 0;
-	return out_of_line_wait_on_bit_lock(word, bit, action, mode);
-}
-
-/**
- * wait_on_atomic_t - Wait for an atomic_t to become 0
- * @val: The atomic value being waited on, a kernel virtual address
- * @action: the function used to sleep, which may take special actions
- * @mode: the task state to sleep in
- *
- * Wait for an atomic_t to become 0.  We abuse the bit-wait waitqueue table for
- * the purpose of getting a waitqueue, but we set the key to a bit number
- * outside of the target 'word'.
- */
-static inline
-int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
-{
-	might_sleep();
-	if (atomic_read(val) == 0)
-		return 0;
-	return out_of_line_wait_on_atomic_t(val, action, mode);
-}
-
 #endif /* _LINUX_WAIT_H */
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
new file mode 100644
index 000000000000..8c85c52d94b6
--- /dev/null
+++ b/include/linux/wait_bit.h
@@ -0,0 +1,260 @@
+#ifndef _LINUX_WAIT_BIT_H
+#define _LINUX_WAIT_BIT_H
+
+/*
+ * Linux wait-bit related types and methods:
+ */
+#include <linux/wait.h>
+
+struct wait_bit_key {
+	void			*flags;
+	int			bit_nr;
+#define WAIT_ATOMIC_T_BIT_NR	-1
+	unsigned long		timeout;
+};
+
+struct wait_bit_queue_entry {
+	struct wait_bit_key	key;
+	struct wait_queue_entry	wq_entry;
+};
+
+#define __WAIT_BIT_KEY_INITIALIZER(word, bit)					\
+	{ .flags = word, .bit_nr = bit, }
+
+#define __WAIT_ATOMIC_T_KEY_INITIALIZER(p)					\
+	{ .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, }
+
+typedef int wait_bit_action_f(struct wait_bit_key *key, int mode);
+void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit);
+int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
+int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
+void wake_up_bit(void *word, int bit);
+void wake_up_atomic_t(atomic_t *p);
+int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode);
+int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
+int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
+int out_of_line_wait_on_atomic_t(atomic_t *p, int (*)(atomic_t *), unsigned int mode);
+struct wait_queue_head *bit_waitqueue(void *word, int bit);
+
+int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
+
+#define DEFINE_WAIT_BIT(name, word, bit)					\
+	struct wait_bit_queue_entry name = {					\
+		.key = __WAIT_BIT_KEY_INITIALIZER(word, bit),			\
+		.wq_entry = {							\
+			.private	= current,				\
+			.func		= wake_bit_function,			\
+			.task_list	=					\
+				LIST_HEAD_INIT((name).wq_entry.task_list),	\
+		},								\
+	}
+
+extern int bit_wait(struct wait_bit_key *key, int bit);
+extern int bit_wait_io(struct wait_bit_key *key, int bit);
+extern int bit_wait_timeout(struct wait_bit_key *key, int bit);
+extern int bit_wait_io_timeout(struct wait_bit_key *key, int bit);
+
+/**
+ * wait_on_bit - wait for a bit to be cleared
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that waits on a bit.
+ * For instance, if one were to have waiters on a bitflag, one would
+ * call wait_on_bit() in threads waiting for the bit to clear.
+ * One uses wait_on_bit() where one is waiting for the bit to clear,
+ * but has no intention of setting it.
+ * Returned value will be zero if the bit was cleared, or non-zero
+ * if the process received a signal and the mode permitted wakeup
+ * on that signal.
+ */
+static inline int
+wait_on_bit(unsigned long *word, int bit, unsigned mode)
+{
+	might_sleep();
+	if (!test_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit(word, bit,
+				       bit_wait,
+				       mode);
+}
+
+/**
+ * wait_on_bit_io - wait for a bit to be cleared
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared.  This is similar to wait_on_bit(), but calls
+ * io_schedule() instead of schedule() for the actual waiting.
+ *
+ * Returned value will be zero if the bit was cleared, or non-zero
+ * if the process received a signal and the mode permitted wakeup
+ * on that signal.
+ */
+static inline int
+wait_on_bit_io(unsigned long *word, int bit, unsigned mode)
+{
+	might_sleep();
+	if (!test_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit(word, bit,
+				       bit_wait_io,
+				       mode);
+}
+
+/**
+ * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ * @timeout: timeout, in jiffies
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared. This is similar to wait_on_bit(), except also takes a
+ * timeout parameter.
+ *
+ * Returned value will be zero if the bit was cleared before the
+ * @timeout elapsed, or non-zero if the @timeout elapsed or process
+ * received a signal and the mode permitted wakeup on that signal.
+ */
+static inline int
+wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode,
+		    unsigned long timeout)
+{
+	might_sleep();
+	if (!test_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit_timeout(word, bit,
+					       bit_wait_timeout,
+					       mode, timeout);
+}
+
+/**
+ * wait_on_bit_action - wait for a bit to be cleared
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @action: the function used to sleep, which may take special actions
+ * @mode: the task state to sleep in
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared, and allow the waiting action to be specified.
+ * This is like wait_on_bit() but allows fine control of how the waiting
+ * is done.
+ *
+ * Returned value will be zero if the bit was cleared, or non-zero
+ * if the process received a signal and the mode permitted wakeup
+ * on that signal.
+ */
+static inline int
+wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action,
+		   unsigned mode)
+{
+	might_sleep();
+	if (!test_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit(word, bit, action, mode);
+}
+
+/**
+ * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that waits on a bit
+ * when one intends to set it, for instance, trying to lock bitflags.
+ * For instance, if one were to have waiters trying to set bitflag
+ * and waiting for it to clear before setting it, one would call
+ * wait_on_bit() in threads waiting to be able to set the bit.
+ * One uses wait_on_bit_lock() where one is waiting for the bit to
+ * clear with the intention of setting it, and when done, clearing it.
+ *
+ * Returns zero if the bit was (eventually) found to be clear and was
+ * set.  Returns non-zero if a signal was delivered to the process and
+ * the @mode allows that signal to wake the process.
+ */
+static inline int
+wait_on_bit_lock(unsigned long *word, int bit, unsigned mode)
+{
+	might_sleep();
+	if (!test_and_set_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode);
+}
+
+/**
+ * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared and then to atomically set it.  This is similar
+ * to wait_on_bit(), but calls io_schedule() instead of schedule()
+ * for the actual waiting.
+ *
+ * Returns zero if the bit was (eventually) found to be clear and was
+ * set.  Returns non-zero if a signal was delivered to the process and
+ * the @mode allows that signal to wake the process.
+ */
+static inline int
+wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode)
+{
+	might_sleep();
+	if (!test_and_set_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode);
+}
+
+/**
+ * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @action: the function used to sleep, which may take special actions
+ * @mode: the task state to sleep in
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared and then to set it, and allow the waiting action
+ * to be specified.
+ * This is like wait_on_bit() but allows fine control of how the waiting
+ * is done.
+ *
+ * Returns zero if the bit was (eventually) found to be clear and was
+ * set.  Returns non-zero if a signal was delivered to the process and
+ * the @mode allows that signal to wake the process.
+ */
+static inline int
+wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
+			unsigned mode)
+{
+	might_sleep();
+	if (!test_and_set_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit_lock(word, bit, action, mode);
+}
+
+/**
+ * wait_on_atomic_t - Wait for an atomic_t to become 0
+ * @val: The atomic value being waited on, a kernel virtual address
+ * @action: the function used to sleep, which may take special actions
+ * @mode: the task state to sleep in
+ *
+ * Wait for an atomic_t to become 0.  We abuse the bit-wait waitqueue table for
+ * the purpose of getting a waitqueue, but we set the key to a bit number
+ * outside of the target 'word'.
+ */
+static inline
+int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
+{
+	might_sleep();
+	if (atomic_read(val) == 0)
+		return 0;
+	return out_of_line_wait_on_atomic_t(val, action, mode);
+}
+
+#endif /* _LINUX_WAIT_BIT_H */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 89ab6758667b..16277e2ed8ee 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,7 +17,7 @@ endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o swait.o completion.o idle.o
+obj-y += wait.o wait_bit.o swait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 95e6d3820cba..6bcd7c3c4501 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -390,260 +390,3 @@ int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sy
 	return default_wake_function(wq_entry, mode, sync, key);
 }
 EXPORT_SYMBOL(woken_wake_function);
-
-int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg)
-{
-	struct wait_bit_key *key = arg;
-	struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
-
-	if (wait_bit->key.flags != key->flags ||
-			wait_bit->key.bit_nr != key->bit_nr ||
-			test_bit(key->bit_nr, key->flags))
-		return 0;
-	else
-		return autoremove_wake_function(wq_entry, mode, sync, key);
-}
-EXPORT_SYMBOL(wake_bit_function);
-
-/*
- * To allow interruptible waiting and asynchronous (i.e. nonblocking)
- * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
- * permitted return codes. Nonzero return codes halt waiting and return.
- */
-int __sched
-__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
-	      wait_bit_action_f *action, unsigned mode)
-{
-	int ret = 0;
-
-	do {
-		prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
-		if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
-			ret = (*action)(&wbq_entry->key, mode);
-	} while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
-	finish_wait(wq_head, &wbq_entry->wq_entry);
-	return ret;
-}
-EXPORT_SYMBOL(__wait_on_bit);
-
-int __sched out_of_line_wait_on_bit(void *word, int bit,
-				    wait_bit_action_f *action, unsigned mode)
-{
-	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
-	DEFINE_WAIT_BIT(wq_entry, word, bit);
-
-	return __wait_on_bit(wq_head, &wq_entry, action, mode);
-}
-EXPORT_SYMBOL(out_of_line_wait_on_bit);
-
-int __sched out_of_line_wait_on_bit_timeout(
-	void *word, int bit, wait_bit_action_f *action,
-	unsigned mode, unsigned long timeout)
-{
-	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
-	DEFINE_WAIT_BIT(wq_entry, word, bit);
-
-	wq_entry.key.timeout = jiffies + timeout;
-	return __wait_on_bit(wq_head, &wq_entry, action, mode);
-}
-EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
-
-int __sched
-__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
-			wait_bit_action_f *action, unsigned mode)
-{
-	int ret = 0;
-
-	for (;;) {
-		prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode);
-		if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) {
-			ret = action(&wbq_entry->key, mode);
-			/*
-			 * See the comment in prepare_to_wait_event().
-			 * finish_wait() does not necessarily takes wwq_head->lock,
-			 * but test_and_set_bit() implies mb() which pairs with
-			 * smp_mb__after_atomic() before wake_up_page().
-			 */
-			if (ret)
-				finish_wait(wq_head, &wbq_entry->wq_entry);
-		}
-		if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) {
-			if (!ret)
-				finish_wait(wq_head, &wbq_entry->wq_entry);
-			return 0;
-		} else if (ret) {
-			return ret;
-		}
-	}
-}
-EXPORT_SYMBOL(__wait_on_bit_lock);
-
-int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
-					 wait_bit_action_f *action, unsigned mode)
-{
-	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
-	DEFINE_WAIT_BIT(wq_entry, word, bit);
-
-	return __wait_on_bit_lock(wq_head, &wq_entry, action, mode);
-}
-EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
-
-void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
-{
-	struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
-	if (waitqueue_active(wq_head))
-		__wake_up(wq_head, TASK_NORMAL, 1, &key);
-}
-EXPORT_SYMBOL(__wake_up_bit);
-
-/**
- * wake_up_bit - wake up a waiter on a bit
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
- *
- * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hashtable's accessor API that wakes up waiters
- * on a bit. For instance, if one were to have waiters on a bitflag,
- * one would call wake_up_bit() after clearing the bit.
- *
- * In order for this to function properly, as it uses waitqueue_active()
- * internally, some kind of memory barrier must be done prior to calling
- * this. Typically, this will be smp_mb__after_atomic(), but in some
- * cases where bitflags are manipulated non-atomically under a lock, one
- * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
- * because spin_unlock() does not guarantee a memory barrier.
- */
-void wake_up_bit(void *word, int bit)
-{
-	__wake_up_bit(bit_waitqueue(word, bit), word, bit);
-}
-EXPORT_SYMBOL(wake_up_bit);
-
-/*
- * Manipulate the atomic_t address to produce a better bit waitqueue table hash
- * index (we're keying off bit -1, but that would produce a horrible hash
- * value).
- */
-static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
-{
-	if (BITS_PER_LONG == 64) {
-		unsigned long q = (unsigned long)p;
-		return bit_waitqueue((void *)(q & ~1), q & 1);
-	}
-	return bit_waitqueue(p, 0);
-}
-
-static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync,
-				  void *arg)
-{
-	struct wait_bit_key *key = arg;
-	struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
-	atomic_t *val = key->flags;
-
-	if (wait_bit->key.flags != key->flags ||
-	    wait_bit->key.bit_nr != key->bit_nr ||
-	    atomic_read(val) != 0)
-		return 0;
-	return autoremove_wake_function(wq_entry, mode, sync, key);
-}
-
-/*
- * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
- * the actions of __wait_on_atomic_t() are permitted return codes.  Nonzero
- * return codes halt waiting and return.
- */
-static __sched
-int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
-		       int (*action)(atomic_t *), unsigned mode)
-{
-	atomic_t *val;
-	int ret = 0;
-
-	do {
-		prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
-		val = wbq_entry->key.flags;
-		if (atomic_read(val) == 0)
-			break;
-		ret = (*action)(val);
-	} while (!ret && atomic_read(val) != 0);
-	finish_wait(wq_head, &wbq_entry->wq_entry);
-	return ret;
-}
-
-#define DEFINE_WAIT_ATOMIC_T(name, p)					\
-	struct wait_bit_queue_entry name = {				\
-		.key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p),		\
-		.wq_entry = {						\
-			.private	= current,			\
-			.func		= wake_atomic_t_function,	\
-			.task_list	=				\
-				LIST_HEAD_INIT((name).wq_entry.task_list), \
-		},							\
-	}
-
-__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
-					 unsigned mode)
-{
-	struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
-	DEFINE_WAIT_ATOMIC_T(wq_entry, p);
-
-	return __wait_on_atomic_t(wq_head, &wq_entry, action, mode);
-}
-EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
-
-/**
- * wake_up_atomic_t - Wake up a waiter on a atomic_t
- * @p: The atomic_t being waited on, a kernel virtual address
- *
- * Wake up anyone waiting for the atomic_t to go to zero.
- *
- * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
- * check is done by the waiter's wake function, not the by the waker itself).
- */
-void wake_up_atomic_t(atomic_t *p)
-{
-	__wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
-}
-EXPORT_SYMBOL(wake_up_atomic_t);
-
-__sched int bit_wait(struct wait_bit_key *word, int mode)
-{
-	schedule();
-	if (signal_pending_state(mode, current))
-		return -EINTR;
-	return 0;
-}
-EXPORT_SYMBOL(bit_wait);
-
-__sched int bit_wait_io(struct wait_bit_key *word, int mode)
-{
-	io_schedule();
-	if (signal_pending_state(mode, current))
-		return -EINTR;
-	return 0;
-}
-EXPORT_SYMBOL(bit_wait_io);
-
-__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
-{
-	unsigned long now = READ_ONCE(jiffies);
-	if (time_after_eq(now, word->timeout))
-		return -EAGAIN;
-	schedule_timeout(word->timeout - now);
-	if (signal_pending_state(mode, current))
-		return -EINTR;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(bit_wait_timeout);
-
-__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
-{
-	unsigned long now = READ_ONCE(jiffies);
-	if (time_after_eq(now, word->timeout))
-		return -EAGAIN;
-	io_schedule_timeout(word->timeout - now);
-	if (signal_pending_state(mode, current))
-		return -EINTR;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
new file mode 100644
index 000000000000..463bac84dfd1
--- /dev/null
+++ b/kernel/sched/wait_bit.c
@@ -0,0 +1,263 @@
+/*
+ * The implementation of the wait_bit*() and related waiting APIs:
+ */
+#include <linux/wait_bit.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/debug.h>
+
+int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg)
+{
+	struct wait_bit_key *key = arg;
+	struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+
+	if (wait_bit->key.flags != key->flags ||
+			wait_bit->key.bit_nr != key->bit_nr ||
+			test_bit(key->bit_nr, key->flags))
+		return 0;
+	else
+		return autoremove_wake_function(wq_entry, mode, sync, key);
+}
+EXPORT_SYMBOL(wake_bit_function);
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking)
+ * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
+ * permitted return codes. Nonzero return codes halt waiting and return.
+ */
+int __sched
+__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
+	      wait_bit_action_f *action, unsigned mode)
+{
+	int ret = 0;
+
+	do {
+		prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
+		if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
+			ret = (*action)(&wbq_entry->key, mode);
+	} while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
+	finish_wait(wq_head, &wbq_entry->wq_entry);
+	return ret;
+}
+EXPORT_SYMBOL(__wait_on_bit);
+
+int __sched out_of_line_wait_on_bit(void *word, int bit,
+				    wait_bit_action_f *action, unsigned mode)
+{
+	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
+	DEFINE_WAIT_BIT(wq_entry, word, bit);
+
+	return __wait_on_bit(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit);
+
+int __sched out_of_line_wait_on_bit_timeout(
+	void *word, int bit, wait_bit_action_f *action,
+	unsigned mode, unsigned long timeout)
+{
+	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
+	DEFINE_WAIT_BIT(wq_entry, word, bit);
+
+	wq_entry.key.timeout = jiffies + timeout;
+	return __wait_on_bit(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
+
+int __sched
+__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
+			wait_bit_action_f *action, unsigned mode)
+{
+	int ret = 0;
+
+	for (;;) {
+		prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode);
+		if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) {
+			ret = action(&wbq_entry->key, mode);
+			/*
+			 * See the comment in prepare_to_wait_event().
+			 * finish_wait() does not necessarily takes wwq_head->lock,
+			 * but test_and_set_bit() implies mb() which pairs with
+			 * smp_mb__after_atomic() before wake_up_page().
+			 */
+			if (ret)
+				finish_wait(wq_head, &wbq_entry->wq_entry);
+		}
+		if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) {
+			if (!ret)
+				finish_wait(wq_head, &wbq_entry->wq_entry);
+			return 0;
+		} else if (ret) {
+			return ret;
+		}
+	}
+}
+EXPORT_SYMBOL(__wait_on_bit_lock);
+
+int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
+					 wait_bit_action_f *action, unsigned mode)
+{
+	struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
+	DEFINE_WAIT_BIT(wq_entry, word, bit);
+
+	return __wait_on_bit_lock(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
+
+void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
+{
+	struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
+	if (waitqueue_active(wq_head))
+		__wake_up(wq_head, TASK_NORMAL, 1, &key);
+}
+EXPORT_SYMBOL(__wake_up_bit);
+
+/**
+ * wake_up_bit - wake up a waiter on a bit
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ *
+ * There is a standard hashed waitqueue table for generic use. This
+ * is the part of the hashtable's accessor API that wakes up waiters
+ * on a bit. For instance, if one were to have waiters on a bitflag,
+ * one would call wake_up_bit() after clearing the bit.
+ *
+ * In order for this to function properly, as it uses waitqueue_active()
+ * internally, some kind of memory barrier must be done prior to calling
+ * this. Typically, this will be smp_mb__after_atomic(), but in some
+ * cases where bitflags are manipulated non-atomically under a lock, one
+ * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
+ * because spin_unlock() does not guarantee a memory barrier.
+ */
+void wake_up_bit(void *word, int bit)
+{
+	__wake_up_bit(bit_waitqueue(word, bit), word, bit);
+}
+EXPORT_SYMBOL(wake_up_bit);
+
+/*
+ * Manipulate the atomic_t address to produce a better bit waitqueue table hash
+ * index (we're keying off bit -1, but that would produce a horrible hash
+ * value).
+ */
+static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
+{
+	if (BITS_PER_LONG == 64) {
+		unsigned long q = (unsigned long)p;
+		return bit_waitqueue((void *)(q & ~1), q & 1);
+	}
+	return bit_waitqueue(p, 0);
+}
+
+static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync,
+				  void *arg)
+{
+	struct wait_bit_key *key = arg;
+	struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+	atomic_t *val = key->flags;
+
+	if (wait_bit->key.flags != key->flags ||
+	    wait_bit->key.bit_nr != key->bit_nr ||
+	    atomic_read(val) != 0)
+		return 0;
+	return autoremove_wake_function(wq_entry, mode, sync, key);
+}
+
+/*
+ * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting,
+ * the actions of __wait_on_atomic_t() are permitted return codes.  Nonzero
+ * return codes halt waiting and return.
+ */
+static __sched
+int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
+		       int (*action)(atomic_t *), unsigned mode)
+{
+	atomic_t *val;
+	int ret = 0;
+
+	do {
+		prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode);
+		val = wbq_entry->key.flags;
+		if (atomic_read(val) == 0)
+			break;
+		ret = (*action)(val);
+	} while (!ret && atomic_read(val) != 0);
+	finish_wait(wq_head, &wbq_entry->wq_entry);
+	return ret;
+}
+
+#define DEFINE_WAIT_ATOMIC_T(name, p)					\
+	struct wait_bit_queue_entry name = {				\
+		.key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p),		\
+		.wq_entry = {						\
+			.private	= current,			\
+			.func		= wake_atomic_t_function,	\
+			.task_list	=				\
+				LIST_HEAD_INIT((name).wq_entry.task_list), \
+		},							\
+	}
+
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
+					 unsigned mode)
+{
+	struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
+	DEFINE_WAIT_ATOMIC_T(wq_entry, p);
+
+	return __wait_on_atomic_t(wq_head, &wq_entry, action, mode);
+}
+EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
+
+/**
+ * wake_up_atomic_t - Wake up a waiter on a atomic_t
+ * @p: The atomic_t being waited on, a kernel virtual address
+ *
+ * Wake up anyone waiting for the atomic_t to go to zero.
+ *
+ * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t
+ * check is done by the waiter's wake function, not the by the waker itself).
+ */
+void wake_up_atomic_t(atomic_t *p)
+{
+	__wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
+}
+EXPORT_SYMBOL(wake_up_atomic_t);
+
+__sched int bit_wait(struct wait_bit_key *word, int mode)
+{
+	schedule();
+	if (signal_pending_state(mode, current))
+		return -EINTR;
+	return 0;
+}
+EXPORT_SYMBOL(bit_wait);
+
+__sched int bit_wait_io(struct wait_bit_key *word, int mode)
+{
+	io_schedule();
+	if (signal_pending_state(mode, current))
+		return -EINTR;
+	return 0;
+}
+EXPORT_SYMBOL(bit_wait_io);
+
+__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
+{
+	unsigned long now = READ_ONCE(jiffies);
+	if (time_after_eq(now, word->timeout))
+		return -EAGAIN;
+	schedule_timeout(word->timeout - now);
+	if (signal_pending_state(mode, current))
+		return -EINTR;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_timeout);
+
+__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
+{
+	unsigned long now = READ_ONCE(jiffies);
+	if (time_after_eq(now, word->timeout))
+		return -EAGAIN;
+	io_schedule_timeout(word->timeout - now);
+	if (signal_pending_state(mode, current))
+		return -EINTR;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/security/keys/internal.h b/security/keys/internal.h
index c0f8682eba69..91bc6214ae57 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -13,6 +13,7 @@
 #define _INTERNAL_H
 
 #include <linux/sched.h>
+#include <linux/wait_bit.h>
 #include <linux/cred.h>
 #include <linux/key-type.h>
 #include <linux/task_work.h>
-- 
cgit v1.3-14-g43fede


From 5822a454d6d22297c5fcd66264120587b2ec21cd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sun, 5 Mar 2017 13:09:07 +0100
Subject: sched/wait: Move bit_wait_table[] and related functionality from
 sched/core.c to sched/wait_bit.c

The key hashed waitqueue data structures and their initialization
was done in the main scheduler file for no good reason, move them
to sched/wait_bit.c instead.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/wait_bit.h |  1 +
 kernel/sched/core.c      | 18 ++----------------
 kernel/sched/wait_bit.c  | 23 +++++++++++++++++++++++
 3 files changed, 26 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 8c85c52d94b6..9cc82114dbcb 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -35,6 +35,7 @@ int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action,
 int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
 int out_of_line_wait_on_atomic_t(atomic_t *p, int (*)(atomic_t *), unsigned int mode);
 struct wait_queue_head *bit_waitqueue(void *word, int bit);
+extern void __init wait_bit_init(void);
 
 int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5b36644536ab..e7b9ef8df126 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10,6 +10,7 @@
 #include <uapi/linux/sched/types.h>
 #include <linux/sched/loadavg.h>
 #include <linux/sched/hotplug.h>
+#include <linux/wait_bit.h>
 #include <linux/cpuset.h>
 #include <linux/delayacct.h>
 #include <linux/init_task.h>
@@ -6026,28 +6027,13 @@ static struct kmem_cache *task_group_cache __read_mostly;
 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
 DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
 
-#define WAIT_TABLE_BITS 8
-#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
-static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
-
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
-{
-	const int shift = BITS_PER_LONG == 32 ? 5 : 6;
-	unsigned long val = (unsigned long)word << shift | bit;
-
-	return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
-}
-EXPORT_SYMBOL(bit_waitqueue);
-
 void __init sched_init(void)
 {
 	int i, j;
 	unsigned long alloc_size = 0, ptr;
 
 	sched_clock_init();
-
-	for (i = 0; i < WAIT_TABLE_SIZE; i++)
-		init_waitqueue_head(bit_wait_table + i);
+	wait_bit_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 463bac84dfd1..c891b34e1896 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -4,6 +4,21 @@
 #include <linux/wait_bit.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/hash.h>
+
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+	const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+	unsigned long val = (unsigned long)word << shift | bit;
+
+	return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(bit_waitqueue);
 
 int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg)
 {
@@ -261,3 +276,11 @@ __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
+
+void __init wait_bit_init(void)
+{
+	int i;
+
+	for (i = 0; i < WAIT_TABLE_SIZE; i++)
+		init_waitqueue_head(bit_wait_table + i);
+}
-- 
cgit v1.3-14-g43fede


From 2055da97389a605c8a00d163d40903afbe413921 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 20 Jun 2017 12:06:46 +0200
Subject: sched/wait: Disambiguate wq_entry->task_list and wq_head->task_list
 naming

So I've noticed a number of instances where it was not obvious from the
code whether ->task_list was for a wait-queue head or a wait-queue entry.

Furthermore, there's a number of wait-queue users where the lists are
not for 'tasks' but other entities (poll tables, etc.), in which case
the 'task_list' name is actively confusing.

To clear this all up, name the wait-queue head and entry list structure
fields unambiguously:

	struct wait_queue_head::task_list	=> ::head
	struct wait_queue_entry::task_list	=> ::entry

For example, this code:

	rqw->wait.task_list.next != &wait->task_list

... is was pretty unclear (to me) what it's doing, while now it's written this way:

	rqw->wait.head.next != &wait->entry

... which makes it pretty clear that we are iterating a list until we see the head.

Other examples are:

	list_for_each_entry_safe(pos, next, &x->task_list, task_list) {
	list_for_each_entry(wq, &fence->wait.task_list, task_list) {

... where it's unclear (to me) what we are iterating, and during review it's
hard to tell whether it's trying to walk a wait-queue entry (which would be
a bug), while now it's written as:

	list_for_each_entry_safe(pos, next, &x->head, entry) {
	list_for_each_entry(wq, &fence->wait.head, entry) {

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 block/blk-mq.c                       |  2 +-
 block/blk-wbt.c                      |  2 +-
 block/kyber-iosched.c                |  8 ++++----
 drivers/gpu/drm/i915/i915_sw_fence.c | 21 ++++++++++-----------
 drivers/rtc/rtc-imxdi.c              |  2 +-
 fs/cachefiles/rdwr.c                 |  2 +-
 fs/eventpoll.c                       |  2 +-
 fs/fs_pin.c                          |  2 +-
 fs/nilfs2/segment.c                  |  3 +--
 fs/orangefs/orangefs-bufmap.c        |  8 ++++----
 fs/userfaultfd.c                     | 22 +++++++++++-----------
 include/linux/wait.h                 | 20 ++++++++++----------
 include/linux/wait_bit.h             |  4 ++--
 kernel/sched/wait.c                  | 24 ++++++++++++------------
 kernel/sched/wait_bit.c              |  4 ++--
 mm/filemap.c                         |  2 +-
 mm/memcontrol.c                      |  2 +-
 mm/shmem.c                           |  4 ++--
 18 files changed, 66 insertions(+), 68 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a083f95e04b1..121aa1dbb192 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -933,7 +933,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int fla
 
 	hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
 
-	list_del(&wait->task_list);
+	list_del(&wait->entry);
 	clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
 	blk_mq_run_hw_queue(hctx, true);
 	return 1;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 5f3a37c2784c..6a9a0f03a67b 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -520,7 +520,7 @@ static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
 	 * in line to be woken up, wait for our turn.
 	 */
 	if (waitqueue_active(&rqw->wait) &&
-	    rqw->wait.task_list.next != &wait->task_list)
+	    rqw->wait.head.next != &wait->entry)
 		return false;
 
 	return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index b95d6bd714c0..9bf1484365b2 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -385,7 +385,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 		INIT_LIST_HEAD(&khd->rqs[i]);
-		INIT_LIST_HEAD(&khd->domain_wait[i].task_list);
+		INIT_LIST_HEAD(&khd->domain_wait[i].entry);
 		atomic_set(&khd->wait_index[i], 0);
 	}
 
@@ -512,7 +512,7 @@ static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 {
 	struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
 
-	list_del_init(&wait->task_list);
+	list_del_init(&wait->entry);
 	blk_mq_run_hw_queue(hctx, true);
 	return 1;
 }
@@ -536,7 +536,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 	 * run when one becomes available. Note that this is serialized on
 	 * khd->lock, but we still need to be careful about the waker.
 	 */
-	if (list_empty_careful(&wait->task_list)) {
+	if (list_empty_careful(&wait->entry)) {
 		init_waitqueue_func_entry(wait, kyber_domain_wake);
 		wait->private = hctx;
 		ws = sbq_wait_ptr(domain_tokens,
@@ -736,7 +736,7 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m)	\
 	struct kyber_hctx_data *khd = hctx->sched_data;			\
 	wait_queue_entry_t *wait = &khd->domain_wait[domain];		\
 									\
-	seq_printf(m, "%d\n", !list_empty_careful(&wait->task_list));	\
+	seq_printf(m, "%d\n", !list_empty_careful(&wait->entry));	\
 	return 0;							\
 }
 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 8669bfa33064..380de4360b8a 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -160,31 +160,30 @@ static void __i915_sw_fence_wake_up_all(struct i915_sw_fence *fence,
 
 	/*
 	 * To prevent unbounded recursion as we traverse the graph of
-	 * i915_sw_fences, we move the task_list from this, the next ready
-	 * fence, to the tail of the original fence's task_list
+	 * i915_sw_fences, we move the entry list from this, the next ready
+	 * fence, to the tail of the original fence's entry list
 	 * (and so added to the list to be woken).
 	 */
 
 	spin_lock_irqsave_nested(&x->lock, flags, 1 + !!continuation);
 	if (continuation) {
-		list_for_each_entry_safe(pos, next, &x->task_list, task_list) {
+		list_for_each_entry_safe(pos, next, &x->head, entry) {
 			if (pos->func == autoremove_wake_function)
 				pos->func(pos, TASK_NORMAL, 0, continuation);
 			else
-				list_move_tail(&pos->task_list, continuation);
+				list_move_tail(&pos->entry, continuation);
 		}
 	} else {
 		LIST_HEAD(extra);
 
 		do {
-			list_for_each_entry_safe(pos, next,
-						 &x->task_list, task_list)
+			list_for_each_entry_safe(pos, next, &x->head, entry)
 				pos->func(pos, TASK_NORMAL, 0, &extra);
 
 			if (list_empty(&extra))
 				break;
 
-			list_splice_tail_init(&extra, &x->task_list);
+			list_splice_tail_init(&extra, &x->head);
 		} while (1);
 	}
 	spin_unlock_irqrestore(&x->lock, flags);
@@ -256,7 +255,7 @@ void i915_sw_fence_commit(struct i915_sw_fence *fence)
 
 static int i915_sw_fence_wake(wait_queue_entry_t *wq, unsigned mode, int flags, void *key)
 {
-	list_del(&wq->task_list);
+	list_del(&wq->entry);
 	__i915_sw_fence_complete(wq->private, key);
 	i915_sw_fence_put(wq->private);
 	if (wq->flags & I915_SW_FENCE_FLAG_ALLOC)
@@ -275,7 +274,7 @@ static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence,
 	if (fence == signaler)
 		return true;
 
-	list_for_each_entry(wq, &fence->wait.task_list, task_list) {
+	list_for_each_entry(wq, &fence->wait.head, entry) {
 		if (wq->func != i915_sw_fence_wake)
 			continue;
 
@@ -293,7 +292,7 @@ static void __i915_sw_fence_clear_checked_bit(struct i915_sw_fence *fence)
 	if (!__test_and_clear_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags))
 		return;
 
-	list_for_each_entry(wq, &fence->wait.task_list, task_list) {
+	list_for_each_entry(wq, &fence->wait.head, entry) {
 		if (wq->func != i915_sw_fence_wake)
 			continue;
 
@@ -350,7 +349,7 @@ static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
 		pending |= I915_SW_FENCE_FLAG_ALLOC;
 	}
 
-	INIT_LIST_HEAD(&wq->task_list);
+	INIT_LIST_HEAD(&wq->entry);
 	wq->flags = pending;
 	wq->func = i915_sw_fence_wake;
 	wq->private = i915_sw_fence_get(fence);
diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c
index 6b54f6c24c5f..80931114c899 100644
--- a/drivers/rtc/rtc-imxdi.c
+++ b/drivers/rtc/rtc-imxdi.c
@@ -709,7 +709,7 @@ static irqreturn_t dryice_irq(int irq, void *dev_id)
 		/*If the write wait queue is empty then there is no pending
 		  operations. It means the interrupt is for DryIce -Security.
 		  IRQ must be returned as none.*/
-		if (list_empty_careful(&imxdi->write_wait.task_list))
+		if (list_empty_careful(&imxdi->write_wait.head))
 			return rc;
 
 		/* DSR_WCF clears itself on DSR read */
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 8be33b33b981..18d7aa61ef0f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -48,7 +48,7 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
 	}
 
 	/* remove from the waitqueue */
-	list_del(&wait->task_list);
+	list_del(&wait->entry);
 
 	/* move onto the action list and queue for FS-Cache thread pool */
 	ASSERT(monitor->op);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 5ac1cba5ef72..b1c8e23ddf65 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1094,7 +1094,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 		 * can't use __remove_wait_queue(). whead->lock is held by
 		 * the caller.
 		 */
-		list_del_init(&wait->task_list);
+		list_del_init(&wait->entry);
 	}
 
 	spin_lock_irqsave(&ep->lock, flags);
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 7b447a245760..e747b3d720ee 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -61,7 +61,7 @@ void pin_kill(struct fs_pin *p)
 		rcu_read_unlock();
 		schedule();
 		rcu_read_lock();
-		if (likely(list_empty(&wait.task_list)))
+		if (likely(list_empty(&wait.entry)))
 			break;
 		/* OK, we know p couldn't have been freed yet */
 		spin_lock_irq(&p->wait.lock);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 775304e7f96f..70ded52dc1dd 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2206,8 +2206,7 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
-	list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list,
-				 wq.task_list) {
+	list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.head, wq.entry) {
 		if (!atomic_read(&wrq->done) &&
 		    nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) {
 			wrq->err = err;
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 9e37b7028ea4..038d67545d9f 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -46,7 +46,7 @@ static void run_down(struct slot_map *m)
 	spin_lock(&m->q.lock);
 	if (m->c != -1) {
 		for (;;) {
-			if (likely(list_empty(&wait.task_list)))
+			if (likely(list_empty(&wait.entry)))
 				__add_wait_queue_entry_tail(&m->q, &wait);
 			set_current_state(TASK_UNINTERRUPTIBLE);
 
@@ -84,7 +84,7 @@ static int wait_for_free(struct slot_map *m)
 
 	do {
 		long n = left, t;
-		if (likely(list_empty(&wait.task_list)))
+		if (likely(list_empty(&wait.entry)))
 			__add_wait_queue_entry_tail_exclusive(&m->q, &wait);
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -108,8 +108,8 @@ static int wait_for_free(struct slot_map *m)
 			left = -EINTR;
 	} while (left > 0);
 
-	if (!list_empty(&wait.task_list))
-		list_del(&wait.task_list);
+	if (!list_empty(&wait.entry))
+		list_del(&wait.entry);
 	else if (left <= 0 && waitqueue_active(&m->q))
 		__wake_up_locked_key(&m->q, TASK_INTERRUPTIBLE, NULL);
 	__set_current_state(TASK_RUNNING);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index bda64fcd8a0c..6148ccd6cccf 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -129,7 +129,7 @@ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
 		 * wouldn't be enough, the smp_mb__before_spinlock is
 		 * enough to avoid an explicit smp_mb() here.
 		 */
-		list_del_init(&wq->task_list);
+		list_del_init(&wq->entry);
 out:
 	return ret;
 }
@@ -522,13 +522,13 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	 * and it's fine not to block on the spinlock. The uwq on this
 	 * kernel stack can be released after the list_del_init.
 	 */
-	if (!list_empty_careful(&uwq.wq.task_list)) {
+	if (!list_empty_careful(&uwq.wq.entry)) {
 		spin_lock(&ctx->fault_pending_wqh.lock);
 		/*
 		 * No need of list_del_init(), the uwq on the stack
 		 * will be freed shortly anyway.
 		 */
-		list_del(&uwq.wq.task_list);
+		list_del(&uwq.wq.entry);
 		spin_unlock(&ctx->fault_pending_wqh.lock);
 	}
 
@@ -869,7 +869,7 @@ static inline struct userfaultfd_wait_queue *find_userfault_in(
 	if (!waitqueue_active(wqh))
 		goto out;
 	/* walk in reverse to provide FIFO behavior to read userfaults */
-	wq = list_last_entry(&wqh->task_list, typeof(*wq), task_list);
+	wq = list_last_entry(&wqh->head, typeof(*wq), entry);
 	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 out:
 	return uwq;
@@ -1003,14 +1003,14 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 			 * changes __remove_wait_queue() to use
 			 * list_del_init() in turn breaking the
 			 * !list_empty_careful() check in
-			 * handle_userfault(). The uwq->wq.task_list
+			 * handle_userfault(). The uwq->wq.head list
 			 * must never be empty at any time during the
 			 * refile, or the waitqueue could disappear
 			 * from under us. The "wait_queue_head_t"
 			 * parameter of __remove_wait_queue() is unused
 			 * anyway.
 			 */
-			list_del(&uwq->wq.task_list);
+			list_del(&uwq->wq.entry);
 			__add_wait_queue(&ctx->fault_wqh, &uwq->wq);
 
 			write_seqcount_end(&ctx->refile_seq);
@@ -1032,7 +1032,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 				fork_nctx = (struct userfaultfd_ctx *)
 					(unsigned long)
 					uwq->msg.arg.reserved.reserved1;
-				list_move(&uwq->wq.task_list, &fork_event);
+				list_move(&uwq->wq.entry, &fork_event);
 				spin_unlock(&ctx->event_wqh.lock);
 				ret = 0;
 				break;
@@ -1069,8 +1069,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 			if (!list_empty(&fork_event)) {
 				uwq = list_first_entry(&fork_event,
 						       typeof(*uwq),
-						       wq.task_list);
-				list_del(&uwq->wq.task_list);
+						       wq.entry);
+				list_del(&uwq->wq.entry);
 				__add_wait_queue(&ctx->event_wqh, &uwq->wq);
 				userfaultfd_event_complete(ctx, uwq);
 			}
@@ -1752,12 +1752,12 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
 	unsigned long pending = 0, total = 0;
 
 	spin_lock(&ctx->fault_pending_wqh.lock);
-	list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
+	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
 		uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 		pending++;
 		total++;
 	}
-	list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
+	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
 		uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 		total++;
 	}
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 629489746f8a..b289c96151ee 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -26,12 +26,12 @@ struct wait_queue_entry {
 	unsigned int		flags;
 	void			*private;
 	wait_queue_func_t	func;
-	struct list_head	task_list;
+	struct list_head	entry;
 };
 
 struct wait_queue_head {
 	spinlock_t		lock;
-	struct list_head	task_list;
+	struct list_head	head;
 };
 typedef struct wait_queue_head wait_queue_head_t;
 
@@ -44,14 +44,14 @@ struct task_struct;
 #define __WAITQUEUE_INITIALIZER(name, tsk) {					\
 	.private	= tsk,							\
 	.func		= default_wake_function,				\
-	.task_list	= { NULL, NULL } }
+	.entry		= { NULL, NULL } }
 
 #define DECLARE_WAITQUEUE(name, tsk)						\
 	struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk)
 
 #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {					\
 	.lock		= __SPIN_LOCK_UNLOCKED(name.lock),			\
-	.task_list	= { &(name).task_list, &(name).task_list } }
+	.head		= { &(name).head, &(name).head } }
 
 #define DECLARE_WAIT_QUEUE_HEAD(name) \
 	struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
@@ -121,7 +121,7 @@ init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t f
  */
 static inline int waitqueue_active(struct wait_queue_head *wq_head)
 {
-	return !list_empty(&wq_head->task_list);
+	return !list_empty(&wq_head->head);
 }
 
 /**
@@ -151,7 +151,7 @@ extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue
 
 static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
-	list_add(&wq_entry->task_list, &wq_head->task_list);
+	list_add(&wq_entry->entry, &wq_head->head);
 }
 
 /*
@@ -166,7 +166,7 @@ __add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_en
 
 static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
-	list_add_tail(&wq_entry->task_list, &wq_head->task_list);
+	list_add_tail(&wq_entry->entry, &wq_head->head);
 }
 
 static inline void
@@ -179,7 +179,7 @@ __add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wa
 static inline void
 __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
-	list_del(&wq_entry->task_list);
+	list_del(&wq_entry->entry);
 }
 
 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
@@ -952,7 +952,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
 	struct wait_queue_entry name = {					\
 		.private	= current,					\
 		.func		= function,					\
-		.task_list	= LIST_HEAD_INIT((name).task_list),		\
+		.entry		= LIST_HEAD_INIT((name).entry),			\
 	}
 
 #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
@@ -961,7 +961,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
 	do {									\
 		(wait)->private = current;					\
 		(wait)->func = autoremove_wake_function;			\
-		INIT_LIST_HEAD(&(wait)->task_list);				\
+		INIT_LIST_HEAD(&(wait)->entry);					\
 		(wait)->flags = 0;						\
 	} while (0)
 
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 9cc82114dbcb..12b26660d7e9 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -45,8 +45,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
 		.wq_entry = {							\
 			.private	= current,				\
 			.func		= wake_bit_function,			\
-			.task_list	=					\
-				LIST_HEAD_INIT((name).wq_entry.task_list),	\
+			.entry		=					\
+				LIST_HEAD_INIT((name).wq_entry.entry),		\
 		},								\
 	}
 
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 6bcd7c3c4501..17f11c6b0a9f 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -16,7 +16,7 @@ void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, st
 {
 	spin_lock_init(&wq_head->lock);
 	lockdep_set_class_and_name(&wq_head->lock, key, name);
-	INIT_LIST_HEAD(&wq_head->task_list);
+	INIT_LIST_HEAD(&wq_head->head);
 }
 
 EXPORT_SYMBOL(__init_waitqueue_head);
@@ -68,7 +68,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
 {
 	wait_queue_entry_t *curr, *next;
 
-	list_for_each_entry_safe(curr, next, &wq_head->task_list, task_list) {
+	list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
 		unsigned flags = curr->flags;
 
 		if (curr->func(curr, mode, wake_flags, key) &&
@@ -176,7 +176,7 @@ prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_ent
 
 	wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
 	spin_lock_irqsave(&wq_head->lock, flags);
-	if (list_empty(&wq_entry->task_list))
+	if (list_empty(&wq_entry->entry))
 		__add_wait_queue(wq_head, wq_entry);
 	set_current_state(state);
 	spin_unlock_irqrestore(&wq_head->lock, flags);
@@ -190,7 +190,7 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent
 
 	wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
 	spin_lock_irqsave(&wq_head->lock, flags);
-	if (list_empty(&wq_entry->task_list))
+	if (list_empty(&wq_entry->entry))
 		__add_wait_queue_entry_tail(wq_head, wq_entry);
 	set_current_state(state);
 	spin_unlock_irqrestore(&wq_head->lock, flags);
@@ -202,7 +202,7 @@ void init_wait_entry(struct wait_queue_entry *wq_entry, int flags)
 	wq_entry->flags = flags;
 	wq_entry->private = current;
 	wq_entry->func = autoremove_wake_function;
-	INIT_LIST_HEAD(&wq_entry->task_list);
+	INIT_LIST_HEAD(&wq_entry->entry);
 }
 EXPORT_SYMBOL(init_wait_entry);
 
@@ -225,10 +225,10 @@ long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_en
 		 * can't see us, it should wake up another exclusive waiter if
 		 * we fail.
 		 */
-		list_del_init(&wq_entry->task_list);
+		list_del_init(&wq_entry->entry);
 		ret = -ERESTARTSYS;
 	} else {
-		if (list_empty(&wq_entry->task_list)) {
+		if (list_empty(&wq_entry->entry)) {
 			if (wq_entry->flags & WQ_FLAG_EXCLUSIVE)
 				__add_wait_queue_entry_tail(wq_head, wq_entry);
 			else
@@ -251,7 +251,7 @@ EXPORT_SYMBOL(prepare_to_wait_event);
  */
 int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
 {
-	if (likely(list_empty(&wait->task_list)))
+	if (likely(list_empty(&wait->entry)))
 		__add_wait_queue_entry_tail(wq, wait);
 
 	set_current_state(TASK_INTERRUPTIBLE);
@@ -267,7 +267,7 @@ EXPORT_SYMBOL(do_wait_intr);
 
 int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
 {
-	if (likely(list_empty(&wait->task_list)))
+	if (likely(list_empty(&wait->entry)))
 		__add_wait_queue_entry_tail(wq, wait);
 
 	set_current_state(TASK_INTERRUPTIBLE);
@@ -308,9 +308,9 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_en
 	 *    have _one_ other CPU that looks at or modifies
 	 *    the list).
 	 */
-	if (!list_empty_careful(&wq_entry->task_list)) {
+	if (!list_empty_careful(&wq_entry->entry)) {
 		spin_lock_irqsave(&wq_head->lock, flags);
-		list_del_init(&wq_entry->task_list);
+		list_del_init(&wq_entry->entry);
 		spin_unlock_irqrestore(&wq_head->lock, flags);
 	}
 }
@@ -321,7 +321,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
 	int ret = default_wake_function(wq_entry, mode, sync, key);
 
 	if (ret)
-		list_del_init(&wq_entry->task_list);
+		list_del_init(&wq_entry->entry);
 	return ret;
 }
 EXPORT_SYMBOL(autoremove_wake_function);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index c891b34e1896..f8159698aa4d 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -205,8 +205,8 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
 		.wq_entry = {						\
 			.private	= current,			\
 			.func		= wake_atomic_t_function,	\
-			.task_list	=				\
-				LIST_HEAD_INIT((name).wq_entry.task_list), \
+			.entry		=				\
+				LIST_HEAD_INIT((name).wq_entry.entry),	\
 		},							\
 	}
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 80c19ee81e95..926484561624 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -845,7 +845,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
 	for (;;) {
 		spin_lock_irq(&q->lock);
 
-		if (likely(list_empty(&wait->task_list))) {
+		if (likely(list_empty(&wait->entry))) {
 			if (lock)
 				__add_wait_queue_entry_tail_exclusive(q, wait);
 			else
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9a90b096dc6b..d75b38b66ef6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1570,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
 	owait.wait.flags = 0;
 	owait.wait.func = memcg_oom_wake_function;
 	owait.wait.private = current;
-	INIT_LIST_HEAD(&owait.wait.task_list);
+	INIT_LIST_HEAD(&owait.wait.entry);
 
 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
 	mem_cgroup_mark_under_oom(memcg);
diff --git a/mm/shmem.c b/mm/shmem.c
index a6c7dece4660..fdc413f82a99 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1905,7 +1905,7 @@ unlock:
 static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 {
 	int ret = default_wake_function(wait, mode, sync, key);
-	list_del_init(&wait->task_list);
+	list_del_init(&wait->entry);
 	return ret;
 }
 
@@ -2840,7 +2840,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		spin_lock(&inode->i_lock);
 		inode->i_private = NULL;
 		wake_up_all(&shmem_falloc_waitq);
-		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list));
+		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
 		spin_unlock(&inode->i_lock);
 		error = 0;
 		goto out;
-- 
cgit v1.3-14-g43fede


From 6d3aed3d8a0573d0a6eb1160ccd0a0713f4dbc2f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 20 Jun 2017 12:24:42 +0200
Subject: sched/debug: Fix SCHED_WARN_ON() to return a value on
 !CONFIG_SCHED_DEBUG as well

This definition of SCHED_WARN_ON():

 #define SCHED_WARN_ON(x)        ((void)(x))

is not fully compatible with the 'real' WARN_ON_ONCE() primitive, as it
has no return value, so it cannot be used in conditionals.

Fix it.

Cc: Daniel Axtens <dja@axtens.net>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/sched.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f2ef759a4cb6..e0329d10bdb8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -39,9 +39,9 @@
 #include "cpuacct.h"
 
 #ifdef CONFIG_SCHED_DEBUG
-#define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
+# define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
 #else
-#define SCHED_WARN_ON(x)	((void)(x))
+# define SCHED_WARN_ON(x)	({ (void)(x), 0; })
 #endif
 
 struct rq;
-- 
cgit v1.3-14-g43fede


From c5ae366e12b2bd56fc7d7e9d484836bec9ddc110 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Thu, 11 May 2017 06:11:39 +1000
Subject: sched/fair: WARN() and refuse to set buddy when !se->on_rq

If we set a next or last buddy for a se that is not on_rq, we will
end up taking a NULL pointer dereference in wakeup_preempt_entity
via pick_next_task_fair.

Detect when we would be about to do that, throw a warning and
then refuse to actually set it.

This has been suggested at least twice:

  https://marc.info/?l=linux-kernel&m=146651668921468&w=2
  https://lkml.org/lkml/2016/6/16/663

I recently had to debug a problem with these (we hadn't backported
Konstantin's patches in this area) and this would have saved a lot
of time/pain.

Just do it.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Cc: Ben Segall <bsegall@google.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170510201139.16236-1-dja@axtens.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 396bca9c7996..cb3a3da7089f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6164,8 +6164,11 @@ static void set_last_buddy(struct sched_entity *se)
 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
 		return;
 
-	for_each_sched_entity(se)
+	for_each_sched_entity(se) {
+		if (SCHED_WARN_ON(!se->on_rq))
+			return;
 		cfs_rq_of(se)->last = se;
+	}
 }
 
 static void set_next_buddy(struct sched_entity *se)
@@ -6173,8 +6176,11 @@ static void set_next_buddy(struct sched_entity *se)
 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
 		return;
 
-	for_each_sched_entity(se)
+	for_each_sched_entity(se) {
+		if (SCHED_WARN_ON(!se->on_rq))
+			return;
 		cfs_rq_of(se)->next = se;
+	}
 }
 
 static void set_skip_buddy(struct sched_entity *se)
-- 
cgit v1.3-14-g43fede


From f11cc0760b8397e0d230122606421b6a96e9f869 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 14 Jun 2017 19:37:30 -0700
Subject: sched/core: Drop the unused try_get_task_struct() helper function

This function was introduced by:

  150593bf8693 ("sched/api: Introduce task_rcu_dereference() and try_get_task_struct()")

... to allow easier usage of task_rcu_dereference(), however no users
were ever added. Drop the helper.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/20170615023730.22827-1-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/task.h |  2 --
 kernel/exit.c              | 13 -------------
 2 files changed, 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index a978d7189cfd..f0f065c5afcf 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -95,8 +95,6 @@ static inline void put_task_struct(struct task_struct *t)
 }
 
 struct task_struct *task_rcu_dereference(struct task_struct **ptask);
-struct task_struct *try_get_task_struct(struct task_struct **ptask);
-
 
 #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
 extern int arch_task_struct_size __read_mostly;
diff --git a/kernel/exit.c b/kernel/exit.c
index 7d694437ab44..c63226283aef 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -318,19 +318,6 @@ void rcuwait_wake_up(struct rcuwait *w)
 	rcu_read_unlock();
 }
 
-struct task_struct *try_get_task_struct(struct task_struct **ptask)
-{
-	struct task_struct *task;
-
-	rcu_read_lock();
-	task = task_rcu_dereference(ptask);
-	if (task)
-		get_task_struct(task);
-	rcu_read_unlock();
-
-	return task;
-}
-
 /*
  * Determine if a process group is "orphaned", according to the POSIX
  * definition in 2.2.2.52.  Orphaned process groups are not to be affected
-- 
cgit v1.3-14-g43fede


From d15bc69affc57d7985a01745ca28eafa0772325b Mon Sep 17 00:00:00 2001
From: Peter Meerwald-Stadler <pmeerw@pmeerw.net>
Date: Tue, 30 May 2017 21:41:03 +0200
Subject: timers: Fix parameter description of try_to_del_timer_sync()

Signed-off-by: Peter Meerwald-Stadler <pmeerw@pmeerw.net>
Link: http://lkml.kernel.org/r/20170530194103.7454-1-pmeerw@pmeerw.net
Cc: John Stultz <john.stultz@linaro.org>
Cc: trivial@rustcorp.com.au
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 152a706ef8b8..709a404bd133 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1150,7 +1150,7 @@ EXPORT_SYMBOL(del_timer);
 
 /**
  * try_to_del_timer_sync - Try to deactivate a timer
- * @timer: timer do del
+ * @timer: timer to delete
  *
  * This function tries to deactivate a timer. Upon successful (ret >= 0)
  * exit the timer is not queued and the handler is not running on any CPU.
-- 
cgit v1.3-14-g43fede


From 35eb7258c009dc478338e674a5a84d25d0929c56 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 17:37:35 +0200
Subject: itimer: Make timeval to nsec conversion range limited

The expiry time of a itimer is supplied through sys_setitimer() via a
struct timeval. The timeval is validated for correctness.

In the actual set timer implementation the timeval is converted to a
scalar nanoseconds value. If the tv_sec part of the time spec is large
enough the conversion to nanoseconds (sec * NSEC_PER_SEC) overflows 64bit.

Mitigate that by using the timeval_to_ktime() conversion function, which
checks the tv_sec part for a potential mult overflow and clamps the result
to KTIME_MAX, which is about 292 years.

Reported-by: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170620154113.505981643@linutronix.de
---
 kernel/time/itimer.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 9dd7ff5e445a..2ef98a02376a 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -152,8 +152,12 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 	u64 oval, nval, ointerval, ninterval;
 	struct cpu_itimer *it = &tsk->signal->it[clock_id];
 
-	nval = timeval_to_ns(&value->it_value);
-	ninterval = timeval_to_ns(&value->it_interval);
+	/*
+	 * Use the to_ktime conversion because that clamps the maximum
+	 * value to KTIME_MAX and avoid multiplication overflows.
+	 */
+	nval = ktime_to_ns(timeval_to_ktime(value->it_value));
+	ninterval = ktime_to_ns(timeval_to_ktime(value->it_interval));
 
 	spin_lock_irq(&tsk->sighand->siglock);
 
-- 
cgit v1.3-14-g43fede


From 098b0e01a91c42aaaf0425605cd126b03fcb0bcf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 17:37:36 +0200
Subject: posix-cpu-timers: Make timespec to nsec conversion safe

The expiry time of a posix cpu timer is supplied through sys_timer_set()
via a struct timespec. The timespec is validated for correctness.

In the actual set timer implementation the timespec is converted to a
scalar nanoseconds value. If the tv_sec part of the time spec is large
enough the conversion to nanoseconds (sec * NSEC_PER_SEC) overflows 64bit.

Mitigate that by using the timespec_to_ktime() conversion function, which
checks the tv_sec part for a potential mult overflow and clamps the result
to KTIME_MAX, which is about 292 years.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170620154113.588276707@linutronix.de
---
 kernel/time/posix-cpu-timers.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 9df618ee64cf..60cb24ac9ebc 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -580,7 +580,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 
 	WARN_ON_ONCE(p == NULL);
 
-	new_expires = timespec64_to_ns(&new->it_value);
+	/*
+	 * Use the to_ktime conversion because that clamps the maximum
+	 * value to KTIME_MAX and avoid multiplication overflows.
+	 */
+	new_expires = ktime_to_ns(timespec64_to_ktime(new->it_value));
 
 	/*
 	 * Protect against sighand release/switch in exit/exec and p->cpu_timers
-- 
cgit v1.3-14-g43fede


From fc6eead7c1e2e5376c25d2795d4539fdacbc0648 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 22 May 2017 17:20:20 -0700
Subject: time: Clean up CLOCK_MONOTONIC_RAW time handling

Now that we fixed the sub-ns handling for CLOCK_MONOTONIC_RAW,
remove the duplicitive tk->raw_time.tv_nsec, which can be
stored in tk->tkr_raw.xtime_nsec (similarly to how its handled
for monotonic time).

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Daniel Mentz <danielmentz@google.com>
Tested-by: Daniel Mentz <danielmentz@google.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 arch/arm64/kernel/vdso.c            |  6 ++---
 include/linux/timekeeper_internal.h |  4 ++--
 kernel/time/timekeeping.c           | 45 ++++++++++++++++++++-----------------
 3 files changed, 29 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index d0cb007fa482..7492d9009610 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -220,10 +220,8 @@ void update_vsyscall(struct timekeeper *tk)
 	if (!use_syscall) {
 		/* tkr_mono.cycle_last == tkr_raw.cycle_last */
 		vdso_data->cs_cycle_last	= tk->tkr_mono.cycle_last;
-		vdso_data->raw_time_sec		= tk->raw_time.tv_sec;
-		vdso_data->raw_time_nsec	= (tk->raw_time.tv_nsec <<
-						   tk->tkr_raw.shift) +
-						  tk->tkr_raw.xtime_nsec;
+		vdso_data->raw_time_sec         = tk->raw_sec;
+		vdso_data->raw_time_nsec        = tk->tkr_raw.xtime_nsec;
 		vdso_data->xtime_clock_sec	= tk->xtime_sec;
 		vdso_data->xtime_clock_nsec	= tk->tkr_mono.xtime_nsec;
 		vdso_data->cs_mono_mult		= tk->tkr_mono.mult;
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index f7043ccca81c..0a0a53daf2a2 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -51,7 +51,7 @@ struct tk_read_base {
  * @clock_was_set_seq:	The sequence number of clock was set events
  * @cs_was_changed_seq:	The sequence number of clocksource change events
  * @next_leap_ktime:	CLOCK_MONOTONIC time value of a pending leap-second
- * @raw_time:		Monotonic raw base time in timespec64 format
+ * @raw_sec:		CLOCK_MONOTONIC_RAW  time in seconds
  * @cycle_interval:	Number of clock cycles in one NTP interval
  * @xtime_interval:	Number of clock shifted nano seconds in one NTP
  *			interval.
@@ -93,7 +93,7 @@ struct timekeeper {
 	unsigned int		clock_was_set_seq;
 	u8			cs_was_changed_seq;
 	ktime_t			next_leap_ktime;
-	struct timespec64	raw_time;
+	u64			raw_sec;
 
 	/* The following members are for timekeeping internal use */
 	u64			cycle_interval;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b602c48cb841..0454bfa24353 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -72,6 +72,10 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)
 		tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
 		tk->xtime_sec++;
 	}
+	while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
+		tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
+		tk->raw_sec++;
+	}
 }
 
 static inline struct timespec64 tk_xtime(struct timekeeper *tk)
@@ -285,12 +289,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	 /* if changing clocks, convert xtime_nsec shift units */
 	if (old_clock) {
 		int shift_change = clock->shift - old_clock->shift;
-		if (shift_change < 0)
+		if (shift_change < 0) {
 			tk->tkr_mono.xtime_nsec >>= -shift_change;
-		else
+			tk->tkr_raw.xtime_nsec >>= -shift_change;
+		} else {
 			tk->tkr_mono.xtime_nsec <<= shift_change;
+			tk->tkr_raw.xtime_nsec <<= shift_change;
+		}
 	}
-	tk->tkr_raw.xtime_nsec = 0;
 
 	tk->tkr_mono.shift = clock->shift;
 	tk->tkr_raw.shift = clock->shift;
@@ -619,9 +625,6 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
 	nsec = (u32) tk->wall_to_monotonic.tv_nsec;
 	tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
 
-	/* Update the monotonic raw base */
-	tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
-
 	/*
 	 * The sum of the nanoseconds portions of xtime and
 	 * wall_to_monotonic can be greater/equal one second. Take
@@ -631,6 +634,11 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
 	if (nsec >= NSEC_PER_SEC)
 		seconds++;
 	tk->ktime_sec = seconds;
+
+	/* Update the monotonic raw base */
+	seconds = tk->raw_sec;
+	nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift);
+	tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
 }
 
 /* must hold timekeeper_lock */
@@ -672,7 +680,6 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 static void timekeeping_forward_now(struct timekeeper *tk)
 {
 	u64 cycle_now, delta;
-	u64 nsec;
 
 	cycle_now = tk_clock_read(&tk->tkr_mono);
 	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
@@ -684,10 +691,13 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 	/* If arch requires, add in get_arch_timeoffset() */
 	tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
 
-	tk_normalize_xtime(tk);
 
-	nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
-	timespec64_add_ns(&tk->raw_time, nsec);
+	tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
+
+	/* If arch requires, add in get_arch_timeoffset() */
+	tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift;
+
+	tk_normalize_xtime(tk);
 }
 
 /**
@@ -1373,19 +1383,18 @@ int timekeeping_notify(struct clocksource *clock)
 void getrawmonotonic64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	struct timespec64 ts64;
 	unsigned long seq;
 	u64 nsecs;
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
+		ts->tv_sec = tk->raw_sec;
 		nsecs = timekeeping_get_ns(&tk->tkr_raw);
-		ts64 = tk->raw_time;
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
-	timespec64_add_ns(&ts64, nsecs);
-	*ts = ts64;
+	ts->tv_nsec = 0;
+	timespec64_add_ns(ts, nsecs);
 }
 EXPORT_SYMBOL(getrawmonotonic64);
 
@@ -1509,8 +1518,7 @@ void __init timekeeping_init(void)
 	tk_setup_internals(tk, clock);
 
 	tk_set_xtime(tk, &now);
-	tk->raw_time.tv_sec = 0;
-	tk->raw_time.tv_nsec = 0;
+	tk->raw_sec = 0;
 	if (boot.tv_sec == 0 && boot.tv_nsec == 0)
 		boot = tk_xtime(tk);
 
@@ -2011,15 +2019,12 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
 	*clock_set |= accumulate_nsecs_to_secs(tk);
 
 	/* Accumulate raw time */
-	tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
 	tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
 	snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
 	while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
 		tk->tkr_raw.xtime_nsec -= snsec_per_sec;
-		tk->raw_time.tv_sec++;
+		tk->raw_sec++;
 	}
-	tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift;
-	tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
 
 	/* Accumulate error between NTP and clock interval */
 	tk->ntp_error += tk->ntp_tick << shift;
-- 
cgit v1.3-14-g43fede


From 369adf04d80a7e179b9ea6d74cc01c233f142f47 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 12 May 2017 10:58:18 -0700
Subject: time: Add warning about imminent deprecation of
 CONFIG_GENERIC_TIME_VSYSCALL_OLD

CONFIG_GENERIC_TIME_VSYSCALL_OLD was introduced five years ago
to allow a transition from the old vsyscall implementations to
the new method (which simplified internal accounting and made
timekeeping more precise).

However, PPC and IA64 have yet to make the transition, despite
in some cases me sending test patches to try to help it along.

http://patches.linaro.org/patch/30501/
http://patches.linaro.org/patch/35412/

If its helpful, my last pass at the patches can be found here:
https://git.linaro.org/people/john.stultz/linux.git dev/oldvsyscall-cleanup

So I think its time to set a deadline and make it clear this
is going away. So this patch adds warnings about this
functionality being dropped. Likely to be in v4.15.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Anton Blanchard <anton@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0454bfa24353..cedafa008de5 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -516,6 +516,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
 }
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
+#warning Please contact your maintainers, as GENERIC_TIME_VSYSCALL_OLD compatibity will disappear soon.
 
 static inline void update_vsyscall(struct timekeeper *tk)
 {
-- 
cgit v1.3-14-g43fede


From f160203986a6ad23ab8077c4a25b260fe55d6e26 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Wed, 31 May 2017 18:06:58 +0200
Subject: irq/generic-chip: Export irq_init_generic_chip() locally

This function will be used in the devres variant of
irq_alloc_generic_chip().

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-doc@vger.kernel.org
Cc: Jonathan Corbet <corbet@lwn.net>
Link: http://lkml.kernel.org/r/1496246820-13250-4-git-send-email-brgl@bgdev.pl
---
 kernel/irq/generic-chip.c |  7 +++----
 kernel/irq/internals.h    | 11 +++++++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index ee32870079c9..f7086b78ad6e 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -201,10 +201,9 @@ static void irq_writel_be(u32 val, void __iomem *addr)
 	iowrite32be(val, addr);
 }
 
-static void
-irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
-		      int num_ct, unsigned int irq_base,
-		      void __iomem *reg_base, irq_flow_handler_t handler)
+void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
+			   int num_ct, unsigned int irq_base,
+			   void __iomem *reg_base, irq_flow_handler_t handler)
 {
 	raw_spin_lock_init(&gc->lock);
 	gc->num_ct = num_ct;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index bc226e783bd2..921a2419720c 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -226,3 +226,14 @@ irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
 static inline void
 irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
 #endif
+
+#ifdef CONFIG_GENERIC_IRQ_CHIP
+void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
+			   int num_ct, unsigned int irq_base,
+			   void __iomem *reg_base, irq_flow_handler_t handler);
+#else
+static inline void
+irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
+		      int num_ct, unsigned int irq_base,
+		      void __iomem *reg_base, irq_flow_handler_t handler) { }
+#endif /* CONFIG_GENERIC_IRQ_CHIP */
-- 
cgit v1.3-14-g43fede


From 1c3e36309fe2e94b8a889fa32cb5c871434f8ed6 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Wed, 31 May 2017 18:06:59 +0200
Subject: irq/generic-chip: Provide devm_irq_alloc_generic_chip()

Provide a resource managed variant of irq_alloc_generic_chip().

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-doc@vger.kernel.org
Cc: Jonathan Corbet <corbet@lwn.net>
Link: http://lkml.kernel.org/r/1496246820-13250-5-git-send-email-brgl@bgdev.pl
---
 Documentation/driver-model/devres.txt |  1 +
 include/linux/irq.h                   |  5 +++++
 kernel/irq/devres.c                   | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index e72587fe477d..d473be8c8781 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -311,6 +311,7 @@ IRQ
   devm_irq_alloc_desc_at()
   devm_irq_alloc_desc_from()
   devm_irq_alloc_descs_from()
+  devm_irq_alloc_generic_chip()
 
 LED
   devm_led_classdev_register()
diff --git a/include/linux/irq.h b/include/linux/irq.h
index dc63aa10ce70..64ae54673e08 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -958,6 +958,11 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type);
 void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
 			     unsigned int clr, unsigned int set);
 
+struct irq_chip_generic *
+devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
+			    unsigned int irq_base, void __iomem *reg_base,
+			    irq_flow_handler_t handler);
+
 struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq);
 
 int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 1613bfd48365..21ee0aebccfb 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -4,6 +4,8 @@
 #include <linux/gfp.h>
 #include <linux/irq.h>
 
+#include "internals.h"
+
 /*
  * Device resource management aware IRQ request/free implementation.
  */
@@ -198,3 +200,35 @@ int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
 	return base;
 }
 EXPORT_SYMBOL_GPL(__devm_irq_alloc_descs);
+
+#ifdef CONFIG_GENERIC_IRQ_CHIP
+/**
+ * devm_irq_alloc_generic_chip - Allocate and initialize a generic chip
+ *                               for a managed device
+ * @dev:	Device to allocate the generic chip for
+ * @name:	Name of the irq chip
+ * @num_ct:	Number of irq_chip_type instances associated with this
+ * @irq_base:	Interrupt base nr for this chip
+ * @reg_base:	Register base address (virtual)
+ * @handler:	Default flow handler associated with this chip
+ *
+ * Returns an initialized irq_chip_generic structure. The chip defaults
+ * to the primary (index 0) irq_chip_type and @handler
+ */
+struct irq_chip_generic *
+devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
+			    unsigned int irq_base, void __iomem *reg_base,
+			    irq_flow_handler_t handler)
+{
+	struct irq_chip_generic *gc;
+	unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+
+	gc = devm_kzalloc(dev, sz, GFP_KERNEL);
+	if (gc)
+		irq_init_generic_chip(gc, name, num_ct,
+				      irq_base, reg_base, handler);
+
+	return gc;
+}
+EXPORT_SYMBOL_GPL(devm_irq_alloc_generic_chip);
+#endif /* CONFIG_GENERIC_IRQ_CHIP */
-- 
cgit v1.3-14-g43fede


From 30fd8fc5c91973485705f83c7efe9588b8e6f371 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <brgl@bgdev.pl>
Date: Wed, 31 May 2017 18:07:00 +0200
Subject: irq/generic-chip: Provide devm_irq_setup_generic_chip()

Provide a resource managed variant of irq_setup_generic_chip().

Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-doc@vger.kernel.org
Cc: Jonathan Corbet <corbet@lwn.net>
Link: http://lkml.kernel.org/r/1496246820-13250-6-git-send-email-brgl@bgdev.pl
---
 Documentation/driver-model/devres.txt |  1 +
 include/linux/irq.h                   |  3 ++
 kernel/irq/devres.c                   | 52 +++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index d473be8c8781..6a6618f34440 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -312,6 +312,7 @@ IRQ
   devm_irq_alloc_desc_from()
   devm_irq_alloc_descs_from()
   devm_irq_alloc_generic_chip()
+  devm_irq_setup_generic_chip()
 
 LED
   devm_led_classdev_register()
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 64ae54673e08..d996314b6522 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -962,6 +962,9 @@ struct irq_chip_generic *
 devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
 			    unsigned int irq_base, void __iomem *reg_base,
 			    irq_flow_handler_t handler);
+int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc,
+				u32 msk, enum irq_gc_flags flags,
+				unsigned int clr, unsigned int set);
 
 struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq);
 
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 21ee0aebccfb..194c506d9d20 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -231,4 +231,56 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,
 	return gc;
 }
 EXPORT_SYMBOL_GPL(devm_irq_alloc_generic_chip);
+
+struct irq_generic_chip_devres {
+	struct irq_chip_generic *gc;
+	u32 msk;
+	unsigned int clr;
+	unsigned int set;
+};
+
+static void devm_irq_remove_generic_chip(struct device *dev, void *res)
+{
+	struct irq_generic_chip_devres *this = res;
+
+	irq_remove_generic_chip(this->gc, this->msk, this->clr, this->set);
+}
+
+/**
+ * devm_irq_setup_generic_chip - Setup a range of interrupts with a generic
+ *                               chip for a managed device
+ *
+ * @dev:	Device to setup the generic chip for
+ * @gc:		Generic irq chip holding all data
+ * @msk:	Bitmask holding the irqs to initialize relative to gc->irq_base
+ * @flags:	Flags for initialization
+ * @clr:	IRQ_* bits to clear
+ * @set:	IRQ_* bits to set
+ *
+ * Set up max. 32 interrupts starting from gc->irq_base. Note, this
+ * initializes all interrupts to the primary irq_chip_type and its
+ * associated handler.
+ */
+int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc,
+				u32 msk, enum irq_gc_flags flags,
+				unsigned int clr, unsigned int set)
+{
+	struct irq_generic_chip_devres *dr;
+
+	dr = devres_alloc(devm_irq_remove_generic_chip,
+			  sizeof(*dr), GFP_KERNEL);
+	if (!dr)
+		return -ENOMEM;
+
+	irq_setup_generic_chip(gc, msk, flags, clr, set);
+
+	dr->gc = gc;
+	dr->msk = msk;
+	dr->clr = clr;
+	dr->set = set;
+	devres_add(dev, dr);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_irq_setup_generic_chip);
 #endif /* CONFIG_GENERIC_IRQ_CHIP */
-- 
cgit v1.3-14-g43fede


From 3c85d6db5e5f05ae6c3d7f5a0ceceb43746a5ca7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 19 Jun 2017 04:12:00 +0200
Subject: sched/loadavg: Generalize "_idle" naming to "_nohz"

The loadavg naming code still assumes that nohz == idle whereas its code
is actually handling well both nohz idle and nohz full.

So lets fix the naming according to what the code actually does, to
unconfuse the reader.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1497838322-10913-2-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/trace/ftrace.txt |  2 +-
 include/linux/sched/nohz.h     |  8 +++----
 kernel/sched/loadavg.c         | 51 +++++++++++++++++++++---------------------
 kernel/time/tick-sched.c       |  4 ++--
 4 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 94a987bd2bc5..fff8ff6d4893 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1609,7 +1609,7 @@ Doing the same with chrt -r 5 and function-trace set.
   <idle>-0       3dN.2   14us : sched_avg_update <-__cpu_load_update
   <idle>-0       3dN.2   14us : _raw_spin_unlock <-cpu_load_update_nohz
   <idle>-0       3dN.2   14us : sub_preempt_count <-_raw_spin_unlock
-  <idle>-0       3dN.1   15us : calc_load_exit_idle <-tick_nohz_idle_exit
+  <idle>-0       3dN.1   15us : calc_load_nohz_stop <-tick_nohz_idle_exit
   <idle>-0       3dN.1   15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
   <idle>-0       3dN.1   15us : hrtimer_cancel <-tick_nohz_idle_exit
   <idle>-0       3dN.1   15us : hrtimer_try_to_cancel <-hrtimer_cancel
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index 4995b717500b..7d3f75db23e5 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -23,11 +23,11 @@ static inline void set_cpu_sd_state_idle(void) { }
 #endif
 
 #ifdef CONFIG_NO_HZ_COMMON
-void calc_load_enter_idle(void);
-void calc_load_exit_idle(void);
+void calc_load_nohz_start(void);
+void calc_load_nohz_stop(void);
 #else
-static inline void calc_load_enter_idle(void) { }
-static inline void calc_load_exit_idle(void) { }
+static inline void calc_load_nohz_start(void) { }
+static inline void calc_load_nohz_stop(void) { }
 #endif /* CONFIG_NO_HZ_COMMON */
 
 #if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index f15fb2bdbc0d..f14716a3522f 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -117,7 +117,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
  * load-average relies on per-cpu sampling from the tick, it is affected by
  * NO_HZ.
  *
- * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
  * entering NO_HZ state such that we can include this as an 'extra' cpu delta
  * when we read the global state.
  *
@@ -126,7 +126,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
  *  - When we go NO_HZ idle during the window, we can negate our sample
  *    contribution, causing under-accounting.
  *
- *    We avoid this by keeping two idle-delta counters and flipping them
+ *    We avoid this by keeping two NO_HZ-delta counters and flipping them
  *    when the window starts, thus separating old and new NO_HZ load.
  *
  *    The only trick is the slight shift in index flip for read vs write.
@@ -137,22 +137,22 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
  *    r:0 0 1           1 0           0 1           1 0
  *    w:0 1 1           0 0           1 1           0 0
  *
- *    This ensures we'll fold the old idle contribution in this window while
+ *    This ensures we'll fold the old NO_HZ contribution in this window while
  *    accumlating the new one.
  *
- *  - When we wake up from NO_HZ idle during the window, we push up our
+ *  - When we wake up from NO_HZ during the window, we push up our
  *    contribution, since we effectively move our sample point to a known
  *    busy state.
  *
  *    This is solved by pushing the window forward, and thus skipping the
- *    sample, for this cpu (effectively using the idle-delta for this cpu which
+ *    sample, for this cpu (effectively using the NO_HZ-delta for this cpu which
  *    was in effect at the time the window opened). This also solves the issue
- *    of having to deal with a cpu having been in NOHZ idle for multiple
- *    LOAD_FREQ intervals.
+ *    of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ
+ *    intervals.
  *
  * When making the ILB scale, we should try to pull this in as well.
  */
-static atomic_long_t calc_load_idle[2];
+static atomic_long_t calc_load_nohz[2];
 static int calc_load_idx;
 
 static inline int calc_load_write_idx(void)
@@ -167,7 +167,7 @@ static inline int calc_load_write_idx(void)
 
 	/*
 	 * If the folding window started, make sure we start writing in the
-	 * next idle-delta.
+	 * next NO_HZ-delta.
 	 */
 	if (!time_before(jiffies, READ_ONCE(calc_load_update)))
 		idx++;
@@ -180,24 +180,24 @@ static inline int calc_load_read_idx(void)
 	return calc_load_idx & 1;
 }
 
-void calc_load_enter_idle(void)
+void calc_load_nohz_start(void)
 {
 	struct rq *this_rq = this_rq();
 	long delta;
 
 	/*
-	 * We're going into NOHZ mode, if there's any pending delta, fold it
-	 * into the pending idle delta.
+	 * We're going into NO_HZ mode, if there's any pending delta, fold it
+	 * into the pending NO_HZ delta.
 	 */
 	delta = calc_load_fold_active(this_rq, 0);
 	if (delta) {
 		int idx = calc_load_write_idx();
 
-		atomic_long_add(delta, &calc_load_idle[idx]);
+		atomic_long_add(delta, &calc_load_nohz[idx]);
 	}
 }
 
-void calc_load_exit_idle(void)
+void calc_load_nohz_stop(void)
 {
 	struct rq *this_rq = this_rq();
 
@@ -217,13 +217,13 @@ void calc_load_exit_idle(void)
 		this_rq->calc_load_update += LOAD_FREQ;
 }
 
-static long calc_load_fold_idle(void)
+static long calc_load_nohz_fold(void)
 {
 	int idx = calc_load_read_idx();
 	long delta = 0;
 
-	if (atomic_long_read(&calc_load_idle[idx]))
-		delta = atomic_long_xchg(&calc_load_idle[idx], 0);
+	if (atomic_long_read(&calc_load_nohz[idx]))
+		delta = atomic_long_xchg(&calc_load_nohz[idx], 0);
 
 	return delta;
 }
@@ -299,9 +299,9 @@ calc_load_n(unsigned long load, unsigned long exp,
 
 /*
  * NO_HZ can leave us missing all per-cpu ticks calling
- * calc_load_account_active(), but since an idle CPU folds its delta into
- * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
- * in the pending idle delta if our idle period crossed a load cycle boundary.
+ * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
+ * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
+ * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
  *
  * Once we've updated the global active value, we need to apply the exponential
  * weights adjusted to the number of cycles missed.
@@ -330,7 +330,7 @@ static void calc_global_nohz(void)
 	}
 
 	/*
-	 * Flip the idle index...
+	 * Flip the NO_HZ index...
 	 *
 	 * Make sure we first write the new time then flip the index, so that
 	 * calc_load_write_idx() will see the new time when it reads the new
@@ -341,7 +341,7 @@ static void calc_global_nohz(void)
 }
 #else /* !CONFIG_NO_HZ_COMMON */
 
-static inline long calc_load_fold_idle(void) { return 0; }
+static inline long calc_load_nohz_fold(void) { return 0; }
 static inline void calc_global_nohz(void) { }
 
 #endif /* CONFIG_NO_HZ_COMMON */
@@ -362,9 +362,9 @@ void calc_global_load(unsigned long ticks)
 		return;
 
 	/*
-	 * Fold the 'old' idle-delta to include all NO_HZ cpus.
+	 * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus.
 	 */
-	delta = calc_load_fold_idle();
+	delta = calc_load_nohz_fold();
 	if (delta)
 		atomic_long_add(delta, &calc_load_tasks);
 
@@ -378,7 +378,8 @@ void calc_global_load(unsigned long ticks)
 	WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
 
 	/*
-	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
+	 * In case we went to NO_HZ for multiple LOAD_FREQ intervals
+	 * catch up in bulk.
 	 */
 	calc_global_nohz();
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 9c2dc64e31d8..b1b58a07e042 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -783,7 +783,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	 */
 	if (!ts->tick_stopped) {
 		nohz_balance_enter_idle(cpu);
-		calc_load_enter_idle();
+		calc_load_nohz_start();
 		cpu_load_update_nohz_start();
 
 		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -823,7 +823,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 	 */
 	timer_clear_idle();
 
-	calc_load_exit_idle();
+	calc_load_nohz_stop();
 	touch_softlockup_watchdog_sched();
 	/*
 	 * Cancel the scheduled timer and restore the tick
-- 
cgit v1.3-14-g43fede


From a0db971e4eb69fc84eb3d7ef94f718b483550b4a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 19 Jun 2017 04:12:01 +0200
Subject: nohz: Move idle balancer registration to the idle path

The idle load balancing registration path assumes that we only stop the
tick when the CPU is idle, ignoring the nohz full case. As a result, a
nohz full CPU that is running a task may be chosen to perform idle load
balancing.

Lets make sure that only CPUs in dynticks idle mode can be picked as
idle load balancers.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1497838322-10913-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b1b58a07e042..db023e9cbb25 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -782,7 +782,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	 * the scheduler tick in nohz_restart_sched_tick.
 	 */
 	if (!ts->tick_stopped) {
-		nohz_balance_enter_idle(cpu);
 		calc_load_nohz_start();
 		cpu_load_update_nohz_start();
 
@@ -923,8 +922,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
 			ts->idle_expires = expires;
 		}
 
-		if (!was_stopped && ts->tick_stopped)
+		if (!was_stopped && ts->tick_stopped) {
 			ts->idle_jiffies = ts->last_jiffies;
+			nohz_balance_enter_idle(cpu);
+		}
 	}
 }
 
-- 
cgit v1.3-14-g43fede


From 387bc8b5536eeb0a92f4b4ab553539eaea2ac0ba Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 19 Jun 2017 04:12:02 +0200
Subject: sched/fair: Spare idle load balancing on nohz_full CPUs

Although idle load balancing obviously only concerns idle CPUs, it can
be a disturbance on a busy nohz_full CPU. Indeed a CPU can only get rid
of an idle load balancing duty once a tick fires while it runs a task
and this can take a while on a nohz_full CPU.

We could fix that and escape the idle load balancing duty from the very
idle exit path but that would bring unecessary overhead. Lets just not
bother and leave that job to housekeeping CPUs (those outside nohz_full
range). The nohz_full CPUs simply don't want any disturbance.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1497838322-10913-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a24661ac3d23..694c258b8771 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8683,6 +8683,10 @@ void nohz_balance_enter_idle(int cpu)
 	if (!cpu_active(cpu))
 		return;
 
+	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
+	if (!is_housekeeping_cpu(cpu))
+		return;
+
 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
 		return;
 
-- 
cgit v1.3-14-g43fede


From 0165308a2f994939d2e1b36624f5a8f57746bc88 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:04 +0200
Subject: genirq/msi: Prevent overwriting domain name

Prevent overwriting an already assigned domain name. Remove the extra check
for chip->name, because if domain->name is NULL overwriting it with NULL is
not a problem.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235443.510684976@linutronix.de
---
 kernel/irq/msi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index fe4d48ec5bc4..9e3f1857c6bd 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -274,7 +274,8 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
 
 	domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
 					     fwnode, &msi_domain_ops, info);
-	if (domain && info->chip && info->chip->name)
+
+	if (domain && !domain->name && info->chip)
 		domain->name = info->chip->name;
 
 	return domain;
-- 
cgit v1.3-14-g43fede


From d59f6617eef0f76e34f7a9993f5645c5ef467e42 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:05 +0200
Subject: genirq: Allow fwnode to carry name information only

In order to provide proper debug interface it's required to have domain
names available when the domain is added. Non fwnode based architectures
like x86 have no way to do so.

It's not possible to use domain ops or host data for this as domain ops
might be the same for several instances, but the names have to be unique.

Extend the irqchip fwnode to allow transporting the domain name. If no node
is supplied, create a 'unknown-N' placeholder.

Warn if an invalid node is supplied and treat it like no node. This happens
e.g. with i2 devices on x86 which hand in an ACPI type node which has no
interface for retrieving the name.

[ Folded a fix from Marc to make DT name parsing work ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235443.588784933@linutronix.de
---
 include/linux/irqdomain.h |  31 +++++++++++++-
 kernel/irq/irqdomain.c    | 105 ++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 122 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 9f3616085423..9cf32a2fbe69 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -189,6 +189,9 @@ enum {
 	/* Irq domain implements MSI remapping */
 	IRQ_DOMAIN_FLAG_MSI_REMAP	= (1 << 5),
 
+	/* Irq domain name was allocated in __irq_domain_add() */
+	IRQ_DOMAIN_NAME_ALLOCATED	= (1 << 6),
+
 	/*
 	 * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
 	 * for implementation specific purposes and ignored by the
@@ -203,7 +206,33 @@ static inline struct device_node *irq_domain_get_of_node(struct irq_domain *d)
 }
 
 #ifdef CONFIG_IRQ_DOMAIN
-struct fwnode_handle *irq_domain_alloc_fwnode(void *data);
+struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
+						const char *name, void *data);
+
+enum {
+	IRQCHIP_FWNODE_REAL,
+	IRQCHIP_FWNODE_NAMED,
+	IRQCHIP_FWNODE_NAMED_ID,
+};
+
+static inline
+struct fwnode_handle *irq_domain_alloc_named_fwnode(const char *name)
+{
+	return __irq_domain_alloc_fwnode(IRQCHIP_FWNODE_NAMED, 0, name, NULL);
+}
+
+static inline
+struct fwnode_handle *irq_domain_alloc_named_id_fwnode(const char *name, int id)
+{
+	return __irq_domain_alloc_fwnode(IRQCHIP_FWNODE_NAMED_ID, id, name,
+					 NULL);
+}
+
+static inline struct fwnode_handle *irq_domain_alloc_fwnode(void *data)
+{
+	return __irq_domain_alloc_fwnode(IRQCHIP_FWNODE_REAL, 0, NULL, data);
+}
+
 void irq_domain_free_fwnode(struct fwnode_handle *fwnode);
 struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
 				    irq_hw_number_t hwirq_max, int direct_max,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 70b9da72018b..e1b925bea205 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -26,39 +26,61 @@ static struct irq_domain *irq_default_domain;
 static void irq_domain_check_hierarchy(struct irq_domain *domain);
 
 struct irqchip_fwid {
-	struct fwnode_handle fwnode;
-	char *name;
-	void *data;
+	struct fwnode_handle	fwnode;
+	unsigned int		type;
+	char			*name;
+	void			*data;
 };
 
 /**
  * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for
  *                           identifying an irq domain
- * @data: optional user-provided data
+ * @type:	Type of irqchip_fwnode. See linux/irqdomain.h
+ * @name:	Optional user provided domain name
+ * @id:		Optional user provided id if name != NULL
+ * @data:	Optional user-provided data
  *
- * Allocate a struct device_node, and return a poiner to the embedded
+ * Allocate a struct irqchip_fwid, and return a poiner to the embedded
  * fwnode_handle (or NULL on failure).
+ *
+ * Note: The types IRQCHIP_FWNODE_NAMED and IRQCHIP_FWNODE_NAMED_ID are
+ * solely to transport name information to irqdomain creation code. The
+ * node is not stored. For other types the pointer is kept in the irq
+ * domain struct.
  */
-struct fwnode_handle *irq_domain_alloc_fwnode(void *data)
+struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
+						const char *name, void *data)
 {
 	struct irqchip_fwid *fwid;
-	char *name;
+	char *n;
 
 	fwid = kzalloc(sizeof(*fwid), GFP_KERNEL);
-	name = kasprintf(GFP_KERNEL, "irqchip@%p", data);
 
-	if (!fwid || !name) {
+	switch (type) {
+	case IRQCHIP_FWNODE_NAMED:
+		n = kasprintf(GFP_KERNEL, "%s", name);
+		break;
+	case IRQCHIP_FWNODE_NAMED_ID:
+		n = kasprintf(GFP_KERNEL, "%s-%d", name, id);
+		break;
+	default:
+		n = kasprintf(GFP_KERNEL, "irqchip@%p", data);
+		break;
+	}
+
+	if (!fwid || !n) {
 		kfree(fwid);
-		kfree(name);
+		kfree(n);
 		return NULL;
 	}
 
-	fwid->name = name;
+	fwid->type = type;
+	fwid->name = n;
 	fwid->data = data;
 	fwid->fwnode.type = FWNODE_IRQCHIP;
 	return &fwid->fwnode;
 }
-EXPORT_SYMBOL_GPL(irq_domain_alloc_fwnode);
+EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode);
 
 /**
  * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle
@@ -97,20 +119,75 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
 				    void *host_data)
 {
 	struct device_node *of_node = to_of_node(fwnode);
+	struct irqchip_fwid *fwid;
 	struct irq_domain *domain;
 
+	static atomic_t unknown_domains;
+
 	domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
 			      GFP_KERNEL, of_node_to_nid(of_node));
 	if (WARN_ON(!domain))
 		return NULL;
 
+	if (fwnode && is_fwnode_irqchip(fwnode)) {
+		fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+
+		switch (fwid->type) {
+		case IRQCHIP_FWNODE_NAMED:
+		case IRQCHIP_FWNODE_NAMED_ID:
+			domain->name = kstrdup(fwid->name, GFP_KERNEL);
+			if (!domain->name) {
+				kfree(domain);
+				return NULL;
+			}
+			domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+			break;
+		default:
+			domain->fwnode = fwnode;
+			domain->name = fwid->name;
+			break;
+		}
+	} else if (of_node) {
+		char *name;
+
+		/*
+		 * DT paths contain '/', which debugfs is legitimately
+		 * unhappy about. Replace them with ':', which does
+		 * the trick and is not as offensive as '\'...
+		 */
+		name = kstrdup(of_node_full_name(of_node), GFP_KERNEL);
+		if (!name) {
+			kfree(domain);
+			return NULL;
+		}
+
+		strreplace(name, '/', ':');
+
+		domain->name = name;
+		domain->fwnode = fwnode;
+		domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+	}
+
+	if (!domain->name) {
+		if (fwnode) {
+			pr_err("Invalid fwnode type (%d) for irqdomain\n",
+			       fwnode->type);
+		}
+		domain->name = kasprintf(GFP_KERNEL, "unknown-%d",
+					 atomic_inc_return(&unknown_domains));
+		if (!domain->name) {
+			kfree(domain);
+			return NULL;
+		}
+		domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+	}
+
 	of_node_get(of_node);
 
 	/* Fill structure */
 	INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
 	domain->ops = ops;
 	domain->host_data = host_data;
-	domain->fwnode = fwnode;
 	domain->hwirq_max = hwirq_max;
 	domain->revmap_size = size;
 	domain->revmap_direct_max_irq = direct_max;
@@ -152,6 +229,8 @@ void irq_domain_remove(struct irq_domain *domain)
 	pr_debug("Removed domain %s\n", domain->name);
 
 	of_node_put(irq_domain_get_of_node(domain));
+	if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
+		kfree(domain->name);
 	kfree(domain);
 }
 EXPORT_SYMBOL_GPL(irq_domain_remove);
-- 
cgit v1.3-14-g43fede


From 9dc6be3d419398eae9a19cd09b7969ceff8eaf10 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:16 +0200
Subject: genirq/irqdomain: Add map counter

Add a map counter instead of counting radix tree entries for
diagnosis. That also gives correct information for linear domains.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235444.459397746@linutronix.de
---
 include/linux/irqdomain.h | 2 ++
 kernel/irq/irqdomain.c    | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 9cf32a2fbe69..17ccd54d936d 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -130,6 +130,7 @@ struct irq_domain_chip_generic;
  * @host_data: private data pointer for use by owner.  Not touched by irq_domain
  *             core code.
  * @flags: host per irq_domain flags
+ * @mapcount: The number of mapped interrupts
  *
  * Optional elements
  * @of_node: Pointer to device tree nodes associated with the irq_domain. Used
@@ -152,6 +153,7 @@ struct irq_domain {
 	const struct irq_domain_ops *ops;
 	void *host_data;
 	unsigned int flags;
+	unsigned int mapcount;
 
 	/* Optional data */
 	struct fwnode_handle *fwnode;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index e1b925bea205..8d5805c655b6 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -423,6 +423,7 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
 
 	irq_data->domain = NULL;
 	irq_data->hwirq = 0;
+	domain->mapcount--;
 
 	/* Clear reverse map for this hwirq */
 	if (hwirq < domain->revmap_size) {
@@ -474,6 +475,7 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
 			domain->name = irq_data->chip->name;
 	}
 
+	domain->mapcount++;
 	if (hwirq < domain->revmap_size) {
 		domain->linear_revmap[hwirq] = virq;
 	} else {
@@ -1081,6 +1083,7 @@ static void irq_domain_insert_irq(int virq)
 		struct irq_domain *domain = data->domain;
 		irq_hw_number_t hwirq = data->hwirq;
 
+		domain->mapcount++;
 		if (hwirq < domain->revmap_size) {
 			domain->linear_revmap[hwirq] = virq;
 		} else {
@@ -1110,6 +1113,7 @@ static void irq_domain_remove_irq(int virq)
 		struct irq_domain *domain = data->domain;
 		irq_hw_number_t hwirq = data->hwirq;
 
+		domain->mapcount--;
 		if (hwirq < domain->revmap_size) {
 			domain->linear_revmap[hwirq] = 0;
 		} else {
-- 
cgit v1.3-14-g43fede


From 087cdfb662ae50e3826e7cd2e54b6519d07b60f0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:17 +0200
Subject: genirq/debugfs: Add proper debugfs interface

Debugging (hierarchical) interupt domains is tedious as there is no
information about the hierarchy and no information about states of
interrupts in the various domain levels.

Add a debugfs directory 'irq' and subdirectories 'domains' and 'irqs'.

The domains directory contains the domain files. The content is information
about the domain. If the domain is part of a hierarchy then the parent
domains are printed as well.

# ls /sys/kernel/debug/irq/domains/
default     INTEL-IR-2	    INTEL-IR-MSI-2  IO-APIC-IR-2  PCI-MSI
DMAR-MSI    INTEL-IR-3	    INTEL-IR-MSI-3  IO-APIC-IR-3  unknown-1
INTEL-IR-0  INTEL-IR-MSI-0  IO-APIC-IR-0    IO-APIC-IR-4  VECTOR
INTEL-IR-1  INTEL-IR-MSI-1  IO-APIC-IR-1    PCI-HT

# cat /sys/kernel/debug/irq/domains/VECTOR
name:   VECTOR
 size:   0
 mapped: 216
 flags:  0x00000041

# cat /sys/kernel/debug/irq/domains/IO-APIC-IR-0
name:   IO-APIC-IR-0
 size:   24
 mapped: 19
 flags:  0x00000041
 parent: INTEL-IR-3
    name:   INTEL-IR-3
     size:   65536
     mapped: 167
     flags:  0x00000041
     parent: VECTOR
        name:   VECTOR
         size:   0
         mapped: 216
         flags:  0x00000041

Unfortunately there is no per cpu information about the VECTOR domain (yet).

The irqs directory contains detailed information about mapped interrupts.

# cat /sys/kernel/debug/irq/irqs/3
handler:  handle_edge_irq
status:   0x00004000
istate:   0x00000000
ddepth:   1
wdepth:   0
dstate:   0x01018000
            IRQD_IRQ_DISABLED
            IRQD_SINGLE_TARGET
            IRQD_MOVE_PCNTXT
node:     0
affinity: 0-143
effectiv: 0
pending:
domain:  IO-APIC-IR-0
 hwirq:   0x3
 chip:    IR-IO-APIC
  flags:   0x10
             IRQCHIP_SKIP_SET_WAKE
 parent:
    domain:  INTEL-IR-3
     hwirq:   0x20000
     chip:    INTEL-IR
      flags:   0x0
     parent:
        domain:  VECTOR
         hwirq:   0x3
         chip:    APIC
          flags:   0x0

This was developed to simplify the debugging of the managed affinity
changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235444.537566163@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdesc.h   |   4 +
 include/linux/irqdomain.h |   4 +
 kernel/irq/Kconfig        |  11 +++
 kernel/irq/Makefile       |   1 +
 kernel/irq/debugfs.c      | 215 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/irq/internals.h    |  22 +++++
 kernel/irq/irqdesc.c      |   1 +
 kernel/irq/irqdomain.c    |  87 ++++++++++++++++++-
 kernel/irq/manage.c       |   1 +
 9 files changed, 345 insertions(+), 1 deletion(-)
 create mode 100644 kernel/irq/debugfs.c

(limited to 'kernel')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index c9be57931b58..d425a3a09722 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -46,6 +46,7 @@ struct pt_regs;
  * @rcu:		rcu head for delayed free
  * @kobj:		kobject used to represent this struct in sysfs
  * @dir:		/proc/irq/ procfs entry
+ * @debugfs_file:	dentry for the debugfs file
  * @name:		flow handler name for /proc/interrupts output
  */
 struct irq_desc {
@@ -88,6 +89,9 @@ struct irq_desc {
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry	*dir;
 #endif
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+	struct dentry		*debugfs_file;
+#endif
 #ifdef CONFIG_SPARSE_IRQ
 	struct rcu_head		rcu;
 	struct kobject		kobj;
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 17ccd54d936d..914b0c31d233 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -139,6 +139,7 @@ struct irq_domain_chip_generic;
  *      setting up one or more generic chips for interrupt controllers
  *      drivers using the generic chip library which uses this pointer.
  * @parent: Pointer to parent irq_domain to support hierarchy irq_domains
+ * @debugfs_file: dentry for the domain debugfs file
  *
  * Revmap data, used internally by irq_domain
  * @revmap_direct_max_irq: The largest hwirq that can be set for controllers that
@@ -162,6 +163,9 @@ struct irq_domain {
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
 	struct irq_domain *parent;
 #endif
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+	struct dentry		*debugfs_file;
+#endif
 
 	/* reverse map data. The linear map gets appended to the irq_domain */
 	irq_hw_number_t hwirq_max;
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 3bbfd6a9c475..8d9498e51585 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -108,4 +108,15 @@ config SPARSE_IRQ
 
 	  If you don't know what to do here, say N.
 
+config GENERIC_IRQ_DEBUGFS
+	bool "Expose irq internals in debugfs"
+	depends on DEBUG_FS
+	default n
+	---help---
+
+	  Exposes internal state information through debugfs. Mostly for
+	  developers and debugging of hard to diagnose interrupt problems.
+
+	  If you don't know what to do here, say N.
+
 endmenu
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 1d3ee3169202..c61fc9c2d1f7 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_PM_SLEEP) += pm.o
 obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
 obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
 obj-$(CONFIG_SMP) += affinity.o
+obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
new file mode 100644
index 000000000000..50ee2f6593e8
--- /dev/null
+++ b/kernel/irq/debugfs.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2017 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * This file is licensed under the GPL V2.
+ */
+#include <linux/debugfs.h>
+#include <linux/irqdomain.h>
+#include <linux/irq.h>
+
+#include "internals.h"
+
+static struct dentry *irq_dir;
+
+struct irq_bit_descr {
+	unsigned int	mask;
+	char		*name;
+};
+#define BIT_MASK_DESCR(m)	{ .mask = m, .name = #m }
+
+static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
+				const struct irq_bit_descr *sd, int size)
+{
+	int i;
+
+	for (i = 0; i < size; i++, sd++) {
+		if (state & sd->mask)
+			seq_printf(m, "%*s%s\n", ind + 12, "", sd->name);
+	}
+}
+
+#ifdef CONFIG_SMP
+static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc)
+{
+	struct irq_data *data = irq_desc_get_irq_data(desc);
+	struct cpumask *msk;
+
+	msk = irq_data_get_affinity_mask(data);
+	seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk));
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	msk = desc->pending_mask;
+	seq_printf(m, "pending:  %*pbl\n", cpumask_pr_args(msk));
+#endif
+}
+#else
+static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc) { }
+#endif
+
+static const struct irq_bit_descr irqchip_flags[] = {
+	BIT_MASK_DESCR(IRQCHIP_SET_TYPE_MASKED),
+	BIT_MASK_DESCR(IRQCHIP_EOI_IF_HANDLED),
+	BIT_MASK_DESCR(IRQCHIP_MASK_ON_SUSPEND),
+	BIT_MASK_DESCR(IRQCHIP_ONOFFLINE_ENABLED),
+	BIT_MASK_DESCR(IRQCHIP_SKIP_SET_WAKE),
+	BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE),
+	BIT_MASK_DESCR(IRQCHIP_EOI_THREADED),
+};
+
+static void
+irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind)
+{
+	struct irq_chip *chip = data->chip;
+
+	if (!chip) {
+		seq_printf(m, "chip: None\n");
+		return;
+	}
+	seq_printf(m, "%*schip:    %s\n", ind, "", chip->name);
+	seq_printf(m, "%*sflags:   0x%lx\n", ind + 1, "", chip->flags);
+	irq_debug_show_bits(m, ind, chip->flags, irqchip_flags,
+			    ARRAY_SIZE(irqchip_flags));
+}
+
+static void
+irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind)
+{
+	seq_printf(m, "%*sdomain:  %s\n", ind, "",
+		   data->domain ? data->domain->name : "");
+	seq_printf(m, "%*shwirq:   0x%lx\n", ind + 1, "", data->hwirq);
+	irq_debug_show_chip(m, data, ind + 1);
+#ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
+	if (!data->parent_data)
+		return;
+	seq_printf(m, "%*sparent:\n", ind + 1, "");
+	irq_debug_show_data(m, data->parent_data, ind + 4);
+#endif
+}
+
+static const struct irq_bit_descr irqdata_states[] = {
+	BIT_MASK_DESCR(IRQ_TYPE_EDGE_RISING),
+	BIT_MASK_DESCR(IRQ_TYPE_EDGE_FALLING),
+	BIT_MASK_DESCR(IRQ_TYPE_LEVEL_HIGH),
+	BIT_MASK_DESCR(IRQ_TYPE_LEVEL_LOW),
+	BIT_MASK_DESCR(IRQD_LEVEL),
+
+	BIT_MASK_DESCR(IRQD_ACTIVATED),
+	BIT_MASK_DESCR(IRQD_IRQ_STARTED),
+	BIT_MASK_DESCR(IRQD_IRQ_DISABLED),
+	BIT_MASK_DESCR(IRQD_IRQ_MASKED),
+	BIT_MASK_DESCR(IRQD_IRQ_INPROGRESS),
+
+	BIT_MASK_DESCR(IRQD_PER_CPU),
+	BIT_MASK_DESCR(IRQD_NO_BALANCING),
+
+	BIT_MASK_DESCR(IRQD_MOVE_PCNTXT),
+	BIT_MASK_DESCR(IRQD_AFFINITY_SET),
+	BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
+	BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
+	BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
+
+	BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
+
+	BIT_MASK_DESCR(IRQD_WAKEUP_STATE),
+	BIT_MASK_DESCR(IRQD_WAKEUP_ARMED),
+};
+
+static const struct irq_bit_descr irqdesc_states[] = {
+	BIT_MASK_DESCR(_IRQ_NOPROBE),
+	BIT_MASK_DESCR(_IRQ_NOREQUEST),
+	BIT_MASK_DESCR(_IRQ_NOTHREAD),
+	BIT_MASK_DESCR(_IRQ_NOAUTOEN),
+	BIT_MASK_DESCR(_IRQ_NESTED_THREAD),
+	BIT_MASK_DESCR(_IRQ_PER_CPU_DEVID),
+	BIT_MASK_DESCR(_IRQ_IS_POLLED),
+	BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY),
+};
+
+static const struct irq_bit_descr irqdesc_istates[] = {
+	BIT_MASK_DESCR(IRQS_AUTODETECT),
+	BIT_MASK_DESCR(IRQS_SPURIOUS_DISABLED),
+	BIT_MASK_DESCR(IRQS_POLL_INPROGRESS),
+	BIT_MASK_DESCR(IRQS_ONESHOT),
+	BIT_MASK_DESCR(IRQS_REPLAY),
+	BIT_MASK_DESCR(IRQS_WAITING),
+	BIT_MASK_DESCR(IRQS_PENDING),
+	BIT_MASK_DESCR(IRQS_SUSPENDED),
+};
+
+
+static int irq_debug_show(struct seq_file *m, void *p)
+{
+	struct irq_desc *desc = m->private;
+	struct irq_data *data;
+
+	raw_spin_lock_irq(&desc->lock);
+	data = irq_desc_get_irq_data(desc);
+	seq_printf(m, "handler:  %pf\n", desc->handle_irq);
+	seq_printf(m, "status:   0x%08x\n", desc->status_use_accessors);
+	irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
+			    ARRAY_SIZE(irqdesc_states));
+	seq_printf(m, "istate:   0x%08x\n", desc->istate);
+	irq_debug_show_bits(m, 0, desc->istate, irqdesc_istates,
+			    ARRAY_SIZE(irqdesc_istates));
+	seq_printf(m, "ddepth:   %u\n", desc->depth);
+	seq_printf(m, "wdepth:   %u\n", desc->wake_depth);
+	seq_printf(m, "dstate:   0x%08x\n", irqd_get(data));
+	irq_debug_show_bits(m, 0, irqd_get(data), irqdata_states,
+			    ARRAY_SIZE(irqdata_states));
+	seq_printf(m, "node:     %d\n", irq_data_get_node(data));
+	irq_debug_show_masks(m, desc);
+	irq_debug_show_data(m, data, 0);
+	raw_spin_unlock_irq(&desc->lock);
+	return 0;
+}
+
+static int irq_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, irq_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_irq_ops = {
+	.open		= irq_debug_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
+{
+	char name [10];
+
+	if (!irq_dir || !desc || desc->debugfs_file)
+		return;
+
+	sprintf(name, "%d", irq);
+	desc->debugfs_file = debugfs_create_file(name, 0444, irq_dir, desc,
+						 &dfs_irq_ops);
+}
+
+void irq_remove_debugfs_entry(struct irq_desc *desc)
+{
+	if (desc->debugfs_file)
+		debugfs_remove(desc->debugfs_file);
+}
+
+static int __init irq_debugfs_init(void)
+{
+	struct dentry *root_dir;
+	int irq;
+
+	root_dir = debugfs_create_dir("irq", NULL);
+	if (!root_dir)
+		return -ENOMEM;
+
+	irq_domain_debugfs_init(root_dir);
+
+	irq_dir = debugfs_create_dir("irqs", root_dir);
+
+	irq_lock_sparse();
+	for_each_active_irq(irq)
+		irq_add_debugfs_entry(irq, irq_to_desc(irq));
+	irq_unlock_sparse();
+
+	return 0;
+}
+__initcall(irq_debugfs_init);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 921a2419720c..094db5bfb83f 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -169,6 +169,11 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
 
+static inline unsigned int irqd_get(struct irq_data *d)
+{
+	return __irqd_to_state(d);
+}
+
 /*
  * Manipulation functions for irq_data.state
  */
@@ -237,3 +242,20 @@ irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
 		      int num_ct, unsigned int irq_base,
 		      void __iomem *reg_base, irq_flow_handler_t handler) { }
 #endif /* CONFIG_GENERIC_IRQ_CHIP */
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
+void irq_remove_debugfs_entry(struct irq_desc *desc);
+# ifdef CONFIG_IRQ_DOMAIN
+void irq_domain_debugfs_init(struct dentry *root);
+# else
+static inline void irq_domain_debugfs_init(struct dentry *root);
+# endif
+#else /* CONFIG_GENERIC_IRQ_DEBUGFS */
+static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
+{
+}
+static inline void irq_remove_debugfs_entry(struct irq_desc *d)
+{
+}
+#endif /* CONFIG_GENERIC_IRQ_DEBUGFS */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 09abce2ea8f0..feade536b6d1 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -394,6 +394,7 @@ static void free_desc(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
+	irq_remove_debugfs_entry(desc);
 	unregister_irq_proc(irq, desc);
 
 	/*
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8d5805c655b6..75e1f0851c33 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -29,9 +29,17 @@ struct irqchip_fwid {
 	struct fwnode_handle	fwnode;
 	unsigned int		type;
 	char			*name;
-	void			*data;
+	void *data;
 };
 
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+static void debugfs_add_domain_dir(struct irq_domain *d);
+static void debugfs_remove_domain_dir(struct irq_domain *d);
+#else
+static inline void debugfs_add_domain_dir(struct irq_domain *d) { }
+static inline void debugfs_remove_domain_dir(struct irq_domain *d) { }
+#endif
+
 /**
  * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for
  *                           identifying an irq domain
@@ -194,6 +202,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
 	irq_domain_check_hierarchy(domain);
 
 	mutex_lock(&irq_domain_mutex);
+	debugfs_add_domain_dir(domain);
 	list_add(&domain->link, &irq_domain_list);
 	mutex_unlock(&irq_domain_mutex);
 
@@ -213,6 +222,7 @@ EXPORT_SYMBOL_GPL(__irq_domain_add);
 void irq_domain_remove(struct irq_domain *domain)
 {
 	mutex_lock(&irq_domain_mutex);
+	debugfs_remove_domain_dir(domain);
 
 	WARN_ON(!radix_tree_empty(&domain->revmap_tree));
 
@@ -1599,3 +1609,78 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)
 {
 }
 #endif	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+static struct dentry *domain_dir;
+
+static void
+irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
+{
+	seq_printf(m, "%*sname:   %s\n", ind, "", d->name);
+	seq_printf(m, "%*ssize:   %u\n", ind + 1, "",
+		   d->revmap_size + d->revmap_direct_max_irq);
+	seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount);
+	seq_printf(m, "%*sflags:  0x%08x\n", ind +1 , "", d->flags);
+#ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
+	if (!d->parent)
+		return;
+	seq_printf(m, "%*sparent: %s\n", ind + 1, "", d->parent->name);
+	irq_domain_debug_show_one(m, d->parent, ind + 4);
+#endif
+}
+
+static int irq_domain_debug_show(struct seq_file *m, void *p)
+{
+	struct irq_domain *d = m->private;
+
+	/* Default domain? Might be NULL */
+	if (!d) {
+		if (!irq_default_domain)
+			return 0;
+		d = irq_default_domain;
+	}
+	irq_domain_debug_show_one(m, d, 0);
+	return 0;
+}
+
+static int irq_domain_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, irq_domain_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_domain_ops = {
+	.open		= irq_domain_debug_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static void debugfs_add_domain_dir(struct irq_domain *d)
+{
+	if (!d->name || !domain_dir || d->debugfs_file)
+		return;
+	d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d,
+					      &dfs_domain_ops);
+}
+
+static void debugfs_remove_domain_dir(struct irq_domain *d)
+{
+	if (d->debugfs_file)
+		debugfs_remove(d->debugfs_file);
+}
+
+void __init irq_domain_debugfs_init(struct dentry *root)
+{
+	struct irq_domain *d;
+
+	domain_dir = debugfs_create_dir("domains", root);
+	if (!domain_dir)
+		return;
+
+	debugfs_create_file("default", 0444, domain_dir, NULL, &dfs_domain_ops);
+	mutex_lock(&irq_domain_mutex);
+	list_for_each_entry(d, &irq_domain_list, link)
+		debugfs_add_domain_dir(d);
+	mutex_unlock(&irq_domain_mutex);
+}
+#endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4c34696ca575..284f4eb1ffbe 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1398,6 +1398,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		wake_up_process(new->secondary->thread);
 
 	register_irq_proc(irq, desc);
+	irq_add_debugfs_entry(irq, desc);
 	new->dir = NULL;
 	register_handler_proc(irq, new);
 	free_cpumask_var(mask);
-- 
cgit v1.3-14-g43fede


From cdd16365b0bd7c0cd19e2cc768b6bdc8021f32c3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:19 +0200
Subject: genirq: Provide irq_fixup_move_pending()

If an CPU goes offline, the interrupts are migrated away, but a eventually
pending interrupt move, which has not yet been made effective is kept
pending even if the outgoing CPU is the sole target of the pending affinity
mask. What's worse is, that the pending affinity mask is discarded even if
it would contain a valid subset of the online CPUs.

Implement a helper function which allows to avoid these issues.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235444.691345468@linutronix.de
---
 include/linux/irq.h    |  5 +++++
 kernel/irq/migration.c | 30 ++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 7e62e10e5856..d008065e2f4d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -491,9 +491,14 @@ extern void irq_migrate_all_off_this_cpu(void);
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ)
 void irq_move_irq(struct irq_data *data);
 void irq_move_masked_irq(struct irq_data *data);
+bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
 #else
 static inline void irq_move_irq(struct irq_data *data) { }
 static inline void irq_move_masked_irq(struct irq_data *data) { }
+static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
+{
+	return false;
+}
 #endif
 
 extern int no_irq_affinity;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 37ddb7bda651..6ca054a3f91d 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,6 +4,36 @@
 
 #include "internals.h"
 
+/**
+ * irq_fixup_move_pending - Cleanup irq move pending from a dying CPU
+ * @desc:		Interrupt descpriptor to clean up
+ * @force_clear:	If set clear the move pending bit unconditionally.
+ *			If not set, clear it only when the dying CPU is the
+ *			last one in the pending mask.
+ *
+ * Returns true if the pending bit was set and the pending mask contains an
+ * online CPU other than the dying CPU.
+ */
+bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear)
+{
+	struct irq_data *data = irq_desc_get_irq_data(desc);
+
+	if (!irqd_is_setaffinity_pending(data))
+		return false;
+
+	/*
+	 * The outgoing CPU might be the last online target in a pending
+	 * interrupt move. If that's the case clear the pending move bit.
+	 */
+	if (cpumask_any_and(desc->pending_mask, cpu_online_mask) >= nr_cpu_ids) {
+		irqd_clr_move_pending(data);
+		return false;
+	}
+	if (force_clear)
+		irqd_clr_move_pending(data);
+	return true;
+}
+
 void irq_move_masked_irq(struct irq_data *idata)
 {
 	struct irq_desc *desc = irq_data_to_desc(idata);
-- 
cgit v1.3-14-g43fede


From cba4235e6031e9318d68186f6d765c531cbea4e1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:21 +0200
Subject: genirq: Remove mask argument from setup_affinity()

No point to have this alloc/free dance of cpumasks. Provide a static mask
for setup_affinity() and protect it proper.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235444.851571573@linutronix.de
---
 kernel/irq/internals.h |  2 +-
 kernel/irq/manage.c    | 53 ++++++++++++++++++++++----------------------------
 kernel/irq/proc.c      |  8 +++++---
 3 files changed, 29 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 094db5bfb83f..33ca83816b8c 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -109,7 +109,7 @@ static inline void unregister_handler_proc(unsigned int irq,
 
 extern bool irq_can_set_affinity_usr(unsigned int irq);
 
-extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
+extern int irq_select_affinity_usr(unsigned int irq);
 
 extern void irq_set_thread_affinity(struct irq_desc *desc);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 284f4eb1ffbe..e2f20d553d60 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -345,15 +345,18 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
 /*
  * Generic version of the affinity autoselector.
  */
-static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
+static int irq_setup_affinity(struct irq_desc *desc)
 {
 	struct cpumask *set = irq_default_affinity;
-	int node = irq_desc_get_node(desc);
+	int ret, node = irq_desc_get_node(desc);
+	static DEFINE_RAW_SPINLOCK(mask_lock);
+	static struct cpumask mask;
 
 	/* Excludes PER_CPU and NO_BALANCE interrupts */
 	if (!__irq_can_set_affinity(desc))
 		return 0;
 
+	raw_spin_lock(&mask_lock);
 	/*
 	 * Preserve the managed affinity setting and a userspace affinity
 	 * setup, but make sure that one of the targets is online.
@@ -367,43 +370,42 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
 			irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
 	}
 
-	cpumask_and(mask, cpu_online_mask, set);
+	cpumask_and(&mask, cpu_online_mask, set);
 	if (node != NUMA_NO_NODE) {
 		const struct cpumask *nodemask = cpumask_of_node(node);
 
 		/* make sure at least one of the cpus in nodemask is online */
-		if (cpumask_intersects(mask, nodemask))
-			cpumask_and(mask, mask, nodemask);
+		if (cpumask_intersects(&mask, nodemask))
+			cpumask_and(&mask, &mask, nodemask);
 	}
-	irq_do_set_affinity(&desc->irq_data, mask, false);
-	return 0;
+	ret = irq_do_set_affinity(&desc->irq_data, &mask, false);
+	raw_spin_unlock(&mask_lock);
+	return ret;
 }
 #else
 /* Wrapper for ALPHA specific affinity selector magic */
-static inline int setup_affinity(struct irq_desc *d, struct cpumask *mask)
+int irq_setup_affinity(struct irq_desc *desc)
 {
-	return irq_select_affinity(irq_desc_get_irq(d));
+	return irq_select_affinity(irq_desc_get_irq(desc));
 }
 #endif
 
 /*
- * Called when affinity is set via /proc/irq
+ * Called when a bogus affinity is set via /proc/irq
  */
-int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
+int irq_select_affinity_usr(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
 	int ret;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
-	ret = setup_affinity(desc, mask);
+	ret = irq_setup_affinity(desc);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return ret;
 }
-
 #else
-static inline int
-setup_affinity(struct irq_desc *desc, struct cpumask *mask)
+static inline int setup_affinity(struct irq_desc *desc)
 {
 	return 0;
 }
@@ -1128,7 +1130,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	struct irqaction *old, **old_ptr;
 	unsigned long flags, thread_mask = 0;
 	int ret, nested, shared = 0;
-	cpumask_var_t mask;
 
 	if (!desc)
 		return -EINVAL;
@@ -1187,11 +1188,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		}
 	}
 
-	if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
-		ret = -ENOMEM;
-		goto out_thread;
-	}
-
 	/*
 	 * Drivers are often written to work w/o knowledge about the
 	 * underlying irq chip implementation, so a request for a
@@ -1256,7 +1252,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		 */
 		if (thread_mask == ~0UL) {
 			ret = -EBUSY;
-			goto out_mask;
+			goto out_unlock;
 		}
 		/*
 		 * The thread_mask for the action is or'ed to
@@ -1300,7 +1296,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
 		       irq);
 		ret = -EINVAL;
-		goto out_mask;
+		goto out_unlock;
 	}
 
 	if (!shared) {
@@ -1308,7 +1304,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		if (ret) {
 			pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
 			       new->name, irq, desc->irq_data.chip->name);
-			goto out_mask;
+			goto out_unlock;
 		}
 
 		init_waitqueue_head(&desc->wait_for_threads);
@@ -1320,7 +1316,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 			if (ret) {
 				irq_release_resources(desc);
-				goto out_mask;
+				goto out_unlock;
 			}
 		}
 
@@ -1357,7 +1353,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		}
 
 		/* Set default affinity mask once everything is setup */
-		setup_affinity(desc, mask);
+		irq_setup_affinity(desc);
 
 	} else if (new->flags & IRQF_TRIGGER_MASK) {
 		unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
@@ -1401,8 +1397,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	irq_add_debugfs_entry(irq, desc);
 	new->dir = NULL;
 	register_handler_proc(irq, new);
-	free_cpumask_var(mask);
-
 	return 0;
 
 mismatch:
@@ -1415,9 +1409,8 @@ mismatch:
 	}
 	ret = -EBUSY;
 
-out_mask:
+out_unlock:
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	free_cpumask_var(mask);
 
 out_thread:
 	if (new->thread) {
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index c53edad7b459..d35bb8d4c317 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -120,9 +120,11 @@ static ssize_t write_irq_affinity(int type, struct file *file,
 	 * one online CPU still has to be targeted.
 	 */
 	if (!cpumask_intersects(new_value, cpu_online_mask)) {
-		/* Special case for empty set - allow the architecture
-		   code to set default SMP affinity. */
-		err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
+		/*
+		 * Special case for empty set - allow the architecture code
+		 * to set default SMP affinity.
+		 */
+		err = irq_select_affinity_usr(irq) ? -EINVAL : count;
 	} else {
 		irq_set_affinity(irq, new_value);
 		err = count;
-- 
cgit v1.3-14-g43fede


From 43564bd97d0e6182bbd43b51b33254c728832551 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:22 +0200
Subject: genirq: Rename setup_affinity() to irq_setup_affinity()

Rename it with a proper irq_ prefix and make it available for other files
in the core code. Preparatory patch for moving the irq affinity setup
around.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235444.928501004@linutronix.de
---
 kernel/irq/internals.h | 6 ++++++
 kernel/irq/manage.c    | 7 +------
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 33ca83816b8c..2d7927d9fb57 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -116,6 +116,12 @@ extern void irq_set_thread_affinity(struct irq_desc *desc);
 extern int irq_do_set_affinity(struct irq_data *data,
 			       const struct cpumask *dest, bool force);
 
+#ifdef CONFIG_SMP
+extern int irq_setup_affinity(struct irq_desc *desc);
+#else
+static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; }
+#endif
+
 /* Inline functions for support of irq chips on slow busses */
 static inline void chip_bus_lock(struct irq_desc *desc)
 {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e2f20d553d60..907fb791ff63 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -345,7 +345,7 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
 /*
  * Generic version of the affinity autoselector.
  */
-static int irq_setup_affinity(struct irq_desc *desc)
+int irq_setup_affinity(struct irq_desc *desc)
 {
 	struct cpumask *set = irq_default_affinity;
 	int ret, node = irq_desc_get_node(desc);
@@ -404,11 +404,6 @@ int irq_select_affinity_usr(unsigned int irq)
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return ret;
 }
-#else
-static inline int setup_affinity(struct irq_desc *desc)
-{
-	return 0;
-}
 #endif
 
 /**
-- 
cgit v1.3-14-g43fede


From 2e051552df69af6d134c2592d0d6f1ac80f01190 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:23 +0200
Subject: genirq: Move initial affinity setup to irq_startup()

The startup vs. setaffinity ordering of interrupts depends on the
IRQF_NOAUTOEN flag. Chained interrupts are not getting any affinity
assignment at all.

A regular interrupt is started up and then the affinity is set. A
IRQF_NOAUTOEN marked interrupt is not started up, but the affinity is set
nevertheless.

Move the affinity setup to startup_irq() so the ordering is always the same
and chained interrupts get the proper default affinity assigned as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235445.020534783@linutronix.de
---
 kernel/irq/chip.c   |  2 ++
 kernel/irq/manage.c | 15 ++++++---------
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index bc1331f84fb5..e290d73b88e2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -213,6 +213,8 @@ int irq_startup(struct irq_desc *desc, bool resend)
 			irq_enable(desc);
 		}
 		irq_state_set_started(desc);
+		/* Set default affinity mask once everything is setup */
+		irq_setup_affinity(desc);
 	}
 
 	if (resend)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 907fb791ff63..1e283073cecc 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1327,6 +1327,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		if (new->flags & IRQF_ONESHOT)
 			desc->istate |= IRQS_ONESHOT;
 
+		/* Exclude IRQ from balancing if requested */
+		if (new->flags & IRQF_NOBALANCING) {
+			irq_settings_set_no_balancing(desc);
+			irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+		}
+
 		if (irq_settings_can_autoenable(desc)) {
 			irq_startup(desc, true);
 		} else {
@@ -1341,15 +1347,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			desc->depth = 1;
 		}
 
-		/* Exclude IRQ from balancing if requested */
-		if (new->flags & IRQF_NOBALANCING) {
-			irq_settings_set_no_balancing(desc);
-			irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
-		}
-
-		/* Set default affinity mask once everything is setup */
-		irq_setup_affinity(desc);
-
 	} else if (new->flags & IRQF_TRIGGER_MASK) {
 		unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
 		unsigned int omsk = irqd_get_trigger_type(&desc->irq_data);
-- 
cgit v1.3-14-g43fede


From 137221df69c6f8a7002f82dc3d95052d34f5667e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 20 Jun 2017 01:37:24 +0200
Subject: genirq: Move pending helpers to internal.h

So that the affinity code can reuse them.


Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170619235445.109426284@linutronix.de
---
 kernel/irq/internals.h | 38 ++++++++++++++++++++++++++++++++++++++
 kernel/irq/manage.c    | 28 ----------------------------
 2 files changed, 38 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 2d7927d9fb57..20b197f0a7b5 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -249,6 +249,44 @@ irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
 		      void __iomem *reg_base, irq_flow_handler_t handler) { }
 #endif /* CONFIG_GENERIC_IRQ_CHIP */
 
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static inline bool irq_can_move_pcntxt(struct irq_data *data)
+{
+	return irqd_can_move_in_process_context(data);
+}
+static inline bool irq_move_pending(struct irq_data *data)
+{
+	return irqd_is_setaffinity_pending(data);
+}
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
+{
+	cpumask_copy(desc->pending_mask, mask);
+}
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
+{
+	cpumask_copy(mask, desc->pending_mask);
+}
+#else /* CONFIG_GENERIC_PENDING_IRQ */
+static inline bool irq_can_move_pcntxt(struct irq_data *data)
+{
+	return true;
+}
+static inline bool irq_move_pending(struct irq_data *data)
+{
+	return false;
+}
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
+{
+}
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
+{
+}
+#endif /* CONFIG_GENERIC_PENDING_IRQ */
+
 #ifdef CONFIG_GENERIC_IRQ_DEBUGFS
 void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
 void irq_remove_debugfs_entry(struct irq_desc *desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1e283073cecc..7dcf19397c39 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -168,34 +168,6 @@ void irq_set_thread_affinity(struct irq_desc *desc)
 			set_bit(IRQTF_AFFINITY, &action->thread_flags);
 }
 
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-static inline bool irq_can_move_pcntxt(struct irq_data *data)
-{
-	return irqd_can_move_in_process_context(data);
-}
-static inline bool irq_move_pending(struct irq_data *data)
-{
-	return irqd_is_setaffinity_pending(data);
-}
-static inline void
-irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
-{
-	cpumask_copy(desc->pending_mask, mask);
-}
-static inline void
-irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
-{
-	cpumask_copy(mask, desc->pending_mask);
-}
-#else
-static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; }
-static inline bool irq_move_pending(struct irq_data *data) { return false; }
-static inline void
-irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
-static inline void
-irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
-#endif
-
 int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
 			bool force)
 {
-- 
cgit v1.3-14-g43fede


From 0dd945ff4647a1f29c6ae8f4f9a69c8f37c994cf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:25 +0200
Subject: genirq/cpuhotplug: Remove irq disabling logic

This is called from stop_machine() with interrupts disabled. No point in
disabling them some more.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235445.198042748@linutronix.de
---
 kernel/irq/cpuhotplug.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 011f8c4c63da..705139831590 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -59,11 +59,8 @@ static bool migrate_one_irq(struct irq_desc *desc)
  */
 void irq_migrate_all_off_this_cpu(void)
 {
-	unsigned int irq;
 	struct irq_desc *desc;
-	unsigned long flags;
-
-	local_irq_save(flags);
+	unsigned int irq;
 
 	for_each_active_irq(irq) {
 		bool affinity_broken;
@@ -73,10 +70,9 @@ void irq_migrate_all_off_this_cpu(void)
 		affinity_broken = migrate_one_irq(desc);
 		raw_spin_unlock(&desc->lock);
 
-		if (affinity_broken)
-			pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n",
+		if (affinity_broken) {
+			pr_warn_ratelimited("IRQ %u: no longer affine to CPU%u\n",
 					    irq, smp_processor_id());
+		}
 	}
-
-	local_irq_restore(flags);
 }
-- 
cgit v1.3-14-g43fede


From 735c09524d3e7c92315e8e2699a1b9acb4fb415c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:26 +0200
Subject: genirq/cpuhotplug: Dont claim success on error

In case the affinity of an interrupt was broken, a printk is emitted.

But if the affinity cannot be set at all due to a missing
irq_set_affinity() callback or due to a failing callback, the message is
still printed preceeded by a warning/error.

That makes no sense whatsoever.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235445.274852976@linutronix.de
---
 kernel/irq/cpuhotplug.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 705139831590..9c5521b247d5 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -37,11 +37,14 @@ static bool migrate_one_irq(struct irq_desc *desc)
 	c = irq_data_get_irq_chip(d);
 	if (!c->irq_set_affinity) {
 		pr_debug("IRQ%u: unable to set affinity\n", d->irq);
+		ret = false;
 	} else {
 		int r = irq_do_set_affinity(d, affinity, false);
-		if (r)
+		if (r) {
 			pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
 					    d->irq, r);
+			ret = false;
+		}
 	}
 
 	return ret;
-- 
cgit v1.3-14-g43fede


From e8a7035039306c90bcc99129ffc18e0be052bbb9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:27 +0200
Subject: genirq/cpuhotplug: Reorder check logic

Move the checks for a valid irq chip and the irq_set_affinity() callback
right in front of the whole migration logic. No point in doing a gazillion
of other things when the interrupt cannot be migrated at all.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235445.354181630@linutronix.de
---
 kernel/irq/cpuhotplug.c | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 9c5521b247d5..41fe1e04d5d9 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -17,9 +17,20 @@
 static bool migrate_one_irq(struct irq_desc *desc)
 {
 	struct irq_data *d = irq_desc_get_irq_data(desc);
+	struct irq_chip *chip = irq_data_get_irq_chip(d);
 	const struct cpumask *affinity = d->common->affinity;
-	struct irq_chip *c;
-	bool ret = false;
+	bool brokeaff = false;
+	int err;
+
+	/*
+	 * IRQ chip might be already torn down, but the irq descriptor is
+	 * still in the radix tree. Also if the chip has no affinity setter,
+	 * nothing can be done here.
+	 */
+	if (!chip || !chip->irq_set_affinity) {
+		pr_debug("IRQ %u: Unable to migrate away\n", d->irq);
+		return false;
+	}
 
 	/*
 	 * If this is a per-CPU interrupt, or the affinity does not
@@ -31,23 +42,16 @@ static bool migrate_one_irq(struct irq_desc *desc)
 
 	if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
 		affinity = cpu_online_mask;
-		ret = true;
+		brokeaff = true;
 	}
 
-	c = irq_data_get_irq_chip(d);
-	if (!c->irq_set_affinity) {
-		pr_debug("IRQ%u: unable to set affinity\n", d->irq);
-		ret = false;
-	} else {
-		int r = irq_do_set_affinity(d, affinity, false);
-		if (r) {
-			pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
-					    d->irq, r);
-			ret = false;
-		}
+	err = irq_do_set_affinity(d, affinity, false);
+	if (err) {
+		pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
+				    d->irq, err);
+		return false;
 	}
-
-	return ret;
+	return brokeaff;
 }
 
 /**
-- 
cgit v1.3-14-g43fede


From 91f26cb4cd3c22bd656ab46c49329aacaaab5504 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:28 +0200
Subject: genirq/cpuhotplug: Do not migrated shutdown irqs

Interrupts, which are shut down are tried to be migrated as well. That's
pointless because the interrupt cannot fire and the next startup will move
it to the proper place anyway.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235445.447550992@linutronix.de
---
 kernel/irq/cpuhotplug.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 41fe1e04d5d9..09b20e127aee 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -33,10 +33,15 @@ static bool migrate_one_irq(struct irq_desc *desc)
 	}
 
 	/*
-	 * If this is a per-CPU interrupt, or the affinity does not
-	 * include this CPU, then we have nothing to do.
+	 * No move required, if:
+	 * - Interrupt is per cpu
+	 * - Interrupt is not started
+	 * - Affinity mask does not include this CPU.
+	 *
+	 * Note: Do not check desc->action as this might be a chained
+	 * interrupt.
 	 */
-	if (irqd_is_per_cpu(d) ||
+	if (irqd_is_per_cpu(d) || !irqd_is_started(d) ||
 	    !cpumask_test_cpu(smp_processor_id(), affinity))
 		return false;
 
-- 
cgit v1.3-14-g43fede


From f0383c24b4855f6a4b5a358c7b2d2c16e0437e9b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:29 +0200
Subject: genirq/cpuhotplug: Add support for cleaning up move in progress

In order to move x86 to the generic hotplug migration code, add support for
cleaning up move in progress bits.

On architectures which have this x86 specific (mis)feature not enabled,
this is optimized out by the compiler.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235445.525817311@linutronix.de
---
 arch/x86/include/asm/irq.h |  1 -
 include/linux/irq.h        |  2 ++
 kernel/irq/cpuhotplug.c    | 28 ++++++++++++++++++++++++++--
 kernel/irq/internals.h     | 10 +++++++++-
 4 files changed, 37 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 16d3fa211962..668cca540025 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -29,7 +29,6 @@ struct irq_desc;
 #include <linux/cpumask.h>
 extern int check_irq_vectors_for_cpu_disable(void);
 extern void fixup_irqs(void);
-extern void irq_force_complete_move(struct irq_desc *desc);
 #endif
 
 #ifdef CONFIG_HAVE_KVM
diff --git a/include/linux/irq.h b/include/linux/irq.h
index d008065e2f4d..299271a4953c 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -491,10 +491,12 @@ extern void irq_migrate_all_off_this_cpu(void);
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ)
 void irq_move_irq(struct irq_data *data);
 void irq_move_masked_irq(struct irq_data *data);
+void irq_force_complete_move(struct irq_desc *desc);
 bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
 #else
 static inline void irq_move_irq(struct irq_data *data) { }
 static inline void irq_move_masked_irq(struct irq_data *data) { }
+static inline void irq_force_complete_move(struct irq_desc *desc) { }
 static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
 {
 	return false;
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 09b20e127aee..4be4bd669d81 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -18,7 +18,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
 {
 	struct irq_data *d = irq_desc_get_irq_data(desc);
 	struct irq_chip *chip = irq_data_get_irq_chip(d);
-	const struct cpumask *affinity = d->common->affinity;
+	const struct cpumask *affinity;
 	bool brokeaff = false;
 	int err;
 
@@ -41,9 +41,33 @@ static bool migrate_one_irq(struct irq_desc *desc)
 	 * Note: Do not check desc->action as this might be a chained
 	 * interrupt.
 	 */
+	affinity = irq_data_get_affinity_mask(d);
 	if (irqd_is_per_cpu(d) || !irqd_is_started(d) ||
-	    !cpumask_test_cpu(smp_processor_id(), affinity))
+	    !cpumask_test_cpu(smp_processor_id(), affinity)) {
+		/*
+		 * If an irq move is pending, abort it if the dying CPU is
+		 * the sole target.
+		 */
+		irq_fixup_move_pending(desc, false);
 		return false;
+	}
+
+	/*
+	 * Complete an eventually pending irq move cleanup. If this
+	 * interrupt was moved in hard irq context, then the vectors need
+	 * to be cleaned up. It can't wait until this interrupt actually
+	 * happens and this CPU was involved.
+	 */
+	irq_force_complete_move(desc);
+
+	/*
+	 * If there is a setaffinity pending, then try to reuse the pending
+	 * mask, so the last change of the affinity does not get lost. If
+	 * there is no move pending or the pending mask does not contain
+	 * any online CPU, use the current affinity mask.
+	 */
+	if (irq_fixup_move_pending(desc, true))
+		affinity = irq_desc_get_pending_mask(desc);
 
 	if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
 		affinity = cpu_online_mask;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 20b197f0a7b5..fd4fa8382b8f 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -268,6 +268,10 @@ irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
 {
 	cpumask_copy(mask, desc->pending_mask);
 }
+static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
+{
+	return desc->pending_mask;
+}
 #else /* CONFIG_GENERIC_PENDING_IRQ */
 static inline bool irq_can_move_pcntxt(struct irq_data *data)
 {
@@ -285,7 +289,11 @@ static inline void
 irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
 {
 }
-#endif /* CONFIG_GENERIC_PENDING_IRQ */
+static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
+{
+	return NULL;
+}
+#endif /* !CONFIG_GENERIC_PENDING_IRQ */
 
 #ifdef CONFIG_GENERIC_IRQ_DEBUGFS
 void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
-- 
cgit v1.3-14-g43fede


From 47a06d3a783217acae02976f15ca07ddc1ac024f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:30 +0200
Subject: genirq/cpuhotplug: Add support for conditional masking

Interrupts which cannot be migrated in process context, need to be masked
before the affinity is changed forcefully.

Add support for that. Will be compiled out for architectures which do not
have this x86 specific issue.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235445.604565591@linutronix.de
---
 kernel/irq/cpuhotplug.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 4be4bd669d81..6f46587a9ce5 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -18,6 +18,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
 {
 	struct irq_data *d = irq_desc_get_irq_data(desc);
 	struct irq_chip *chip = irq_data_get_irq_chip(d);
+	bool maskchip = !irq_can_move_pcntxt(d) && !irqd_irq_masked(d);
 	const struct cpumask *affinity;
 	bool brokeaff = false;
 	int err;
@@ -69,6 +70,10 @@ static bool migrate_one_irq(struct irq_desc *desc)
 	if (irq_fixup_move_pending(desc, true))
 		affinity = irq_desc_get_pending_mask(desc);
 
+	/* Mask the chip for interrupts which cannot move in process context */
+	if (maskchip && chip->irq_mask)
+		chip->irq_mask(d);
+
 	if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
 		affinity = cpu_online_mask;
 		brokeaff = true;
@@ -78,8 +83,12 @@ static bool migrate_one_irq(struct irq_desc *desc)
 	if (err) {
 		pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
 				    d->irq, err);
-		return false;
+		brokeaff = false;
 	}
+
+	if (maskchip && chip->irq_unmask)
+		chip->irq_unmask(d);
+
 	return brokeaff;
 }
 
-- 
cgit v1.3-14-g43fede


From 77f85e66aa8be563ae5804eebf74a78ec6ef5555 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:31 +0200
Subject: genirq/cpuhotplug: Set force affinity flag on hotplug migration

Set the force migration flag when migrating interrupts away from an
outgoing CPU.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235445.681874648@linutronix.de
---
 kernel/irq/cpuhotplug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 6f46587a9ce5..e09cb91a7c8b 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -79,7 +79,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
 		brokeaff = true;
 	}
 
-	err = irq_do_set_affinity(d, affinity, false);
+	err = irq_do_set_affinity(d, affinity, true);
 	if (err) {
 		pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
 				    d->irq, err);
-- 
cgit v1.3-14-g43fede


From 36d84fb45140f151fa4e145381dbce5e5ffed24d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:34 +0200
Subject: genirq: Move irq_fixup_move_pending() to core

Now that x86 uses the generic code, the function declaration and inline
stub can move to the core internal header.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235445.928156166@linutronix.de
---
 include/linux/irq.h    | 5 -----
 kernel/irq/internals.h | 5 +++++
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 299271a4953c..2b7e5a70d05f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -492,15 +492,10 @@ extern void irq_migrate_all_off_this_cpu(void);
 void irq_move_irq(struct irq_data *data);
 void irq_move_masked_irq(struct irq_data *data);
 void irq_force_complete_move(struct irq_desc *desc);
-bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
 #else
 static inline void irq_move_irq(struct irq_data *data) { }
 static inline void irq_move_masked_irq(struct irq_data *data) { }
 static inline void irq_force_complete_move(struct irq_desc *desc) { }
-static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
-{
-	return false;
-}
 #endif
 
 extern int no_irq_affinity;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index fd4fa8382b8f..040806f1124c 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -272,6 +272,7 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
 {
 	return desc->pending_mask;
 }
+bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
 #else /* CONFIG_GENERIC_PENDING_IRQ */
 static inline bool irq_can_move_pcntxt(struct irq_data *data)
 {
@@ -293,6 +294,10 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
 {
 	return NULL;
 }
+static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
+{
+	return false;
+}
 #endif /* !CONFIG_GENERIC_PENDING_IRQ */
 
 #ifdef CONFIG_GENERIC_IRQ_DEBUGFS
-- 
cgit v1.3-14-g43fede


From 047dc6331de58da51818582c0db0dbfcb837e614 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:35 +0200
Subject: genirq: Remove pointless arg from show_irq_affinity

The third argument of the internal helper function is unused. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235446.004958600@linutronix.de
---
 kernel/irq/proc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d35bb8d4c317..eff7c0c8f9b9 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -37,7 +37,7 @@ static struct proc_dir_entry *root_irq_dir;
 
 #ifdef CONFIG_SMP
 
-static int show_irq_affinity(int type, struct seq_file *m, void *v)
+static int show_irq_affinity(int type, struct seq_file *m)
 {
 	struct irq_desc *desc = irq_to_desc((long)m->private);
 	const struct cpumask *mask = desc->irq_common_data.affinity;
@@ -80,12 +80,12 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
 int no_irq_affinity;
 static int irq_affinity_proc_show(struct seq_file *m, void *v)
 {
-	return show_irq_affinity(0, m, v);
+	return show_irq_affinity(0, m);
 }
 
 static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
 {
-	return show_irq_affinity(1, m, v);
+	return show_irq_affinity(1, m);
 }
 
 
-- 
cgit v1.3-14-g43fede


From 4ab764c336123157690ee0000a1dcf81851c58d1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:36 +0200
Subject: genirq: Remove pointless gfp argument

All callers hand in GPF_KERNEL. No point to have an extra argument for
that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235446.082544752@linutronix.de
---
 kernel/irq/irqdesc.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index feade536b6d1..48d4f0365e52 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -54,14 +54,14 @@ static void __init init_irq_default_affinity(void)
 #endif
 
 #ifdef CONFIG_SMP
-static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
+static int alloc_masks(struct irq_desc *desc, int node)
 {
 	if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity,
-				     gfp, node))
+				     GFP_KERNEL, node))
 		return -ENOMEM;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
+	if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) {
 		free_cpumask_var(desc->irq_common_data.affinity);
 		return -ENOMEM;
 	}
@@ -86,7 +86,7 @@ static void desc_smp_init(struct irq_desc *desc, int node,
 
 #else
 static inline int
-alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
+alloc_masks(struct irq_desc *desc, int node) { return 0; }
 static inline void
 desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { }
 #endif
@@ -344,9 +344,8 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
 				   struct module *owner)
 {
 	struct irq_desc *desc;
-	gfp_t gfp = GFP_KERNEL;
 
-	desc = kzalloc_node(sizeof(*desc), gfp, node);
+	desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
 	if (!desc)
 		return NULL;
 	/* allocate based on nr_cpu_ids */
@@ -354,7 +353,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
 	if (!desc->kstat_irqs)
 		goto err_desc;
 
-	if (alloc_masks(desc, gfp, node))
+	if (alloc_masks(desc, node))
 		goto err_kstat;
 
 	raw_spin_lock_init(&desc->lock);
@@ -525,7 +524,7 @@ int __init early_irq_init(void)
 
 	for (i = 0; i < count; i++) {
 		desc[i].kstat_irqs = alloc_percpu(unsigned int);
-		alloc_masks(&desc[i], GFP_KERNEL, node);
+		alloc_masks(&desc[i], node);
 		raw_spin_lock_init(&desc[i].lock);
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
 		desc_set_defaults(i, &desc[i], node, NULL, NULL);
-- 
cgit v1.3-14-g43fede


From c1a80386965e9fa3c2f8d1d57966216fe02c9124 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:37 +0200
Subject: genirq/proc: Replace ever repeating type cast

The proc file setup repeats the same ugly type cast for the irq number over
and over. Do it once and hand in the local void pointer.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235446.160866358@linutronix.de
---
 kernel/irq/proc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index eff7c0c8f9b9..cbc4c5e377ec 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -326,6 +326,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
 void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 {
 	static DEFINE_MUTEX(register_lock);
+	void __maybe_unused *irqp = (void *)(unsigned long) irq;
 	char name [MAX_NAMELEN];
 
 	if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip))
@@ -351,20 +352,19 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 #ifdef CONFIG_SMP
 	/* create /proc/irq/<irq>/smp_affinity */
 	proc_create_data("smp_affinity", 0644, desc->dir,
-			 &irq_affinity_proc_fops, (void *)(long)irq);
+			 &irq_affinity_proc_fops, irqp);
 
 	/* create /proc/irq/<irq>/affinity_hint */
 	proc_create_data("affinity_hint", 0444, desc->dir,
-			 &irq_affinity_hint_proc_fops, (void *)(long)irq);
+			 &irq_affinity_hint_proc_fops, irqp);
 
 	/* create /proc/irq/<irq>/smp_affinity_list */
 	proc_create_data("smp_affinity_list", 0644, desc->dir,
-			 &irq_affinity_list_proc_fops, (void *)(long)irq);
+			 &irq_affinity_list_proc_fops, irqp);
 
 	proc_create_data("node", 0444, desc->dir,
-			 &irq_node_proc_fops, (void *)(long)irq);
+			 &irq_node_proc_fops, irqp);
 #endif
-
 	proc_create_data("spurious", 0444, desc->dir,
 			 &irq_spurious_proc_fops, (void *)(long)irq);
 
-- 
cgit v1.3-14-g43fede


From 0d3f54257dc300f2db480d6a46b34bdb87f18c1b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:38 +0200
Subject: genirq: Introduce effective affinity mask

There is currently no way to evaluate the effective affinity mask of a
given interrupt. Many irq chips allow only a single target CPU or a subset
of CPUs in the affinity mask.

Updating the mask at the time of setting the affinity to the subset would
be counterproductive because information for cpu hotplug about assigned
interrupt affinities gets lost. On CPU hotplug it's also pointless to force
migrate an interrupt, which is not targeted at the CPU effectively. But
currently the information is not available.

Provide a seperate mask to be updated by the irq_chip->irq_set_affinity()
implementations. Implement the read only proc files so the user can see the
effective mask as well w/o trying to deduce it from /proc/interrupts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235446.247834245@linutronix.de
---
 include/linux/irq.h  | 29 +++++++++++++++++
 kernel/irq/Kconfig   |  4 +++
 kernel/irq/debugfs.c |  4 +++
 kernel/irq/irqdesc.c | 14 ++++++++
 kernel/irq/proc.c    | 90 ++++++++++++++++++++++++++++++++++++++++++++++++----
 5 files changed, 134 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 2b7e5a70d05f..4087ef268ba9 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -137,6 +137,9 @@ struct irq_domain;
  * @affinity:		IRQ affinity on SMP. If this is an IPI
  *			related irq, then this is the mask of the
  *			CPUs to which an IPI can be sent.
+ * @effective_affinity:	The effective IRQ affinity on SMP as some irq
+ *			chips do not allow multi CPU destinations.
+ *			A subset of @affinity.
  * @msi_desc:		MSI descriptor
  * @ipi_offset:		Offset of first IPI target cpu in @affinity. Optional.
  */
@@ -148,6 +151,9 @@ struct irq_common_data {
 	void			*handler_data;
 	struct msi_desc		*msi_desc;
 	cpumask_var_t		affinity;
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+	cpumask_var_t		effective_affinity;
+#endif
 #ifdef CONFIG_GENERIC_IRQ_IPI
 	unsigned int		ipi_offset;
 #endif
@@ -737,6 +743,29 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d)
 	return d->common->affinity;
 }
 
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+static inline
+struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
+{
+	return d->common->effective_affinity;
+}
+static inline void irq_data_update_effective_affinity(struct irq_data *d,
+						      const struct cpumask *m)
+{
+	cpumask_copy(d->common->effective_affinity, m);
+}
+#else
+static inline void irq_data_update_effective_affinity(struct irq_data *d,
+						      const struct cpumask *m)
+{
+}
+static inline
+struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
+{
+	return d->common->affinity;
+}
+#endif
+
 unsigned int arch_dynirq_lower_bound(unsigned int from);
 
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 8d9498e51585..fcbb1d6d51cb 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -21,6 +21,10 @@ config GENERIC_IRQ_SHOW
 config GENERIC_IRQ_SHOW_LEVEL
        bool
 
+# Supports effective affinity mask
+config GENERIC_IRQ_EFFECTIVE_AFF_MASK
+       bool
+
 # Facility to allocate a hardware interrupt. This is legacy support
 # and should not be used in new code. Use irq domains instead.
 config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 50ee2f6593e8..edbef252d0c4 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -36,6 +36,10 @@ static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc)
 
 	msk = irq_data_get_affinity_mask(data);
 	seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk));
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+	msk = irq_data_get_effective_affinity_mask(data);
+	seq_printf(m, "effectiv: %*pbl\n", cpumask_pr_args(msk));
+#endif
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	msk = desc->pending_mask;
 	seq_printf(m, "pending:  %*pbl\n", cpumask_pr_args(msk));
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 48d4f0365e52..35a95fadcfda 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -60,8 +60,19 @@ static int alloc_masks(struct irq_desc *desc, int node)
 				     GFP_KERNEL, node))
 		return -ENOMEM;
 
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+	if (!zalloc_cpumask_var_node(&desc->irq_common_data.effective_affinity,
+				     GFP_KERNEL, node)) {
+		free_cpumask_var(desc->irq_common_data.affinity);
+		return -ENOMEM;
+	}
+#endif
+
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) {
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+		free_cpumask_var(desc->irq_common_data.effective_affinity);
+#endif
 		free_cpumask_var(desc->irq_common_data.affinity);
 		return -ENOMEM;
 	}
@@ -324,6 +335,9 @@ static void free_masks(struct irq_desc *desc)
 	free_cpumask_var(desc->pending_mask);
 #endif
 	free_cpumask_var(desc->irq_common_data.affinity);
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+	free_cpumask_var(desc->irq_common_data.effective_affinity);
+#endif
 }
 #else
 static inline void free_masks(struct irq_desc *desc) { }
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index cbc4c5e377ec..7f9642a1e267 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -37,19 +37,47 @@ static struct proc_dir_entry *root_irq_dir;
 
 #ifdef CONFIG_SMP
 
+enum {
+	AFFINITY,
+	AFFINITY_LIST,
+	EFFECTIVE,
+	EFFECTIVE_LIST,
+};
+
 static int show_irq_affinity(int type, struct seq_file *m)
 {
 	struct irq_desc *desc = irq_to_desc((long)m->private);
-	const struct cpumask *mask = desc->irq_common_data.affinity;
+	const struct cpumask *mask;
 
+	switch (type) {
+	case AFFINITY:
+	case AFFINITY_LIST:
+		mask = desc->irq_common_data.affinity;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (irqd_is_setaffinity_pending(&desc->irq_data))
-		mask = desc->pending_mask;
+		if (irqd_is_setaffinity_pending(&desc->irq_data))
+			mask = desc->pending_mask;
 #endif
-	if (type)
+		break;
+	case EFFECTIVE:
+	case EFFECTIVE_LIST:
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+		mask = desc->irq_common_data.effective_affinity;
+		break;
+#else
+		return -EINVAL;
+#endif
+	};
+
+	switch (type) {
+	case AFFINITY_LIST:
+	case EFFECTIVE_LIST:
 		seq_printf(m, "%*pbl\n", cpumask_pr_args(mask));
-	else
+		break;
+	case AFFINITY:
+	case EFFECTIVE:
 		seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
+		break;
+	}
 	return 0;
 }
 
@@ -80,12 +108,12 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
 int no_irq_affinity;
 static int irq_affinity_proc_show(struct seq_file *m, void *v)
 {
-	return show_irq_affinity(0, m);
+	return show_irq_affinity(AFFINITY, m);
 }
 
 static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
 {
-	return show_irq_affinity(1, m);
+	return show_irq_affinity(AFFINITY_LIST, m);
 }
 
 
@@ -185,6 +213,44 @@ static const struct file_operations irq_affinity_list_proc_fops = {
 	.write		= irq_affinity_list_proc_write,
 };
 
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+static int irq_effective_aff_proc_show(struct seq_file *m, void *v)
+{
+	return show_irq_affinity(EFFECTIVE, m);
+}
+
+static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v)
+{
+	return show_irq_affinity(EFFECTIVE_LIST, m);
+}
+
+static int irq_effective_aff_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, irq_effective_aff_proc_show, PDE_DATA(inode));
+}
+
+static int irq_effective_aff_list_proc_open(struct inode *inode,
+					    struct file *file)
+{
+	return single_open(file, irq_effective_aff_list_proc_show,
+			   PDE_DATA(inode));
+}
+
+static const struct file_operations irq_effective_aff_proc_fops = {
+	.open		= irq_effective_aff_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static const struct file_operations irq_effective_aff_list_proc_fops = {
+	.open		= irq_effective_aff_list_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
 static int default_affinity_show(struct seq_file *m, void *v)
 {
 	seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity));
@@ -364,6 +430,12 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 
 	proc_create_data("node", 0444, desc->dir,
 			 &irq_node_proc_fops, irqp);
+# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+	proc_create_data("effective_affinity", 0444, desc->dir,
+			 &irq_effective_aff_proc_fops, irqp);
+	proc_create_data("effective_affinity_list", 0444, desc->dir,
+			 &irq_effective_aff_list_proc_fops, irqp);
+# endif
 #endif
 	proc_create_data("spurious", 0444, desc->dir,
 			 &irq_spurious_proc_fops, (void *)(long)irq);
@@ -383,6 +455,10 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
 	remove_proc_entry("affinity_hint", desc->dir);
 	remove_proc_entry("smp_affinity_list", desc->dir);
 	remove_proc_entry("node", desc->dir);
+# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+	remove_proc_entry("effective_affinity", desc->dir);
+	remove_proc_entry("effective_affinity_list", desc->dir);
+# endif
 #endif
 	remove_proc_entry("spurious", desc->dir);
 
-- 
cgit v1.3-14-g43fede


From 415fcf1a2293046e0c1f4ab8558a87bad66652b1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:39 +0200
Subject: genirq/cpuhotplug: Use effective affinity mask

If the architecture supports the effective affinity mask, migrating
interrupts away which are not targeted by the effective mask is
pointless.

They can stay in the user or system supplied affinity mask, but won't be
targetted at any given point as the affinity setter functions need to
validate against the online cpu mask anyway.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235446.328488490@linutronix.de
---
 kernel/irq/cpuhotplug.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index e09cb91a7c8b..0b093db3336b 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -14,6 +14,14 @@
 
 #include "internals.h"
 
+/* For !GENERIC_IRQ_EFFECTIVE_AFF_MASK this looks at general affinity mask */
+static inline bool irq_needs_fixup(struct irq_data *d)
+{
+	const struct cpumask *m = irq_data_get_effective_affinity_mask(d);
+
+	return cpumask_test_cpu(smp_processor_id(), m);
+}
+
 static bool migrate_one_irq(struct irq_desc *desc)
 {
 	struct irq_data *d = irq_desc_get_irq_data(desc);
@@ -42,9 +50,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
 	 * Note: Do not check desc->action as this might be a chained
 	 * interrupt.
 	 */
-	affinity = irq_data_get_affinity_mask(d);
-	if (irqd_is_per_cpu(d) || !irqd_is_started(d) ||
-	    !cpumask_test_cpu(smp_processor_id(), affinity)) {
+	if (irqd_is_per_cpu(d) || !irqd_is_started(d) || !irq_needs_fixup(d)) {
 		/*
 		 * If an irq move is pending, abort it if the dying CPU is
 		 * the sole target.
@@ -69,6 +75,8 @@ static bool migrate_one_irq(struct irq_desc *desc)
 	 */
 	if (irq_fixup_move_pending(desc, true))
 		affinity = irq_desc_get_pending_mask(desc);
+	else
+		affinity = irq_data_get_affinity_mask(d);
 
 	/* Mask the chip for interrupts which cannot move in process context */
 	if (maskchip && chip->irq_mask)
-- 
cgit v1.3-14-g43fede


From 54fdf6a0875ca380647ac1cc9b5b8f2dbbbfa131 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:47 +0200
Subject: genirq: Introduce IRQD_MANAGED_SHUTDOWN

Affinity managed interrupts should keep their assigned affinity accross CPU
hotplug. To avoid magic hackery in device drivers, the core code shall
manage them transparently. This will set these interrupts into a managed
shutdown state when the last CPU of the assigned affinity mask goes
offline. The interrupt will be restarted when one of the CPUs in the
assigned affinity mask comes back online.

Introduce the necessary state flag and the accessor functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235446.954523476@linutronix.de
---
 include/linux/irq.h    |  8 ++++++++
 kernel/irq/internals.h | 10 ++++++++++
 2 files changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 4087ef268ba9..0e37276c5315 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -207,6 +207,8 @@ struct irq_data {
  * IRQD_FORWARDED_TO_VCPU	- The interrupt is forwarded to a VCPU
  * IRQD_AFFINITY_MANAGED	- Affinity is auto-managed by the kernel
  * IRQD_IRQ_STARTED		- Startup state of the interrupt
+ * IRQD_MANAGED_SHUTDOWN	- Interrupt was shutdown due to empty affinity
+ *				  mask. Applies only to affinity managed irqs.
  */
 enum {
 	IRQD_TRIGGER_MASK		= 0xf,
@@ -225,6 +227,7 @@ enum {
 	IRQD_FORWARDED_TO_VCPU		= (1 << 20),
 	IRQD_AFFINITY_MANAGED		= (1 << 21),
 	IRQD_IRQ_STARTED		= (1 << 22),
+	IRQD_MANAGED_SHUTDOWN		= (1 << 23),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -343,6 +346,11 @@ static inline bool irqd_is_started(struct irq_data *d)
 	return __irqd_to_state(d) & IRQD_IRQ_STARTED;
 }
 
+static inline bool irqd_is_managed_shutdown(struct irq_data *d)
+{
+	return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN;
+}
+
 #undef __irqd_to_state
 
 static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 040806f1124c..ca4666b4cd39 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -193,6 +193,16 @@ static inline void irqd_clr_move_pending(struct irq_data *d)
 	__irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING;
 }
 
+static inline void irqd_set_managed_shutdown(struct irq_data *d)
+{
+	__irqd_to_state(d) |= IRQD_MANAGED_SHUTDOWN;
+}
+
+static inline void irqd_clr_managed_shutdown(struct irq_data *d)
+{
+	__irqd_to_state(d) &= ~IRQD_MANAGED_SHUTDOWN;
+}
+
 static inline void irqd_clear(struct irq_data *d, unsigned int mask)
 {
 	__irqd_to_state(d) &= ~mask;
-- 
cgit v1.3-14-g43fede


From 708d174b6c32bffc5d73793bc7a267bcafeb6558 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:48 +0200
Subject: genirq: Split out irq_startup() code

Split out the inner workings of irq_startup() so it can be reused to handle
managed interrupts gracefully.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235447.033235144@linutronix.de
---
 kernel/irq/chip.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index e290d73b88e2..1163089aa245 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -195,6 +195,23 @@ static void irq_state_set_started(struct irq_desc *desc)
 	irqd_set(&desc->irq_data, IRQD_IRQ_STARTED);
 }
 
+static int __irq_startup(struct irq_desc *desc)
+{
+	struct irq_data *d = irq_desc_get_irq_data(desc);
+	int ret = 0;
+
+	irq_domain_activate_irq(d);
+	if (d->chip->irq_startup) {
+		ret = d->chip->irq_startup(d);
+		irq_state_clr_disabled(desc);
+		irq_state_clr_masked(desc);
+	} else {
+		irq_enable(desc);
+	}
+	irq_state_set_started(desc);
+	return ret;
+}
+
 int irq_startup(struct irq_desc *desc, bool resend)
 {
 	int ret = 0;
@@ -204,19 +221,9 @@ int irq_startup(struct irq_desc *desc, bool resend)
 	if (irqd_is_started(&desc->irq_data)) {
 		irq_enable(desc);
 	} else {
-		irq_domain_activate_irq(&desc->irq_data);
-		if (desc->irq_data.chip->irq_startup) {
-			ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
-			irq_state_clr_disabled(desc);
-			irq_state_clr_masked(desc);
-		} else {
-			irq_enable(desc);
-		}
-		irq_state_set_started(desc);
-		/* Set default affinity mask once everything is setup */
+		ret = __irq_startup(desc);
 		irq_setup_affinity(desc);
 	}
-
 	if (resend)
 		check_irq_resend(desc);
 
-- 
cgit v1.3-14-g43fede


From 4cde9c6b826834b861a2b58653ab33150f562064 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:49 +0200
Subject: genirq: Add force argument to irq_startup()

In order to handle managed interrupts gracefully on irq_startup() so they
won't lose their assigned affinity, it's necessary to allow startups which
keep the interrupts in managed shutdown state, if none of the assigend CPUs
is online. This allows drivers to request interrupts w/o the CPUs being
online, which avoid online/offline churn in drivers.

Add a force argument which can override that decision and let only
request_irq() and enable_irq() allow the managed shutdown
handling. enable_irq() is required, because the interrupt might be
requested with IRQF_NOAUTOEN and enable_irq() invokes irq_startup() which
would then wreckage the assignment again. All other callers force startup
and potentially break the assigned affinity.

No functional change as this only adds the function argument.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235447.112094565@linutronix.de
---
 kernel/irq/autoprobe.c | 4 ++--
 kernel/irq/chip.c      | 4 ++--
 kernel/irq/internals.h | 9 ++++++++-
 kernel/irq/manage.c    | 4 ++--
 4 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 0119b9d467ae..d30a0dd5cc02 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -53,7 +53,7 @@ unsigned long probe_irq_on(void)
 			if (desc->irq_data.chip->irq_set_type)
 				desc->irq_data.chip->irq_set_type(&desc->irq_data,
 							 IRQ_TYPE_PROBE);
-			irq_startup(desc, false);
+			irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE);
 		}
 		raw_spin_unlock_irq(&desc->lock);
 	}
@@ -70,7 +70,7 @@ unsigned long probe_irq_on(void)
 		raw_spin_lock_irq(&desc->lock);
 		if (!desc->action && irq_settings_can_probe(desc)) {
 			desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
-			if (irq_startup(desc, false))
+			if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE))
 				desc->istate |= IRQS_PENDING;
 		}
 		raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 1163089aa245..b7599e952d3b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -212,7 +212,7 @@ static int __irq_startup(struct irq_desc *desc)
 	return ret;
 }
 
-int irq_startup(struct irq_desc *desc, bool resend)
+int irq_startup(struct irq_desc *desc, bool resend, bool force)
 {
 	int ret = 0;
 
@@ -892,7 +892,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
 		irq_settings_set_norequest(desc);
 		irq_settings_set_nothread(desc);
 		desc->action = &chained_action;
-		irq_startup(desc, true);
+		irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
 	}
 }
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ca4666b4cd39..5fd105e252c3 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -66,7 +66,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags);
 extern void __disable_irq(struct irq_desc *desc);
 extern void __enable_irq(struct irq_desc *desc);
 
-extern int irq_startup(struct irq_desc *desc, bool resend);
+#define IRQ_RESEND	true
+#define IRQ_NORESEND	false
+
+#define IRQ_START_FORCE	true
+#define IRQ_START_COND	false
+
+extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
+
 extern void irq_shutdown(struct irq_desc *desc);
 extern void irq_enable(struct irq_desc *desc);
 extern void irq_disable(struct irq_desc *desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7dcf19397c39..3577c091ac7b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -509,7 +509,7 @@ void __enable_irq(struct irq_desc *desc)
 		 * time. If it was already started up, then irq_startup()
 		 * will invoke irq_enable() under the hood.
 		 */
-		irq_startup(desc, true);
+		irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
 		break;
 	}
 	default:
@@ -1306,7 +1306,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		}
 
 		if (irq_settings_can_autoenable(desc)) {
-			irq_startup(desc, true);
+			irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
 		} else {
 			/*
 			 * Shared interrupts do not go well with disabling
-- 
cgit v1.3-14-g43fede


From 761ea388e8c4e3ac883a94e16bcc8c51fa419d4f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:50 +0200
Subject: genirq: Handle managed irqs gracefully in irq_startup()

Affinity managed interrupts should keep their assigned affinity accross CPU
hotplug. To avoid magic hackery in device drivers, the core code shall
manage them transparently and set these interrupts into a managed shutdown
state when the last CPU of the assigned affinity mask goes offline. The
interrupt will be restarted when one of the CPUs in the assigned affinity
mask comes back online.

Add the necessary logic to irq_startup(). If an interrupt is requested and
started up, the code checks whether it is affinity managed and if so, it
checks whether a CPU in the interrupts affinity mask is online. If not, it
puts the interrupt into managed shutdown state.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235447.189851170@linutronix.de
---
 include/linux/irq.h |  2 +-
 kernel/irq/chip.c   | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 62 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 0e37276c5315..807042b46af1 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -346,7 +346,7 @@ static inline bool irqd_is_started(struct irq_data *d)
 	return __irqd_to_state(d) & IRQD_IRQ_STARTED;
 }
 
-static inline bool irqd_is_managed_shutdown(struct irq_data *d)
+static inline bool irqd_is_managed_and_shutdown(struct irq_data *d)
 {
 	return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN;
 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b7599e952d3b..fc89eeb8a6b4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -195,6 +195,52 @@ static void irq_state_set_started(struct irq_desc *desc)
 	irqd_set(&desc->irq_data, IRQD_IRQ_STARTED);
 }
 
+enum {
+	IRQ_STARTUP_NORMAL,
+	IRQ_STARTUP_MANAGED,
+	IRQ_STARTUP_ABORT,
+};
+
+#ifdef CONFIG_SMP
+static int
+__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
+{
+	struct irq_data *d = irq_desc_get_irq_data(desc);
+
+	if (!irqd_affinity_is_managed(d))
+		return IRQ_STARTUP_NORMAL;
+
+	irqd_clr_managed_shutdown(d);
+
+	if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) {
+		/*
+		 * Catch code which fiddles with enable_irq() on a managed
+		 * and potentially shutdown IRQ. Chained interrupt
+		 * installment or irq auto probing should not happen on
+		 * managed irqs either. Emit a warning, break the affinity
+		 * and start it up as a normal interrupt.
+		 */
+		if (WARN_ON_ONCE(force))
+			return IRQ_STARTUP_NORMAL;
+		/*
+		 * The interrupt was requested, but there is no online CPU
+		 * in it's affinity mask. Put it into managed shutdown
+		 * state and let the cpu hotplug mechanism start it up once
+		 * a CPU in the mask becomes available.
+		 */
+		irqd_set_managed_shutdown(d);
+		return IRQ_STARTUP_ABORT;
+	}
+	return IRQ_STARTUP_MANAGED;
+}
+#else
+static int
+__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
+{
+	return IRQ_STARTUP_NORMAL;
+}
+#endif
+
 static int __irq_startup(struct irq_desc *desc)
 {
 	struct irq_data *d = irq_desc_get_irq_data(desc);
@@ -214,15 +260,27 @@ static int __irq_startup(struct irq_desc *desc)
 
 int irq_startup(struct irq_desc *desc, bool resend, bool force)
 {
+	struct irq_data *d = irq_desc_get_irq_data(desc);
+	struct cpumask *aff = irq_data_get_affinity_mask(d);
 	int ret = 0;
 
 	desc->depth = 0;
 
-	if (irqd_is_started(&desc->irq_data)) {
+	if (irqd_is_started(d)) {
 		irq_enable(desc);
 	} else {
-		ret = __irq_startup(desc);
-		irq_setup_affinity(desc);
+		switch (__irq_startup_managed(desc, aff, force)) {
+		case IRQ_STARTUP_NORMAL:
+			ret = __irq_startup(desc);
+			irq_setup_affinity(desc);
+			break;
+		case IRQ_STARTUP_MANAGED:
+			ret = __irq_startup(desc);
+			irq_set_affinity_locked(d, aff, false);
+			break;
+		case IRQ_STARTUP_ABORT:
+			return 0;
+		}
 	}
 	if (resend)
 		check_irq_resend(desc);
-- 
cgit v1.3-14-g43fede


From c5cb83bb337c25caae995d992d1cdf9b317f83de Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:51 +0200
Subject: genirq/cpuhotplug: Handle managed IRQs on CPU hotplug

If a CPU goes offline, interrupts affine to the CPU are moved away. If the
outgoing CPU is the last CPU in the affinity mask the migration code breaks
the affinity and sets it it all online cpus.

This is a problem for affinity managed interrupts as CPU hotplug is often
used for power management purposes. If the affinity is broken, the
interrupt is not longer affine to the CPUs to which it was allocated.

The affinity spreading allows to lay out multi queue devices in a way that
they are assigned to a single CPU or a group of CPUs. If the last CPU goes
offline, then the queue is not longer used, so the interrupt can be
shutdown gracefully and parked until one of the assigned CPUs comes online
again.

Add a graceful shutdown mechanism into the irq affinity breaking code path,
mark the irq as MANAGED_SHUTDOWN and leave the affinity mask unmodified.

In the online path, scan the active interrupts for managed interrupts and
if the interrupt is functional and the newly online CPU is part of the
affinity mask, restart the interrupt if it is marked MANAGED_SHUTDOWN or if
the interrupts is started up, try to add the CPU back to the effective
affinity mask.

Originally-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170619235447.273417334@linutronix.de
---
 include/linux/cpuhotplug.h |  1 +
 include/linux/irq.h        |  5 +++++
 kernel/cpu.c               |  5 +++++
 kernel/irq/cpuhotplug.c    | 45 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 0f2a80377520..c15f22c54535 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -124,6 +124,7 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE_IDLE,
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_X86_VDSO_VMA_ONLINE,
+	CPUHP_AP_IRQ_AFFINITY_ONLINE,
 	CPUHP_AP_PERF_ONLINE,
 	CPUHP_AP_PERF_X86_ONLINE,
 	CPUHP_AP_PERF_X86_UNCORE_ONLINE,
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 807042b46af1..19cea6326599 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -500,7 +500,12 @@ extern int irq_set_affinity_locked(struct irq_data *data,
 				   const struct cpumask *cpumask, bool force);
 extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info);
 
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_IRQ_MIGRATION)
 extern void irq_migrate_all_off_this_cpu(void);
+extern int irq_affinity_online_cpu(unsigned int cpu);
+#else
+# define irq_affinity_online_cpu	NULL
+#endif
 
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ)
 void irq_move_irq(struct irq_data *data);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index cb5103413bd8..b86b32ebb3b2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1252,6 +1252,11 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.startup.single		= smpboot_unpark_threads,
 		.teardown.single	= NULL,
 	},
+	[CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
+		.name			= "irq/affinity:online",
+		.startup.single		= irq_affinity_online_cpu,
+		.teardown.single	= NULL,
+	},
 	[CPUHP_AP_PERF_ONLINE] = {
 		.name			= "perf:online",
 		.startup.single		= perf_event_init_cpu,
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 0b093db3336b..b7964e72ded7 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -83,6 +83,15 @@ static bool migrate_one_irq(struct irq_desc *desc)
 		chip->irq_mask(d);
 
 	if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+		/*
+		 * If the interrupt is managed, then shut it down and leave
+		 * the affinity untouched.
+		 */
+		if (irqd_affinity_is_managed(d)) {
+			irqd_set_managed_shutdown(d);
+			irq_shutdown(desc);
+			return false;
+		}
 		affinity = cpu_online_mask;
 		brokeaff = true;
 	}
@@ -129,3 +138,39 @@ void irq_migrate_all_off_this_cpu(void)
 		}
 	}
 }
+
+static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
+{
+	struct irq_data *data = irq_desc_get_irq_data(desc);
+	const struct cpumask *affinity = irq_data_get_affinity_mask(data);
+
+	if (!irqd_affinity_is_managed(data) || !desc->action ||
+	    !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity))
+		return;
+
+	if (irqd_is_managed_and_shutdown(data))
+		irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
+	else
+		irq_set_affinity_locked(data, affinity, false);
+}
+
+/**
+ * irq_affinity_online_cpu - Restore affinity for managed interrupts
+ * @cpu:	Upcoming CPU for which interrupts should be restored
+ */
+int irq_affinity_online_cpu(unsigned int cpu)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+
+	irq_lock_sparse();
+	for_each_active_irq(irq) {
+		desc = irq_to_desc(irq);
+		raw_spin_lock_irq(&desc->lock);
+		irq_restore_affinity_of_irq(desc, cpu);
+		raw_spin_unlock_irq(&desc->lock);
+	}
+	irq_unlock_sparse();
+
+	return 0;
+}
-- 
cgit v1.3-14-g43fede


From d52dd44175bd27ad9d8e34a994fb80877c1f6d61 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:52 +0200
Subject: genirq: Introduce IRQD_SINGLE_TARGET flag

Many interrupt chips allow only a single CPU as interrupt target. The core
code has no knowledge about that. That's unfortunate as it could avoid
trying to readd a newly online CPU to the effective affinity mask.

Add the status flag and the necessary accessors.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235447.352343969@linutronix.de
---
 include/linux/irq.h  | 16 ++++++++++++++++
 kernel/irq/debugfs.c |  1 +
 2 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 19cea6326599..00db35b61e9e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -209,6 +209,7 @@ struct irq_data {
  * IRQD_IRQ_STARTED		- Startup state of the interrupt
  * IRQD_MANAGED_SHUTDOWN	- Interrupt was shutdown due to empty affinity
  *				  mask. Applies only to affinity managed irqs.
+ * IRQD_SINGLE_TARGET		- IRQ allows only a single affinity target
  */
 enum {
 	IRQD_TRIGGER_MASK		= 0xf,
@@ -228,6 +229,7 @@ enum {
 	IRQD_AFFINITY_MANAGED		= (1 << 21),
 	IRQD_IRQ_STARTED		= (1 << 22),
 	IRQD_MANAGED_SHUTDOWN		= (1 << 23),
+	IRQD_SINGLE_TARGET		= (1 << 24),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -276,6 +278,20 @@ static inline bool irqd_is_level_type(struct irq_data *d)
 	return __irqd_to_state(d) & IRQD_LEVEL;
 }
 
+/*
+ * Must only be called of irqchip.irq_set_affinity() or low level
+ * hieararchy domain allocation functions.
+ */
+static inline void irqd_set_single_target(struct irq_data *d)
+{
+	__irqd_to_state(d) |= IRQD_SINGLE_TARGET;
+}
+
+static inline bool irqd_is_single_target(struct irq_data *d)
+{
+	return __irqd_to_state(d) & IRQD_SINGLE_TARGET;
+}
+
 static inline bool irqd_is_wakeup_set(struct irq_data *d)
 {
 	return __irqd_to_state(d) & IRQD_WAKEUP_STATE;
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index edbef252d0c4..dbd6e78db213 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -105,6 +105,7 @@ static const struct irq_bit_descr irqdata_states[] = {
 	BIT_MASK_DESCR(IRQD_PER_CPU),
 	BIT_MASK_DESCR(IRQD_NO_BALANCING),
 
+	BIT_MASK_DESCR(IRQD_SINGLE_TARGET),
 	BIT_MASK_DESCR(IRQD_MOVE_PCNTXT),
 	BIT_MASK_DESCR(IRQD_AFFINITY_SET),
 	BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
-- 
cgit v1.3-14-g43fede


From 8f31a9845db348f5781df47ce04c79e4cfe90016 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:53 +0200
Subject: genirq/cpuhotplug: Avoid irq affinity setting for single targets

Avoid trying to add a newly online CPU to the effective affinity mask of an
started up interrupt. That interrupt will either stay on the already online
CPU or move around for no value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235447.431321047@linutronix.de
---
 kernel/irq/cpuhotplug.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index b7964e72ded7..aee8f7ec40af 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -148,9 +148,17 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
 	    !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity))
 		return;
 
-	if (irqd_is_managed_and_shutdown(data))
+	if (irqd_is_managed_and_shutdown(data)) {
 		irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
-	else
+		return;
+	}
+
+	/*
+	 * If the interrupt can only be directed to a single target
+	 * CPU then it is already assigned to a CPU in the affinity
+	 * mask. No point in trying to move it around.
+	 */
+	if (!irqd_is_single_target(data))
 		irq_set_affinity_locked(data, affinity, false);
 }
 
-- 
cgit v1.3-14-g43fede


From 9a0ef98e186d86fb3c1ff3ec267a76f067005f74 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 20 Jun 2017 01:37:55 +0200
Subject: genirq/affinity: Assign vectors to all present CPUs

Currently the irq vector spread algorithm is restricted to online CPUs,
which ties the IRQ mapping to the currently online devices and doesn't deal
nicely with the fact that CPUs could come and go rapidly due to e.g. power
management.

Instead assign vectors to all present CPUs to avoid this churn.

Build a map of all possible CPUs for a given node, as the architectures
only provide a map of all onlines CPUs. Do this dynamically on each call
for the vector assingments, which is a bit suboptimal and could be
optimized in the future by provinding a mapping from the arch code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linux-nvme@lists.infradead.org
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170603140403.27379-5-hch@lst.de
---
 kernel/irq/affinity.c | 76 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 63 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e2d356dd7581..d2747f9c5707 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -1,4 +1,7 @@
-
+/*
+ * Copyright (C) 2016 Thomas Gleixner.
+ * Copyright (C) 2016-2017 Christoph Hellwig.
+ */
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -35,13 +38,54 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
 	}
 }
 
-static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
+static cpumask_var_t *alloc_node_to_present_cpumask(void)
+{
+	cpumask_var_t *masks;
+	int node;
+
+	masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
+	if (!masks)
+		return NULL;
+
+	for (node = 0; node < nr_node_ids; node++) {
+		if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
+			goto out_unwind;
+	}
+
+	return masks;
+
+out_unwind:
+	while (--node >= 0)
+		free_cpumask_var(masks[node]);
+	kfree(masks);
+	return NULL;
+}
+
+static void free_node_to_present_cpumask(cpumask_var_t *masks)
+{
+	int node;
+
+	for (node = 0; node < nr_node_ids; node++)
+		free_cpumask_var(masks[node]);
+	kfree(masks);
+}
+
+static void build_node_to_present_cpumask(cpumask_var_t *masks)
+{
+	int cpu;
+
+	for_each_present_cpu(cpu)
+		cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
+}
+
+static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask,
+				const struct cpumask *mask, nodemask_t *nodemsk)
 {
 	int n, nodes = 0;
 
 	/* Calculate the number of nodes in the supplied affinity mask */
-	for_each_online_node(n) {
-		if (cpumask_intersects(mask, cpumask_of_node(n))) {
+	for_each_node(n) {
+		if (cpumask_intersects(mask, node_to_present_cpumask[n])) {
 			node_set(n, *nodemsk);
 			nodes++;
 		}
@@ -64,7 +108,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	int last_affv = affv + affd->pre_vectors;
 	nodemask_t nodemsk = NODE_MASK_NONE;
 	struct cpumask *masks;
-	cpumask_var_t nmsk;
+	cpumask_var_t nmsk, *node_to_present_cpumask;
 
 	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
 		return NULL;
@@ -73,13 +117,19 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	if (!masks)
 		goto out;
 
+	node_to_present_cpumask = alloc_node_to_present_cpumask();
+	if (!node_to_present_cpumask)
+		goto out;
+
 	/* Fill out vectors at the beginning that don't need affinity */
 	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
 		cpumask_copy(masks + curvec, irq_default_affinity);
 
 	/* Stabilize the cpumasks */
 	get_online_cpus();
-	nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk);
+	build_node_to_present_cpumask(node_to_present_cpumask);
+	nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask,
+				     &nodemsk);
 
 	/*
 	 * If the number of nodes in the mask is greater than or equal the
@@ -87,7 +137,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	 */
 	if (affv <= nodes) {
 		for_each_node_mask(n, nodemsk) {
-			cpumask_copy(masks + curvec, cpumask_of_node(n));
+			cpumask_copy(masks + curvec,
+				     node_to_present_cpumask[n]);
 			if (++curvec == last_affv)
 				break;
 		}
@@ -101,7 +152,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 		vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
 
 		/* Get the cpus on this node which are in the mask */
-		cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n));
+		cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]);
 
 		/* Calculate the number of cpus per vector */
 		ncpus = cpumask_weight(nmsk);
@@ -133,6 +184,7 @@ done:
 	/* Fill out vectors at the end that don't need affinity */
 	for (; curvec < nvecs; curvec++)
 		cpumask_copy(masks + curvec, irq_default_affinity);
+	free_node_to_present_cpumask(node_to_present_cpumask);
 out:
 	free_cpumask_var(nmsk);
 	return masks;
@@ -147,12 +199,10 @@ int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd)
 {
 	int resv = affd->pre_vectors + affd->post_vectors;
 	int vecs = maxvec - resv;
-	int cpus;
+	int ret;
 
-	/* Stabilize the cpumasks */
 	get_online_cpus();
-	cpus = cpumask_weight(cpu_online_mask);
+	ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv;
 	put_online_cpus();
-
-	return min(cpus, vecs) + resv;
+	return ret;
 }
-- 
cgit v1.3-14-g43fede


From 61d0a000b7746665c7cfcff766532f6f2a922a61 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 22 Jun 2017 11:34:57 +0100
Subject: genirq/irqdomain: Add irq_domain_update_bus_token helper

We can have irq domains that are identified by the same fwnode
(because they are serviced by the same HW), and yet have different
functionnality (because they serve different busses, for example).
This is what we use the bus_token field.

Since we don't use this field when generating the domain name,
all the aliasing domains will get the same name, and the debugfs
file creation fails. Also, bus_token is updated by individual drivers,
and the core code is unaware of that update.

In order to sort this mess, let's introduce a helper that takes care
of updating bus_token, and regenerate the debugfs file.

A separate patch will update all the individual users.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdomain.h |  3 +++
 kernel/irq/irqdomain.c    | 31 +++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 914b0c31d233..222f47af12f4 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -273,6 +273,9 @@ static inline bool is_fwnode_irqchip(struct fwnode_handle *fwnode)
 	return fwnode && fwnode->type == FWNODE_IRQCHIP;
 }
 
+extern void irq_domain_update_bus_token(struct irq_domain *domain,
+					enum irq_domain_bus_token bus_token);
+
 static inline
 struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
 					    enum irq_domain_bus_token bus_token)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 75e1f0851c33..f6adeaeb4c16 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -245,6 +245,37 @@ void irq_domain_remove(struct irq_domain *domain)
 }
 EXPORT_SYMBOL_GPL(irq_domain_remove);
 
+void irq_domain_update_bus_token(struct irq_domain *domain,
+				 enum irq_domain_bus_token bus_token)
+{
+	char *name;
+
+	if (domain->bus_token == bus_token)
+		return;
+
+	mutex_lock(&irq_domain_mutex);
+
+	domain->bus_token = bus_token;
+
+	name = kasprintf(GFP_KERNEL, "%s-%d", domain->name, bus_token);
+	if (!name) {
+		mutex_unlock(&irq_domain_mutex);
+		return;
+	}
+
+	debugfs_remove_domain_dir(domain);
+
+	if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED)
+		kfree(domain->name);
+	else
+		domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+
+	domain->name = name;
+	debugfs_add_domain_dir(domain);
+
+	mutex_unlock(&irq_domain_mutex);
+}
+
 /**
  * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
  * @of_node: pointer to interrupt controller's device tree node.
-- 
cgit v1.3-14-g43fede


From 6a6544e520abecd484ab8b67fb50d1fc003f3275 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 20 Jun 2017 22:17:44 +0100
Subject: genirq/irqdomain: Remove auto-recursive hierarchy support

It did seem like a good idea at the time, but it never really
caught on, and auto-recursive domains remain unused 3 years after
having been introduced.

Oh well, time for a late spring cleanup.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdomain.h |  9 +++-----
 kernel/irq/irqdomain.c    | 55 +++++++++++------------------------------------
 kernel/irq/msi.c          |  2 +-
 3 files changed, 17 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 222f47af12f4..cac77a5c5555 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -180,8 +180,8 @@ enum {
 	/* Irq domain is hierarchical */
 	IRQ_DOMAIN_FLAG_HIERARCHY	= (1 << 0),
 
-	/* Core calls alloc/free recursive through the domain hierarchy. */
-	IRQ_DOMAIN_FLAG_AUTO_RECURSIVE	= (1 << 1),
+	/* Irq domain name was allocated in __irq_domain_add() */
+	IRQ_DOMAIN_NAME_ALLOCATED	= (1 << 6),
 
 	/* Irq domain is an IPI domain with virq per cpu */
 	IRQ_DOMAIN_FLAG_IPI_PER_CPU	= (1 << 2),
@@ -195,9 +195,6 @@ enum {
 	/* Irq domain implements MSI remapping */
 	IRQ_DOMAIN_FLAG_MSI_REMAP	= (1 << 5),
 
-	/* Irq domain name was allocated in __irq_domain_add() */
-	IRQ_DOMAIN_NAME_ALLOCATED	= (1 << 6),
-
 	/*
 	 * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
 	 * for implementation specific purposes and ignored by the
@@ -448,7 +445,7 @@ static inline int irq_domain_alloc_irqs(struct irq_domain *domain,
 				       NULL);
 }
 
-extern int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
+extern int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
 					   unsigned int irq_base,
 					   unsigned int nr_irqs, void *arg);
 extern int irq_domain_set_hwirq_and_chip(struct irq_domain *domain,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f6adeaeb4c16..14fe862aa2e3 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1342,43 +1342,18 @@ void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq,
 	irq_domain_free_irqs_common(domain, virq, nr_irqs);
 }
 
-static bool irq_domain_is_auto_recursive(struct irq_domain *domain)
-{
-	return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE;
-}
-
-static void irq_domain_free_irqs_recursive(struct irq_domain *domain,
+static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain,
 					   unsigned int irq_base,
 					   unsigned int nr_irqs)
 {
 	domain->ops->free(domain, irq_base, nr_irqs);
-	if (irq_domain_is_auto_recursive(domain)) {
-		BUG_ON(!domain->parent);
-		irq_domain_free_irqs_recursive(domain->parent, irq_base,
-					       nr_irqs);
-	}
 }
 
-int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
+int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
 				    unsigned int irq_base,
 				    unsigned int nr_irqs, void *arg)
 {
-	int ret = 0;
-	struct irq_domain *parent = domain->parent;
-	bool recursive = irq_domain_is_auto_recursive(domain);
-
-	BUG_ON(recursive && !parent);
-	if (recursive)
-		ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
-						      nr_irqs, arg);
-	if (ret < 0)
-		return ret;
-
-	ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
-	if (ret < 0 && recursive)
-		irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
-
-	return ret;
+	return domain->ops->alloc(domain, irq_base, nr_irqs, arg);
 }
 
 /**
@@ -1439,7 +1414,7 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
 	}
 
 	mutex_lock(&irq_domain_mutex);
-	ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg);
+	ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg);
 	if (ret < 0) {
 		mutex_unlock(&irq_domain_mutex);
 		goto out_free_irq_data;
@@ -1474,7 +1449,7 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
 	mutex_lock(&irq_domain_mutex);
 	for (i = 0; i < nr_irqs; i++)
 		irq_domain_remove_irq(virq + i);
-	irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs);
+	irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs);
 	mutex_unlock(&irq_domain_mutex);
 
 	irq_domain_free_irq_data(virq, nr_irqs);
@@ -1494,15 +1469,11 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
 				 unsigned int irq_base, unsigned int nr_irqs,
 				 void *arg)
 {
-	/* irq_domain_alloc_irqs_recursive() has called parent's alloc() */
-	if (irq_domain_is_auto_recursive(domain))
-		return 0;
+	if (!domain->parent)
+		return -ENOSYS;
 
-	domain = domain->parent;
-	if (domain)
-		return irq_domain_alloc_irqs_recursive(domain, irq_base,
-						       nr_irqs, arg);
-	return -ENOSYS;
+	return irq_domain_alloc_irqs_hierarchy(domain->parent, irq_base,
+					       nr_irqs, arg);
 }
 EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent);
 
@@ -1517,10 +1488,10 @@ EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent);
 void irq_domain_free_irqs_parent(struct irq_domain *domain,
 				 unsigned int irq_base, unsigned int nr_irqs)
 {
-	/* irq_domain_free_irqs_recursive() will call parent's free */
-	if (!irq_domain_is_auto_recursive(domain) && domain->parent)
-		irq_domain_free_irqs_recursive(domain->parent, irq_base,
-					       nr_irqs);
+	if (!domain->parent)
+		return;
+
+	irq_domain_free_irqs_hierarchy(domain->parent, irq_base, nr_irqs);
 }
 EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 9e3f1857c6bd..48eadf416c24 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -315,7 +315,7 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
 
 		ops->set_desc(arg, desc);
 		/* Assumes the domain mutex is held! */
-		ret = irq_domain_alloc_irqs_recursive(domain, virq, 1, arg);
+		ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
 		if (ret)
 			break;
 
-- 
cgit v1.3-14-g43fede


From feaf1283d11794b9d518fcfd54b6bf8bee1f0b4b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 22 Jun 2017 17:04:55 -0400
Subject: tracing: Show address when function names are not found

Currently, when a function is not found in kallsyms, instead of simply
showing the function address, it shows nothing at all:

 # echo ':mod:kvm_intel' > /sys/kernel/tracing/set_ftrace_filter
 # echo function > /sys/kernel/tracing/set_ftrace_filter
 # qemu -enable-kvm /home/my-qemu-image
   <Ctrl-C>
 # rmmod kvm_intel
 # cat /sys/kernel/tracing/trace
 qemu-system-x86-2408  [001] d..2   135.013238:  <-kvm_arch_hardware_enable
 qemu-system-x86-2408  [001] ....   135.014574:  <-kvm_arch_vm_ioctl
 qemu-system-x86-2408  [001] ....   135.015420:  <-kvm_vm_ioctl_check_extension
 qemu-system-x86-2408  [001] ....   135.045411:  <-__do_cpuid_ent
 qemu-system-x86-2408  [001] ....   135.045412:  <-__do_cpuid_ent
 qemu-system-x86-2408  [001] ....   135.045412:  <-__do_cpuid_ent
 qemu-system-x86-2408  [001] ....   135.045412:  <-__do_cpuid_ent
 qemu-system-x86-2408  [001] ...1   135.045413:  <-__do_cpuid_ent
 qemu-system-x86-2408  [001] ....   135.045413:  <-__do_cpuid_ent

When it should show:

 qemu-system-x86-2408  [001] d..2   135.013238: 0xffffffffa02a39f0 <-kvm_arch_hardware_enable
 qemu-system-x86-2408  [001] ....   135.014574: 0xffffffffa02a2ba0 <-kvm_arch_vm_ioctl
 qemu-system-x86-2408  [001] ....   135.015420: 0xffffffffa029e4e0 <-kvm_vm_ioctl_check_extension
 qemu-system-x86-2408  [001] ....   135.045411: 0xffffffffa02a1380 <-__do_cpuid_ent
 qemu-system-x86-2408  [001] ....   135.045412: 0xffffffffa029e160 <-__do_cpuid_ent
 qemu-system-x86-2408  [001] ....   135.045412: 0xffffffffa029e180 <-__do_cpuid_ent
 qemu-system-x86-2408  [001] ....   135.045412: 0xffffffffa029e520 <-__do_cpuid_ent
 qemu-system-x86-2408  [001] ...1   135.045413: 0xffffffffa02a13b0 <-__do_cpuid_ent
 qemu-system-x86-2408  [001] ....   135.045413: 0xffffffffa02a1380 <-__do_cpuid_ent

instead.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 08f9bab8089e..01ff99969ca7 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -340,31 +340,41 @@ static inline const char *kretprobed(const char *name)
 static void
 seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
 {
-#ifdef CONFIG_KALLSYMS
 	char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_KALLSYMS
 	const char *name;
 
 	kallsyms_lookup(address, NULL, NULL, NULL, str);
 
 	name = kretprobed(str);
 
-	trace_seq_printf(s, fmt, name);
+	if (name && strlen(name)) {
+		trace_seq_printf(s, fmt, name);
+		return;
+	}
 #endif
+	snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
+	trace_seq_printf(s, fmt, str);
 }
 
 static void
 seq_print_sym_offset(struct trace_seq *s, const char *fmt,
 		     unsigned long address)
 {
-#ifdef CONFIG_KALLSYMS
 	char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_KALLSYMS
 	const char *name;
 
 	sprint_symbol(str, address);
 	name = kretprobed(str);
 
-	trace_seq_printf(s, fmt, name);
+	if (name && strlen(name)) {
+		trace_seq_printf(s, fmt, name);
+		return;
+	}
 #endif
+	snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
+	trace_seq_printf(s, fmt, str);
 }
 
 #ifndef CONFIG_64BIT
-- 
cgit v1.3-14-g43fede


From e1d4eeec5aaa28d25f249c0195b0e1d9b9feb7bd Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Wed, 14 Jun 2017 13:19:23 -0400
Subject: sched/cpuset: Only offer CONFIG_CPUSETS if SMP is enabled

Make CONFIG_CPUSETS=y depend on SMP as this feature makes no sense
on UP. This allows for configuring out cpuset_cpumask_can_shrink()
and task_can_attach() entirely, which shrinks the kernel a bit.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170614171926.8345-2-nicolas.pitre@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 init/Kconfig        | 1 +
 kernel/sched/core.c | 7 +++----
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index 1d3475fc9496..c359038ebeed 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1156,6 +1156,7 @@ config CGROUP_HUGETLB
 
 config CPUSETS
 	bool "Cpuset controller"
+	depends on SMP
 	help
 	  This option will let you create and manage CPUSETs which
 	  allow dynamically partitioning a system into sets of CPUs and
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62166da1c359..7faf4b322b63 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5451,6 +5451,8 @@ void init_idle(struct task_struct *idle, int cpu)
 #endif
 }
 
+#ifdef CONFIG_SMP
+
 int cpuset_cpumask_can_shrink(const struct cpumask *cur,
 			      const struct cpumask *trial)
 {
@@ -5494,7 +5496,6 @@ int task_can_attach(struct task_struct *p,
 		goto out;
 	}
 
-#ifdef CONFIG_SMP
 	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
 					      cs_cpus_allowed)) {
 		unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
@@ -5524,13 +5525,11 @@ int task_can_attach(struct task_struct *p,
 		rcu_read_unlock_sched();
 
 	}
-#endif
+
 out:
 	return ret;
 }
 
-#ifdef CONFIG_SMP
-
 bool sched_smp_initialized __read_mostly;
 
 #ifdef CONFIG_NUMA_BALANCING
-- 
cgit v1.3-14-g43fede


From 06a76fe08d4daaeea01ca0f175ad29f40c781ece Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Wed, 21 Jun 2017 14:22:01 -0400
Subject: sched/deadline: Move DL related code from sched/core.c to
 sched/deadline.c

This helps making sched/core.c smaller and hopefully easier to understand and maintain.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170621182203.30626-2-nicolas.pitre@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c     | 343 +----------------------------------------------
 kernel/sched/deadline.c | 344 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h    |  17 ++-
 3 files changed, 364 insertions(+), 340 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7faf4b322b63..54e1b0700af3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2139,25 +2139,6 @@ int wake_up_state(struct task_struct *p, unsigned int state)
 	return try_to_wake_up(p, state, 0);
 }
 
-/*
- * This function clears the sched_dl_entity static params.
- */
-void __dl_clear_params(struct task_struct *p)
-{
-	struct sched_dl_entity *dl_se = &p->dl;
-
-	dl_se->dl_runtime = 0;
-	dl_se->dl_deadline = 0;
-	dl_se->dl_period = 0;
-	dl_se->flags = 0;
-	dl_se->dl_bw = 0;
-	dl_se->dl_density = 0;
-
-	dl_se->dl_throttled = 0;
-	dl_se->dl_yielded = 0;
-	dl_se->dl_non_contending = 0;
-}
-
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
@@ -2438,101 +2419,6 @@ unsigned long to_ratio(u64 period, u64 runtime)
 	return div64_u64(runtime << BW_SHIFT, period);
 }
 
-#ifdef CONFIG_SMP
-inline struct dl_bw *dl_bw_of(int i)
-{
-	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
-			 "sched RCU must be held");
-	return &cpu_rq(i)->rd->dl_bw;
-}
-
-inline int dl_bw_cpus(int i)
-{
-	struct root_domain *rd = cpu_rq(i)->rd;
-	int cpus = 0;
-
-	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
-			 "sched RCU must be held");
-	for_each_cpu_and(i, rd->span, cpu_active_mask)
-		cpus++;
-
-	return cpus;
-}
-#else
-inline struct dl_bw *dl_bw_of(int i)
-{
-	return &cpu_rq(i)->dl.dl_bw;
-}
-
-inline int dl_bw_cpus(int i)
-{
-	return 1;
-}
-#endif
-
-/*
- * We must be sure that accepting a new task (or allowing changing the
- * parameters of an existing one) is consistent with the bandwidth
- * constraints. If yes, this function also accordingly updates the currently
- * allocated bandwidth to reflect the new situation.
- *
- * This function is called while holding p's rq->lock.
- */
-static int dl_overflow(struct task_struct *p, int policy,
-		       const struct sched_attr *attr)
-{
-
-	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
-	u64 period = attr->sched_period ?: attr->sched_deadline;
-	u64 runtime = attr->sched_runtime;
-	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
-	int cpus, err = -1;
-
-	/* !deadline task may carry old deadline bandwidth */
-	if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
-		return 0;
-
-	/*
-	 * Either if a task, enters, leave, or stays -deadline but changes
-	 * its parameters, we may need to update accordingly the total
-	 * allocated bandwidth of the container.
-	 */
-	raw_spin_lock(&dl_b->lock);
-	cpus = dl_bw_cpus(task_cpu(p));
-	if (dl_policy(policy) && !task_has_dl_policy(p) &&
-	    !__dl_overflow(dl_b, cpus, 0, new_bw)) {
-		if (hrtimer_active(&p->dl.inactive_timer))
-			__dl_clear(dl_b, p->dl.dl_bw, cpus);
-		__dl_add(dl_b, new_bw, cpus);
-		err = 0;
-	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
-		   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
-		/*
-		 * XXX this is slightly incorrect: when the task
-		 * utilization decreases, we should delay the total
-		 * utilization change until the task's 0-lag point.
-		 * But this would require to set the task's "inactive
-		 * timer" when the task is not inactive.
-		 */
-		__dl_clear(dl_b, p->dl.dl_bw, cpus);
-		__dl_add(dl_b, new_bw, cpus);
-		dl_change_utilization(p, new_bw);
-		err = 0;
-	} else if (!dl_policy(policy) && task_has_dl_policy(p)) {
-		/*
-		 * Do not decrease the total deadline utilization here,
-		 * switched_from_dl() will take care to do it at the correct
-		 * (0-lag) time.
-		 */
-		err = 0;
-	}
-	raw_spin_unlock(&dl_b->lock);
-
-	return err;
-}
-
-extern void init_dl_bw(struct dl_bw *dl_b);
-
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
@@ -4014,27 +3900,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
 	return pid ? find_task_by_vpid(pid) : current;
 }
 
-/*
- * This function initializes the sched_dl_entity of a newly becoming
- * SCHED_DEADLINE task.
- *
- * Only the static values are considered here, the actual runtime and the
- * absolute deadline will be properly calculated when the task is enqueued
- * for the first time with its new policy.
- */
-static void
-__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
-{
-	struct sched_dl_entity *dl_se = &p->dl;
-
-	dl_se->dl_runtime = attr->sched_runtime;
-	dl_se->dl_deadline = attr->sched_deadline;
-	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
-	dl_se->flags = attr->sched_flags;
-	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
-	dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
-}
-
 /*
  * sched_setparam() passes in -1 for its policy, to let the functions
  * it calls know not to change it.
@@ -4088,59 +3953,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
 		p->sched_class = &fair_sched_class;
 }
 
-static void
-__getparam_dl(struct task_struct *p, struct sched_attr *attr)
-{
-	struct sched_dl_entity *dl_se = &p->dl;
-
-	attr->sched_priority = p->rt_priority;
-	attr->sched_runtime = dl_se->dl_runtime;
-	attr->sched_deadline = dl_se->dl_deadline;
-	attr->sched_period = dl_se->dl_period;
-	attr->sched_flags = dl_se->flags;
-}
-
-/*
- * This function validates the new parameters of a -deadline task.
- * We ask for the deadline not being zero, and greater or equal
- * than the runtime, as well as the period of being zero or
- * greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution of 1us (we
- * check sched_runtime only since it is always the smaller one) and
- * below 2^63 ns (we have to check both sched_deadline and
- * sched_period, as the latter can be zero).
- */
-static bool
-__checkparam_dl(const struct sched_attr *attr)
-{
-	/* deadline != 0 */
-	if (attr->sched_deadline == 0)
-		return false;
-
-	/*
-	 * Since we truncate DL_SCALE bits, make sure we're at least
-	 * that big.
-	 */
-	if (attr->sched_runtime < (1ULL << DL_SCALE))
-		return false;
-
-	/*
-	 * Since we use the MSB for wrap-around and sign issues, make
-	 * sure it's not set (mind that period can be equal to zero).
-	 */
-	if (attr->sched_deadline & (1ULL << 63) ||
-	    attr->sched_period & (1ULL << 63))
-		return false;
-
-	/* runtime <= deadline <= period (if period != 0) */
-	if ((attr->sched_period != 0 &&
-	     attr->sched_period < attr->sched_deadline) ||
-	    attr->sched_deadline < attr->sched_runtime)
-		return false;
-
-	return true;
-}
-
 /*
  * Check the target process has a UID that matches the current process's:
  */
@@ -4157,19 +3969,6 @@ static bool check_same_owner(struct task_struct *p)
 	return match;
 }
 
-static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
-{
-	struct sched_dl_entity *dl_se = &p->dl;
-
-	if (dl_se->dl_runtime != attr->sched_runtime ||
-		dl_se->dl_deadline != attr->sched_deadline ||
-		dl_se->dl_period != attr->sched_period ||
-		dl_se->flags != attr->sched_flags)
-		return true;
-
-	return false;
-}
-
 static int __sched_setscheduler(struct task_struct *p,
 				const struct sched_attr *attr,
 				bool user, bool pi)
@@ -4350,7 +4149,7 @@ change:
 	 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
 	 * is available.
 	 */
-	if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
+	if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
 		task_rq_unlock(rq, p, &rf);
 		return -EBUSY;
 	}
@@ -5456,23 +5255,12 @@ void init_idle(struct task_struct *idle, int cpu)
 int cpuset_cpumask_can_shrink(const struct cpumask *cur,
 			      const struct cpumask *trial)
 {
-	int ret = 1, trial_cpus;
-	struct dl_bw *cur_dl_b;
-	unsigned long flags;
+	int ret = 1;
 
 	if (!cpumask_weight(cur))
 		return ret;
 
-	rcu_read_lock_sched();
-	cur_dl_b = dl_bw_of(cpumask_any(cur));
-	trial_cpus = cpumask_weight(trial);
-
-	raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
-	if (cur_dl_b->bw != -1 &&
-	    cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
-		ret = 0;
-	raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
-	rcu_read_unlock_sched();
+	ret = dl_cpuset_cpumask_can_shrink(cur, trial);
 
 	return ret;
 }
@@ -5497,34 +5285,8 @@ int task_can_attach(struct task_struct *p,
 	}
 
 	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
-					      cs_cpus_allowed)) {
-		unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
-							cs_cpus_allowed);
-		struct dl_bw *dl_b;
-		bool overflow;
-		int cpus;
-		unsigned long flags;
-
-		rcu_read_lock_sched();
-		dl_b = dl_bw_of(dest_cpu);
-		raw_spin_lock_irqsave(&dl_b->lock, flags);
-		cpus = dl_bw_cpus(dest_cpu);
-		overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
-		if (overflow)
-			ret = -EBUSY;
-		else {
-			/*
-			 * We reserve space for this task in the destination
-			 * root_domain, as we can't fail after this point.
-			 * We will free resources in the source root_domain
-			 * later on (see set_cpus_allowed_dl()).
-			 */
-			__dl_add(dl_b, p->dl.dl_bw, cpus);
-		}
-		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-		rcu_read_unlock_sched();
-
-	}
+					      cs_cpus_allowed))
+		ret = dl_task_can_attach(p, cs_cpus_allowed);
 
 out:
 	return ret;
@@ -5792,23 +5554,8 @@ static void cpuset_cpu_active(void)
 
 static int cpuset_cpu_inactive(unsigned int cpu)
 {
-	unsigned long flags;
-	struct dl_bw *dl_b;
-	bool overflow;
-	int cpus;
-
 	if (!cpuhp_tasks_frozen) {
-		rcu_read_lock_sched();
-		dl_b = dl_bw_of(cpu);
-
-		raw_spin_lock_irqsave(&dl_b->lock, flags);
-		cpus = dl_bw_cpus(cpu);
-		overflow = __dl_overflow(dl_b, cpus, 0, 0);
-		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
-		rcu_read_unlock_sched();
-
-		if (overflow)
+		if (dl_cpu_busy(cpu))
 			return -EBUSY;
 		cpuset_update_active_cpus();
 	} else {
@@ -6711,84 +6458,6 @@ static int sched_rt_global_constraints(void)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
-static int sched_dl_global_validate(void)
-{
-	u64 runtime = global_rt_runtime();
-	u64 period = global_rt_period();
-	u64 new_bw = to_ratio(period, runtime);
-	struct dl_bw *dl_b;
-	int cpu, ret = 0;
-	unsigned long flags;
-
-	/*
-	 * Here we want to check the bandwidth not being set to some
-	 * value smaller than the currently allocated bandwidth in
-	 * any of the root_domains.
-	 *
-	 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
-	 * cycling on root_domains... Discussion on different/better
-	 * solutions is welcome!
-	 */
-	for_each_possible_cpu(cpu) {
-		rcu_read_lock_sched();
-		dl_b = dl_bw_of(cpu);
-
-		raw_spin_lock_irqsave(&dl_b->lock, flags);
-		if (new_bw < dl_b->total_bw)
-			ret = -EBUSY;
-		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
-		rcu_read_unlock_sched();
-
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-
-void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
-{
-	if (global_rt_runtime() == RUNTIME_INF) {
-		dl_rq->bw_ratio = 1 << RATIO_SHIFT;
-		dl_rq->extra_bw = 1 << BW_SHIFT;
-	} else {
-		dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
-			  global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
-		dl_rq->extra_bw = to_ratio(global_rt_period(),
-						    global_rt_runtime());
-	}
-}
-
-static void sched_dl_do_global(void)
-{
-	u64 new_bw = -1;
-	struct dl_bw *dl_b;
-	int cpu;
-	unsigned long flags;
-
-	def_dl_bandwidth.dl_period = global_rt_period();
-	def_dl_bandwidth.dl_runtime = global_rt_runtime();
-
-	if (global_rt_runtime() != RUNTIME_INF)
-		new_bw = to_ratio(global_rt_period(), global_rt_runtime());
-
-	/*
-	 * FIXME: As above...
-	 */
-	for_each_possible_cpu(cpu) {
-		rcu_read_lock_sched();
-		dl_b = dl_bw_of(cpu);
-
-		raw_spin_lock_irqsave(&dl_b->lock, flags);
-		dl_b->bw = new_bw;
-		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
-		rcu_read_unlock_sched();
-		init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
-	}
-}
-
 static int sched_rt_global_validate(void)
 {
 	if (sysctl_sched_rt_period <= 0)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e12f85975857..a84299f44b5d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,6 +17,7 @@
 #include "sched.h"
 
 #include <linux/slab.h>
+#include <uapi/linux/sched/types.h>
 
 struct dl_bandwidth def_dl_bandwidth;
 
@@ -43,6 +44,38 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
 	return !RB_EMPTY_NODE(&dl_se->rb_node);
 }
 
+#ifdef CONFIG_SMP
+static inline struct dl_bw *dl_bw_of(int i)
+{
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+			 "sched RCU must be held");
+	return &cpu_rq(i)->rd->dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+	struct root_domain *rd = cpu_rq(i)->rd;
+	int cpus = 0;
+
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+			 "sched RCU must be held");
+	for_each_cpu_and(i, rd->span, cpu_active_mask)
+		cpus++;
+
+	return cpus;
+}
+#else
+static inline struct dl_bw *dl_bw_of(int i)
+{
+	return &cpu_rq(i)->dl.dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+	return 1;
+}
+#endif
+
 static inline
 void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
 {
@@ -2318,6 +2351,317 @@ const struct sched_class dl_sched_class = {
 	.update_curr		= update_curr_dl,
 };
 
+int sched_dl_global_validate(void)
+{
+	u64 runtime = global_rt_runtime();
+	u64 period = global_rt_period();
+	u64 new_bw = to_ratio(period, runtime);
+	struct dl_bw *dl_b;
+	int cpu, ret = 0;
+	unsigned long flags;
+
+	/*
+	 * Here we want to check the bandwidth not being set to some
+	 * value smaller than the currently allocated bandwidth in
+	 * any of the root_domains.
+	 *
+	 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
+	 * cycling on root_domains... Discussion on different/better
+	 * solutions is welcome!
+	 */
+	for_each_possible_cpu(cpu) {
+		rcu_read_lock_sched();
+		dl_b = dl_bw_of(cpu);
+
+		raw_spin_lock_irqsave(&dl_b->lock, flags);
+		if (new_bw < dl_b->total_bw)
+			ret = -EBUSY;
+		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+		rcu_read_unlock_sched();
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
+{
+	if (global_rt_runtime() == RUNTIME_INF) {
+		dl_rq->bw_ratio = 1 << RATIO_SHIFT;
+		dl_rq->extra_bw = 1 << BW_SHIFT;
+	} else {
+		dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
+			  global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
+		dl_rq->extra_bw = to_ratio(global_rt_period(),
+						    global_rt_runtime());
+	}
+}
+
+void sched_dl_do_global(void)
+{
+	u64 new_bw = -1;
+	struct dl_bw *dl_b;
+	int cpu;
+	unsigned long flags;
+
+	def_dl_bandwidth.dl_period = global_rt_period();
+	def_dl_bandwidth.dl_runtime = global_rt_runtime();
+
+	if (global_rt_runtime() != RUNTIME_INF)
+		new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+
+	/*
+	 * FIXME: As above...
+	 */
+	for_each_possible_cpu(cpu) {
+		rcu_read_lock_sched();
+		dl_b = dl_bw_of(cpu);
+
+		raw_spin_lock_irqsave(&dl_b->lock, flags);
+		dl_b->bw = new_bw;
+		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+		rcu_read_unlock_sched();
+		init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
+	}
+}
+
+/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+int sched_dl_overflow(struct task_struct *p, int policy,
+		      const struct sched_attr *attr)
+{
+	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+	u64 period = attr->sched_period ?: attr->sched_deadline;
+	u64 runtime = attr->sched_runtime;
+	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+	int cpus, err = -1;
+
+	/* !deadline task may carry old deadline bandwidth */
+	if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
+		return 0;
+
+	/*
+	 * Either if a task, enters, leave, or stays -deadline but changes
+	 * its parameters, we may need to update accordingly the total
+	 * allocated bandwidth of the container.
+	 */
+	raw_spin_lock(&dl_b->lock);
+	cpus = dl_bw_cpus(task_cpu(p));
+	if (dl_policy(policy) && !task_has_dl_policy(p) &&
+	    !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+		if (hrtimer_active(&p->dl.inactive_timer))
+			__dl_clear(dl_b, p->dl.dl_bw, cpus);
+		__dl_add(dl_b, new_bw, cpus);
+		err = 0;
+	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
+		   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+		/*
+		 * XXX this is slightly incorrect: when the task
+		 * utilization decreases, we should delay the total
+		 * utilization change until the task's 0-lag point.
+		 * But this would require to set the task's "inactive
+		 * timer" when the task is not inactive.
+		 */
+		__dl_clear(dl_b, p->dl.dl_bw, cpus);
+		__dl_add(dl_b, new_bw, cpus);
+		dl_change_utilization(p, new_bw);
+		err = 0;
+	} else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+		/*
+		 * Do not decrease the total deadline utilization here,
+		 * switched_from_dl() will take care to do it at the correct
+		 * (0-lag) time.
+		 */
+		err = 0;
+	}
+	raw_spin_unlock(&dl_b->lock);
+
+	return err;
+}
+
+/*
+ * This function initializes the sched_dl_entity of a newly becoming
+ * SCHED_DEADLINE task.
+ *
+ * Only the static values are considered here, the actual runtime and the
+ * absolute deadline will be properly calculated when the task is enqueued
+ * for the first time with its new policy.
+ */
+void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
+{
+	struct sched_dl_entity *dl_se = &p->dl;
+
+	dl_se->dl_runtime = attr->sched_runtime;
+	dl_se->dl_deadline = attr->sched_deadline;
+	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
+	dl_se->flags = attr->sched_flags;
+	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+	dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
+}
+
+void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
+{
+	struct sched_dl_entity *dl_se = &p->dl;
+
+	attr->sched_priority = p->rt_priority;
+	attr->sched_runtime = dl_se->dl_runtime;
+	attr->sched_deadline = dl_se->dl_deadline;
+	attr->sched_period = dl_se->dl_period;
+	attr->sched_flags = dl_se->flags;
+}
+
+/*
+ * This function validates the new parameters of a -deadline task.
+ * We ask for the deadline not being zero, and greater or equal
+ * than the runtime, as well as the period of being zero or
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
+ */
+bool __checkparam_dl(const struct sched_attr *attr)
+{
+	/* deadline != 0 */
+	if (attr->sched_deadline == 0)
+		return false;
+
+	/*
+	 * Since we truncate DL_SCALE bits, make sure we're at least
+	 * that big.
+	 */
+	if (attr->sched_runtime < (1ULL << DL_SCALE))
+		return false;
+
+	/*
+	 * Since we use the MSB for wrap-around and sign issues, make
+	 * sure it's not set (mind that period can be equal to zero).
+	 */
+	if (attr->sched_deadline & (1ULL << 63) ||
+	    attr->sched_period & (1ULL << 63))
+		return false;
+
+	/* runtime <= deadline <= period (if period != 0) */
+	if ((attr->sched_period != 0 &&
+	     attr->sched_period < attr->sched_deadline) ||
+	    attr->sched_deadline < attr->sched_runtime)
+		return false;
+
+	return true;
+}
+
+/*
+ * This function clears the sched_dl_entity static params.
+ */
+void __dl_clear_params(struct task_struct *p)
+{
+	struct sched_dl_entity *dl_se = &p->dl;
+
+	dl_se->dl_runtime = 0;
+	dl_se->dl_deadline = 0;
+	dl_se->dl_period = 0;
+	dl_se->flags = 0;
+	dl_se->dl_bw = 0;
+	dl_se->dl_density = 0;
+
+	dl_se->dl_throttled = 0;
+	dl_se->dl_yielded = 0;
+	dl_se->dl_non_contending = 0;
+}
+
+bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
+{
+	struct sched_dl_entity *dl_se = &p->dl;
+
+	if (dl_se->dl_runtime != attr->sched_runtime ||
+	    dl_se->dl_deadline != attr->sched_deadline ||
+	    dl_se->dl_period != attr->sched_period ||
+	    dl_se->flags != attr->sched_flags)
+		return true;
+
+	return false;
+}
+
+#ifdef CONFIG_SMP
+int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
+{
+	unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
+							cs_cpus_allowed);
+	struct dl_bw *dl_b;
+	bool overflow;
+	int cpus, ret;
+	unsigned long flags;
+
+	rcu_read_lock_sched();
+	dl_b = dl_bw_of(dest_cpu);
+	raw_spin_lock_irqsave(&dl_b->lock, flags);
+	cpus = dl_bw_cpus(dest_cpu);
+	overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+	if (overflow)
+		ret = -EBUSY;
+	else {
+		/*
+		 * We reserve space for this task in the destination
+		 * root_domain, as we can't fail after this point.
+		 * We will free resources in the source root_domain
+		 * later on (see set_cpus_allowed_dl()).
+		 */
+		__dl_add(dl_b, p->dl.dl_bw, cpus);
+		ret = 0;
+	}
+	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+	rcu_read_unlock_sched();
+	return ret;
+}
+
+int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
+				 const struct cpumask *trial)
+{
+	int ret = 1, trial_cpus;
+	struct dl_bw *cur_dl_b;
+	unsigned long flags;
+
+	rcu_read_lock_sched();
+	cur_dl_b = dl_bw_of(cpumask_any(cur));
+	trial_cpus = cpumask_weight(trial);
+
+	raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
+	if (cur_dl_b->bw != -1 &&
+	    cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+		ret = 0;
+	raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
+	rcu_read_unlock_sched();
+	return ret;
+}
+
+bool dl_cpu_busy(unsigned int cpu)
+{
+	unsigned long flags;
+	struct dl_bw *dl_b;
+	bool overflow;
+	int cpus;
+
+	rcu_read_lock_sched();
+	dl_b = dl_bw_of(cpu);
+	raw_spin_lock_irqsave(&dl_b->lock, flags);
+	cpus = dl_bw_cpus(cpu);
+	overflow = __dl_overflow(dl_b, cpus, 0, 0);
+	raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+	rcu_read_unlock_sched();
+	return overflow;
+}
+#endif
+
 #ifdef CONFIG_SCHED_DEBUG
 extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0329d10bdb8..d4eb3f67529d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -218,9 +218,6 @@ static inline int dl_bandwidth_enabled(void)
 	return sysctl_sched_rt_runtime >= 0;
 }
 
-extern struct dl_bw *dl_bw_of(int i);
-extern int dl_bw_cpus(int i);
-
 struct dl_bw {
 	raw_spinlock_t lock;
 	u64 bw, total_bw;
@@ -251,6 +248,20 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
 
 void dl_change_utilization(struct task_struct *p, u64 new_bw);
 extern void init_dl_bw(struct dl_bw *dl_b);
+extern int sched_dl_global_validate(void);
+extern void sched_dl_do_global(void);
+extern int sched_dl_overflow(struct task_struct *p, int policy,
+			     const struct sched_attr *attr);
+extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
+extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
+extern bool __checkparam_dl(const struct sched_attr *attr);
+extern void __dl_clear_params(struct task_struct *p);
+extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
+extern int dl_task_can_attach(struct task_struct *p,
+			      const struct cpumask *cs_cpus_allowed);
+extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
+					const struct cpumask *trial);
+extern bool dl_cpu_busy(unsigned int cpu);
 
 #ifdef CONFIG_CGROUP_SCHED
 
-- 
cgit v1.3-14-g43fede


From 8887cd99038bf242fb47f2d07fa0cf9371efa643 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Wed, 21 Jun 2017 14:22:02 -0400
Subject: sched/rt: Move RT related code from sched/core.c to sched/rt.c

This helps making sched/core.c smaller and hopefully easier to understand and maintain.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170621182203.30626-3-nicolas.pitre@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 315 ---------------------------------------------------
 kernel/sched/rt.c    | 310 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h |   5 +
 3 files changed, 315 insertions(+), 315 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 54e1b0700af3..5186797908dc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6224,321 +6224,6 @@ void sched_move_task(struct task_struct *tsk)
 
 	task_rq_unlock(rq, tsk, &rf);
 }
-#endif /* CONFIG_CGROUP_SCHED */
-
-#ifdef CONFIG_RT_GROUP_SCHED
-/*
- * Ensure that the real time constraints are schedulable.
- */
-static DEFINE_MUTEX(rt_constraints_mutex);
-
-/* Must be called with tasklist_lock held */
-static inline int tg_has_rt_tasks(struct task_group *tg)
-{
-	struct task_struct *g, *p;
-
-	/*
-	 * Autogroups do not have RT tasks; see autogroup_create().
-	 */
-	if (task_group_is_autogroup(tg))
-		return 0;
-
-	for_each_process_thread(g, p) {
-		if (rt_task(p) && task_group(p) == tg)
-			return 1;
-	}
-
-	return 0;
-}
-
-struct rt_schedulable_data {
-	struct task_group *tg;
-	u64 rt_period;
-	u64 rt_runtime;
-};
-
-static int tg_rt_schedulable(struct task_group *tg, void *data)
-{
-	struct rt_schedulable_data *d = data;
-	struct task_group *child;
-	unsigned long total, sum = 0;
-	u64 period, runtime;
-
-	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-	runtime = tg->rt_bandwidth.rt_runtime;
-
-	if (tg == d->tg) {
-		period = d->rt_period;
-		runtime = d->rt_runtime;
-	}
-
-	/*
-	 * Cannot have more runtime than the period.
-	 */
-	if (runtime > period && runtime != RUNTIME_INF)
-		return -EINVAL;
-
-	/*
-	 * Ensure we don't starve existing RT tasks.
-	 */
-	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
-		return -EBUSY;
-
-	total = to_ratio(period, runtime);
-
-	/*
-	 * Nobody can have more than the global setting allows.
-	 */
-	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
-		return -EINVAL;
-
-	/*
-	 * The sum of our children's runtime should not exceed our own.
-	 */
-	list_for_each_entry_rcu(child, &tg->children, siblings) {
-		period = ktime_to_ns(child->rt_bandwidth.rt_period);
-		runtime = child->rt_bandwidth.rt_runtime;
-
-		if (child == d->tg) {
-			period = d->rt_period;
-			runtime = d->rt_runtime;
-		}
-
-		sum += to_ratio(period, runtime);
-	}
-
-	if (sum > total)
-		return -EINVAL;
-
-	return 0;
-}
-
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-{
-	int ret;
-
-	struct rt_schedulable_data data = {
-		.tg = tg,
-		.rt_period = period,
-		.rt_runtime = runtime,
-	};
-
-	rcu_read_lock();
-	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
-	rcu_read_unlock();
-
-	return ret;
-}
-
-static int tg_set_rt_bandwidth(struct task_group *tg,
-		u64 rt_period, u64 rt_runtime)
-{
-	int i, err = 0;
-
-	/*
-	 * Disallowing the root group RT runtime is BAD, it would disallow the
-	 * kernel creating (and or operating) RT threads.
-	 */
-	if (tg == &root_task_group && rt_runtime == 0)
-		return -EINVAL;
-
-	/* No period doesn't make any sense. */
-	if (rt_period == 0)
-		return -EINVAL;
-
-	mutex_lock(&rt_constraints_mutex);
-	read_lock(&tasklist_lock);
-	err = __rt_schedulable(tg, rt_period, rt_runtime);
-	if (err)
-		goto unlock;
-
-	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
-	tg->rt_bandwidth.rt_runtime = rt_runtime;
-
-	for_each_possible_cpu(i) {
-		struct rt_rq *rt_rq = tg->rt_rq[i];
-
-		raw_spin_lock(&rt_rq->rt_runtime_lock);
-		rt_rq->rt_runtime = rt_runtime;
-		raw_spin_unlock(&rt_rq->rt_runtime_lock);
-	}
-	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
-unlock:
-	read_unlock(&tasklist_lock);
-	mutex_unlock(&rt_constraints_mutex);
-
-	return err;
-}
-
-static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
-{
-	u64 rt_runtime, rt_period;
-
-	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
-	if (rt_runtime_us < 0)
-		rt_runtime = RUNTIME_INF;
-
-	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
-}
-
-static long sched_group_rt_runtime(struct task_group *tg)
-{
-	u64 rt_runtime_us;
-
-	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
-		return -1;
-
-	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
-	do_div(rt_runtime_us, NSEC_PER_USEC);
-	return rt_runtime_us;
-}
-
-static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
-{
-	u64 rt_runtime, rt_period;
-
-	rt_period = rt_period_us * NSEC_PER_USEC;
-	rt_runtime = tg->rt_bandwidth.rt_runtime;
-
-	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
-}
-
-static long sched_group_rt_period(struct task_group *tg)
-{
-	u64 rt_period_us;
-
-	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
-	do_div(rt_period_us, NSEC_PER_USEC);
-	return rt_period_us;
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-#ifdef CONFIG_RT_GROUP_SCHED
-static int sched_rt_global_constraints(void)
-{
-	int ret = 0;
-
-	mutex_lock(&rt_constraints_mutex);
-	read_lock(&tasklist_lock);
-	ret = __rt_schedulable(NULL, 0, 0);
-	read_unlock(&tasklist_lock);
-	mutex_unlock(&rt_constraints_mutex);
-
-	return ret;
-}
-
-static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
-{
-	/* Don't accept realtime tasks when there is no way for them to run */
-	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
-		return 0;
-
-	return 1;
-}
-
-#else /* !CONFIG_RT_GROUP_SCHED */
-static int sched_rt_global_constraints(void)
-{
-	unsigned long flags;
-	int i;
-
-	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
-	for_each_possible_cpu(i) {
-		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-
-		raw_spin_lock(&rt_rq->rt_runtime_lock);
-		rt_rq->rt_runtime = global_rt_runtime();
-		raw_spin_unlock(&rt_rq->rt_runtime_lock);
-	}
-	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
-
-	return 0;
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-static int sched_rt_global_validate(void)
-{
-	if (sysctl_sched_rt_period <= 0)
-		return -EINVAL;
-
-	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
-		(sysctl_sched_rt_runtime > sysctl_sched_rt_period))
-		return -EINVAL;
-
-	return 0;
-}
-
-static void sched_rt_do_global(void)
-{
-	def_rt_bandwidth.rt_runtime = global_rt_runtime();
-	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
-}
-
-int sched_rt_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos)
-{
-	int old_period, old_runtime;
-	static DEFINE_MUTEX(mutex);
-	int ret;
-
-	mutex_lock(&mutex);
-	old_period = sysctl_sched_rt_period;
-	old_runtime = sysctl_sched_rt_runtime;
-
-	ret = proc_dointvec(table, write, buffer, lenp, ppos);
-
-	if (!ret && write) {
-		ret = sched_rt_global_validate();
-		if (ret)
-			goto undo;
-
-		ret = sched_dl_global_validate();
-		if (ret)
-			goto undo;
-
-		ret = sched_rt_global_constraints();
-		if (ret)
-			goto undo;
-
-		sched_rt_do_global();
-		sched_dl_do_global();
-	}
-	if (0) {
-undo:
-		sysctl_sched_rt_period = old_period;
-		sysctl_sched_rt_runtime = old_runtime;
-	}
-	mutex_unlock(&mutex);
-
-	return ret;
-}
-
-int sched_rr_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos)
-{
-	int ret;
-	static DEFINE_MUTEX(mutex);
-
-	mutex_lock(&mutex);
-	ret = proc_dointvec(table, write, buffer, lenp, ppos);
-	/*
-	 * Make sure that internally we keep jiffies.
-	 * Also, writing zero resets the timeslice to default:
-	 */
-	if (!ret && write) {
-		sched_rr_timeslice =
-			sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
-			msecs_to_jiffies(sysctl_sched_rr_timeslice);
-	}
-	mutex_unlock(&mutex);
-	return ret;
-}
-
-#ifdef CONFIG_CGROUP_SCHED
 
 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
 {
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 581d5c7a5264..45caf937ef90 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2449,6 +2449,316 @@ const struct sched_class rt_sched_class = {
 	.update_curr		= update_curr_rt,
 };
 
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
+{
+	struct task_struct *g, *p;
+
+	/*
+	 * Autogroups do not have RT tasks; see autogroup_create().
+	 */
+	if (task_group_is_autogroup(tg))
+		return 0;
+
+	for_each_process_thread(g, p) {
+		if (rt_task(p) && task_group(p) == tg)
+			return 1;
+	}
+
+	return 0;
+}
+
+struct rt_schedulable_data {
+	struct task_group *tg;
+	u64 rt_period;
+	u64 rt_runtime;
+};
+
+static int tg_rt_schedulable(struct task_group *tg, void *data)
+{
+	struct rt_schedulable_data *d = data;
+	struct task_group *child;
+	unsigned long total, sum = 0;
+	u64 period, runtime;
+
+	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+	runtime = tg->rt_bandwidth.rt_runtime;
+
+	if (tg == d->tg) {
+		period = d->rt_period;
+		runtime = d->rt_runtime;
+	}
+
+	/*
+	 * Cannot have more runtime than the period.
+	 */
+	if (runtime > period && runtime != RUNTIME_INF)
+		return -EINVAL;
+
+	/*
+	 * Ensure we don't starve existing RT tasks.
+	 */
+	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+		return -EBUSY;
+
+	total = to_ratio(period, runtime);
+
+	/*
+	 * Nobody can have more than the global setting allows.
+	 */
+	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+		return -EINVAL;
+
+	/*
+	 * The sum of our children's runtime should not exceed our own.
+	 */
+	list_for_each_entry_rcu(child, &tg->children, siblings) {
+		period = ktime_to_ns(child->rt_bandwidth.rt_period);
+		runtime = child->rt_bandwidth.rt_runtime;
+
+		if (child == d->tg) {
+			period = d->rt_period;
+			runtime = d->rt_runtime;
+		}
+
+		sum += to_ratio(period, runtime);
+	}
+
+	if (sum > total)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+{
+	int ret;
+
+	struct rt_schedulable_data data = {
+		.tg = tg,
+		.rt_period = period,
+		.rt_runtime = runtime,
+	};
+
+	rcu_read_lock();
+	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int tg_set_rt_bandwidth(struct task_group *tg,
+		u64 rt_period, u64 rt_runtime)
+{
+	int i, err = 0;
+
+	/*
+	 * Disallowing the root group RT runtime is BAD, it would disallow the
+	 * kernel creating (and or operating) RT threads.
+	 */
+	if (tg == &root_task_group && rt_runtime == 0)
+		return -EINVAL;
+
+	/* No period doesn't make any sense. */
+	if (rt_period == 0)
+		return -EINVAL;
+
+	mutex_lock(&rt_constraints_mutex);
+	read_lock(&tasklist_lock);
+	err = __rt_schedulable(tg, rt_period, rt_runtime);
+	if (err)
+		goto unlock;
+
+	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
+	tg->rt_bandwidth.rt_runtime = rt_runtime;
+
+	for_each_possible_cpu(i) {
+		struct rt_rq *rt_rq = tg->rt_rq[i];
+
+		raw_spin_lock(&rt_rq->rt_runtime_lock);
+		rt_rq->rt_runtime = rt_runtime;
+		raw_spin_unlock(&rt_rq->rt_runtime_lock);
+	}
+	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
+unlock:
+	read_unlock(&tasklist_lock);
+	mutex_unlock(&rt_constraints_mutex);
+
+	return err;
+}
+
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+{
+	u64 rt_runtime, rt_period;
+
+	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+	if (rt_runtime_us < 0)
+		rt_runtime = RUNTIME_INF;
+
+	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_runtime(struct task_group *tg)
+{
+	u64 rt_runtime_us;
+
+	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
+		return -1;
+
+	rt_runtime_us = tg->rt_bandwidth.rt_runtime;
+	do_div(rt_runtime_us, NSEC_PER_USEC);
+	return rt_runtime_us;
+}
+
+int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
+{
+	u64 rt_runtime, rt_period;
+
+	rt_period = rt_period_us * NSEC_PER_USEC;
+	rt_runtime = tg->rt_bandwidth.rt_runtime;
+
+	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+}
+
+long sched_group_rt_period(struct task_group *tg)
+{
+	u64 rt_period_us;
+
+	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
+	do_div(rt_period_us, NSEC_PER_USEC);
+	return rt_period_us;
+}
+
+static int sched_rt_global_constraints(void)
+{
+	int ret = 0;
+
+	mutex_lock(&rt_constraints_mutex);
+	read_lock(&tasklist_lock);
+	ret = __rt_schedulable(NULL, 0, 0);
+	read_unlock(&tasklist_lock);
+	mutex_unlock(&rt_constraints_mutex);
+
+	return ret;
+}
+
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+	/* Don't accept realtime tasks when there is no way for them to run */
+	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+		return 0;
+
+	return 1;
+}
+
+#else /* !CONFIG_RT_GROUP_SCHED */
+static int sched_rt_global_constraints(void)
+{
+	unsigned long flags;
+	int i;
+
+	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
+	for_each_possible_cpu(i) {
+		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
+
+		raw_spin_lock(&rt_rq->rt_runtime_lock);
+		rt_rq->rt_runtime = global_rt_runtime();
+		raw_spin_unlock(&rt_rq->rt_runtime_lock);
+	}
+	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
+
+	return 0;
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static int sched_rt_global_validate(void)
+{
+	if (sysctl_sched_rt_period <= 0)
+		return -EINVAL;
+
+	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+		(sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void sched_rt_do_global(void)
+{
+	def_rt_bandwidth.rt_runtime = global_rt_runtime();
+	def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+}
+
+int sched_rt_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int old_period, old_runtime;
+	static DEFINE_MUTEX(mutex);
+	int ret;
+
+	mutex_lock(&mutex);
+	old_period = sysctl_sched_rt_period;
+	old_runtime = sysctl_sched_rt_runtime;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+	if (!ret && write) {
+		ret = sched_rt_global_validate();
+		if (ret)
+			goto undo;
+
+		ret = sched_dl_global_validate();
+		if (ret)
+			goto undo;
+
+		ret = sched_rt_global_constraints();
+		if (ret)
+			goto undo;
+
+		sched_rt_do_global();
+		sched_dl_do_global();
+	}
+	if (0) {
+undo:
+		sysctl_sched_rt_period = old_period;
+		sysctl_sched_rt_runtime = old_runtime;
+	}
+	mutex_unlock(&mutex);
+
+	return ret;
+}
+
+int sched_rr_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret;
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	/*
+	 * Make sure that internally we keep jiffies.
+	 * Also, writing zero resets the timeslice to default:
+	 */
+	if (!ret && write) {
+		sched_rr_timeslice =
+			sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
+			msecs_to_jiffies(sysctl_sched_rr_timeslice);
+	}
+	mutex_unlock(&mutex);
+	return ret;
+}
+
 #ifdef CONFIG_SCHED_DEBUG
 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d4eb3f67529d..eeef1a3086d1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -383,6 +383,11 @@ extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent
 extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 		struct sched_rt_entity *rt_se, int cpu,
 		struct sched_rt_entity *parent);
+extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us);
+extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us);
+extern long sched_group_rt_runtime(struct task_group *tg);
+extern long sched_group_rt_period(struct task_group *tg);
+extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
 
 extern struct task_group *sched_create_group(struct task_group *parent);
 extern void sched_online_group(struct task_group *tg,
-- 
cgit v1.3-14-g43fede


From 239946314e57711d7da546b67964d0b387a3ee42 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 22 Jun 2017 15:07:39 -0700
Subject: bpf: possibly avoid extra masking for narrower load in verifier

Commit 31fd85816dbe ("bpf: permits narrower load from bpf program
context fields") permits narrower load for certain ctx fields.
The commit however will already generate a masking even if
the prog-specific ctx conversion produces the result with
narrower size.

For example, for __sk_buff->protocol, the ctx conversion
loads the data into register with 2-byte load.
A narrower 2-byte load should not generate masking.
For __sk_buff->vlan_present, the conversion function
set the result as either 0 or 1, essentially a byte.
The narrower 2-byte or 1-byte load should not generate masking.

To avoid unnecessary masking, prog-specific *_is_valid_access
now passes converted_op_size back to verifier, which indicates
the valid data width after perceived future conversion.
Based on this information, verifier is able to avoid
unnecessary marking.

Since we want more information back from prog-specific
*_is_valid_access checking, all of them are packed into
one data structure for more clarity.

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          | 11 +++++-
 include/linux/bpf_verifier.h |  3 +-
 kernel/bpf/verifier.c        | 29 ++++++++++----
 kernel/trace/bpf_trace.c     | 17 +++++---
 net/core/filter.c            | 92 +++++++++++++++++++++++++-------------------
 5 files changed, 97 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1bcbf0a71f75..deca4e7f2845 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -149,6 +149,15 @@ enum bpf_reg_type {
 
 struct bpf_prog;
 
+/* The information passed from prog-specific *_is_valid_access
+ * back to the verifier.
+ */
+struct bpf_insn_access_aux {
+	enum bpf_reg_type reg_type;
+	int ctx_field_size;
+	int converted_op_size;
+};
+
 struct bpf_verifier_ops {
 	/* return eBPF function prototype for verification */
 	const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
@@ -157,7 +166,7 @@ struct bpf_verifier_ops {
 	 * with 'type' (read or write) is allowed
 	 */
 	bool (*is_valid_access)(int off, int size, enum bpf_access_type type,
-				enum bpf_reg_type *reg_type, int *ctx_field_size);
+				struct bpf_insn_access_aux *info);
 	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
 			    const struct bpf_prog *prog);
 	u32 (*convert_ctx_access)(enum bpf_access_type type,
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 189741c0da85..621076f56251 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -73,7 +73,8 @@ struct bpf_insn_aux_data {
 		enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
 		struct bpf_map *map_ptr;	/* pointer for call insn into lookup_elem */
 	};
-	int ctx_field_size; /* the ctx field size for load/store insns, maybe 0 */
+	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
+	int converted_op_size; /* the valid value width after perceived conversion */
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 44b97d958fb7..74ea96ea391b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -761,22 +761,34 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
 {
-	int ctx_field_size = 0;
+	struct bpf_insn_access_aux info = { .reg_type = *reg_type };
 
 	/* for analyzer ctx accesses are already validated and converted */
 	if (env->analyzer_ops)
 		return 0;
 
 	if (env->prog->aux->ops->is_valid_access &&
-	    env->prog->aux->ops->is_valid_access(off, size, t, reg_type, &ctx_field_size)) {
-		/* a non zero ctx_field_size indicates:
+	    env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+		/* a non zero info.ctx_field_size indicates:
 		 * . For this field, the prog type specific ctx conversion algorithm
 		 *   only supports whole field access.
 		 * . This ctx access is a candiate for later verifier transformation
 		 *   to load the whole field and then apply a mask to get correct result.
+		 * a non zero info.converted_op_size indicates perceived actual converted
+		 * value width in convert_ctx_access.
 		 */
-		if (ctx_field_size)
-			env->insn_aux_data[insn_idx].ctx_field_size = ctx_field_size;
+		if ((info.ctx_field_size && !info.converted_op_size) ||
+		    (!info.ctx_field_size &&  info.converted_op_size)) {
+			verbose("verifier bug in is_valid_access prog type=%u off=%d size=%d\n",
+				env->prog->type, off, size);
+			return -EACCES;
+		}
+
+		if (info.ctx_field_size) {
+			env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
+			env->insn_aux_data[insn_idx].converted_op_size = info.converted_op_size;
+		}
+		*reg_type = info.reg_type;
 
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
@@ -3388,7 +3400,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 	struct bpf_insn insn_buf[16], *insn;
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
-	int i, cnt, off, size, ctx_field_size, is_narrower_load, delta = 0;
+	int i, cnt, off, size, ctx_field_size, converted_op_size, is_narrower_load, delta = 0;
 
 	if (ops->gen_prologue) {
 		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
@@ -3431,7 +3443,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		off = insn->off;
 		size = bpf_size_to_bytes(BPF_SIZE(insn->code));
 		ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
-		is_narrower_load = (type == BPF_READ && size < ctx_field_size);
+		converted_op_size = env->insn_aux_data[i + delta].converted_op_size;
+		is_narrower_load = type == BPF_READ && size < ctx_field_size;
 
 		/* If the read access is a narrower load of the field,
 		 * convert to a 4/8-byte load, to minimum program type specific
@@ -3453,7 +3466,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
 		}
-		if (is_narrower_load) {
+		if (is_narrower_load && size < converted_op_size) {
 			if (ctx_field_size <= 4)
 				insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
 							(1 << size * 8) - 1);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 9d3ec8253131..97c46b440cd6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -479,7 +479,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 
 /* bpf+kprobe programs can access fields of 'struct pt_regs' */
 static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
-					enum bpf_reg_type *reg_type, int *ctx_field_size)
+					struct bpf_insn_access_aux *info)
 {
 	if (off < 0 || off >= sizeof(struct pt_regs))
 		return false;
@@ -562,7 +562,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 }
 
 static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
-				    enum bpf_reg_type *reg_type, int *ctx_field_size)
+				    struct bpf_insn_access_aux *info)
 {
 	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
 		return false;
@@ -581,7 +581,7 @@ const struct bpf_verifier_ops tracepoint_prog_ops = {
 };
 
 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
-				    enum bpf_reg_type *reg_type, int *ctx_field_size)
+				    struct bpf_insn_access_aux *info)
 {
 	int sample_period_off;
 
@@ -595,12 +595,17 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
 	/* permit 1, 2, 4 byte narrower and 8 normal read access to sample_period */
 	sample_period_off = offsetof(struct bpf_perf_event_data, sample_period);
 	if (off >= sample_period_off && off < sample_period_off + sizeof(__u64)) {
-		*ctx_field_size = 8;
+		int allowed;
+
 #ifdef __LITTLE_ENDIAN
-		return (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0;
+		allowed = (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0;
 #else
-		return ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0;
+		allowed = ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0;
 #endif
+		if (!allowed)
+			return false;
+		info->ctx_field_size = 8;
+		info->converted_op_size = 8;
 	} else {
 		if (size != sizeof(long))
 			return false;
diff --git a/net/core/filter.c b/net/core/filter.c
index 60ed6f343a63..4b788007415f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2856,8 +2856,37 @@ lwt_xmit_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static void __set_access_aux_info(int off, struct bpf_insn_access_aux *info)
+{
+	info->ctx_field_size = 4;
+	switch (off) {
+	case offsetof(struct __sk_buff, pkt_type) ...
+	     offsetof(struct __sk_buff, pkt_type) + sizeof(__u32) - 1:
+	case offsetof(struct __sk_buff, vlan_present) ...
+	     offsetof(struct __sk_buff, vlan_present) + sizeof(__u32) - 1:
+		info->converted_op_size = 1;
+		break;
+	case offsetof(struct __sk_buff, queue_mapping) ...
+	     offsetof(struct __sk_buff, queue_mapping) + sizeof(__u32) - 1:
+	case offsetof(struct __sk_buff, protocol) ...
+	     offsetof(struct __sk_buff, protocol) + sizeof(__u32) - 1:
+	case offsetof(struct __sk_buff, vlan_tci) ...
+	     offsetof(struct __sk_buff, vlan_tci) + sizeof(__u32) - 1:
+	case offsetof(struct __sk_buff, vlan_proto) ...
+	     offsetof(struct __sk_buff, vlan_proto) + sizeof(__u32) - 1:
+	case offsetof(struct __sk_buff, tc_index) ...
+	     offsetof(struct __sk_buff, tc_index) + sizeof(__u32) - 1:
+	case offsetof(struct __sk_buff, tc_classid) ...
+	     offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1:
+		info->converted_op_size = 2;
+		break;
+	default:
+		info->converted_op_size = 4;
+	}
+}
+
 static bool __is_valid_access(int off, int size, enum bpf_access_type type,
-			      int *ctx_field_size)
+			      struct bpf_insn_access_aux *info)
 {
 	if (off < 0 || off >= sizeof(struct __sk_buff))
 		return false;
@@ -2875,24 +2904,32 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type,
 		break;
 	case offsetof(struct __sk_buff, data) ...
 	     offsetof(struct __sk_buff, data) + sizeof(__u32) - 1:
+		if (size != sizeof(__u32))
+			return false;
+		info->reg_type = PTR_TO_PACKET;
+		break;
 	case offsetof(struct __sk_buff, data_end) ...
 	     offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1:
 		if (size != sizeof(__u32))
 			return false;
+		info->reg_type = PTR_TO_PACKET_END;
 		break;
 	default:
-		/* permit narrower load for not cb/data/data_end fields */
-		*ctx_field_size = 4;
 		if (type == BPF_WRITE) {
 			if (size != sizeof(__u32))
 				return false;
 		} else {
-			if (size != sizeof(__u32))
+			int allowed;
+
+			/* permit narrower load for not cb/data/data_end fields */
 #ifdef __LITTLE_ENDIAN
-				return (off & 0x3) == 0 && (size == 1 || size == 2);
+			allowed = (off & 0x3) == 0 && size <= 4 && (size & (size - 1)) == 0;
 #else
-				return (off & 0x3) + size == 4 && (size == 1 || size == 2);
+			allowed = (off & 0x3) + size == 4 && size <= 4 && (size & (size - 1)) == 0;
 #endif
+			if (!allowed)
+				return false;
+			__set_access_aux_info(off, info);
 		}
 	}
 
@@ -2901,8 +2938,7 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type,
 
 static bool sk_filter_is_valid_access(int off, int size,
 				      enum bpf_access_type type,
-				      enum bpf_reg_type *reg_type,
-				      int *ctx_field_size)
+				      struct bpf_insn_access_aux *info)
 {
 	switch (off) {
 	case offsetof(struct __sk_buff, tc_classid) ...
@@ -2924,13 +2960,12 @@ static bool sk_filter_is_valid_access(int off, int size,
 		}
 	}
 
-	return __is_valid_access(off, size, type, ctx_field_size);
+	return __is_valid_access(off, size, type, info);
 }
 
 static bool lwt_is_valid_access(int off, int size,
 				enum bpf_access_type type,
-				enum bpf_reg_type *reg_type,
-				int *ctx_field_size)
+				struct bpf_insn_access_aux *info)
 {
 	switch (off) {
 	case offsetof(struct __sk_buff, tc_classid) ...
@@ -2950,22 +2985,12 @@ static bool lwt_is_valid_access(int off, int size,
 		}
 	}
 
-	switch (off) {
-	case offsetof(struct __sk_buff, data):
-		*reg_type = PTR_TO_PACKET;
-		break;
-	case offsetof(struct __sk_buff, data_end):
-		*reg_type = PTR_TO_PACKET_END;
-		break;
-	}
-
-	return __is_valid_access(off, size, type, ctx_field_size);
+	return __is_valid_access(off, size, type, info);
 }
 
 static bool sock_filter_is_valid_access(int off, int size,
 					enum bpf_access_type type,
-					enum bpf_reg_type *reg_type,
-					int *ctx_field_size)
+					struct bpf_insn_access_aux *info)
 {
 	if (type == BPF_WRITE) {
 		switch (off) {
@@ -3028,8 +3053,7 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 
 static bool tc_cls_act_is_valid_access(int off, int size,
 				       enum bpf_access_type type,
-				       enum bpf_reg_type *reg_type,
-				       int *ctx_field_size)
+				       struct bpf_insn_access_aux *info)
 {
 	if (type == BPF_WRITE) {
 		switch (off) {
@@ -3045,16 +3069,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 		}
 	}
 
-	switch (off) {
-	case offsetof(struct __sk_buff, data):
-		*reg_type = PTR_TO_PACKET;
-		break;
-	case offsetof(struct __sk_buff, data_end):
-		*reg_type = PTR_TO_PACKET_END;
-		break;
-	}
-
-	return __is_valid_access(off, size, type, ctx_field_size);
+	return __is_valid_access(off, size, type, info);
 }
 
 static bool __is_valid_xdp_access(int off, int size)
@@ -3071,18 +3086,17 @@ static bool __is_valid_xdp_access(int off, int size)
 
 static bool xdp_is_valid_access(int off, int size,
 				enum bpf_access_type type,
-				enum bpf_reg_type *reg_type,
-				int *ctx_field_size)
+				struct bpf_insn_access_aux *info)
 {
 	if (type == BPF_WRITE)
 		return false;
 
 	switch (off) {
 	case offsetof(struct xdp_md, data):
-		*reg_type = PTR_TO_PACKET;
+		info->reg_type = PTR_TO_PACKET;
 		break;
 	case offsetof(struct xdp_md, data_end):
-		*reg_type = PTR_TO_PACKET_END;
+		info->reg_type = PTR_TO_PACKET_END;
 		break;
 	}
 
-- 
cgit v1.3-14-g43fede


From 739294fb03f590401bbd7faa6d31a507e3ffada5 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Fri, 23 Jun 2017 12:55:27 -0400
Subject: sched/numa: Override part of migrate_degrades_locality() when idle
 balancing

Several tests in the NAS benchmark seem to run a lot slower with
NUMA balancing enabled, than with NUMA balancing disabled. The
slower run time corresponds with increased idle time.

Overriding the final test of migrate_degrades_locality (but still
doing the other NUMA tests first) seems to improve performance
of those benchmarks.

Reported-by: Jirka Hladky <jhladky@redhat.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20170623165530.22514-2-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 694c258b8771..6e0c0524131e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6688,6 +6688,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 	if (dst_nid == p->numa_preferred_nid)
 		return 0;
 
+	/* Leaving a core idle is often worse than degrading locality. */
+	if (env->idle != CPU_NOT_IDLE)
+		return -1;
+
 	if (numa_group) {
 		src_faults = group_faults(p, src_nid);
 		dst_faults = group_faults(p, dst_nid);
-- 
cgit v1.3-14-g43fede


From 7d894e6e34a5cdd12309c7e4a3f830277ad4b7bf Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Fri, 23 Jun 2017 12:55:28 -0400
Subject: sched/fair: Simplify wake_affine() for the single socket case

Then 'this_cpu' and 'prev_cpu' are in the same socket, select_idle_sibling()
will do its thing regardless of the return value of wake_affine().

Just return true and don't look at all the other things.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: jhladky@redhat.com
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20170623165530.22514-3-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e0c0524131e..fe1901686fa5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5419,6 +5419,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 	load	  = source_load(prev_cpu, idx);
 	this_load = target_load(this_cpu, idx);
 
+	/*
+	 * Common case: CPUs are in the same socket, and select_idle_sibling()
+	 * will do its thing regardless of what we return:
+	 */
+	if (cpus_share_cache(prev_cpu, this_cpu))
+		return true;
+
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
 	 * effect of the currently running task from the load
@@ -6007,11 +6014,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
 	if (affine_sd) {
 		sd = NULL; /* Prefer wake_affine over balance flags */
-		if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
+		if (cpu == prev_cpu)
+			goto pick_cpu;
+
+		if (wake_affine(affine_sd, p, prev_cpu, sync))
 			new_cpu = cpu;
 	}
 
 	if (!sd) {
+ pick_cpu:
 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
 			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
-- 
cgit v1.3-14-g43fede


From 3fed382b46baac83703130fe4cd3d9147f427fb9 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Fri, 23 Jun 2017 12:55:29 -0400
Subject: sched/numa: Implement NUMA node level wake_affine()

Since select_idle_sibling() can place a task anywhere on a socket,
comparing loads between individual CPU cores makes no real sense
for deciding whether to do an affine wakeup across sockets, either.

Instead, compare the load between the sockets in a similar way the
load balancer and the numa balancing code do.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: jhladky@redhat.com
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20170623165530.22514-4-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 130 ++++++++++++++++++++++++++++------------------------
 1 file changed, 71 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe1901686fa5..79ac078caf5d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2586,6 +2586,60 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 		}
 	}
 }
+
+/*
+ * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ */
+static inline bool numa_wake_affine(struct sched_domain *sd,
+				    struct task_struct *p, int this_cpu,
+				    int prev_cpu, int sync)
+{
+	struct numa_stats prev_load, this_load;
+	s64 this_eff_load, prev_eff_load;
+
+	update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
+	update_numa_stats(&this_load, cpu_to_node(this_cpu));
+
+	/*
+	 * If sync wakeup then subtract the (maximum possible)
+	 * effect of the currently running task from the load
+	 * of the current CPU:
+	 */
+	if (sync) {
+		unsigned long current_load = task_h_load(current);
+
+		if (this_load.load > current_load)
+			this_load.load -= current_load;
+		else
+			this_load.load = 0;
+	}
+
+	/*
+	 * In low-load situations, where this_cpu's node is idle due to the
+	 * sync cause above having dropped this_load.load to 0, move the task.
+	 * Moving to an idle socket will not create a bad imbalance.
+	 *
+	 * Otherwise check if the nodes are near enough in load to allow this
+	 * task to be woken on this_cpu's node.
+	 */
+	if (this_load.load > 0) {
+		unsigned long task_load = task_h_load(p);
+
+		this_eff_load = 100;
+		this_eff_load *= prev_load.compute_capacity;
+
+		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+		prev_eff_load *= this_load.compute_capacity;
+
+		this_eff_load *= this_load.load + task_load;
+		prev_eff_load *= prev_load.load - task_load;
+
+		return this_eff_load <= prev_eff_load;
+	}
+
+	return true;
+}
 #else
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
@@ -2598,6 +2652,13 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 {
 }
+
+static inline bool numa_wake_affine(struct sched_domain *sd,
+				    struct task_struct *p, int this_cpu,
+				    int prev_cpu, int sync)
+{
+	return true;
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 static void
@@ -5407,74 +5468,25 @@ static int wake_wide(struct task_struct *p)
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 		       int prev_cpu, int sync)
 {
-	s64 this_load, load;
-	s64 this_eff_load, prev_eff_load;
-	int idx, this_cpu;
-	struct task_group *tg;
-	unsigned long weight;
-	int balanced;
-
-	idx	  = sd->wake_idx;
-	this_cpu  = smp_processor_id();
-	load	  = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
+	int this_cpu = smp_processor_id();
+	bool affine = false;
 
 	/*
 	 * Common case: CPUs are in the same socket, and select_idle_sibling()
 	 * will do its thing regardless of what we return:
 	 */
 	if (cpus_share_cache(prev_cpu, this_cpu))
-		return true;
-
-	/*
-	 * If sync wakeup then subtract the (maximum possible)
-	 * effect of the currently running task from the load
-	 * of the current CPU:
-	 */
-	if (sync) {
-		tg = task_group(current);
-		weight = current->se.avg.load_avg;
-
-		this_load += effective_load(tg, this_cpu, -weight, -weight);
-		load += effective_load(tg, prev_cpu, 0, -weight);
-	}
-
-	tg = task_group(p);
-	weight = p->se.avg.load_avg;
-
-	/*
-	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
-	 * due to the sync cause above having dropped this_load to 0, we'll
-	 * always have an imbalance, but there's really nothing you can do
-	 * about that, so that's good too.
-	 *
-	 * Otherwise check if either cpus are near enough in load to allow this
-	 * task to be woken on this_cpu.
-	 */
-	this_eff_load = 100;
-	this_eff_load *= capacity_of(prev_cpu);
-
-	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-	prev_eff_load *= capacity_of(this_cpu);
-
-	if (this_load > 0) {
-		this_eff_load *= this_load +
-			effective_load(tg, this_cpu, weight, weight);
-
-		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
-	}
-
-	balanced = this_eff_load <= prev_eff_load;
+		affine = true;
+	else
+		affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
 
 	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
+	if (affine) {
+		schedstat_inc(sd->ttwu_move_affine);
+		schedstat_inc(p->se.statistics.nr_wakeups_affine);
+	}
 
-	if (!balanced)
-		return 0;
-
-	schedstat_inc(sd->ttwu_move_affine);
-	schedstat_inc(p->se.statistics.nr_wakeups_affine);
-
-	return 1;
+	return affine;
 }
 
 static inline int task_util(struct task_struct *p);
-- 
cgit v1.3-14-g43fede


From 815abf5af45f04f759f12f3172afd15226fd7f71 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Fri, 23 Jun 2017 12:55:30 -0400
Subject: sched/fair: Remove effective_load()

The effective_load() function was only used by the NUMA balancing
code, and not by the regular load balancing code. Now that the
NUMA balancing code no longer uses it either, get rid of it.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: jhladky@redhat.com
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20170623165530.22514-5-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 124 +---------------------------------------------------
 1 file changed, 1 insertion(+), 123 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79ac078caf5d..6f4f155adf5f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1382,7 +1382,6 @@ static unsigned long weighted_cpuload(const int cpu);
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long capacity_of(int cpu);
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
 
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
@@ -3045,8 +3044,7 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
  * differential update where we store the last value we propagated. This in
  * turn allows skipping updates if the differential is 'small'.
  *
- * Updating tg's load_avg is necessary before update_cfs_share() (which is
- * done) and effective_load() (which is not done because it is too costly).
+ * Updating tg's load_avg is necessary before update_cfs_share().
  */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 {
@@ -5298,126 +5296,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 	return 0;
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * effective_load() calculates the load change as seen from the root_task_group
- *
- * Adding load to a group doesn't make a group heavier, but can cause movement
- * of group shares between cpus. Assuming the shares were perfectly aligned one
- * can calculate the shift in shares.
- *
- * Calculate the effective load difference if @wl is added (subtracted) to @tg
- * on this @cpu and results in a total addition (subtraction) of @wg to the
- * total group weight.
- *
- * Given a runqueue weight distribution (rw_i) we can compute a shares
- * distribution (s_i) using:
- *
- *   s_i = rw_i / \Sum rw_j						(1)
- *
- * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
- * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
- * shares distribution (s_i):
- *
- *   rw_i = {   2,   4,   1,   0 }
- *   s_i  = { 2/7, 4/7, 1/7,   0 }
- *
- * As per wake_affine() we're interested in the load of two CPUs (the CPU the
- * task used to run on and the CPU the waker is running on), we need to
- * compute the effect of waking a task on either CPU and, in case of a sync
- * wakeup, compute the effect of the current task going to sleep.
- *
- * So for a change of @wl to the local @cpu with an overall group weight change
- * of @wl we can compute the new shares distribution (s'_i) using:
- *
- *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
- *
- * Suppose we're interested in CPUs 0 and 1, and want to compute the load
- * differences in waking a task to CPU 0. The additional task changes the
- * weight and shares distributions like:
- *
- *   rw'_i = {   3,   4,   1,   0 }
- *   s'_i  = { 3/8, 4/8, 1/8,   0 }
- *
- * We can then compute the difference in effective weight by using:
- *
- *   dw_i = S * (s'_i - s_i)						(3)
- *
- * Where 'S' is the group weight as seen by its parent.
- *
- * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
- * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
- * 4/7) times the weight of the group.
- */
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-{
-	struct sched_entity *se = tg->se[cpu];
-
-	if (!tg->parent)	/* the trivial, non-cgroup case */
-		return wl;
-
-	for_each_sched_entity(se) {
-		struct cfs_rq *cfs_rq = se->my_q;
-		long W, w = cfs_rq_load_avg(cfs_rq);
-
-		tg = cfs_rq->tg;
-
-		/*
-		 * W = @wg + \Sum rw_j
-		 */
-		W = wg + atomic_long_read(&tg->load_avg);
-
-		/* Ensure \Sum rw_j >= rw_i */
-		W -= cfs_rq->tg_load_avg_contrib;
-		W += w;
-
-		/*
-		 * w = rw_i + @wl
-		 */
-		w += wl;
-
-		/*
-		 * wl = S * s'_i; see (2)
-		 */
-		if (W > 0 && w < W)
-			wl = (w * (long)scale_load_down(tg->shares)) / W;
-		else
-			wl = scale_load_down(tg->shares);
-
-		/*
-		 * Per the above, wl is the new se->load.weight value; since
-		 * those are clipped to [MIN_SHARES, ...) do so now. See
-		 * calc_cfs_shares().
-		 */
-		if (wl < MIN_SHARES)
-			wl = MIN_SHARES;
-
-		/*
-		 * wl = dw_i = S * (s'_i - s_i); see (3)
-		 */
-		wl -= se->avg.load_avg;
-
-		/*
-		 * Recursively apply this logic to all parent groups to compute
-		 * the final effective load change on the root group. Since
-		 * only the @tg group gets extra weight, all parent groups can
-		 * only redistribute existing shares. @wl is the shift in shares
-		 * resulting from this level per the above.
-		 */
-		wg = 0;
-	}
-
-	return wl;
-}
-#else
-
-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-{
-	return wl;
-}
-
-#endif
-
 static void record_wakee(struct task_struct *p)
 {
 	/*
-- 
cgit v1.3-14-g43fede


From c2ce34c0a0e5187195ecade872be950d2611ba68 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 Jun 2017 11:05:59 +0200
Subject: genirq/debugfs: Remove pointless NULL pointer check

debugfs_remove() has it's own NULL pointer check. Remove the conditional
and make irq_remove_debugfs_entry() an inline helper

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/debugfs.c   | 7 -------
 kernel/irq/internals.h | 7 ++++++-
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index dbd6e78db213..4d384edc0c64 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -3,7 +3,6 @@
  *
  * This file is licensed under the GPL V2.
  */
-#include <linux/debugfs.h>
 #include <linux/irqdomain.h>
 #include <linux/irq.h>
 
@@ -191,12 +190,6 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
 						 &dfs_irq_ops);
 }
 
-void irq_remove_debugfs_entry(struct irq_desc *desc)
-{
-	if (desc->debugfs_file)
-		debugfs_remove(desc->debugfs_file);
-}
-
 static int __init irq_debugfs_init(void)
 {
 	struct dentry *root_dir;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 5fd105e252c3..a573e0771baf 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -318,8 +318,13 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
 #endif /* !CONFIG_GENERIC_PENDING_IRQ */
 
 #ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+#include <linux/debugfs.h>
+
 void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
-void irq_remove_debugfs_entry(struct irq_desc *desc);
+static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
+{
+	debugfs_remove(desc->debugfs_file);
+}
 # ifdef CONFIG_IRQ_DOMAIN
 void irq_domain_debugfs_init(struct dentry *root);
 # else
-- 
cgit v1.3-14-g43fede


From b2d3d61adb7b73cfe5f82404f7a130a76fc64232 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 23 Jun 2017 16:11:07 +0200
Subject: genirq/timings: Add infrastructure to track the interrupt timings

The interrupt framework gives a lot of information about each interrupt. It
does not keep track of when those interrupts occur though, which is a
prerequisite for estimating the next interrupt arrival for power management
purposes.

Add a mechanism to record the timestamp for each interrupt occurrences in a
per-CPU circular buffer to help with the prediction of the next occurrence
using a statistical model.

Each CPU can store up to IRQ_TIMINGS_SIZE events <irq, timestamp>, the
current value of IRQ_TIMINGS_SIZE is 32.

Each event is encoded into a single u64, where the high 48 bits are used
for the timestamp and the low 16 bits are for the irq number.

A static key is introduced so when the irq prediction is switched off at
runtime, the overhead is near to zero.

It results in most of the code in internals.h for inline reasons and a very
few in the new file timings.c. The latter will contain more in the next patch
which will provide the statistical model for the next event prediction.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: "Rafael J . Wysocki" <rafael@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Link: http://lkml.kernel.org/r/1498227072-5980-1-git-send-email-daniel.lezcano@linaro.org
---
 include/linux/interrupt.h |  5 +++
 kernel/irq/Kconfig        |  3 ++
 kernel/irq/Makefile       |  1 +
 kernel/irq/handle.c       |  2 ++
 kernel/irq/internals.h    | 90 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/irq/manage.c       |  3 ++
 kernel/irq/timings.c      | 30 ++++++++++++++++
 7 files changed, 134 insertions(+)
 create mode 100644 kernel/irq/timings.c

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a6fba4804672..9f617238a2f7 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -703,6 +703,11 @@ static inline void init_irq_proc(void)
 }
 #endif
 
+#ifdef CONFIG_IRQ_TIMINGS
+void irq_timings_enable(void);
+void irq_timings_disable(void);
+#endif
+
 struct seq_file;
 int show_interrupts(struct seq_file *p, void *v);
 int arch_show_interrupts(struct seq_file *p, int prec);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index fcbb1d6d51cb..27c4e774071c 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -85,6 +85,9 @@ config GENERIC_MSI_IRQ_DOMAIN
 config HANDLE_DOMAIN_IRQ
 	bool
 
+config IRQ_TIMINGS
+	bool
+
 config IRQ_DOMAIN_DEBUG
 	bool "Expose hardware/virtual IRQ mapping via debugfs"
 	depends on IRQ_DOMAIN && DEBUG_FS
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index c61fc9c2d1f7..e4aef7351f2b 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,6 @@
 
 obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
+obj-$(CONFIG_IRQ_TIMINGS) += timings.o
 obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d3f24905852c..eb4d3e8945b8 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -138,6 +138,8 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
 	unsigned int irq = desc->irq_data.irq;
 	struct irqaction *action;
 
+	record_irq_time(desc);
+
 	for_each_action_of_desc(desc, action) {
 		irqreturn_t res;
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a573e0771baf..b95b74920433 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -8,6 +8,7 @@
 #include <linux/irqdesc.h>
 #include <linux/kernel_stat.h>
 #include <linux/pm_runtime.h>
+#include <linux/sched/clock.h>
 
 #ifdef CONFIG_SPARSE_IRQ
 # define IRQ_BITMAP_BITS	(NR_IRQS + 8196)
@@ -57,6 +58,7 @@ enum {
 	IRQS_WAITING		= 0x00000080,
 	IRQS_PENDING		= 0x00000200,
 	IRQS_SUSPENDED		= 0x00000800,
+	IRQS_TIMINGS		= 0x00001000,
 };
 
 #include "debug.h"
@@ -255,6 +257,94 @@ static inline void
 irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
 #endif
 
+#ifdef CONFIG_IRQ_TIMINGS
+
+#define IRQ_TIMINGS_SHIFT	5
+#define IRQ_TIMINGS_SIZE	(1 << IRQ_TIMINGS_SHIFT)
+#define IRQ_TIMINGS_MASK	(IRQ_TIMINGS_SIZE - 1)
+
+/**
+ * struct irq_timings - irq timings storing structure
+ * @values: a circular buffer of u64 encoded <timestamp,irq> values
+ * @count: the number of elements in the array
+ */
+struct irq_timings {
+	u64	values[IRQ_TIMINGS_SIZE];
+	int	count;
+};
+
+DECLARE_PER_CPU(struct irq_timings, irq_timings);
+
+static inline void irq_remove_timings(struct irq_desc *desc)
+{
+	desc->istate &= ~IRQS_TIMINGS;
+}
+
+static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act)
+{
+	/*
+	 * We don't need the measurement because the idle code already
+	 * knows the next expiry event.
+	 */
+	if (act->flags & __IRQF_TIMER)
+		return;
+
+	desc->istate |= IRQS_TIMINGS;
+}
+
+extern void irq_timings_enable(void);
+extern void irq_timings_disable(void);
+
+DECLARE_STATIC_KEY_FALSE(irq_timing_enabled);
+
+/*
+ * The interrupt number and the timestamp are encoded into a single
+ * u64 variable to optimize the size.
+ * 48 bit time stamp and 16 bit IRQ number is way sufficient.
+ *  Who cares an IRQ after 78 hours of idle time?
+ */
+static inline u64 irq_timing_encode(u64 timestamp, int irq)
+{
+	return (timestamp << 16) | irq;
+}
+
+static inline int irq_timing_decode(u64 value, u64 *timestamp)
+{
+	*timestamp = value >> 16;
+	return value & U16_MAX;
+}
+
+/*
+ * The function record_irq_time is only called in one place in the
+ * interrupts handler. We want this function always inline so the code
+ * inside is embedded in the function and the static key branching
+ * code can act at the higher level. Without the explicit
+ * __always_inline we can end up with a function call and a small
+ * overhead in the hotpath for nothing.
+ */
+static __always_inline void record_irq_time(struct irq_desc *desc)
+{
+	if (!static_branch_likely(&irq_timing_enabled))
+		return;
+
+	if (desc->istate & IRQS_TIMINGS) {
+		struct irq_timings *timings = this_cpu_ptr(&irq_timings);
+
+		timings->values[timings->count & IRQ_TIMINGS_MASK] =
+			irq_timing_encode(local_clock(),
+					  irq_desc_get_irq(desc));
+
+		timings->count++;
+	}
+}
+#else
+static inline void irq_remove_timings(struct irq_desc *desc) {}
+static inline void irq_setup_timings(struct irq_desc *desc,
+				     struct irqaction *act) {};
+static inline void record_irq_time(struct irq_desc *desc) {}
+#endif /* CONFIG_IRQ_TIMINGS */
+
+
 #ifdef CONFIG_GENERIC_IRQ_CHIP
 void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
 			   int num_ct, unsigned int irq_base,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3577c091ac7b..5c11c1730ba5 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1348,6 +1348,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
+	irq_setup_timings(desc, new);
+
 	/*
 	 * Strictly no need to wake it up, but hung_task complains
 	 * when no hard interrupt wakes the thread up.
@@ -1474,6 +1476,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 		irq_settings_clr_disable_unlazy(desc);
 		irq_shutdown(desc);
 		irq_release_resources(desc);
+		irq_remove_timings(desc);
 	}
 
 #ifdef CONFIG_SMP
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
new file mode 100644
index 000000000000..56cf6870fa26
--- /dev/null
+++ b/kernel/irq/timings.c
@@ -0,0 +1,30 @@
+/*
+ * linux/kernel/irq/timings.c
+ *
+ * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/percpu.h>
+#include <linux/static_key.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+
+#include "internals.h"
+
+DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
+
+DEFINE_PER_CPU(struct irq_timings, irq_timings);
+
+void irq_timings_enable(void)
+{
+	static_branch_enable(&irq_timing_enabled);
+}
+
+void irq_timings_disable(void)
+{
+	static_branch_disable(&irq_timing_enabled);
+}
-- 
cgit v1.3-14-g43fede


From e1c921495534002d727b15a76a2f8c20b6b108b5 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 23 Jun 2017 16:11:08 +0200
Subject: genirq/timings: Add infrastructure for estimating the next interrupt
 arrival time

An interrupt behaves with a burst of activity with periodic interval of time
followed by one or two peaks of longer interval.

As the time intervals are periodic, statistically speaking they follow a normal
distribution and each interrupts can be tracked individually.

Add a mechanism to compute the statistics on all interrupts, except the
timers which are deterministic from a prediction point of view, as their
expiry time is known.

The goal is to extract the periodicity for each interrupt, with the last
timestamp and sum them, so the next event can be predicted to a certain
extent.

Taking the earliest prediction gives the expected wakeup on the system
(assuming a timer won't expire before).

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Hannes Reinecke <hare@suse.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: "Rafael J . Wysocki" <rafael@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Link: http://lkml.kernel.org/r/1498227072-5980-2-git-send-email-daniel.lezcano@linaro.org
---
 include/linux/interrupt.h |   1 +
 kernel/irq/internals.h    |  19 +++
 kernel/irq/timings.c      | 339 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 359 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9f617238a2f7..37f8e354f564 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -706,6 +706,7 @@ static inline void init_irq_proc(void)
 #ifdef CONFIG_IRQ_TIMINGS
 void irq_timings_enable(void);
 void irq_timings_disable(void);
+u64 irq_timings_next_event(u64 now);
 #endif
 
 struct seq_file;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b95b74920433..9da14d125df4 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -275,13 +275,21 @@ struct irq_timings {
 
 DECLARE_PER_CPU(struct irq_timings, irq_timings);
 
+extern void irq_timings_free(int irq);
+extern int irq_timings_alloc(int irq);
+
 static inline void irq_remove_timings(struct irq_desc *desc)
 {
 	desc->istate &= ~IRQS_TIMINGS;
+
+	irq_timings_free(irq_desc_get_irq(desc));
 }
 
 static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act)
 {
+	int irq = irq_desc_get_irq(desc);
+	int ret;
+
 	/*
 	 * We don't need the measurement because the idle code already
 	 * knows the next expiry event.
@@ -289,6 +297,17 @@ static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *ac
 	if (act->flags & __IRQF_TIMER)
 		return;
 
+	/*
+	 * In case the timing allocation fails, we just want to warn,
+	 * not fail, so letting the system boot anyway.
+	 */
+	ret = irq_timings_alloc(irq);
+	if (ret) {
+		pr_warn("Failed to allocate irq timing stats for irq%d (%d)",
+			irq, ret);
+		return;
+	}
+
 	desc->istate |= IRQS_TIMINGS;
 }
 
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 56cf6870fa26..c8c1d073fbf1 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -8,10 +8,16 @@
  * published by the Free Software Foundation.
  *
  */
+#include <linux/kernel.h>
 #include <linux/percpu.h>
+#include <linux/slab.h>
 #include <linux/static_key.h>
 #include <linux/interrupt.h>
+#include <linux/idr.h>
 #include <linux/irq.h>
+#include <linux/math64.h>
+
+#include <trace/events/irq.h>
 
 #include "internals.h"
 
@@ -19,6 +25,18 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);
 
 DEFINE_PER_CPU(struct irq_timings, irq_timings);
 
+struct irqt_stat {
+	u64	next_evt;
+	u64	last_ts;
+	u64	variance;
+	u32	avg;
+	u32	nr_samples;
+	int	anomalies;
+	int	valid;
+};
+
+static DEFINE_IDR(irqt_stats);
+
 void irq_timings_enable(void)
 {
 	static_branch_enable(&irq_timing_enabled);
@@ -28,3 +46,324 @@ void irq_timings_disable(void)
 {
 	static_branch_disable(&irq_timing_enabled);
 }
+
+/**
+ * irqs_update - update the irq timing statistics with a new timestamp
+ *
+ * @irqs: an irqt_stat struct pointer
+ * @ts: the new timestamp
+ *
+ * The statistics are computed online, in other words, the code is
+ * designed to compute the statistics on a stream of values rather
+ * than doing multiple passes on the values to compute the average,
+ * then the variance. The integer division introduces a loss of
+ * precision but with an acceptable error margin regarding the results
+ * we would have with the double floating precision: we are dealing
+ * with nanosec, so big numbers, consequently the mantisse is
+ * negligeable, especially when converting the time in usec
+ * afterwards.
+ *
+ * The computation happens at idle time. When the CPU is not idle, the
+ * interrupts' timestamps are stored in the circular buffer, when the
+ * CPU goes idle and this routine is called, all the buffer's values
+ * are injected in the statistical model continuying to extend the
+ * statistics from the previous busy-idle cycle.
+ *
+ * The observations showed a device will trigger a burst of periodic
+ * interrupts followed by one or two peaks of longer time, for
+ * instance when a SD card device flushes its cache, then the periodic
+ * intervals occur again. A one second inactivity period resets the
+ * stats, that gives us the certitude the statistical values won't
+ * exceed 1x10^9, thus the computation won't overflow.
+ *
+ * Basically, the purpose of the algorithm is to watch the periodic
+ * interrupts and eliminate the peaks.
+ *
+ * An interrupt is considered periodically stable if the interval of
+ * its occurences follow the normal distribution, thus the values
+ * comply with:
+ *
+ *      avg - 3 x stddev < value < avg + 3 x stddev
+ *
+ * Which can be simplified to:
+ *
+ *      -3 x stddev < value - avg < 3 x stddev
+ *
+ *      abs(value - avg) < 3 x stddev
+ *
+ * In order to save a costly square root computation, we use the
+ * variance. For the record, stddev = sqrt(variance). The equation
+ * above becomes:
+ *
+ *      abs(value - avg) < 3 x sqrt(variance)
+ *
+ * And finally we square it:
+ *
+ *      (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
+ *
+ *      (value - avg) x (value - avg) < 9 x variance
+ *
+ * Statistically speaking, any values out of this interval is
+ * considered as an anomaly and is discarded. However, a normal
+ * distribution appears when the number of samples is 30 (it is the
+ * rule of thumb in statistics, cf. "30 samples" on Internet). When
+ * there are three consecutive anomalies, the statistics are resetted.
+ *
+ */
+static void irqs_update(struct irqt_stat *irqs, u64 ts)
+{
+	u64 old_ts = irqs->last_ts;
+	u64 variance = 0;
+	u64 interval;
+	s64 diff;
+
+	/*
+	 * The timestamps are absolute time values, we need to compute
+	 * the timing interval between two interrupts.
+	 */
+	irqs->last_ts = ts;
+
+	/*
+	 * The interval type is u64 in order to deal with the same
+	 * type in our computation, that prevent mindfuck issues with
+	 * overflow, sign and division.
+	 */
+	interval = ts - old_ts;
+
+	/*
+	 * The interrupt triggered more than one second apart, that
+	 * ends the sequence as predictible for our purpose. In this
+	 * case, assume we have the beginning of a sequence and the
+	 * timestamp is the first value. As it is impossible to
+	 * predict anything at this point, return.
+	 *
+	 * Note the first timestamp of the sequence will always fall
+	 * in this test because the old_ts is zero. That is what we
+	 * want as we need another timestamp to compute an interval.
+	 */
+	if (interval >= NSEC_PER_SEC) {
+		memset(irqs, 0, sizeof(*irqs));
+		irqs->last_ts = ts;
+		return;
+	}
+
+	/*
+	 * Pre-compute the delta with the average as the result is
+	 * used several times in this function.
+	 */
+	diff = interval - irqs->avg;
+
+	/*
+	 * Increment the number of samples.
+	 */
+	irqs->nr_samples++;
+
+	/*
+	 * Online variance divided by the number of elements if there
+	 * is more than one sample.  Normally the formula is division
+	 * by nr_samples - 1 but we assume the number of element will be
+	 * more than 32 and dividing by 32 instead of 31 is enough
+	 * precise.
+	 */
+	if (likely(irqs->nr_samples > 1))
+		variance = irqs->variance >> IRQ_TIMINGS_SHIFT;
+
+	/*
+	 * The rule of thumb in statistics for the normal distribution
+	 * is having at least 30 samples in order to have the model to
+	 * apply. Values outside the interval are considered as an
+	 * anomaly.
+	 */
+	if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) {
+		/*
+		 * After three consecutive anomalies, we reset the
+		 * stats as it is no longer stable enough.
+		 */
+		if (irqs->anomalies++ >= 3) {
+			memset(irqs, 0, sizeof(*irqs));
+			irqs->last_ts = ts;
+			return;
+		}
+	} else {
+		/*
+		 * The anomalies must be consecutives, so at this
+		 * point, we reset the anomalies counter.
+		 */
+		irqs->anomalies = 0;
+	}
+
+	/*
+	 * The interrupt is considered stable enough to try to predict
+	 * the next event on it.
+	 */
+	irqs->valid = 1;
+
+	/*
+	 * Online average algorithm:
+	 *
+	 *  new_average = average + ((value - average) / count)
+	 *
+	 * The variance computation depends on the new average
+	 * to be computed here first.
+	 *
+	 */
+	irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);
+
+	/*
+	 * Online variance algorithm:
+	 *
+	 *  new_variance = variance + (value - average) x (value - new_average)
+	 *
+	 * Warning: irqs->avg is updated with the line above, hence
+	 * 'interval - irqs->avg' is no longer equal to 'diff'
+	 */
+	irqs->variance = irqs->variance + (diff * (interval - irqs->avg));
+
+	/*
+	 * Update the next event
+	 */
+	irqs->next_evt = ts + irqs->avg;
+}
+
+/**
+ * irq_timings_next_event - Return when the next event is supposed to arrive
+ *
+ * During the last busy cycle, the number of interrupts is incremented
+ * and stored in the irq_timings structure. This information is
+ * necessary to:
+ *
+ * - know if the index in the table wrapped up:
+ *
+ *      If more than the array size interrupts happened during the
+ *      last busy/idle cycle, the index wrapped up and we have to
+ *      begin with the next element in the array which is the last one
+ *      in the sequence, otherwise it is a the index 0.
+ *
+ * - have an indication of the interrupts activity on this CPU
+ *   (eg. irq/sec)
+ *
+ * The values are 'consumed' after inserting in the statistical model,
+ * thus the count is reinitialized.
+ *
+ * The array of values **must** be browsed in the time direction, the
+ * timestamp must increase between an element and the next one.
+ *
+ * Returns a nanosec time based estimation of the earliest interrupt,
+ * U64_MAX otherwise.
+ */
+u64 irq_timings_next_event(u64 now)
+{
+	struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
+	struct irqt_stat *irqs;
+	struct irqt_stat __percpu *s;
+	u64 ts, next_evt = U64_MAX;
+	int i, irq = 0;
+
+	/*
+	 * This function must be called with the local irq disabled in
+	 * order to prevent the timings circular buffer to be updated
+	 * while we are reading it.
+	 */
+	WARN_ON_ONCE(!irqs_disabled());
+
+	/*
+	 * Number of elements in the circular buffer: If it happens it
+	 * was flushed before, then the number of elements could be
+	 * smaller than IRQ_TIMINGS_SIZE, so the count is used,
+	 * otherwise the array size is used as we wrapped. The index
+	 * begins from zero when we did not wrap. That could be done
+	 * in a nicer way with the proper circular array structure
+	 * type but with the cost of extra computation in the
+	 * interrupt handler hot path. We choose efficiency.
+	 *
+	 * Inject measured irq/timestamp to the statistical model
+	 * while decrementing the counter because we consume the data
+	 * from our circular buffer.
+	 */
+	for (i = irqts->count & IRQ_TIMINGS_MASK,
+		     irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
+	     irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
+
+		irq = irq_timing_decode(irqts->values[i], &ts);
+
+		s = idr_find(&irqt_stats, irq);
+		if (s) {
+			irqs = this_cpu_ptr(s);
+			irqs_update(irqs, ts);
+		}
+	}
+
+	/*
+	 * Look in the list of interrupts' statistics, the earliest
+	 * next event.
+	 */
+	idr_for_each_entry(&irqt_stats, s, i) {
+
+		irqs = this_cpu_ptr(s);
+
+		if (!irqs->valid)
+			continue;
+
+		if (irqs->next_evt <= now) {
+			irq = i;
+			next_evt = now;
+
+			/*
+			 * This interrupt mustn't use in the future
+			 * until new events occur and update the
+			 * statistics.
+			 */
+			irqs->valid = 0;
+			break;
+		}
+
+		if (irqs->next_evt < next_evt) {
+			irq = i;
+			next_evt = irqs->next_evt;
+		}
+	}
+
+	return next_evt;
+}
+
+void irq_timings_free(int irq)
+{
+	struct irqt_stat __percpu *s;
+
+	s = idr_find(&irqt_stats, irq);
+	if (s) {
+		free_percpu(s);
+		idr_remove(&irqt_stats, irq);
+	}
+}
+
+int irq_timings_alloc(int irq)
+{
+	struct irqt_stat __percpu *s;
+	int id;
+
+	/*
+	 * Some platforms can have the same private interrupt per cpu,
+	 * so this function may be be called several times with the
+	 * same interrupt number. Just bail out in case the per cpu
+	 * stat structure is already allocated.
+	 */
+	s = idr_find(&irqt_stats, irq);
+	if (s)
+		return 0;
+
+	s = alloc_percpu(*s);
+	if (!s)
+		return -ENOMEM;
+
+	idr_preload(GFP_KERNEL);
+	id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
+	idr_preload_end();
+
+	if (id < 0) {
+		free_percpu(s);
+		return id;
+	}
+
+	return 0;
+}
-- 
cgit v1.3-14-g43fede


From f59dd9c886acb3abb188e8e94a99436560976835 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 24 Jun 2017 11:45:02 -0700
Subject: time: add get_timespec64 and put_timespec64

Add helper functions to convert between struct timespec64 and
struct timespec at userspace boundaries.

This is a preparatory patch to use timespec64 as the basic type
internally in the kernel as timespec is not y2038 safe on 32 bit systems.
The patch helps the cause by containing all data conversions at the
userspace boundaries within these functions.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/compat.h |  2 ++
 include/linux/time.h   |  5 +++++
 kernel/compat.c        | 44 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/time/time.c     | 28 ++++++++++++++++++++++++++++
 4 files changed, 79 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 425563c7647b..3eb04016ffa9 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -164,6 +164,8 @@ extern int compat_get_timespec(struct timespec *, const void __user *);
 extern int compat_put_timespec(const struct timespec *, void __user *);
 extern int compat_get_timeval(struct timeval *, const void __user *);
 extern int compat_put_timeval(const struct timeval *, void __user *);
+extern int compat_get_timespec64(struct timespec64 *, const void __user *);
+extern int compat_put_timespec64(const struct timespec64 *, void __user *);
 
 /*
  * This function convert a timespec if necessary and returns a *user
diff --git a/include/linux/time.h b/include/linux/time.h
index c0543f5f25de..36afb579495f 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -8,6 +8,11 @@
 
 extern struct timezone sys_tz;
 
+int get_timespec64(struct timespec64 *ts,
+		const struct timespec __user *uts);
+int put_timespec64(const struct timespec64 *ts,
+		struct timespec __user *uts);
+
 #define TIME_T_MAX	(time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
 
 static inline int timespec_equal(const struct timespec *a,
diff --git a/kernel/compat.c b/kernel/compat.c
index ebd8bdc3fd68..73f26ba44a8a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -120,6 +120,50 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
 			__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
 
+static int __compat_get_timespec64(struct timespec64 *ts64,
+				   const struct compat_timespec __user *cts)
+{
+	struct compat_timespec ts;
+	int ret;
+
+	ret = copy_from_user(&ts, cts, sizeof(ts));
+	if (ret)
+		return -EFAULT;
+
+	ts64->tv_sec = ts.tv_sec;
+	ts64->tv_nsec = ts.tv_nsec;
+
+	return 0;
+}
+
+static int __compat_put_timespec64(const struct timespec64 *ts64,
+				   struct compat_timespec __user *cts)
+{
+	struct compat_timespec ts = {
+		.tv_sec = ts64->tv_sec,
+		.tv_nsec = ts64->tv_nsec
+	};
+	return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+
+int compat_get_timespec64(struct timespec64 *ts, const void __user *uts)
+{
+	if (COMPAT_USE_64BIT_TIME)
+		return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
+	else
+		return __compat_get_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec64);
+
+int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
+{
+	if (COMPAT_USE_64BIT_TIME)
+		return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
+	else
+		return __compat_put_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec64);
+
 int compat_get_timeval(struct timeval *tv, const void __user *utv)
 {
 	if (COMPAT_USE_64BIT_TIME)
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 7c89e437c4d7..adb9853ca6b0 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -890,3 +890,31 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
 
 	return res;
 }
+
+int get_timespec64(struct timespec64 *ts,
+		   const struct timespec __user *uts)
+{
+	struct timespec kts;
+	int ret;
+
+	ret = copy_from_user(&kts, uts, sizeof(kts));
+	if (ret)
+		return -EFAULT;
+
+	ts->tv_sec = kts.tv_sec;
+	ts->tv_nsec = kts.tv_nsec;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(get_timespec64);
+
+int put_timespec64(const struct timespec64 *ts,
+		   struct timespec __user *uts)
+{
+	struct timespec kts = {
+		.tv_sec = ts->tv_sec,
+		.tv_nsec = ts->tv_nsec
+	};
+	return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(put_timespec64);
-- 
cgit v1.3-14-g43fede


From d5b7ffbfbdacc29e4db035f90665951668fa9c58 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 24 Jun 2017 11:45:03 -0700
Subject: time: introduce {get,put}_itimerspec64

As we change the user space type for the timerfd and posix timer
functions to newer data types, we need some form of conversion
helpers to avoid duplicating that logic.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/compat.h       |  4 ++++
 include/linux/posix-timers.h |  1 -
 include/linux/time.h         | 13 +++++++++++++
 kernel/compat.c              | 21 +++++++++++++++++++++
 kernel/time/time.c           | 30 ++++++++++++++++++++++++++++++
 5 files changed, 68 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 3eb04016ffa9..2ed54020ace0 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -166,6 +166,10 @@ extern int compat_get_timeval(struct timeval *, const void __user *);
 extern int compat_put_timeval(const struct timeval *, void __user *);
 extern int compat_get_timespec64(struct timespec64 *, const void __user *);
 extern int compat_put_timespec64(const struct timespec64 *, void __user *);
+extern int get_compat_itimerspec64(struct itimerspec64 *its,
+			const struct compat_itimerspec __user *uits);
+extern int put_compat_itimerspec64(const struct itimerspec64 *its,
+			struct compat_itimerspec __user *uits);
 
 /*
  * This function convert a timespec if necessary and returns a *user
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 29f1b7f09ced..62839fd04dce 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -113,5 +113,4 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
 void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);
 
 void posixtimer_rearm(struct siginfo *info);
-
 #endif
diff --git a/include/linux/time.h b/include/linux/time.h
index 36afb579495f..f9858d7e6361 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -12,6 +12,10 @@ int get_timespec64(struct timespec64 *ts,
 		const struct timespec __user *uts);
 int put_timespec64(const struct timespec64 *ts,
 		struct timespec __user *uts);
+int get_itimerspec64(struct itimerspec64 *it,
+			const struct itimerspec __user *uit);
+int put_itimerspec64(const struct itimerspec64 *it,
+			struct itimerspec __user *uit);
 
 #define TIME_T_MAX	(time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
 
@@ -275,4 +279,13 @@ static __always_inline void timespec_add_ns(struct timespec *a, u64 ns)
 	a->tv_nsec = ns;
 }
 
+static inline bool itimerspec64_valid(const struct itimerspec64 *its)
+{
+	if (!timespec64_valid(&(its->it_interval)) ||
+		!timespec64_valid(&(its->it_value)))
+		return false;
+
+	return true;
+}
+
 #endif
diff --git a/kernel/compat.c b/kernel/compat.c
index 73f26ba44a8a..a350deda503a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -586,6 +586,27 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst,
 	return 0;
 }
 
+int get_compat_itimerspec64(struct itimerspec64 *its,
+			const struct compat_itimerspec __user *uits)
+{
+
+	if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) ||
+	    __compat_get_timespec64(&its->it_value, &uits->it_value))
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(get_compat_itimerspec64);
+
+int put_compat_itimerspec64(const struct itimerspec64 *its,
+			struct compat_itimerspec __user *uits)
+{
+	if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) ||
+	    __compat_put_timespec64(&its->it_value, &uits->it_value))
+		return -EFAULT;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(put_compat_itimerspec64);
+
 /*
  * We currently only need the following fields from the sigevent
  * structure: sigev_value, sigev_signo, sig_notify and (sometimes
diff --git a/kernel/time/time.c b/kernel/time/time.c
index adb9853ca6b0..44a8c1402133 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -918,3 +918,33 @@ int put_timespec64(const struct timespec64 *ts,
 	return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
 }
 EXPORT_SYMBOL_GPL(put_timespec64);
+
+int get_itimerspec64(struct itimerspec64 *it,
+			const struct itimerspec __user *uit)
+{
+	int ret;
+
+	ret = get_timespec64(&it->it_interval, &uit->it_interval);
+	if (ret)
+		return ret;
+
+	ret = get_timespec64(&it->it_value, &uit->it_value);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(get_itimerspec64);
+
+int put_itimerspec64(const struct itimerspec64 *it,
+			struct itimerspec __user *uit)
+{
+	int ret;
+
+	ret = put_timespec64(&it->it_interval, &uit->it_interval);
+	if (ret)
+		return ret;
+
+	ret = put_timespec64(&it->it_value, &uit->it_value);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(put_itimerspec64);
-- 
cgit v1.3-14-g43fede


From 63a766a1780f9581e8885bdb64270a594a84f81a Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 24 Jun 2017 11:45:04 -0700
Subject: posix-stubs: Conditionally include COMPAT_SYS_NI defines

These apis only need to be defined if CONFIG_COMPAT is
enabled.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/time/posix-stubs.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 38f3b20efa29..65878221cbfb 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -41,12 +41,6 @@ SYS_NI(setitimer);
 #ifdef __ARCH_WANT_SYS_ALARM
 SYS_NI(alarm);
 #endif
-COMPAT_SYS_NI(timer_create);
-COMPAT_SYS_NI(clock_adjtime);
-COMPAT_SYS_NI(timer_settime);
-COMPAT_SYS_NI(timer_gettime);
-COMPAT_SYS_NI(getitimer);
-COMPAT_SYS_NI(setitimer);
 
 /*
  * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
@@ -138,6 +132,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 }
 
 #ifdef CONFIG_COMPAT
+COMPAT_SYS_NI(timer_create);
+COMPAT_SYS_NI(clock_adjtime);
+COMPAT_SYS_NI(timer_settime);
+COMPAT_SYS_NI(timer_gettime);
+COMPAT_SYS_NI(getitimer);
+COMPAT_SYS_NI(setitimer);
+
 COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 		       struct compat_timespec __user *, tp)
 {
-- 
cgit v1.3-14-g43fede


From d829b8fb2431595422289cfc210f0a955a8bec74 Mon Sep 17 00:00:00 2001
From: Jeffy Chen <jeffy.chen@rock-chips.com>
Date: Mon, 26 Jun 2017 19:33:33 +0800
Subject: genirq: Set irq masked state when initializing irq_desc

The irq default state is set to disabled when allocating irq desc, but the
masked state flag is not set. This is inconsistent vs. the state tracking
logic which is used to prevent unnecessary calls to hardware level irq chip
functions.

Set the masked state flag as well.

Signed-off-by: Jeffy Chen <jeffy.chen@rock-chips.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: tfiga@chromium.org
Cc: briannorris@chromium.org
Cc: dianders@chromium.org
Link: http://lkml.kernel.org/r/1498476814-12563-1-git-send-email-jeffy.chen@rock-chips.com
---
 kernel/irq/irqdesc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 35a95fadcfda..948b50e78549 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -116,6 +116,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
 	desc->irq_data.chip_data = NULL;
 	irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
 	irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
+	irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
 	desc->handle_irq = handle_bad_irq;
 	desc->depth = 1;
 	desc->irq_count = 0;
-- 
cgit v1.3-14-g43fede


From bf22ff45bed664aefb5c4e43029057a199b7070c Mon Sep 17 00:00:00 2001
From: Jeffy Chen <jeffy.chen@rock-chips.com>
Date: Mon, 26 Jun 2017 19:33:34 +0800
Subject: genirq: Avoid unnecessary low level irq function calls

Check irq state in enable/disable/unmask/mask_irq to avoid unnecessary
low level irq function calls.

This has two advantages:
    - Conditionals are faster than hardware access

    - Solves issues with the underlying refcounting of the pinctrl
      infrastructure

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jeffy Chen <jeffy.chen@rock-chips.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: tfiga@chromium.org
Cc: briannorris@chromium.org
Cc: dianders@chromium.org
Link: http://lkml.kernel.org/r/1498476814-12563-2-git-send-email-jeffy.chen@rock-chips.com
---
 kernel/irq/chip.c | 53 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index fc89eeb8a6b4..2e30d925a40d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -314,22 +314,32 @@ void irq_shutdown(struct irq_desc *desc)
 
 void irq_enable(struct irq_desc *desc)
 {
-	irq_state_clr_disabled(desc);
-	if (desc->irq_data.chip->irq_enable)
-		desc->irq_data.chip->irq_enable(&desc->irq_data);
-	else
-		desc->irq_data.chip->irq_unmask(&desc->irq_data);
-	irq_state_clr_masked(desc);
+	if (!irqd_irq_disabled(&desc->irq_data)) {
+		unmask_irq(desc);
+	} else {
+		irq_state_clr_disabled(desc);
+		if (desc->irq_data.chip->irq_enable) {
+			desc->irq_data.chip->irq_enable(&desc->irq_data);
+			irq_state_clr_masked(desc);
+		} else {
+			unmask_irq(desc);
+		}
+	}
 }
 
 static void __irq_disable(struct irq_desc *desc, bool mask)
 {
-	irq_state_set_disabled(desc);
-	if (desc->irq_data.chip->irq_disable) {
-		desc->irq_data.chip->irq_disable(&desc->irq_data);
-		irq_state_set_masked(desc);
-	} else if (mask) {
-		mask_irq(desc);
+	if (irqd_irq_disabled(&desc->irq_data)) {
+		if (mask)
+			mask_irq(desc);
+	} else {
+		irq_state_set_disabled(desc);
+		if (desc->irq_data.chip->irq_disable) {
+			desc->irq_data.chip->irq_disable(&desc->irq_data);
+			irq_state_set_masked(desc);
+		} else if (mask) {
+			mask_irq(desc);
+		}
 	}
 }
 
@@ -378,18 +388,21 @@ void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu)
 
 static inline void mask_ack_irq(struct irq_desc *desc)
 {
-	if (desc->irq_data.chip->irq_mask_ack)
+	if (desc->irq_data.chip->irq_mask_ack) {
 		desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
-	else {
-		desc->irq_data.chip->irq_mask(&desc->irq_data);
+		irq_state_set_masked(desc);
+	} else {
+		mask_irq(desc);
 		if (desc->irq_data.chip->irq_ack)
 			desc->irq_data.chip->irq_ack(&desc->irq_data);
 	}
-	irq_state_set_masked(desc);
 }
 
 void mask_irq(struct irq_desc *desc)
 {
+	if (irqd_irq_masked(&desc->irq_data))
+		return;
+
 	if (desc->irq_data.chip->irq_mask) {
 		desc->irq_data.chip->irq_mask(&desc->irq_data);
 		irq_state_set_masked(desc);
@@ -398,6 +411,9 @@ void mask_irq(struct irq_desc *desc)
 
 void unmask_irq(struct irq_desc *desc)
 {
+	if (!irqd_irq_masked(&desc->irq_data))
+		return;
+
 	if (desc->irq_data.chip->irq_unmask) {
 		desc->irq_data.chip->irq_unmask(&desc->irq_data);
 		irq_state_clr_masked(desc);
@@ -411,10 +427,7 @@ void unmask_threaded_irq(struct irq_desc *desc)
 	if (chip->flags & IRQCHIP_EOI_THREADED)
 		chip->irq_eoi(&desc->irq_data);
 
-	if (chip->irq_unmask) {
-		chip->irq_unmask(&desc->irq_data);
-		irq_state_clr_masked(desc);
-	}
+	unmask_irq(desc);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 1ba5c08b58a0c21fca222f1bf2fde184aa26103f Mon Sep 17 00:00:00 2001
From: Corentin Labbe <clabbe.montjoie@gmail.com>
Date: Tue, 6 Jun 2017 14:17:39 +0200
Subject: kernel/module.c: suppress warning about unused nowarn variable

This patch fix the following warning:
kernel/module.c: In function 'add_usage_links':
kernel/module.c:1653:6: warning: variable 'nowarn' set but not used [-Wunused-but-set-variable]

[jeyu: folded in first patch since it only swapped the function order
so that del_usage_links can be called from add_usage_links]
Signed-off-by: Corentin Labbe <clabbe.montjoie@gmail.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 3803449ca219..f546d574f436 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1666,31 +1666,36 @@ static inline void remove_notes_attrs(struct module *mod)
 }
 #endif /* CONFIG_KALLSYMS */
 
-static void add_usage_links(struct module *mod)
+static void del_usage_links(struct module *mod)
 {
 #ifdef CONFIG_MODULE_UNLOAD
 	struct module_use *use;
-	int nowarn;
 
 	mutex_lock(&module_mutex);
-	list_for_each_entry(use, &mod->target_list, target_list) {
-		nowarn = sysfs_create_link(use->target->holders_dir,
-					   &mod->mkobj.kobj, mod->name);
-	}
+	list_for_each_entry(use, &mod->target_list, target_list)
+		sysfs_remove_link(use->target->holders_dir, mod->name);
 	mutex_unlock(&module_mutex);
 #endif
 }
 
-static void del_usage_links(struct module *mod)
+static int add_usage_links(struct module *mod)
 {
+	int ret = 0;
 #ifdef CONFIG_MODULE_UNLOAD
 	struct module_use *use;
 
 	mutex_lock(&module_mutex);
-	list_for_each_entry(use, &mod->target_list, target_list)
-		sysfs_remove_link(use->target->holders_dir, mod->name);
+	list_for_each_entry(use, &mod->target_list, target_list) {
+		ret = sysfs_create_link(use->target->holders_dir,
+					&mod->mkobj.kobj, mod->name);
+		if (ret)
+			break;
+	}
 	mutex_unlock(&module_mutex);
+	if (ret)
+		del_usage_links(mod);
 #endif
+	return ret;
 }
 
 static int module_add_modinfo_attrs(struct module *mod)
@@ -1801,13 +1806,18 @@ static int mod_sysfs_setup(struct module *mod,
 	if (err)
 		goto out_unreg_param;
 
-	add_usage_links(mod);
+	err = add_usage_links(mod);
+	if (err)
+		goto out_unreg_modinfo_attrs;
+
 	add_sect_attrs(mod, info);
 	add_notes_attrs(mod, info);
 
 	kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
 	return 0;
 
+out_unreg_modinfo_attrs:
+	module_remove_modinfo_attrs(mod);
 out_unreg_param:
 	module_param_sysfs_remove(mod);
 out_unreg_holders:
-- 
cgit v1.3-14-g43fede


From 673feb9d76ab3eddde7acfd94b206e321cfc90b9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 23 Jun 2017 15:26:26 -0400
Subject: ftrace: Add :mod: caching infrastructure to trace_array

This is the start of the infrastructure work to allow for tracing module
functions before it is loaded.

Currently the following command:

  # echo :mod:some-mod > set_ftrace_filter

will enable tracing of all functions within the module "some-mod" if it is
loaded. What we want, is if the module is not loaded, that line will be
saved. When the module is loaded, then the "some-mod" will have that line
executed on it, so that the functions within it starts being traced.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/trace/trace.h  |  12 +++++
 2 files changed, 148 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e5841dc14b5..1867edec6269 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1293,6 +1293,28 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
 	FTRACE_WARN_ON(hash->count);
 }
 
+static void free_ftrace_mod(struct ftrace_mod_load *ftrace_mod)
+{
+	list_del(&ftrace_mod->list);
+	kfree(ftrace_mod->module);
+	kfree(ftrace_mod->func);
+	kfree(ftrace_mod);
+}
+
+static void clear_ftrace_mod_list(struct list_head *head)
+{
+	struct ftrace_mod_load *p, *n;
+
+	/* stack tracer isn't supported yet */
+	if (!head)
+		return;
+
+	mutex_lock(&ftrace_lock);
+	list_for_each_entry_safe(p, n, head, list)
+		free_ftrace_mod(p);
+	mutex_unlock(&ftrace_lock);
+}
+
 static void free_ftrace_hash(struct ftrace_hash *hash)
 {
 	if (!hash || hash == EMPTY_HASH)
@@ -1346,6 +1368,35 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
 	return hash;
 }
 
+
+static int ftrace_add_mod(struct trace_array *tr,
+			  const char *func, const char *module,
+			  int enable)
+{
+	struct ftrace_mod_load *ftrace_mod;
+	struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace;
+
+	ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL);
+	if (!ftrace_mod)
+		return -ENOMEM;
+
+	ftrace_mod->func = kstrdup(func, GFP_KERNEL);
+	ftrace_mod->module = kstrdup(module, GFP_KERNEL);
+	ftrace_mod->enable = enable;
+
+	if (!ftrace_mod->func || !ftrace_mod->module)
+		goto out_free;
+
+	list_add(&ftrace_mod->list, mod_head);
+
+	return 0;
+
+ out_free:
+	free_ftrace_mod(ftrace_mod);
+
+	return -ENOMEM;
+}
+
 static struct ftrace_hash *
 alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
 {
@@ -3457,6 +3508,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 {
 	struct ftrace_iterator *iter;
 	struct ftrace_hash *hash;
+	struct list_head *mod_head;
+	struct trace_array *tr = ops->private;
 	int ret = 0;
 
 	ftrace_ops_init(ops);
@@ -3478,18 +3531,23 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 
 	mutex_lock(&ops->func_hash->regex_lock);
 
-	if (flag & FTRACE_ITER_NOTRACE)
+	if (flag & FTRACE_ITER_NOTRACE) {
 		hash = ops->func_hash->notrace_hash;
-	else
+		mod_head = tr ? &tr->mod_trace : NULL;
+	} else {
 		hash = ops->func_hash->filter_hash;
+		mod_head = tr ? &tr->mod_notrace : NULL;
+	}
 
 	if (file->f_mode & FMODE_WRITE) {
 		const int size_bits = FTRACE_HASH_DEFAULT_BITS;
 
-		if (file->f_flags & O_TRUNC)
+		if (file->f_flags & O_TRUNC) {
 			iter->hash = alloc_ftrace_hash(size_bits);
-		else
+			clear_ftrace_mod_list(mod_head);
+	        } else {
 			iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
+		}
 
 		if (!iter->hash) {
 			trace_parser_put(&iter->parser);
@@ -3761,6 +3819,68 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
 	return ret;
 }
 
+static bool module_exists(const char *module)
+{
+	/* All modules have the symbol __this_module */
+	const char this_mod[] = "__this_module";
+	const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1;
+	char modname[modname_size + 1];
+	unsigned long val;
+	int n;
+
+	n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod);
+
+	if (n > modname_size)
+		return false;
+
+	val = module_kallsyms_lookup_name(modname);
+	return val != 0;
+}
+
+static int cache_mod(struct trace_array *tr,
+		     const char *func, char *module, int enable)
+{
+	struct ftrace_mod_load *ftrace_mod, *n;
+	struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace;
+	int ret;
+
+	mutex_lock(&ftrace_lock);
+
+	/* We do not cache inverse filters */
+	if (func[0] == '!') {
+		func++;
+		ret = -EINVAL;
+
+		/* Look to remove this hash */
+		list_for_each_entry_safe(ftrace_mod, n, head, list) {
+			if (strcmp(ftrace_mod->module, module) != 0)
+				continue;
+
+			/* no func matches all */
+			if (!func || strcmp(func, "*") == 0 ||
+			    (ftrace_mod->func &&
+			     strcmp(ftrace_mod->func, func) == 0)) {
+				ret = 0;
+				free_ftrace_mod(ftrace_mod);
+				continue;
+			}
+		}
+		goto out;
+	}
+
+	ret = -EINVAL;
+	/* We only care about modules that have not been loaded yet */
+	if (module_exists(module))
+		goto out;
+
+	/* Save this string off, and execute it when the module is loaded */
+	ret = ftrace_add_mod(tr, func, module, enable);
+ out:
+	mutex_unlock(&ftrace_lock);
+
+	return ret;
+}
+
 /*
  * We register the module command as a template to show others how
  * to register the a command as well.
@@ -3768,10 +3888,16 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
 
 static int
 ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
-		    char *func, char *cmd, char *module, int enable)
+		    char *func_orig, char *cmd, char *module, int enable)
 {
+	char *func;
 	int ret;
 
+	/* match_records() modifies func, and we need the original */
+	func = kstrdup(func_orig, GFP_KERNEL);
+	if (!func)
+		return -ENOMEM;
+
 	/*
 	 * cmd == 'mod' because we only registered this func
 	 * for the 'mod' ftrace_func_command.
@@ -3780,8 +3906,10 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
 	 * parameter.
 	 */
 	ret = match_records(hash, func, strlen(func), module);
+	kfree(func);
+
 	if (!ret)
-		return -EINVAL;
+		return cache_mod(tr, func_orig, module, enable);
 	if (ret < 0)
 		return ret;
 	return 0;
@@ -5570,6 +5698,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
 void ftrace_init_trace_array(struct trace_array *tr)
 {
 	INIT_LIST_HEAD(&tr->func_probes);
+	INIT_LIST_HEAD(&tr->mod_trace);
+	INIT_LIST_HEAD(&tr->mod_notrace);
 }
 #else
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 69a3ab3ee4f5..d63550cdbdfa 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -263,7 +263,10 @@ struct trace_array {
 	struct ftrace_ops	*ops;
 	struct trace_pid_list	__rcu *function_pids;
 #ifdef CONFIG_DYNAMIC_FTRACE
+	/* All of these are protected by the ftrace_lock */
 	struct list_head	func_probes;
+	struct list_head	mod_trace;
+	struct list_head	mod_notrace;
 #endif
 	/* function tracing enabled */
 	int			function_enabled;
@@ -761,6 +764,15 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
 
 extern char trace_find_mark(unsigned long long duration);
 
+struct ftrace_hash;
+
+struct ftrace_mod_load {
+	struct list_head	list;
+	char			*func;
+	char			*module;
+	int			 enable;
+};
+
 struct ftrace_hash {
 	unsigned long		size_bits;
 	struct hlist_head	*buckets;
-- 
cgit v1.3-14-g43fede


From 5985ea8bd5d1b820b909af49fbc2767a990080a6 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 23 Jun 2017 16:05:11 -0400
Subject: ftrace: Have the cached module list show in set_ftrace_filter

When writing in a module filter into set_ftrace_filter for a module that is
not yet loaded, it it cached, and will be executed when the module is loaded
(although that is not implemented yet at this commit). Display the list of
cached modules to be traced.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |   3 +-
 kernel/trace/ftrace.c  | 112 +++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 102 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1b6992e994e6..9fb9a67dc9d4 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -446,7 +446,8 @@ enum {
 	FTRACE_ITER_PRINTALL	= (1 << 2),
 	FTRACE_ITER_DO_PROBES	= (1 << 3),
 	FTRACE_ITER_PROBE	= (1 << 4),
-	FTRACE_ITER_ENABLED	= (1 << 5),
+	FTRACE_ITER_MOD		= (1 << 5),
+	FTRACE_ITER_ENABLED	= (1 << 6),
 };
 
 void arch_ftrace_update_code(int command);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1867edec6269..bfdbce78064b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3112,6 +3112,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
 struct ftrace_iterator {
 	loff_t				pos;
 	loff_t				func_pos;
+	loff_t				mod_pos;
 	struct ftrace_page		*pg;
 	struct dyn_ftrace		*func;
 	struct ftrace_func_probe	*probe;
@@ -3119,6 +3120,8 @@ struct ftrace_iterator {
 	struct trace_parser		parser;
 	struct ftrace_hash		*hash;
 	struct ftrace_ops		*ops;
+	struct trace_array		*tr;
+	struct list_head		*mod_list;
 	int				pidx;
 	int				idx;
 	unsigned			flags;
@@ -3203,13 +3206,13 @@ static void *t_probe_start(struct seq_file *m, loff_t *pos)
 	if (!(iter->flags & FTRACE_ITER_DO_PROBES))
 		return NULL;
 
-	if (iter->func_pos > *pos)
+	if (iter->mod_pos > *pos)
 		return NULL;
 
 	iter->probe = NULL;
 	iter->probe_entry = NULL;
 	iter->pidx = 0;
-	for (l = 0; l <= (*pos - iter->func_pos); ) {
+	for (l = 0; l <= (*pos - iter->mod_pos); ) {
 		p = t_probe_next(m, &l);
 		if (!p)
 			break;
@@ -3247,6 +3250,82 @@ t_probe_show(struct seq_file *m, struct ftrace_iterator *iter)
 	return 0;
 }
 
+static void *
+t_mod_next(struct seq_file *m, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	struct trace_array *tr = iter->tr;
+
+	(*pos)++;
+	iter->pos = *pos;
+
+	iter->mod_list = iter->mod_list->next;
+
+	if (iter->mod_list == &tr->mod_trace ||
+	    iter->mod_list == &tr->mod_notrace) {
+		iter->flags &= ~FTRACE_ITER_MOD;
+		return NULL;
+	}
+
+	iter->mod_pos = *pos;
+
+	return iter;
+}
+
+static void *t_mod_start(struct seq_file *m, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	void *p = NULL;
+	loff_t l;
+
+	if (iter->func_pos > *pos)
+		return NULL;
+
+	iter->mod_pos = iter->func_pos;
+
+	/* probes are only available if tr is set */
+	if (!iter->tr)
+		return NULL;
+
+	for (l = 0; l <= (*pos - iter->func_pos); ) {
+		p = t_mod_next(m, &l);
+		if (!p)
+			break;
+	}
+	if (!p) {
+		iter->flags &= ~FTRACE_ITER_MOD;
+		return t_probe_start(m, pos);
+	}
+
+	/* Only set this if we have an item */
+	iter->flags |= FTRACE_ITER_MOD;
+
+	return iter;
+}
+
+static int
+t_mod_show(struct seq_file *m, struct ftrace_iterator *iter)
+{
+	struct ftrace_mod_load *ftrace_mod;
+	struct trace_array *tr = iter->tr;
+
+	if (WARN_ON_ONCE(!iter->mod_list) ||
+			 iter->mod_list == &tr->mod_trace ||
+			 iter->mod_list == &tr->mod_notrace)
+		return -EIO;
+
+	ftrace_mod = list_entry(iter->mod_list, struct ftrace_mod_load, list);
+
+	if (ftrace_mod->func)
+		seq_printf(m, "%s", ftrace_mod->func);
+	else
+		seq_putc(m, '*');
+
+	seq_printf(m, ":mod:%s\n", ftrace_mod->module);
+
+	return 0;
+}
+
 static void *
 t_func_next(struct seq_file *m, loff_t *pos)
 {
@@ -3288,7 +3367,7 @@ static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
-	loff_t l = *pos; /* t_hash_start() must use original pos */
+	loff_t l = *pos; /* t_probe_start() must use original pos */
 	void *ret;
 
 	if (unlikely(ftrace_disabled))
@@ -3297,16 +3376,19 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 	if (iter->flags & FTRACE_ITER_PROBE)
 		return t_probe_next(m, pos);
 
+	if (iter->flags & FTRACE_ITER_MOD)
+		return t_mod_next(m, pos);
+
 	if (iter->flags & FTRACE_ITER_PRINTALL) {
 		/* next must increment pos, and t_probe_start does not */
 		(*pos)++;
-		return t_probe_start(m, &l);
+		return t_mod_start(m, &l);
 	}
 
 	ret = t_func_next(m, pos);
 
 	if (!ret)
-		return t_probe_start(m, &l);
+		return t_mod_start(m, &l);
 
 	return ret;
 }
@@ -3315,7 +3397,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
 {
 	iter->pos = 0;
 	iter->func_pos = 0;
-	iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE);
+	iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE | FTRACE_ITER_MOD);
 }
 
 static void *t_start(struct seq_file *m, loff_t *pos)
@@ -3344,15 +3426,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	    ftrace_hash_empty(iter->hash)) {
 		iter->func_pos = 1; /* Account for the message */
 		if (*pos > 0)
-			return t_probe_start(m, pos);
+			return t_mod_start(m, pos);
 		iter->flags |= FTRACE_ITER_PRINTALL;
 		/* reset in case of seek/pread */
 		iter->flags &= ~FTRACE_ITER_PROBE;
 		return iter;
 	}
 
-	if (iter->flags & FTRACE_ITER_PROBE)
-		return t_probe_start(m, pos);
+	if (iter->flags & FTRACE_ITER_MOD)
+		return t_mod_start(m, pos);
 
 	/*
 	 * Unfortunately, we need to restart at ftrace_pages_start
@@ -3368,7 +3450,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	}
 
 	if (!p)
-		return t_probe_start(m, pos);
+		return t_mod_start(m, pos);
 
 	return iter;
 }
@@ -3402,6 +3484,9 @@ static int t_show(struct seq_file *m, void *v)
 	if (iter->flags & FTRACE_ITER_PROBE)
 		return t_probe_show(m, iter);
 
+	if (iter->flags & FTRACE_ITER_MOD)
+		return t_mod_show(m, iter);
+
 	if (iter->flags & FTRACE_ITER_PRINTALL) {
 		if (iter->flags & FTRACE_ITER_NOTRACE)
 			seq_puts(m, "#### no functions disabled ####\n");
@@ -3528,17 +3613,20 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 
 	iter->ops = ops;
 	iter->flags = flag;
+	iter->tr = tr;
 
 	mutex_lock(&ops->func_hash->regex_lock);
 
 	if (flag & FTRACE_ITER_NOTRACE) {
 		hash = ops->func_hash->notrace_hash;
-		mod_head = tr ? &tr->mod_trace : NULL;
+		mod_head = tr ? &tr->mod_notrace : NULL;
 	} else {
 		hash = ops->func_hash->filter_hash;
-		mod_head = tr ? &tr->mod_notrace : NULL;
+		mod_head = tr ? &tr->mod_trace : NULL;
 	}
 
+	iter->mod_list = mod_head;
+
 	if (file->f_mode & FMODE_WRITE) {
 		const int size_bits = FTRACE_HASH_DEFAULT_BITS;
 
-- 
cgit v1.3-14-g43fede


From d7fbf8df7ca0a5c7e85db79f7005f99cb461c525 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 26 Jun 2017 10:57:21 -0400
Subject: ftrace: Implement cached modules tracing on module load

If a module is cached in the set_ftrace_filter, and that module is loaded,
then enable tracing on that module as if the cached module text was written
into set_ftrace_filter just as the module is loaded.

  # echo ":mod:kvm_intel" >
  # cat /sys/kernel/tracing/set_ftrace_filter
 #### all functions enabled ####
 :mod:kvm_intel
  # modprobe kvm_intel
  # cat /sys/kernel/tracing/set_ftrace_filter
 vmx_get_rflags [kvm_intel]
 vmx_get_pkru [kvm_intel]
 vmx_get_interrupt_shadow [kvm_intel]
 vmx_rdtscp_supported [kvm_intel]
 vmx_invpcid_supported [kvm_intel]
 [..]

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bfdbce78064b..f1ccf8be9df7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3969,6 +3969,97 @@ static int cache_mod(struct trace_array *tr,
 	return ret;
 }
 
+static int
+ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
+		 int reset, int enable);
+
+static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
+			     char *mod, bool enable)
+{
+	struct ftrace_mod_load *ftrace_mod, *n;
+	struct ftrace_hash **orig_hash, *new_hash;
+	LIST_HEAD(process_mods);
+	char *func;
+	int ret;
+
+	mutex_lock(&ops->func_hash->regex_lock);
+
+	if (enable)
+		orig_hash = &ops->func_hash->filter_hash;
+	else
+		orig_hash = &ops->func_hash->notrace_hash;
+
+	new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS,
+					      *orig_hash);
+	if (!new_hash)
+		return; /* Warn? */
+
+	mutex_lock(&ftrace_lock);
+
+	list_for_each_entry_safe(ftrace_mod, n, head, list) {
+
+		if (strcmp(ftrace_mod->module, mod) != 0)
+			continue;
+
+		if (ftrace_mod->func)
+			func = kstrdup(ftrace_mod->func, GFP_KERNEL);
+		else
+			func = kstrdup("*", GFP_KERNEL);
+
+		if (!func) /* warn? */
+			continue;
+
+		list_del(&ftrace_mod->list);
+		list_add(&ftrace_mod->list, &process_mods);
+
+		/* Use the newly allocated func, as it may be "*" */
+		kfree(ftrace_mod->func);
+		ftrace_mod->func = func;
+	}
+
+	mutex_unlock(&ftrace_lock);
+
+	list_for_each_entry_safe(ftrace_mod, n, &process_mods, list) {
+
+		func = ftrace_mod->func;
+
+		/* Grabs ftrace_lock, which is why we have this extra step */
+		match_records(new_hash, func, strlen(func), mod);
+		free_ftrace_mod(ftrace_mod);
+	}
+
+	mutex_lock(&ftrace_lock);
+
+	ret = ftrace_hash_move_and_update_ops(ops, orig_hash,
+					      new_hash, enable);
+	mutex_unlock(&ftrace_lock);
+
+	mutex_unlock(&ops->func_hash->regex_lock);
+
+	free_ftrace_hash(new_hash);
+}
+
+static void process_cached_mods(const char *mod_name)
+{
+	struct trace_array *tr;
+	char *mod;
+
+	mod = kstrdup(mod_name, GFP_KERNEL);
+	if (!mod)
+		return;
+
+	mutex_lock(&trace_types_lock);
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		if (!list_empty(&tr->mod_trace))
+			process_mod_list(&tr->mod_trace, tr->ops, mod, true);
+		if (!list_empty(&tr->mod_notrace))
+			process_mod_list(&tr->mod_notrace, tr->ops, mod, false);
+	}
+	mutex_unlock(&trace_types_lock);
+
+	kfree(mod);
+}
+
 /*
  * We register the module command as a template to show others how
  * to register the a command as well.
@@ -5682,6 +5773,8 @@ void ftrace_module_enable(struct module *mod)
 
  out_unlock:
 	mutex_unlock(&ftrace_lock);
+
+	process_cached_mods(mod->name);
 }
 
 void ftrace_module_init(struct module *mod)
-- 
cgit v1.3-14-g43fede


From 8c08f0d5c6fb10ff93ffb1cbf416f4f1c3a52a80 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 26 Jun 2017 11:47:31 -0400
Subject: ftrace: Have cached module filters be an active filter

When a module filter is added to set_ftrace_filter, if the module is not
loaded, it is cached. This should be considered an active filter, and
function tracing should be filtered by this. That is, if a cached module
filter is the only filter set, then no function tracing should be happening,
as all the functions available will be filtered out.

This makes sense, as the reason to add a cached module filter, is to trace
the module when you load it. There shouldn't be any other tracing happening
until then.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  2 ++
 kernel/trace/ftrace.c  | 20 +++++++++++++++-----
 kernel/trace/trace.h   |  7 ++++++-
 3 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 9fb9a67dc9d4..5857390ac35a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -120,6 +120,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
  *            this ops will fail to register or set_filter_ip.
  * PID     - Is affected by set_ftrace_pid (allows filtering on those pids)
  * RCU     - Set when the ops can only be called when RCU is watching.
+ * TRACE_ARRAY - The ops->private points to a trace_array descriptor.
  */
 enum {
 	FTRACE_OPS_FL_ENABLED			= 1 << 0,
@@ -138,6 +139,7 @@ enum {
 	FTRACE_OPS_FL_IPMODIFY			= 1 << 13,
 	FTRACE_OPS_FL_PID			= 1 << 14,
 	FTRACE_OPS_FL_RCU			= 1 << 15,
+	FTRACE_OPS_FL_TRACE_ARRAY		= 1 << 16,
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f1ccf8be9df7..914539e3e301 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1410,6 +1410,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
 	if (!new_hash)
 		return NULL;
 
+	if (hash)
+		new_hash->flags = hash->flags;
+
 	/* Empty hash? */
 	if (ftrace_hash_empty(hash))
 		return new_hash;
@@ -1454,7 +1457,7 @@ __ftrace_hash_move(struct ftrace_hash *src)
 	/*
 	 * If the new source is empty, just return the empty_hash.
 	 */
-	if (!src->count)
+	if (ftrace_hash_empty(src))
 		return EMPTY_HASH;
 
 	/*
@@ -1471,6 +1474,8 @@ __ftrace_hash_move(struct ftrace_hash *src)
 	if (!new_hash)
 		return NULL;
 
+	new_hash->flags = src->flags;
+
 	size = 1 << src->size_bits;
 	for (i = 0; i < size; i++) {
 		hhd = &src->buckets[i];
@@ -1701,7 +1706,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
 	struct dyn_ftrace *rec;
 	bool update = false;
 	int count = 0;
-	int all = 0;
+	int all = false;
 
 	/* Only update if the ops has been registered */
 	if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
@@ -1722,7 +1727,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
 		hash = ops->func_hash->filter_hash;
 		other_hash = ops->func_hash->notrace_hash;
 		if (ftrace_hash_empty(hash))
-			all = 1;
+			all = true;
 	} else {
 		inc = !inc;
 		hash = ops->func_hash->notrace_hash;
@@ -4028,6 +4033,9 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
 		free_ftrace_mod(ftrace_mod);
 	}
 
+	if (enable && list_empty(head))
+		new_hash->flags &= ~FTRACE_HASH_FL_MOD;
+
 	mutex_lock(&ftrace_lock);
 
 	ret = ftrace_hash_move_and_update_ops(ops, orig_hash,
@@ -5035,9 +5043,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 	if (file->f_mode & FMODE_WRITE) {
 		filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
 
-		if (filter_hash)
+		if (filter_hash) {
 			orig_hash = &iter->ops->func_hash->filter_hash;
-		else
+			if (!list_empty(&iter->tr->mod_trace))
+				iter->hash->flags |= FTRACE_HASH_FL_MOD;
+		} else
 			orig_hash = &iter->ops->func_hash->notrace_hash;
 
 		mutex_lock(&ftrace_lock);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d63550cdbdfa..13823951e42b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -773,10 +773,15 @@ struct ftrace_mod_load {
 	int			 enable;
 };
 
+enum {
+	FTRACE_HASH_FL_MOD	= (1 << 0),
+};
+
 struct ftrace_hash {
 	unsigned long		size_bits;
 	struct hlist_head	*buckets;
 	unsigned long		count;
+	unsigned long		flags;
 	struct rcu_head		rcu;
 };
 
@@ -785,7 +790,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip);
 
 static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
 {
-	return !hash || !hash->count;
+	return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD));
 }
 
 /* Standard output formatting function used for function return traces */
-- 
cgit v1.3-14-g43fede


From 131b63515932d18a3b1a60db3958f3c0dd5462bc Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Feb 2017 09:24:24 -0800
Subject: seccomp: Clean up core dump logic

This just cleans up the core dumping logic to avoid the braces around
the RET_KILL case.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/seccomp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 65f61077ad50..fce83885b7ef 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -641,11 +641,12 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 		return 0;
 
 	case SECCOMP_RET_KILL:
-	default: {
-		siginfo_t info;
+	default:
 		audit_seccomp(this_syscall, SIGSYS, action);
 		/* Dump core only if this is the last remaining thread. */
 		if (get_nr_threads(current) == 1) {
+			siginfo_t info;
+
 			/* Show the original registers in the dump. */
 			syscall_rollback(current, task_pt_regs(current));
 			/* Trigger a manual coredump since do_exit skips it. */
@@ -654,7 +655,6 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 		}
 		do_exit(SIGSYS);
 	}
-	}
 
 	unreachable();
 
-- 
cgit v1.3-14-g43fede


From 0b5fa2290637a3235898d18dc0e7a136783f1bd2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 26 Jun 2017 09:24:00 -0700
Subject: seccomp: Switch from atomic_t to recount_t

This switches the seccomp usage tracking from atomic_t to refcount_t to
gain refcount overflow protections.

Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: David Windsor <dwindsor@gmail.com>
Cc: Hans Liljestrand <hans.liljestrand@aalto.fi>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/seccomp.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index fce83885b7ef..98b59b5db90b 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -13,7 +13,7 @@
  *        of Berkeley Packet Filters/Linux Socket Filters.
  */
 
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 #include <linux/audit.h>
 #include <linux/compat.h>
 #include <linux/coredump.h>
@@ -56,7 +56,7 @@
  * to a task_struct (other than @usage).
  */
 struct seccomp_filter {
-	atomic_t usage;
+	refcount_t usage;
 	struct seccomp_filter *prev;
 	struct bpf_prog *prog;
 };
@@ -378,7 +378,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 		return ERR_PTR(ret);
 	}
 
-	atomic_set(&sfilter->usage, 1);
+	refcount_set(&sfilter->usage, 1);
 
 	return sfilter;
 }
@@ -465,7 +465,7 @@ void get_seccomp_filter(struct task_struct *tsk)
 	if (!orig)
 		return;
 	/* Reference count is bounded by the number of total processes. */
-	atomic_inc(&orig->usage);
+	refcount_inc(&orig->usage);
 }
 
 static inline void seccomp_filter_free(struct seccomp_filter *filter)
@@ -481,7 +481,7 @@ void put_seccomp_filter(struct task_struct *tsk)
 {
 	struct seccomp_filter *orig = tsk->seccomp.filter;
 	/* Clean up single-reference branches iteratively. */
-	while (orig && atomic_dec_and_test(&orig->usage)) {
+	while (orig && refcount_dec_and_test(&orig->usage)) {
 		struct seccomp_filter *freeme = orig;
 		orig = orig->prev;
 		seccomp_filter_free(freeme);
-- 
cgit v1.3-14-g43fede


From 49368a47f6dc1b256e3b83813da5c9b0731fe268 Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Sat, 3 Jun 2017 20:52:32 +1000
Subject: PM / hibernate: Use CONFIG_HAVE_SET_MEMORY for include condition

Kbuild reported a build failure when CONFIG_STRICT_KERNEL_RWX was
enabled on powerpc. We don't yet have ARCH_HAS_SET_MEMORY and ppc32
saw a build failure.

I've only done a basic compile test with a config that has
hibernation enabled.

Fixes: 50327ddfbc92 (kernel/power/snapshot.c: use set_memory.h header)
Reported-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index fa46606f3356..71730d672290 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -36,13 +36,13 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
-#ifdef CONFIG_STRICT_KERNEL_RWX
+#ifdef CONFIG_ARCH_HAS_SET_MEMORY
 #include <asm/set_memory.h>
 #endif
 
 #include "power.h"
 
-#ifdef CONFIG_STRICT_KERNEL_RWX
+#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_ARCH_HAS_SET_MEMORY)
 static bool hibernate_restore_protection;
 static bool hibernate_restore_protection_active;
 
@@ -77,7 +77,7 @@ static inline void hibernate_restore_protection_begin(void) {}
 static inline void hibernate_restore_protection_end(void) {}
 static inline void hibernate_restore_protect_page(void *page_address) {}
 static inline void hibernate_restore_unprotect_page(void *page_address) {}
-#endif /* CONFIG_STRICT_KERNEL_RWX */
+#endif /* CONFIG_STRICT_KERNEL_RWX  && CONFIG_ARCH_HAS_SET_MEMORY */
 
 static int swsusp_page_is_free(struct page *);
 static void swsusp_set_page_forbidden(struct page *);
-- 
cgit v1.3-14-g43fede


From eba74c294467d55c697e2199c37dfaf8126fe396 Mon Sep 17 00:00:00 2001
From: BaoJun Luo <baojun.luo@samsung.com>
Date: Tue, 27 Jun 2017 02:10:44 +0200
Subject: PM / hibernate: Drop redundant parameter of swsusp_alloc()

The first parameter of swsusp_alloc is not used, so drop it.

Signed-off-by: BaoJun Luo <baojun.luo@samsung.com>
[ rjw: Subject & changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 71730d672290..b7708e319941 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1929,8 +1929,7 @@ static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
  * also be located in the high memory, because of the way in which
  * copy_data_pages() works.
  */
-static int swsusp_alloc(struct memory_bitmap *orig_bm,
-			struct memory_bitmap *copy_bm,
+static int swsusp_alloc(struct memory_bitmap *copy_bm,
 			unsigned int nr_pages, unsigned int nr_highmem)
 {
 	if (nr_highmem > 0) {
@@ -1976,7 +1975,7 @@ asmlinkage __visible int swsusp_save(void)
 		return -ENOMEM;
 	}
 
-	if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
+	if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem)) {
 		printk(KERN_ERR "PM: Memory allocation failed\n");
 		return -ENOMEM;
 	}
-- 
cgit v1.3-14-g43fede


From 6a9c981b1e9657ca5866d10aa38b8a4fe1159138 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 27 Jun 2017 11:02:49 -0400
Subject: ftrace: Remove unused function ftrace_arch_read_dyn_info()

ftrace_arch_read_dyn_info() was used so that archs could add its own debug
information into the dyn_ftrace_total_info in the tracefs file system. That
file is for debugging usage of dynamic ftrace. No arch uses that function
anymore, so just get rid of it.

This also allows for tracing_read_dyn_info() to be cleaned up a bit.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 19ac2088d10a..14318ce92b13 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6737,33 +6737,18 @@ static const struct file_operations tracing_stats_fops = {
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
-int __weak ftrace_arch_read_dyn_info(char *buf, int size)
-{
-	return 0;
-}
-
 static ssize_t
 tracing_read_dyn_info(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
-	static char ftrace_dyn_info_buffer[1024];
-	static DEFINE_MUTEX(dyn_info_mutex);
 	unsigned long *p = filp->private_data;
-	char *buf = ftrace_dyn_info_buffer;
-	int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
+	char buf[64]; /* Not too big for a shallow stack */
 	int r;
 
-	mutex_lock(&dyn_info_mutex);
-	r = sprintf(buf, "%ld ", *p);
-
-	r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
+	r = scnprintf(buf, 63, "%ld", *p);
 	buf[r++] = '\n';
 
-	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-
-	mutex_unlock(&dyn_info_mutex);
-
-	return r;
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
 static const struct file_operations tracing_dyn_info_fops = {
-- 
cgit v1.3-14-g43fede


From 83dd14933e33a45e9b366c572e15505982b46845 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 27 Jun 2017 11:04:40 -0400
Subject: ftrace: Decrement count for dyn_ftrace_total_info file

The dyn_ftrace_total_info file is used to show how many functions have been
converted into nops and can be used by ftrace. The problem is that it does
not get decremented when functions are removed (init boot code being freed,
and modules being freed). That means the number is very inaccurate everytime
functions are removed from the ftrace tables. Decrement it when functions
are removed.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 914539e3e301..7509ef9810bf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5705,6 +5705,7 @@ void ftrace_release_mod(struct module *mod)
 			if (pg == ftrace_pages)
 				ftrace_pages = next_to_ftrace_page(last_pg);
 
+			ftrace_update_tot_cnt -= pg->index;
 			*last_pg = pg->next;
 			order = get_count_order(pg->size / ENTRIES_PER_PAGE);
 			free_pages((unsigned long)pg->records, order);
-- 
cgit v1.3-14-g43fede


From d914ba37d7145acb9fd3bb23075c2d56e5a44eb6 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 26 Jun 2017 19:01:55 -0700
Subject: tracing: Add support for recording tgid of tasks

Inorder to support recording of tgid, the following changes are made:

* Introduce a new API (tracing_record_taskinfo) to additionally record the tgid
  along with the task's comm at the same time. This has has the benefit of not
  setting trace_cmdline_save before all the information for a task is saved.
* Add a new API tracing_record_taskinfo_sched_switch to record task information
  for 2 tasks at a time (previous and next) and use it from sched_switch probe.
* Preserve the old API (tracing_record_cmdline) and create it as a wrapper
  around the new one so that existing callers aren't affected.
* Reuse the existing sched_switch and sched_wakeup probes to record tgid
  information and add a new option 'record-tgid' to enable recording of tgid

When record-tgid option isn't enabled to being with, we take care to make sure
that there's isn't memory or runtime overhead.

Link: http://lkml.kernel.org/r/20170627020155.5139-1-joelaf@google.com

Cc: kernel-team@android.com
Cc: Ingo Molnar <mingo@redhat.com>
Tested-by: Michael Sartain <mikesart@gmail.com>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h      |  13 ++++-
 kernel/trace/trace.c              | 105 ++++++++++++++++++++++++++++++++++----
 kernel/trace/trace.h              |   7 +++
 kernel/trace/trace_events.c       |  42 ++++++++++++++-
 kernel/trace/trace_sched_switch.c |  72 +++++++++++++++++++++-----
 5 files changed, 213 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index a556805eff8a..f73cedfa2e0b 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -151,7 +151,15 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer,
 				int type, unsigned long len,
 				unsigned long flags, int pc);
 
-void tracing_record_cmdline(struct task_struct *tsk);
+#define TRACE_RECORD_CMDLINE	BIT(0)
+#define TRACE_RECORD_TGID	BIT(1)
+
+void tracing_record_taskinfo(struct task_struct *task, int flags);
+void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
+					  struct task_struct *next, int flags);
+
+void tracing_record_cmdline(struct task_struct *task);
+void tracing_record_tgid(struct task_struct *task);
 
 int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...);
 
@@ -290,6 +298,7 @@ struct trace_subsystem_dir;
 enum {
 	EVENT_FILE_FL_ENABLED_BIT,
 	EVENT_FILE_FL_RECORDED_CMD_BIT,
+	EVENT_FILE_FL_RECORDED_TGID_BIT,
 	EVENT_FILE_FL_FILTERED_BIT,
 	EVENT_FILE_FL_NO_SET_FILTER_BIT,
 	EVENT_FILE_FL_SOFT_MODE_BIT,
@@ -303,6 +312,7 @@ enum {
  * Event file flags:
  *  ENABLED	  - The event is enabled
  *  RECORDED_CMD  - The comms should be recorded at sched_switch
+ *  RECORDED_TGID - The tgids should be recorded at sched_switch
  *  FILTERED	  - The event has a filter attached
  *  NO_SET_FILTER - Set when filter has error and is to be ignored
  *  SOFT_MODE     - The event is enabled/disabled by SOFT_DISABLED
@@ -315,6 +325,7 @@ enum {
 enum {
 	EVENT_FILE_FL_ENABLED		= (1 << EVENT_FILE_FL_ENABLED_BIT),
 	EVENT_FILE_FL_RECORDED_CMD	= (1 << EVENT_FILE_FL_RECORDED_CMD_BIT),
+	EVENT_FILE_FL_RECORDED_TGID	= (1 << EVENT_FILE_FL_RECORDED_TGID_BIT),
 	EVENT_FILE_FL_FILTERED		= (1 << EVENT_FILE_FL_FILTERED_BIT),
 	EVENT_FILE_FL_NO_SET_FILTER	= (1 << EVENT_FILE_FL_NO_SET_FILTER_BIT),
 	EVENT_FILE_FL_SOFT_MODE		= (1 << EVENT_FILE_FL_SOFT_MODE_BIT),
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 14318ce92b13..ab9db750dd29 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -87,7 +87,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
  * tracing is active, only save the comm when a trace event
  * occurred.
  */
-static DEFINE_PER_CPU(bool, trace_cmdline_save);
+static DEFINE_PER_CPU(bool, trace_taskinfo_save);
 
 /*
  * Kill all tracing for good (never come back).
@@ -790,7 +790,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
 static __always_inline void
 __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
 {
-	__this_cpu_write(trace_cmdline_save, true);
+	__this_cpu_write(trace_taskinfo_save, true);
 
 	/* If this is the temp buffer, we need to commit fully */
 	if (this_cpu_read(trace_buffered_event) == event) {
@@ -1709,6 +1709,8 @@ void tracing_reset_all_online_cpus(void)
 	}
 }
 
+static int *tgid_map;
+
 #define SAVED_CMDLINES_DEFAULT 128
 #define NO_CMDLINE_MAP UINT_MAX
 static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -1722,7 +1724,7 @@ struct saved_cmdlines_buffer {
 static struct saved_cmdlines_buffer *savedcmd;
 
 /* temporary disable recording */
-static atomic_t trace_record_cmdline_disabled __read_mostly;
+static atomic_t trace_record_taskinfo_disabled __read_mostly;
 
 static inline char *get_saved_cmdlines(int idx)
 {
@@ -1990,16 +1992,87 @@ void trace_find_cmdline(int pid, char comm[])
 	preempt_enable();
 }
 
-void tracing_record_cmdline(struct task_struct *tsk)
+int trace_find_tgid(int pid)
+{
+	if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT))
+		return 0;
+
+	return tgid_map[pid];
+}
+
+static int trace_save_tgid(struct task_struct *tsk)
 {
-	if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
+	if (unlikely(!tgid_map || !tsk->pid || tsk->pid > PID_MAX_DEFAULT))
+		return 0;
+
+	tgid_map[tsk->pid] = tsk->tgid;
+	return 1;
+}
+
+static bool tracing_record_taskinfo_skip(int flags)
+{
+	if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
+		return true;
+	if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on())
+		return true;
+	if (!__this_cpu_read(trace_taskinfo_save))
+		return true;
+	return false;
+}
+
+/**
+ * tracing_record_taskinfo - record the task info of a task
+ *
+ * @task  - task to record
+ * @flags - TRACE_RECORD_CMDLINE for recording comm
+ *        - TRACE_RECORD_TGID for recording tgid
+ */
+void tracing_record_taskinfo(struct task_struct *task, int flags)
+{
+	if (tracing_record_taskinfo_skip(flags))
+		return;
+	if ((flags & TRACE_RECORD_CMDLINE) && !trace_save_cmdline(task))
+		return;
+	if ((flags & TRACE_RECORD_TGID) && !trace_save_tgid(task))
 		return;
 
-	if (!__this_cpu_read(trace_cmdline_save))
+	__this_cpu_write(trace_taskinfo_save, false);
+}
+
+/**
+ * tracing_record_taskinfo_sched_switch - record task info for sched_switch
+ *
+ * @prev - previous task during sched_switch
+ * @next - next task during sched_switch
+ * @flags - TRACE_RECORD_CMDLINE for recording comm
+ *          TRACE_RECORD_TGID for recording tgid
+ */
+void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
+					  struct task_struct *next, int flags)
+{
+	if (tracing_record_taskinfo_skip(flags))
 		return;
 
-	if (trace_save_cmdline(tsk))
-		__this_cpu_write(trace_cmdline_save, false);
+	if ((flags & TRACE_RECORD_CMDLINE) &&
+	    (!trace_save_cmdline(prev) || !trace_save_cmdline(next)))
+		return;
+
+	if ((flags & TRACE_RECORD_TGID) &&
+	    (!trace_save_tgid(prev) || !trace_save_tgid(next)))
+		return;
+
+	__this_cpu_write(trace_taskinfo_save, false);
+}
+
+/* Helpers to record a specific task information */
+void tracing_record_cmdline(struct task_struct *task)
+{
+	tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
+}
+
+void tracing_record_tgid(struct task_struct *task)
+{
+	tracing_record_taskinfo(task, TRACE_RECORD_TGID);
 }
 
 /*
@@ -3144,7 +3217,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 #endif
 
 	if (!iter->snapshot)
-		atomic_inc(&trace_record_cmdline_disabled);
+		atomic_inc(&trace_record_taskinfo_disabled);
 
 	if (*pos != iter->pos) {
 		iter->ent = NULL;
@@ -3189,7 +3262,7 @@ static void s_stop(struct seq_file *m, void *p)
 #endif
 
 	if (!iter->snapshot)
-		atomic_dec(&trace_record_cmdline_disabled);
+		atomic_dec(&trace_record_taskinfo_disabled);
 
 	trace_access_unlock(iter->cpu_file);
 	trace_event_read_unlock();
@@ -4236,6 +4309,18 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
 	if (mask == TRACE_ITER_RECORD_CMD)
 		trace_event_enable_cmd_record(enabled);
 
+	if (mask == TRACE_ITER_RECORD_TGID) {
+		if (!tgid_map)
+			tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map),
+					   GFP_KERNEL);
+		if (!tgid_map) {
+			tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
+			return -ENOMEM;
+		}
+
+		trace_event_enable_tgid_record(enabled);
+	}
+
 	if (mask == TRACE_ITER_EVENT_FORK)
 		trace_event_follow_fork(tr, enabled);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 13823951e42b..6ade1c55cc3a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -640,6 +640,9 @@ void set_graph_array(struct trace_array *tr);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
+void tracing_start_tgid_record(void);
+void tracing_stop_tgid_record(void);
+
 int register_tracer(struct tracer *type);
 int is_tracing_stopped(void);
 
@@ -700,6 +703,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
 extern u64 ftrace_now(int cpu);
 
 extern void trace_find_cmdline(int pid, char comm[]);
+extern int trace_find_tgid(int pid);
 extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -1124,6 +1128,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 		C(CONTEXT_INFO,		"context-info"),   /* Print pid/cpu/time */ \
 		C(LATENCY_FMT,		"latency-format"),	\
 		C(RECORD_CMD,		"record-cmd"),		\
+		C(RECORD_TGID,		"record-tgid"),		\
 		C(OVERWRITE,		"overwrite"),		\
 		C(STOP_ON_FREE,		"disable_on_free"),	\
 		C(IRQ_INFO,		"irq-info"),		\
@@ -1440,6 +1445,8 @@ struct ftrace_event_field *
 trace_find_event_field(struct trace_event_call *call, char *name);
 
 extern void trace_event_enable_cmd_record(bool enable);
+extern void trace_event_enable_tgid_record(bool enable);
+
 extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
 extern int event_trace_del_tracer(struct trace_array *tr);
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 83dfd0dbbbfe..36132f9280e6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -343,6 +343,28 @@ void trace_event_enable_cmd_record(bool enable)
 	mutex_unlock(&event_mutex);
 }
 
+void trace_event_enable_tgid_record(bool enable)
+{
+	struct trace_event_file *file;
+	struct trace_array *tr;
+
+	mutex_lock(&event_mutex);
+	do_for_each_event_file(tr, file) {
+		if (!(file->flags & EVENT_FILE_FL_ENABLED))
+			continue;
+
+		if (enable) {
+			tracing_start_tgid_record();
+			set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
+		} else {
+			tracing_stop_tgid_record();
+			clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT,
+				  &file->flags);
+		}
+	} while_for_each_event_file();
+	mutex_unlock(&event_mutex);
+}
+
 static int __ftrace_event_enable_disable(struct trace_event_file *file,
 					 int enable, int soft_disable)
 {
@@ -381,6 +403,12 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 				tracing_stop_cmdline_record();
 				clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
 			}
+
+			if (file->flags & EVENT_FILE_FL_RECORDED_TGID) {
+				tracing_stop_tgid_record();
+				clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
+			}
+
 			call->class->reg(call, TRACE_REG_UNREGISTER, file);
 		}
 		/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
@@ -407,18 +435,30 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 		}
 
 		if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
+			bool cmd = false, tgid = false;
 
 			/* Keep the event disabled, when going to SOFT_MODE. */
 			if (soft_disable)
 				set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
 
 			if (tr->trace_flags & TRACE_ITER_RECORD_CMD) {
+				cmd = true;
 				tracing_start_cmdline_record();
 				set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
 			}
+
+			if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
+				tgid = true;
+				tracing_start_tgid_record();
+				set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
+			}
+
 			ret = call->class->reg(call, TRACE_REG_REGISTER, file);
 			if (ret) {
-				tracing_stop_cmdline_record();
+				if (cmd)
+					tracing_stop_cmdline_record();
+				if (tgid)
+					tracing_stop_tgid_record();
 				pr_info("event trace: Could not enable event "
 					"%s\n", trace_event_name(call));
 				break;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 4c896a0101bd..b341c02730be 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -12,27 +12,38 @@
 
 #include "trace.h"
 
-static int			sched_ref;
+#define RECORD_CMDLINE	1
+#define RECORD_TGID	2
+
+static int		sched_cmdline_ref;
+static int		sched_tgid_ref;
 static DEFINE_MUTEX(sched_register_mutex);
 
 static void
 probe_sched_switch(void *ignore, bool preempt,
 		   struct task_struct *prev, struct task_struct *next)
 {
-	if (unlikely(!sched_ref))
-		return;
+	int flags;
+
+	flags = (RECORD_TGID * !!sched_tgid_ref) +
+		(RECORD_CMDLINE * !!sched_cmdline_ref);
 
-	tracing_record_cmdline(prev);
-	tracing_record_cmdline(next);
+	if (!flags)
+		return;
+	tracing_record_taskinfo_sched_switch(prev, next, flags);
 }
 
 static void
 probe_sched_wakeup(void *ignore, struct task_struct *wakee)
 {
-	if (unlikely(!sched_ref))
-		return;
+	int flags;
+
+	flags = (RECORD_TGID * !!sched_tgid_ref) +
+		(RECORD_CMDLINE * !!sched_cmdline_ref);
 
-	tracing_record_cmdline(current);
+	if (!flags)
+		return;
+	tracing_record_taskinfo(current, flags);
 }
 
 static int tracing_sched_register(void)
@@ -75,28 +86,61 @@ static void tracing_sched_unregister(void)
 	unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
 }
 
-static void tracing_start_sched_switch(void)
+static void tracing_start_sched_switch(int ops)
 {
+	bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
 	mutex_lock(&sched_register_mutex);
-	if (!(sched_ref++))
+
+	switch (ops) {
+	case RECORD_CMDLINE:
+		sched_cmdline_ref++;
+		break;
+
+	case RECORD_TGID:
+		sched_tgid_ref++;
+		break;
+	}
+
+	if (sched_register && (sched_cmdline_ref || sched_tgid_ref))
 		tracing_sched_register();
 	mutex_unlock(&sched_register_mutex);
 }
 
-static void tracing_stop_sched_switch(void)
+static void tracing_stop_sched_switch(int ops)
 {
 	mutex_lock(&sched_register_mutex);
-	if (!(--sched_ref))
+
+	switch (ops) {
+	case RECORD_CMDLINE:
+		sched_cmdline_ref--;
+		break;
+
+	case RECORD_TGID:
+		sched_tgid_ref--;
+		break;
+	}
+
+	if (!sched_cmdline_ref && !sched_tgid_ref)
 		tracing_sched_unregister();
 	mutex_unlock(&sched_register_mutex);
 }
 
 void tracing_start_cmdline_record(void)
 {
-	tracing_start_sched_switch();
+	tracing_start_sched_switch(RECORD_CMDLINE);
 }
 
 void tracing_stop_cmdline_record(void)
 {
-	tracing_stop_sched_switch();
+	tracing_stop_sched_switch(RECORD_CMDLINE);
+}
+
+void tracing_start_tgid_record(void)
+{
+	tracing_start_sched_switch(RECORD_TGID);
+}
+
+void tracing_stop_tgid_record(void)
+{
+	tracing_stop_sched_switch(RECORD_TGID);
 }
-- 
cgit v1.3-14-g43fede


From 441dae8f2f2975c68101a84bc3f528ec95ecf7c3 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Sun, 25 Jun 2017 22:38:43 -0700
Subject: tracing: Add support for display of tgid in trace output

Earlier patches introduced ability to record the tgid using the 'record-tgid'
option. Here we read the tgid and output it if the option is enabled.

Link: http://lkml.kernel.org/r/20170626053844.5746-3-joelaf@google.com

Cc: kernel-team@android.com
Cc: Ingo Molnar <mingo@redhat.com>
Tested-by: Michael Sartain <mikesart@gmail.com>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 36 ++++++++++++++++++++++--------------
 kernel/trace/trace_output.c |  9 +++++++++
 2 files changed, 31 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ab9db750dd29..c579dea4a0eb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3319,23 +3319,29 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
 	seq_puts(m, "#\n");
 }
 
-static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
+static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m,
+				   unsigned int flags)
 {
+	bool tgid = flags & TRACE_ITER_RECORD_TGID;
+
 	print_event_info(buf, m);
-	seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n"
-		    "#              | |       |          |         |\n");
+
+	seq_printf(m, "#           TASK-PID   CPU#   %s  TIMESTAMP  FUNCTION\n", tgid ? "TGID     " : "");
+	seq_printf(m, "#              | |       |    %s     |         |\n",	 tgid ? "  |      " : "");
 }
 
-static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
+static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m,
+				       unsigned int flags)
 {
-	print_event_info(buf, m);
-	seq_puts(m, "#                              _-----=> irqs-off\n"
-		    "#                             / _----=> need-resched\n"
-		    "#                            | / _---=> hardirq/softirq\n"
-		    "#                            || / _--=> preempt-depth\n"
-		    "#                            ||| /     delay\n"
-		    "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n"
-		    "#              | |       |   ||||       |         |\n");
+	bool tgid = flags & TRACE_ITER_RECORD_TGID;
+
+	seq_printf(m, "#                          %s  _-----=> irqs-off\n",	    tgid ? "          " : "");
+	seq_printf(m, "#                          %s / _----=> need-resched\n",	    tgid ? "          " : "");
+	seq_printf(m, "#                          %s| / _---=> hardirq/softirq\n",  tgid ? "          " : "");
+	seq_printf(m, "#                          %s|| / _--=> preempt-depth\n",    tgid ? "          " : "");
+	seq_printf(m, "#                          %s||| /     delay\n",		    tgid ? "          " : "");
+	seq_printf(m, "#           TASK-PID   CPU#%s||||    TIMESTAMP  FUNCTION\n", tgid ? "   TGID   " : "");
+	seq_printf(m, "#              | |       | %s||||       |         |\n",	    tgid ? "     |    " : "");
 }
 
 void
@@ -3651,9 +3657,11 @@ void trace_default_header(struct seq_file *m)
 	} else {
 		if (!(trace_flags & TRACE_ITER_VERBOSE)) {
 			if (trace_flags & TRACE_ITER_IRQ_INFO)
-				print_func_help_header_irq(iter->trace_buffer, m);
+				print_func_help_header_irq(iter->trace_buffer,
+							   m, trace_flags);
 			else
-				print_func_help_header(iter->trace_buffer, m);
+				print_func_help_header(iter->trace_buffer, m,
+						       trace_flags);
 		}
 	}
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 01ff99969ca7..bac629af2285 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -597,6 +597,15 @@ int trace_print_context(struct trace_iterator *iter)
 	trace_seq_printf(s, "%16s-%-5d [%03d] ",
 			       comm, entry->pid, iter->cpu);
 
+	if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
+		unsigned int tgid = trace_find_tgid(entry->pid);
+
+		if (!tgid)
+			trace_seq_printf(s, "(-----) ");
+		else
+			trace_seq_printf(s, "(%5d) ", tgid);
+	}
+
 	if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
 		trace_print_lat_fmt(s, entry);
 
-- 
cgit v1.3-14-g43fede


From 93437353daeff31bd5b11810daa4d2d509d1a64e Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Fri, 26 May 2017 14:12:25 -0700
Subject: module: use list_for_each_entry_rcu() on find_module_all()

The module list has been using RCU in a lot of other calls
for a while now, we just overlooked changing this one over to
use RCU.

Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index f546d574f436..afc6ede7bcdf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -603,7 +603,7 @@ static struct module *find_module_all(const char *name, size_t len,
 
 	module_assert_mutex_or_preempt();
 
-	list_for_each_entry(mod, &modules, list) {
+	list_for_each_entry_rcu(mod, &modules, list) {
 		if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
 			continue;
 		if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
-- 
cgit v1.3-14-g43fede


From 165d1cc0074b2f938586274776d029b9bce914c4 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Fri, 23 Jun 2017 12:19:12 -0700
Subject: kmod: reduce atomic operations on kmod_concurrent and simplify

When checking if we want to allow a kmod thread to kick off we increment,
then read to see if we should enable a thread. If we were over the allowed
limit limit we decrement. Splitting the increment far apart from decrement
means there could be a time where two increments happen potentially
giving a false failure on a thread which should have been allowed.

CPU1			CPU2
atomic_inc()
			atomic_inc()
atomic_read()
			atomic_read()
atomic_dec()
			atomic_dec()

In this case a read on CPU1 gets the atomic_inc()'s and we could negate
it from getting a kmod thread. We could try to prevent this with a lock
or preemption but that is overkill. We can fix by reducing the number of
atomic operations. We do this by inverting the logic of of the enabler,
instead of incrementing kmod_concurrent as we get new kmod users, define the
variable kmod_concurrent_max as the max number of currently allowed kmod
users and as we get new kmod users just decrement it if its still positive.
This combines the dec and read in one atomic operation.

In this case we no longer get the same false failure:

CPU1			CPU2
atomic_dec_if_positive()
			atomic_dec_if_positive()
atomic_inc()
			atomic_inc()

The number of threads is computed at init, and since the current computation
of kmod_concurrent includes the thread count we can avoid setting
kmod_concurrent_max later in boot through an init call by simply sticking to
50 as the kmod_concurrent_max. The assumption here is a system with modules
must at least have ~16 MiB of RAM.

Suggested-by: Petr Mladek <pmladek@suse.com>
Suggested-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/kmod.c | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 563f97e2be36..ff68198fe83b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,8 +45,6 @@
 
 #include <trace/events/module.h>
 
-extern int max_threads;
-
 #define CAP_BSET	(void *)1
 #define CAP_PI		(void *)2
 
@@ -56,6 +54,20 @@ static DEFINE_SPINLOCK(umh_sysctl_lock);
 static DECLARE_RWSEM(umhelper_sem);
 
 #ifdef CONFIG_MODULES
+/*
+ * Assuming:
+ *
+ * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
+ *		       (u64) THREAD_SIZE * 8UL);
+ *
+ * If you need less than 50 threads would mean we're dealing with systems
+ * smaller than 3200 pages. This assuems you are capable of having ~13M memory,
+ * and this would only be an be an upper limit, after which the OOM killer
+ * would take effect. Systems like these are very unlikely if modules are
+ * enabled.
+ */
+#define MAX_KMOD_CONCURRENT 50
+static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
 
 /*
 	modprobe_path is set via /proc/sys.
@@ -127,10 +139,7 @@ int __request_module(bool wait, const char *fmt, ...)
 {
 	va_list args;
 	char module_name[MODULE_NAME_LEN];
-	unsigned int max_modprobes;
 	int ret;
-	static atomic_t kmod_concurrent = ATOMIC_INIT(0);
-#define MAX_KMOD_CONCURRENT 50	/* Completely arbitrary value - KAO */
 	static int kmod_loop_msg;
 
 	/*
@@ -154,21 +163,7 @@ int __request_module(bool wait, const char *fmt, ...)
 	if (ret)
 		return ret;
 
-	/* If modprobe needs a service that is in a module, we get a recursive
-	 * loop.  Limit the number of running kmod threads to max_threads/2 or
-	 * MAX_KMOD_CONCURRENT, whichever is the smaller.  A cleaner method
-	 * would be to run the parents of this process, counting how many times
-	 * kmod was invoked.  That would mean accessing the internals of the
-	 * process tables to get the command line, proc_pid_cmdline is static
-	 * and it is not worth changing the proc code just to handle this case. 
-	 * KAO.
-	 *
-	 * "trace the ppid" is simple, but will fail if someone's
-	 * parent exits.  I think this is as good as it gets. --RR
-	 */
-	max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
-	atomic_inc(&kmod_concurrent);
-	if (atomic_read(&kmod_concurrent) > max_modprobes) {
+	if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) {
 		/* We may be blaming an innocent here, but unlikely */
 		if (kmod_loop_msg < 5) {
 			printk(KERN_ERR
@@ -176,7 +171,6 @@ int __request_module(bool wait, const char *fmt, ...)
 			       module_name);
 			kmod_loop_msg++;
 		}
-		atomic_dec(&kmod_concurrent);
 		return -ENOMEM;
 	}
 
@@ -184,10 +178,12 @@ int __request_module(bool wait, const char *fmt, ...)
 
 	ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
 
-	atomic_dec(&kmod_concurrent);
+	atomic_inc(&kmod_concurrent_max);
+
 	return ret;
 }
 EXPORT_SYMBOL(__request_module);
+
 #endif /* CONFIG_MODULES */
 
 static void call_usermodehelper_freeinfo(struct subprocess_info *info)
-- 
cgit v1.3-14-g43fede


From 3b58a3c72f484393c65995a551902945f5a18c70 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 28 Jun 2017 09:09:38 -0400
Subject: ftrace: Unlock hash mutex on failed allocation in process_mod_list()

If the new_hash fails to allocate, then unlock the hash mutex on error.

Reported-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7509ef9810bf..2c79630cd267 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3997,7 +3997,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
 	new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS,
 					      *orig_hash);
 	if (!new_hash)
-		return; /* Warn? */
+		goto out; /* warn? */
 
 	mutex_lock(&ftrace_lock);
 
@@ -4042,6 +4042,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
 					      new_hash, enable);
 	mutex_unlock(&ftrace_lock);
 
+ out:
 	mutex_unlock(&ops->func_hash->regex_lock);
 
 	free_ftrace_hash(new_hash);
-- 
cgit v1.3-14-g43fede


From 4ec78467858739c0119569c0610676aa50dfa8fb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 28 Jun 2017 11:57:03 -0400
Subject: ftrace: Decrement count for dyn_ftrace_total_info for init functions

Init boot up functions may be traced, but they are also freed when the
kernel finishes booting. These are removed from the ftrace tables, and the
debug variable for dyn_ftrace_total_info needs to reflect that as well.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2c79630cd267..e392f750a1cf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5825,6 +5825,7 @@ void __init ftrace_free_init_mem(void)
 		if (!rec)
 			continue;
 		pg->index--;
+		ftrace_update_tot_cnt--;
 		if (!pg->index) {
 			*last_pg = pg->next;
 			order = get_count_order(pg->size / ENTRIES_PER_PAGE);
-- 
cgit v1.3-14-g43fede


From 824ecbe01c5d833b8c8a371c209e3ac3a76cd18a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 25 Jun 2017 00:27:59 -0400
Subject: cgroup: restructure cgroup_procs_write_permission()

Restructure cgroup_procs_write_permission() to make extending
permission logic easier.

This patch doesn't cause any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 57 +++++++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index dbfd7028b1c6..d48069ee84c2 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2363,27 +2363,12 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 					 struct cgroup *dst_cgrp,
 					 struct kernfs_open_file *of)
 {
-	int ret = 0;
-
-	if (cgroup_on_dfl(dst_cgrp)) {
-		struct super_block *sb = of->file->f_path.dentry->d_sb;
-		struct cgroup *cgrp;
-		struct inode *inode;
-
-		spin_lock_irq(&css_set_lock);
-		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-		spin_unlock_irq(&css_set_lock);
-
-		while (!cgroup_is_descendant(dst_cgrp, cgrp))
-			cgrp = cgroup_parent(cgrp);
+	struct super_block *sb = of->file->f_path.dentry->d_sb;
+	struct cgroup *src_cgrp, *com_cgrp;
+	struct inode *inode;
+	int ret;
 
-		ret = -ENOMEM;
-		inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
-		if (inode) {
-			ret = inode_permission(inode, MAY_WRITE);
-			iput(inode);
-		}
-	} else {
+	if (!cgroup_on_dfl(dst_cgrp)) {
 		const struct cred *cred = current_cred();
 		const struct cred *tcred = get_task_cred(task);
 
@@ -2391,14 +2376,38 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 		 * even if we're attaching all tasks in the thread group,
 		 * we only need to check permissions on one of them.
 		 */
-		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
-		    !uid_eq(cred->euid, tcred->uid) &&
-		    !uid_eq(cred->euid, tcred->suid))
+		if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
+		    uid_eq(cred->euid, tcred->uid) ||
+		    uid_eq(cred->euid, tcred->suid))
+			ret = 0;
+		else
 			ret = -EACCES;
+
 		put_cred(tcred);
+		return ret;
 	}
 
-	return ret;
+	/* find the source cgroup */
+	spin_lock_irq(&css_set_lock);
+	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+	spin_unlock_irq(&css_set_lock);
+
+	/* and the common ancestor */
+	com_cgrp = src_cgrp;
+	while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+		com_cgrp = cgroup_parent(com_cgrp);
+
+	/* %current should be authorized to migrate to the common ancestor */
+	inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+	if (!inode)
+		return -ENOMEM;
+
+	ret = inode_permission(inode, MAY_WRITE);
+	iput(inode);
+	if (ret)
+		return ret;
+
+	return 0;
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 5136f6365ce3eace5a926e10f16ed2a233db5ba9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 27 Jun 2017 14:30:28 -0400
Subject: cgroup: implement "nsdelegate" mount option

Currently, cgroup only supports delegation to !root users and cgroup
namespaces don't get any special treatments.  This limits the
usefulness of cgroup namespaces as they by themselves can't be safe
delegation boundaries.  A process inside a cgroup can change the
resource control knobs of the parent in the namespace root and may
move processes in and out of the namespace if cgroups outside its
namespace are visible somehow.

This patch adds a new mount option "nsdelegate" which makes cgroup
namespaces delegation boundaries.  If set, cgroup behaves as if write
permission based delegation took place at namespace boundaries -
writes to the resource control knobs from the namespace root are
denied and migration crossing the namespace boundary aren't allowed
from inside the namespace.

This allows cgroup namespace to function as a delegation boundary by
itself.

v2: Silently ignore nsdelegate specified on !init mounts.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Aravind Anbudurai <aru7@fb.com>
Cc: Serge Hallyn <serge@hallyn.com>
Cc: Eric Biederman <ebiederm@xmission.com>
---
 Documentation/cgroup-v2.txt | 61 +++++++++++++++++++++----------
 include/linux/cgroup-defs.h |  9 +++++
 kernel/cgroup/cgroup.c      | 88 ++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 135 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 0260ed053efd..558c3a739baf 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -149,6 +149,16 @@ during boot, before manual intervention is possible. To make testing
 and experimenting easier, the kernel parameter cgroup_no_v1= allows
 disabling controllers in v1 and make them always available in v2.
 
+cgroup v2 currently supports the following mount options.
+
+  nsdelegate
+
+	Consider cgroup namespaces as delegation boundaries.  This
+	option is system wide and can only be set on mount or modified
+	through remount from the init namespace.  The mount option is
+	ignored on non-init namespace mounts.  Please refer to the
+	Delegation section for details.
+
 
 2-2. Organizing Processes
 
@@ -308,19 +318,27 @@ file.
 
 2-5-1. Model of Delegation
 
-A cgroup can be delegated to a less privileged user by granting write
-access of the directory and its "cgroup.procs" and
-"cgroup.subtree_control" files to the user.  Note that resource
-control interface files in a given directory control the distribution
-of the parent's resources and thus must not be delegated along with
-the directory.
-
-Once delegated, the user can build sub-hierarchy under the directory,
-organize processes as it sees fit and further distribute the resources
-it received from the parent.  The limits and other settings of all
-resource controllers are hierarchical and regardless of what happens
-in the delegated sub-hierarchy, nothing can escape the resource
-restrictions imposed by the parent.
+A cgroup can be delegated in two ways.  First, to a less privileged
+user by granting write access of the directory and its "cgroup.procs"
+and "cgroup.subtree_control" files to the user.  Second, if the
+"nsdelegate" mount option is set, automatically to a cgroup namespace
+on namespace creation.
+
+Because the resource control interface files in a given directory
+control the distribution of the parent's resources, the delegatee
+shouldn't be allowed to write to them.  For the first method, this is
+achieved by not granting access to these files.  For the second, the
+kernel rejects writes to all files other than "cgroup.procs" and
+"cgroup.subtree_control" on a namespace root from inside the
+namespace.
+
+The end results are equivalent for both delegation types.  Once
+delegated, the user can build sub-hierarchy under the directory,
+organize processes inside it as it sees fit and further distribute the
+resources it received from the parent.  The limits and other settings
+of all resource controllers are hierarchical and regardless of what
+happens in the delegated sub-hierarchy, nothing can escape the
+resource restrictions imposed by the parent.
 
 Currently, cgroup doesn't impose any restrictions on the number of
 cgroups in or nesting depth of a delegated sub-hierarchy; however,
@@ -330,10 +348,12 @@ this may be limited explicitly in the future.
 2-5-2. Delegation Containment
 
 A delegated sub-hierarchy is contained in the sense that processes
-can't be moved into or out of the sub-hierarchy by the delegatee.  For
-a process with a non-root euid to migrate a target process into a
-cgroup by writing its PID to the "cgroup.procs" file, the following
-conditions must be met.
+can't be moved into or out of the sub-hierarchy by the delegatee.
+
+For delegations to a less privileged user, this is achieved by
+requiring the following conditions for a process with a non-root euid
+to migrate a target process into a cgroup by writing its PID to the
+"cgroup.procs" file.
 
 - The writer must have write access to the "cgroup.procs" file.
 
@@ -360,6 +380,11 @@ destination cgroup C00 is above the points of delegation and U0 would
 not have write access to its "cgroup.procs" files and thus the write
 will be denied with -EACCES.
 
+For delegations to namespaces, containment is achieved by requiring
+that both the source and destination cgroups are reachable from the
+namespace of the process which is attempting the migration.  If either
+is not reachable, the migration is rejected with -ENOENT.
+
 
 2-6. Guidelines
 
@@ -1414,7 +1439,7 @@ D. Deprecated v1 Core Features
 
 - Multiple hierarchies including named ones are not supported.
 
-- All mount options and remounting are not supported.
+- All v1 mount options are not supported.
 
 - The "tasks" file is removed and "cgroup.procs" is not sorted.
 
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 3bc4196bf217..09f4c7df1478 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -67,12 +67,21 @@ enum {
 enum {
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
+
+	/*
+	 * Consider namespaces as delegation boundaries.  If this flag is
+	 * set, controller specific interface files in a namespace root
+	 * aren't writeable from inside the namespace.
+	 */
+	CGRP_ROOT_NS_DELEGATE	= (1 << 3),
 };
 
 /* cftype->flags */
 enum {
 	CFTYPE_ONLY_ON_ROOT	= (1 << 0),	/* only create on root cgrp */
 	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cgrp */
+	CFTYPE_NS_DELEGATABLE	= (1 << 2),	/* writeable beyond delegation boundaries */
+
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d48069ee84c2..620794a20a33 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1547,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 	return len;
 }
 
+static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
+{
+	char *token;
+
+	*root_flags = 0;
+
+	if (!data)
+		return 0;
+
+	while ((token = strsep(&data, ",")) != NULL) {
+		if (!strcmp(token, "nsdelegate")) {
+			*root_flags |= CGRP_ROOT_NS_DELEGATE;
+			continue;
+		}
+
+		pr_err("cgroup2: unknown option \"%s\"\n", token);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void apply_cgroup_root_flags(unsigned int root_flags)
+{
+	if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
+		if (root_flags & CGRP_ROOT_NS_DELEGATE)
+			cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
+		else
+			cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+	}
+}
+
+static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+	if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
+		seq_puts(seq, ",nsdelegate");
+	return 0;
+}
+
 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
-	pr_err("remount is not allowed\n");
-	return -EINVAL;
+	unsigned int root_flags;
+	int ret;
+
+	ret = parse_cgroup_root_flags(data, &root_flags);
+	if (ret)
+		return ret;
+
+	apply_cgroup_root_flags(root_flags);
+	return 0;
 }
 
 /*
@@ -1790,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 {
 	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
 	struct dentry *dentry;
+	int ret;
 
 	get_cgroup_ns(ns);
 
@@ -1807,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		cgroup_enable_task_cg_lists();
 
 	if (fs_type == &cgroup2_fs_type) {
-		if (data) {
-			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+		unsigned int root_flags;
+
+		ret = parse_cgroup_root_flags(data, &root_flags);
+		if (ret) {
 			put_cgroup_ns(ns);
-			return ERR_PTR(-EINVAL);
+			return ERR_PTR(ret);
 		}
+
 		cgrp_dfl_visible = true;
 		cgroup_get_live(&cgrp_dfl_root.cgrp);
 
 		dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
 					 CGROUP2_SUPER_MAGIC, ns);
+		if (!IS_ERR(dentry))
+			apply_cgroup_root_flags(root_flags);
 	} else {
 		dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
 				       CGROUP_SUPER_MAGIC, ns);
@@ -2364,6 +2416,8 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 					 struct kernfs_open_file *of)
 {
 	struct super_block *sb = of->file->f_path.dentry->d_sb;
+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+	struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
 	struct cgroup *src_cgrp, *com_cgrp;
 	struct inode *inode;
 	int ret;
@@ -2407,6 +2461,15 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 	if (ret)
 		return ret;
 
+	/*
+	 * If namespaces are delegation boundaries, %current must be able
+	 * to see both source and destination cgroups from its namespace.
+	 */
+	if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+	    (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
+	     !cgroup_is_descendant(dst_cgrp, root_cgrp)))
+		return -ENOENT;
+
 	return 0;
 }
 
@@ -2971,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of)
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 {
+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
 	struct cgroup *cgrp = of->kn->parent->priv;
 	struct cftype *cft = of->kn->priv;
 	struct cgroup_subsys_state *css;
 	int ret;
 
+	/*
+	 * If namespaces are delegation boundaries, disallow writes to
+	 * files in an non-init namespace root from inside the namespace
+	 * except for the files explicitly marked delegatable -
+	 * cgroup.procs and cgroup.subtree_control.
+	 */
+	if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
+	    !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
+	    ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+		return -EPERM;
+
 	if (cft->write)
 		return cft->write(of, buf, nbytes, off);
 
@@ -3809,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
 static struct cftype cgroup_base_files[] = {
 	{
 		.name = "cgroup.procs",
+		.flags = CFTYPE_NS_DELEGATABLE,
 		.file_offset = offsetof(struct cgroup, procs_file),
 		.release = cgroup_procs_release,
 		.seq_start = cgroup_procs_start,
@@ -3822,6 +3898,7 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "cgroup.subtree_control",
+		.flags = CFTYPE_NS_DELEGATABLE,
 		.seq_show = cgroup_subtree_control_show,
 		.write = cgroup_subtree_control_write,
 	},
@@ -4410,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn)
 }
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+	.show_options		= cgroup_show_options,
 	.remount_fs		= cgroup_remount,
 	.mkdir			= cgroup_mkdir,
 	.rmdir			= cgroup_rmdir,
-- 
cgit v1.3-14-g43fede


From 2287d8664fe7345ead891017eccd879fc605305e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 27 Jun 2017 18:15:38 +0200
Subject: timers: Make the cpu base lock raw

The timers cpu base lock could not be converted to a raw spinlock becaue
the lock held time was non-deterministic due to cascading and long lasting
timer wheel traversals.

The rework of the timer wheel to the new non-cascading model removed also
the wheel traversals and the lock held times are deterministic now. This
allows to make the lock raw and thereby unbreaks NOHz* on preempt-RT.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: http://lkml.kernel.org/r/20170627161538.30257-1-bigeasy@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 709a404bd133..71ce3f4eead3 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -195,7 +195,7 @@ EXPORT_SYMBOL(jiffies_64);
 #endif
 
 struct timer_base {
-	spinlock_t		lock;
+	raw_spinlock_t		lock;
 	struct timer_list	*running_timer;
 	unsigned long		clk;
 	unsigned long		next_expiry;
@@ -913,10 +913,10 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
 
 		if (!(tf & TIMER_MIGRATING)) {
 			base = get_timer_base(tf);
-			spin_lock_irqsave(&base->lock, *flags);
+			raw_spin_lock_irqsave(&base->lock, *flags);
 			if (timer->flags == tf)
 				return base;
-			spin_unlock_irqrestore(&base->lock, *flags);
+			raw_spin_unlock_irqrestore(&base->lock, *flags);
 		}
 		cpu_relax();
 	}
@@ -986,9 +986,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 			/* See the comment in lock_timer_base() */
 			timer->flags |= TIMER_MIGRATING;
 
-			spin_unlock(&base->lock);
+			raw_spin_unlock(&base->lock);
 			base = new_base;
-			spin_lock(&base->lock);
+			raw_spin_lock(&base->lock);
 			WRITE_ONCE(timer->flags,
 				   (timer->flags & ~TIMER_BASEMASK) | base->cpu);
 		}
@@ -1013,7 +1013,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 	}
 
 out_unlock:
-	spin_unlock_irqrestore(&base->lock, flags);
+	raw_spin_unlock_irqrestore(&base->lock, flags);
 
 	return ret;
 }
@@ -1106,16 +1106,16 @@ void add_timer_on(struct timer_list *timer, int cpu)
 	if (base != new_base) {
 		timer->flags |= TIMER_MIGRATING;
 
-		spin_unlock(&base->lock);
+		raw_spin_unlock(&base->lock);
 		base = new_base;
-		spin_lock(&base->lock);
+		raw_spin_lock(&base->lock);
 		WRITE_ONCE(timer->flags,
 			   (timer->flags & ~TIMER_BASEMASK) | cpu);
 	}
 
 	debug_activate(timer, timer->expires);
 	internal_add_timer(base, timer);
-	spin_unlock_irqrestore(&base->lock, flags);
+	raw_spin_unlock_irqrestore(&base->lock, flags);
 }
 EXPORT_SYMBOL_GPL(add_timer_on);
 
@@ -1141,7 +1141,7 @@ int del_timer(struct timer_list *timer)
 	if (timer_pending(timer)) {
 		base = lock_timer_base(timer, &flags);
 		ret = detach_if_pending(timer, base, true);
-		spin_unlock_irqrestore(&base->lock, flags);
+		raw_spin_unlock_irqrestore(&base->lock, flags);
 	}
 
 	return ret;
@@ -1168,7 +1168,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
 	if (base->running_timer != timer)
 		ret = detach_if_pending(timer, base, true);
 
-	spin_unlock_irqrestore(&base->lock, flags);
+	raw_spin_unlock_irqrestore(&base->lock, flags);
 
 	return ret;
 }
@@ -1299,13 +1299,13 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 		data = timer->data;
 
 		if (timer->flags & TIMER_IRQSAFE) {
-			spin_unlock(&base->lock);
+			raw_spin_unlock(&base->lock);
 			call_timer_fn(timer, fn, data);
-			spin_lock(&base->lock);
+			raw_spin_lock(&base->lock);
 		} else {
-			spin_unlock_irq(&base->lock);
+			raw_spin_unlock_irq(&base->lock);
 			call_timer_fn(timer, fn, data);
-			spin_lock_irq(&base->lock);
+			raw_spin_lock_irq(&base->lock);
 		}
 	}
 }
@@ -1474,7 +1474,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 	if (cpu_is_offline(smp_processor_id()))
 		return expires;
 
-	spin_lock(&base->lock);
+	raw_spin_lock(&base->lock);
 	nextevt = __next_timer_interrupt(base);
 	is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
 	base->next_expiry = nextevt;
@@ -1502,7 +1502,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 		if ((expires - basem) > TICK_NSEC)
 			base->is_idle = true;
 	}
-	spin_unlock(&base->lock);
+	raw_spin_unlock(&base->lock);
 
 	return cmp_next_hrtimer_event(basem, expires);
 }
@@ -1590,7 +1590,7 @@ static inline void __run_timers(struct timer_base *base)
 	if (!time_after_eq(jiffies, base->clk))
 		return;
 
-	spin_lock_irq(&base->lock);
+	raw_spin_lock_irq(&base->lock);
 
 	while (time_after_eq(jiffies, base->clk)) {
 
@@ -1601,7 +1601,7 @@ static inline void __run_timers(struct timer_base *base)
 			expire_timers(base, heads + levels);
 	}
 	base->running_timer = NULL;
-	spin_unlock_irq(&base->lock);
+	raw_spin_unlock_irq(&base->lock);
 }
 
 /*
@@ -1786,16 +1786,16 @@ int timers_dead_cpu(unsigned int cpu)
 		 * The caller is globally serialized and nobody else
 		 * takes two locks at once, deadlock is not possible.
 		 */
-		spin_lock_irq(&new_base->lock);
-		spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+		raw_spin_lock_irq(&new_base->lock);
+		raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
 
 		BUG_ON(old_base->running_timer);
 
 		for (i = 0; i < WHEEL_SIZE; i++)
 			migrate_timer_list(new_base, old_base->vectors + i);
 
-		spin_unlock(&old_base->lock);
-		spin_unlock_irq(&new_base->lock);
+		raw_spin_unlock(&old_base->lock);
+		raw_spin_unlock_irq(&new_base->lock);
 		put_cpu_ptr(&timer_bases);
 	}
 	return 0;
@@ -1811,7 +1811,7 @@ static void __init init_timer_cpu(int cpu)
 	for (i = 0; i < NR_BASES; i++) {
 		base = per_cpu_ptr(&timer_bases[i], cpu);
 		base->cpu = cpu;
-		spin_lock_init(&base->lock);
+		raw_spin_lock_init(&base->lock);
 		base->clk = jiffies;
 	}
 }
-- 
cgit v1.3-14-g43fede


From ff801b716effd652f420204eddb36f6e4a716819 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jun 2017 08:25:52 +0200
Subject: sched/numa: Hide numa_wake_affine() from UP build

Stephen reported the following build warning in UP:

kernel/sched/fair.c:2657:9: warning: 'struct sched_domain' declared inside
parameter list
         ^
/home/sfr/next/next/kernel/sched/fair.c:2657:9: warning: its scope is only this
definition or declaration, which is probably not what you want

Hide the numa_wake_affine() inline stub on UP builds to get rid of it.

Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/fair.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6f4f155adf5f..008c514dc241 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2652,12 +2652,14 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 {
 }
 
+#ifdef CONFIG_SMP
 static inline bool numa_wake_affine(struct sched_domain *sd,
 				    struct task_struct *p, int this_cpu,
 				    int prev_cpu, int sync)
 {
 	return true;
 }
+#endif /* !SMP */
 #endif /* CONFIG_NUMA_BALANCING */
 
 static void
-- 
cgit v1.3-14-g43fede


From 96b5b19459b3c2aed2872bac42cbe19edfae710f Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Wed, 28 Jun 2017 18:32:31 -0700
Subject: module: make the modinfo name const

This can be accomplished by making blacklisted() also accept const.

Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
[jeyu: fix typo]
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index afc6ede7bcdf..d07287707557 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -302,7 +302,7 @@ int unregister_module_notifier(struct notifier_block *nb)
 EXPORT_SYMBOL(unregister_module_notifier);
 
 struct load_info {
-	char *name;
+	const char *name;
 	Elf_Ehdr *hdr;
 	unsigned long len;
 	Elf_Shdr *sechdrs;
@@ -3265,7 +3265,7 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
 
 /* module_blacklist is a comma-separated list of module names */
 static char *module_blacklist;
-static bool blacklisted(char *module_name)
+static bool blacklisted(const char *module_name)
 {
 	const char *p;
 	size_t len;
-- 
cgit v1.3-14-g43fede


From 14dc6f04f49dc12614d7e90928b495b8d73cd471 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 27 Jun 2017 23:08:34 -0700
Subject: bpf: Add syscall lookup support for fd array and htab

This patch allows userspace to do BPF_MAP_LOOKUP_ELEM on
BPF_MAP_TYPE_PROG_ARRAY,
BPF_MAP_TYPE_ARRAY_OF_MAPS and
BPF_MAP_TYPE_HASH_OF_MAPS.

The lookup returns a prog-id or map-id to the userspace.
The userspace can then use the BPF_PROG_GET_FD_BY_ID
or BPF_MAP_GET_FD_BY_ID to get a fd.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h     |  3 +++
 kernel/bpf/arraymap.c   | 27 +++++++++++++++++++++++++++
 kernel/bpf/hashtab.c    | 21 +++++++++++++++++++++
 kernel/bpf/map_in_map.c |  5 +++++
 kernel/bpf/map_in_map.h |  1 +
 kernel/bpf/syscall.c    | 16 +++++++++++++---
 6 files changed, 70 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index deca4e7f2845..5175729270d7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -36,6 +36,7 @@ struct bpf_map_ops {
 				int fd);
 	void (*map_fd_put_ptr)(void *ptr);
 	u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
+	u32 (*map_fd_sys_lookup_elem)(void *ptr);
 };
 
 struct bpf_map {
@@ -288,9 +289,11 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value);
 
 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 				 void *key, void *value, u64 map_flags);
+int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
 void bpf_fd_array_map_clear(struct bpf_map *map);
 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 				void *key, void *value, u64 map_flags);
+int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
 
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index ecb43542246e..d771a3872500 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -334,6 +334,26 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 	return NULL;
 }
 
+/* only called from syscall */
+int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
+{
+	void **elem, *ptr;
+	int ret =  0;
+
+	if (!map->ops->map_fd_sys_lookup_elem)
+		return -ENOTSUPP;
+
+	rcu_read_lock();
+	elem = array_map_lookup_elem(map, key);
+	if (elem && (ptr = READ_ONCE(*elem)))
+		*value = map->ops->map_fd_sys_lookup_elem(ptr);
+	else
+		ret = -ENOENT;
+	rcu_read_unlock();
+
+	return ret;
+}
+
 /* only called from syscall */
 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 				 void *key, void *value, u64 map_flags)
@@ -400,6 +420,11 @@ static void prog_fd_array_put_ptr(void *ptr)
 	bpf_prog_put(ptr);
 }
 
+static u32 prog_fd_array_sys_lookup_elem(void *ptr)
+{
+	return ((struct bpf_prog *)ptr)->aux->id;
+}
+
 /* decrement refcnt of all bpf_progs that are stored in this map */
 void bpf_fd_array_map_clear(struct bpf_map *map)
 {
@@ -418,6 +443,7 @@ const struct bpf_map_ops prog_array_map_ops = {
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = prog_fd_array_get_ptr,
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
+	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
 };
 
 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -585,4 +611,5 @@ const struct bpf_map_ops array_of_maps_map_ops = {
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = bpf_map_fd_get_ptr,
 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
+	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
 };
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 004334ea13ba..4fb463172aa8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1243,6 +1243,26 @@ static void fd_htab_map_free(struct bpf_map *map)
 	htab_map_free(map);
 }
 
+/* only called from syscall */
+int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
+{
+	void **ptr;
+	int ret = 0;
+
+	if (!map->ops->map_fd_sys_lookup_elem)
+		return -ENOTSUPP;
+
+	rcu_read_lock();
+	ptr = htab_map_lookup_elem(map, key);
+	if (ptr)
+		*value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
+	else
+		ret = -ENOENT;
+	rcu_read_unlock();
+
+	return ret;
+}
+
 /* only called from syscall */
 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 				void *key, void *value, u64 map_flags)
@@ -1305,4 +1325,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
 	.map_delete_elem = htab_map_delete_elem,
 	.map_fd_get_ptr = bpf_map_fd_get_ptr,
 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
+	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
 };
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 59bcdf821ae4..1da574612bea 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -95,3 +95,8 @@ void bpf_map_fd_put_ptr(void *ptr)
 	 */
 	bpf_map_put(ptr);
 }
+
+u32 bpf_map_fd_sys_lookup_elem(void *ptr)
+{
+	return ((struct bpf_map *)ptr)->id;
+}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 177fadb689dc..6183db9ec08c 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -19,5 +19,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
 void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
 			 int ufd);
 void bpf_map_fd_put_ptr(void *ptr);
+u32 bpf_map_fd_sys_lookup_elem(void *ptr);
 
 #endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8942c820d620..4409ccca8831 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -24,6 +24,13 @@
 #include <linux/kernel.h>
 #include <linux/idr.h>
 
+#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
+			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
+			   (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
+			   (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
+#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
+
 DEFINE_PER_CPU(int, bpf_prog_active);
 static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
@@ -411,6 +418,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
 		value_size = round_up(map->value_size, 8) * num_possible_cpus();
+	else if (IS_FD_MAP(map))
+		value_size = sizeof(u32);
 	else
 		value_size = map->value_size;
 
@@ -426,9 +435,10 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = bpf_percpu_array_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
 		err = bpf_stackmap_copy(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
-		   map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
-		err = -ENOTSUPP;
+	} else if (IS_FD_ARRAY(map)) {
+		err = bpf_fd_array_map_lookup_elem(map, key, value);
+	} else if (IS_FD_HASH(map)) {
+		err = bpf_fd_htab_map_lookup_elem(map, key, value);
 	} else {
 		rcu_read_lock();
 		ptr = map->ops->map_lookup_elem(map, key);
-- 
cgit v1.3-14-g43fede


From 8007e40a24e12d35189203370268c7278f29ab74 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 28 Jun 2017 10:41:24 -0700
Subject: bpf: Fix out-of-bound access on interpreters[]

The index is off-by-one when fp->aux->stack_depth
has already been rounded up to 32.  In particular,
if stack_depth is 512, the index will be 16.

The fix is to round_up and then takes -1 instead of round_down.

[   22.318680] ==================================================================
[   22.319745] BUG: KASAN: global-out-of-bounds in bpf_prog_select_runtime+0x48a/0x670
[   22.320737] Read of size 8 at addr ffffffff82aadae0 by task sockex3/1946
[   22.321646]
[   22.321858] CPU: 1 PID: 1946 Comm: sockex3 Tainted: G        W       4.12.0-rc6-01680-g2ee87db3a287 #22
[   22.323061] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014
[   22.324260] Call Trace:
[   22.324612]  dump_stack+0x67/0x99
[   22.325081]  print_address_description+0x1e8/0x290
[   22.325734]  ? bpf_prog_select_runtime+0x48a/0x670
[   22.326360]  kasan_report+0x265/0x350
[   22.326860]  __asan_report_load8_noabort+0x19/0x20
[   22.327484]  bpf_prog_select_runtime+0x48a/0x670
[   22.328109]  bpf_prog_load+0x626/0xd40
[   22.328637]  ? __bpf_prog_charge+0xc0/0xc0
[   22.329222]  ? check_nnp_nosuid.isra.61+0x100/0x100
[   22.329890]  ? __might_fault+0xf6/0x1b0
[   22.330446]  ? lock_acquire+0x360/0x360
[   22.331013]  SyS_bpf+0x67c/0x24d0
[   22.331491]  ? trace_hardirqs_on+0xd/0x10
[   22.332049]  ? __getnstimeofday64+0xaf/0x1c0
[   22.332635]  ? bpf_prog_get+0x20/0x20
[   22.333135]  ? __audit_syscall_entry+0x300/0x600
[   22.333770]  ? syscall_trace_enter+0x540/0xdd0
[   22.334339]  ? exit_to_usermode_loop+0xe0/0xe0
[   22.334950]  ? do_syscall_64+0x48/0x410
[   22.335446]  ? bpf_prog_get+0x20/0x20
[   22.335954]  do_syscall_64+0x181/0x410
[   22.336454]  entry_SYSCALL64_slow_path+0x25/0x25
[   22.337121] RIP: 0033:0x7f263fe81f19
[   22.337618] RSP: 002b:00007ffd9a3440c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000141
[   22.338619] RAX: ffffffffffffffda RBX: 0000000000aac5fb RCX: 00007f263fe81f19
[   22.339600] RDX: 0000000000000030 RSI: 00007ffd9a3440d0 RDI: 0000000000000005
[   22.340470] RBP: 0000000000a9a1e0 R08: 0000000000a9a1e0 R09: 0000009d00000001
[   22.341430] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000010000
[   22.342411] R13: 0000000000a9a023 R14: 0000000000000001 R15: 0000000000000003
[   22.343369]
[   22.343593] The buggy address belongs to the variable:
[   22.344241]  interpreters+0x80/0x980
[   22.344708]
[   22.344908] Memory state around the buggy address:
[   22.345556]  ffffffff82aad980: 00 00 00 04 fa fa fa fa 04 fa fa fa fa fa fa fa
[   22.346449]  ffffffff82aada00: 00 00 00 00 00 fa fa fa fa fa fa fa 00 00 00 00
[   22.347361] >ffffffff82aada80: 00 00 00 00 00 00 00 00 00 00 00 00 fa fa fa fa
[   22.348301]                                                        ^
[   22.349142]  ffffffff82aadb00: 00 01 fa fa fa fa fa fa 00 00 00 00 00 00 00 00
[   22.350058]  ffffffff82aadb80: 00 00 07 fa fa fa fa fa 00 00 05 fa fa fa fa fa
[   22.350984] ==================================================================

Fixes: b870aa901f4b ("bpf: use different interpreter depending on required stack size")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 774069ca18a7..ad5f55922a13 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1297,7 +1297,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
  */
 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 {
-	fp->bpf_func = interpreters[round_down(fp->aux->stack_depth, 32) / 32];
+	u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
+
+	fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
 
 	/* eBPF JITs can rewrite the program in case constant
 	 * blinding is active. However, in case of error during
-- 
cgit v1.3-14-g43fede


From 59494fe2c89e46e85d83de9bc45dd1d528955c49 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Thu, 29 Jun 2017 16:58:40 +0530
Subject: PM: hibernate: constify attribute_group structures.

attribute_groups are not supposed to change at runtime. All functions
working with attribute_groups provided by <linux/sysfs.h> work with const
attribute_group. So mark the non-const structs as const.

File size before:
   text	   data	    bss	    dec	    hex	filename
   6332	    488	    308	   7128	   1bd8	kernel/power/hibernate.o

File size After adding 'const':
   text	   data	    bss	    dec	    hex	filename
   6396	    424	    308	   7128	   1bd8	kernel/power/hibernate.o

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a8b978c35a6a..e1914c7b85b1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1108,7 +1108,7 @@ static struct attribute * g[] = {
 };
 
 
-static struct attribute_group attr_group = {
+static const struct attribute_group attr_group = {
 	.attrs = g,
 };
 
-- 
cgit v1.3-14-g43fede


From a9bd8dfa539493db265e46a496c1a89279ab31d1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 13 May 2017 18:39:01 -0400
Subject: kimage_file_prepare_segments(): don't open-code memdup_user()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/kexec_file.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b118735fea9d..766e7e4d3ad9 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -162,16 +162,10 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
 	}
 
 	if (cmdline_len) {
-		image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
-		if (!image->cmdline_buf) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
-				     cmdline_len);
-		if (ret) {
-			ret = -EFAULT;
+		image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
+		if (IS_ERR(image->cmdline_buf)) {
+			ret = PTR_ERR(image->cmdline_buf);
+			image->cmdline_buf = NULL;
 			goto out;
 		}
 
-- 
cgit v1.3-14-g43fede


From e4448ed87ccdbacb74871736f63220642242b32f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 13 May 2017 18:43:00 -0400
Subject: bpf: don't open-code memdup_user()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/bpf/syscall.c | 45 ++++++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fd2411fd6914..4b8b10bddfde 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -322,14 +322,11 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	err = -ENOMEM;
-	key = kmalloc(map->key_size, GFP_USER);
-	if (!key)
+	key = memdup_user(ukey, map->key_size);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
 		goto err_put;
-
-	err = -EFAULT;
-	if (copy_from_user(key, ukey, map->key_size) != 0)
-		goto free_key;
+	}
 
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -402,14 +399,11 @@ static int map_update_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	err = -ENOMEM;
-	key = kmalloc(map->key_size, GFP_USER);
-	if (!key)
+	key = memdup_user(ukey, map->key_size);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
 		goto err_put;
-
-	err = -EFAULT;
-	if (copy_from_user(key, ukey, map->key_size) != 0)
-		goto free_key;
+	}
 
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -488,14 +482,11 @@ static int map_delete_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	err = -ENOMEM;
-	key = kmalloc(map->key_size, GFP_USER);
-	if (!key)
+	key = memdup_user(ukey, map->key_size);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
 		goto err_put;
-
-	err = -EFAULT;
-	if (copy_from_user(key, ukey, map->key_size) != 0)
-		goto free_key;
+	}
 
 	preempt_disable();
 	__this_cpu_inc(bpf_prog_active);
@@ -507,7 +498,6 @@ static int map_delete_elem(union bpf_attr *attr)
 
 	if (!err)
 		trace_bpf_map_delete_elem(map, ufd, key);
-free_key:
 	kfree(key);
 err_put:
 	fdput(f);
@@ -536,14 +526,11 @@ static int map_get_next_key(union bpf_attr *attr)
 		return PTR_ERR(map);
 
 	if (ukey) {
-		err = -ENOMEM;
-		key = kmalloc(map->key_size, GFP_USER);
-		if (!key)
+		key = memdup_user(ukey, map->key_size);
+		if (IS_ERR(key)) {
+			err = PTR_ERR(key);
 			goto err_put;
-
-		err = -EFAULT;
-		if (copy_from_user(key, ukey, map->key_size) != 0)
-			goto free_key;
+		}
 	} else {
 		key = NULL;
 	}
-- 
cgit v1.3-14-g43fede


From 48365b38849fdb1ee6dc65beac044ca59f669683 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Mon, 26 Jun 2017 17:07:14 +0200
Subject: sched/debug: Expose the number of RT/DL tasks that can migrate

Add the value of the rt_rq.rt_nr_migratory and dl_rq.dl_nr_migratory
to the sched_debug output, for instance:

 rt_rq[0]:
   .rt_nr_running                 : 2
   .rt_nr_migratory               : 1     <--- Like this
   .rt_throttled                  : 0
   .rt_time                       : 828.645877
   .rt_runtime                    : 1000.000000

This is useful to debug problems related to the RT/DL schedulers.

This also fixes the format of some variables, that were unsigned, rather
than signed.

Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis Claudio R. Goncalves <lgoncalv@redhat.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-rt-users <linux-rt-users@vger.kernel.org>
Link: http://lkml.kernel.org/r/7896f71cada54ee7dd8507bb666063a2e051c3d4.1498482127.git.bristot@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/debug.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 38f019324f1a..4fa66de52bd6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -552,15 +552,21 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 
 #define P(x) \
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
+#define PU(x) \
+	SEQ_printf(m, "  .%-30s: %lu\n", #x, (unsigned long)(rt_rq->x))
 #define PN(x) \
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
 
-	P(rt_nr_running);
+	PU(rt_nr_running);
+#ifdef CONFIG_SMP
+	PU(rt_nr_migratory);
+#endif
 	P(rt_throttled);
 	PN(rt_time);
 	PN(rt_runtime);
 
 #undef PN
+#undef PU
 #undef P
 }
 
@@ -569,14 +575,21 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
 	struct dl_bw *dl_bw;
 
 	SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
-	SEQ_printf(m, "  .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+
+#define PU(x) \
+	SEQ_printf(m, "  .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x))
+
+	PU(dl_nr_running);
 #ifdef CONFIG_SMP
+	PU(dl_nr_migratory);
 	dl_bw = &cpu_rq(cpu)->rd->dl_bw;
 #else
 	dl_bw = &dl_rq->dl_bw;
 #endif
 	SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
 	SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
+
+#undef PU
 }
 
 extern __read_mostly int sched_clock_running;
-- 
cgit v1.3-14-g43fede


From 993647a293814dd47ae41d38657fda6e4ab04e33 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Thu, 29 Jun 2017 17:40:47 +0530
Subject: cpu/hotplug: Constify attribute_group structures

attribute_groups are not supposed to change at runtime. All functions
working with attribute_groups provided by <linux/sysfs.h> work with const
attribute_group.

So mark the non-const structs as const:

File size before:
   text	   data	    bss	    dec	    hex	filename
  12582	  15361	     20	  27963	   6d3b	kernel/cpu.o

File size After adding 'const':
   text	   data	    bss	    dec	    hex	filename
  12710	  15265	     20	  27995	   6d5b	kernel/cpu.o

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: anna-maria@linutronix.de
Cc: bigeasy@linutronix.de
Cc: boris.ostrovsky@oracle.com
Cc: rcochran@linutronix.de
Link: http://lkml.kernel.org/r/f9079e94e12b36d245e7adbf67d312bc5d0250c6.1498737970.git.arvind.yadav.cs@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index d0f5f54aa087..b69c0588f8c9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1629,7 +1629,7 @@ static struct attribute *cpuhp_cpu_attrs[] = {
 	NULL
 };
 
-static struct attribute_group cpuhp_cpu_attr_group = {
+static const struct attribute_group cpuhp_cpu_attr_group = {
 	.attrs = cpuhp_cpu_attrs,
 	.name = "hotplug",
 	NULL
@@ -1661,7 +1661,7 @@ static struct attribute *cpuhp_cpu_root_attrs[] = {
 	NULL
 };
 
-static struct attribute_group cpuhp_cpu_root_attr_group = {
+static const struct attribute_group cpuhp_cpu_root_attr_group = {
 	.attrs = cpuhp_cpu_root_attrs,
 	.name = "hotplug",
 	NULL
-- 
cgit v1.3-14-g43fede


From 72298e5c92c50edd8cb7cfda4519483ce65fa166 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Thu, 29 Jun 2017 13:41:28 -0500
Subject: sched/cputime: Refactor the cputime_adjust() code

Address a Coverity false positive, which is caused by overly
convoluted code:

Value assigned to variable 'utime' at line 619:utime = rtime;
is overwritten at line 642:utime = rtime - stime; before it
can be used. This makes such variable assignment useless.

Remove this variable assignment and refactor the code related.

Addresses-Coverity-ID: 1371643
Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Cc: Frans Klaver <fransklaver@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/20170629184128.GA5271@embeddedgus
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index aea3135c5d90..67c70e287647 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -615,19 +615,13 @@ static void cputime_adjust(struct task_cputime *curr,
 	 * userspace. Once a task gets some ticks, the monotonicy code at
 	 * 'update' will ensure things converge to the observed ratio.
 	 */
-	if (stime == 0) {
-		utime = rtime;
-		goto update;
+	if (stime != 0) {
+		if (utime == 0)
+			stime = rtime;
+		else
+			stime = scale_stime(stime, rtime, stime + utime);
 	}
 
-	if (utime == 0) {
-		stime = rtime;
-		goto update;
-	}
-
-	stime = scale_stime(stime, rtime, stime + utime);
-
-update:
 	/*
 	 * Make sure stime doesn't go backwards; this preserves monotonicity
 	 * for utime because rtime is monotonic.
-- 
cgit v1.3-14-g43fede


From 5c4994102fb508d4a0f7a8afa46560c314c1ebd4 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 24 Jun 2017 11:45:05 -0700
Subject: posix-timers: Use get_timespec64() and put_timespec64()

Usage of these apis and their compat versions makes
the syscalls: clock_gettime, clock_settime, clock_getres
and their compat implementations simpler.

This is a preparatory patch to isolate data conversions to
struct timespec64 at userspace boundaries. This helps contain
the changes needed to transition to new y2038 safe types.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/time/posix-stubs.c  | 83 +++++++++++++++++++++++++---------------------
 kernel/time/posix-timers.c | 63 ++++++++++++++---------------------
 2 files changed, 70 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 65878221cbfb..06f34feb635e 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -51,40 +51,52 @@ SYS_NI(alarm);
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 		const struct timespec __user *, tp)
 {
-	struct timespec64 new_tp64;
-	struct timespec new_tp;
+	struct timespec64 new_tp;
 
 	if (which_clock != CLOCK_REALTIME)
 		return -EINVAL;
-	if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+	if (get_timespec64(&new_tp, tp))
 		return -EFAULT;
 
-	new_tp64 = timespec_to_timespec64(new_tp);
-	return do_sys_settimeofday64(&new_tp64, NULL);
+	return do_sys_settimeofday64(&new_tp, NULL);
 }
 
-SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
-		struct timespec __user *,tp)
+int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
 {
-	struct timespec64 kernel_tp64;
-	struct timespec kernel_tp;
-
 	switch (which_clock) {
-	case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
-	case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
-	case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
-	default: return -EINVAL;
+	case CLOCK_REALTIME:
+		ktime_get_real_ts64(tp);
+		break;
+	case CLOCK_MONOTONIC:
+		ktime_get_ts64(tp);
+		break;
+	case CLOCK_BOOTTIME:
+		get_monotonic_boottime64(tp);
+		break;
+	default:
+		return -EINVAL;
 	}
 
-	kernel_tp = timespec64_to_timespec(kernel_tp64);
-	if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+	return 0;
+}
+SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
+		struct timespec __user *, tp)
+{
+	int ret;
+	struct timespec64 kernel_tp;
+
+	ret = do_clock_gettime(which_clock, &kernel_tp);
+	if (ret)
+		return ret;
+
+	if (put_timespec64(&kernel_tp, tp))
 		return -EFAULT;
 	return 0;
 }
 
 SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp)
 {
-	struct timespec rtn_tp = {
+	struct timespec64 rtn_tp = {
 		.tv_sec = 0,
 		.tv_nsec = hrtimer_resolution,
 	};
@@ -93,7 +105,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us
 	case CLOCK_REALTIME:
 	case CLOCK_MONOTONIC:
 	case CLOCK_BOOTTIME:
-		if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp)))
+		if (put_timespec64(&rtn_tp, tp))
 			return -EFAULT;
 		return 0;
 	default:
@@ -142,41 +154,35 @@ COMPAT_SYS_NI(setitimer);
 COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 		       struct compat_timespec __user *, tp)
 {
-	struct timespec64 new_tp64;
-	struct timespec new_tp;
+	struct timespec64 new_tp;
 
 	if (which_clock != CLOCK_REALTIME)
 		return -EINVAL;
-	if (compat_get_timespec(&new_tp, tp))
+	if (compat_get_timespec64(&new_tp, tp))
 		return -EFAULT;
 
-	new_tp64 = timespec_to_timespec64(new_tp);
-	return do_sys_settimeofday64(&new_tp64, NULL);
+	return do_sys_settimeofday64(&new_tp, NULL);
 }
 
-COMPAT_SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
-		       struct compat_timespec __user *,tp)
+COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
+		       struct compat_timespec __user *, tp)
 {
-	struct timespec64 kernel_tp64;
-	struct timespec kernel_tp;
+	int ret;
+	struct timespec64 kernel_tp;
 
-	switch (which_clock) {
-	case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
-	case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
-	case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
-	default: return -EINVAL;
-	}
+	ret = do_clock_gettime(which_clock, &kernel_tp);
+	if (ret)
+		return ret;
 
-	kernel_tp = timespec64_to_timespec(kernel_tp64);
-	if (compat_put_timespec(&kernel_tp, tp))
+	if (compat_put_timespec64(&kernel_tp, tp))
 		return -EFAULT;
 	return 0;
 }
 
-COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
+COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
 		       struct compat_timespec __user *, tp)
 {
-	struct timespec rtn_tp = {
+	struct timespec64 rtn_tp = {
 		.tv_sec = 0,
 		.tv_nsec = hrtimer_resolution,
 	};
@@ -185,13 +191,14 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 	case CLOCK_REALTIME:
 	case CLOCK_MONOTONIC:
 	case CLOCK_BOOTTIME:
-		if (compat_put_timespec(&rtn_tp, tp))
+		if (compat_put_timespec64(&rtn_tp, tp))
 			return -EFAULT;
 		return 0;
 	default:
 		return -EINVAL;
 	}
 }
+
 COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
 		       struct compat_timespec __user *, rqtp,
 		       struct compat_timespec __user *, rmtp)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 82d67be7d9d1..39322ae5dd87 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1049,34 +1049,30 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 		const struct timespec __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
-	struct timespec64 new_tp64;
-	struct timespec new_tp;
+	struct timespec64 new_tp;
 
 	if (!kc || !kc->clock_set)
 		return -EINVAL;
 
-	if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+	if (get_timespec64(&new_tp, tp))
 		return -EFAULT;
-	new_tp64 = timespec_to_timespec64(new_tp);
 
-	return kc->clock_set(which_clock, &new_tp64);
+	return kc->clock_set(which_clock, &new_tp);
 }
 
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 		struct timespec __user *,tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
-	struct timespec64 kernel_tp64;
-	struct timespec kernel_tp;
+	struct timespec64 kernel_tp;
 	int error;
 
 	if (!kc)
 		return -EINVAL;
 
-	error = kc->clock_get(which_clock, &kernel_tp64);
-	kernel_tp = timespec64_to_timespec(kernel_tp64);
+	error = kc->clock_get(which_clock, &kernel_tp);
 
-	if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+	if (!error && put_timespec64(&kernel_tp, tp))
 		error = -EFAULT;
 
 	return error;
@@ -1109,17 +1105,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 		struct timespec __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
-	struct timespec64 rtn_tp64;
-	struct timespec rtn_tp;
+	struct timespec64 rtn_tp;
 	int error;
 
 	if (!kc)
 		return -EINVAL;
 
-	error = kc->clock_getres(which_clock, &rtn_tp64);
-	rtn_tp = timespec64_to_timespec(rtn_tp64);
+	error = kc->clock_getres(which_clock, &rtn_tp);
 
-	if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
+	if (!error && tp && put_timespec64(&rtn_tp, tp))
 		error = -EFAULT;
 
 	return error;
@@ -1131,38 +1125,33 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
 		       struct compat_timespec __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
-	struct timespec64 new_tp64;
-	struct timespec new_tp;
+	struct timespec64 ts;
 
 	if (!kc || !kc->clock_set)
 		return -EINVAL;
 
-	if (compat_get_timespec(&new_tp, tp))
+	if (compat_get_timespec64(&ts, tp))
 		return -EFAULT;
 
-	new_tp64 = timespec_to_timespec64(new_tp);
-
-	return kc->clock_set(which_clock, &new_tp64);
+	return kc->clock_set(which_clock, &ts);
 }
 
 COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
 		       struct compat_timespec __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
-	struct timespec64 kernel_tp64;
-	struct timespec kernel_tp;
-	int error;
+	struct timespec64 ts;
+	int err;
 
 	if (!kc)
 		return -EINVAL;
 
-	error = kc->clock_get(which_clock, &kernel_tp64);
-	kernel_tp = timespec64_to_timespec(kernel_tp64);
+	err = kc->clock_get(which_clock, &ts);
 
-	if (!error && compat_put_timespec(&kernel_tp, tp))
-		error = -EFAULT;
+	if (!err && compat_put_timespec64(&ts, tp))
+		err = -EFAULT;
 
-	return error;
+	return err;
 }
 
 COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
@@ -1193,21 +1182,19 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
 		       struct compat_timespec __user *, tp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
-	struct timespec64 rtn_tp64;
-	struct timespec rtn_tp;
-	int error;
+	struct timespec64 ts;
+	int err;
 
 	if (!kc)
 		return -EINVAL;
 
-	error = kc->clock_getres(which_clock, &rtn_tp64);
-	rtn_tp = timespec64_to_timespec(rtn_tp64);
-
-	if (!error && tp && compat_put_timespec(&rtn_tp, tp))
-		error = -EFAULT;
+	err = kc->clock_getres(which_clock, &ts);
+	if (!err && tp && compat_put_timespec64(&ts, tp))
+		return -EFAULT;
 
-	return error;
+	return err;
 }
+
 #endif
 
 /*
-- 
cgit v1.3-14-g43fede


From c0edd7c9acd0eaee149ab6cb4441cc71a1af87f0 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 24 Jun 2017 11:45:06 -0700
Subject: nanosleep: Use get_timespec64() and put_timespec64()

Usage of these apis and their compat versions makes
the syscalls: clock_nanosleep and nanosleep and
their compat implementations simpler.

This is a preparatory patch to isolate data conversions to
struct timespec64 at userspace boundaries. This helps contain
the changes needed to transition to new y2038 safe types.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/hrtimer.h        |  2 +-
 kernel/time/alarmtimer.c       |  4 ++--
 kernel/time/hrtimer.c          | 30 +++++++++++++-----------------
 kernel/time/posix-cpu-timers.c |  8 ++------
 kernel/time/posix-timers.c     | 20 ++++++++------------
 5 files changed, 26 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 255edd5e7a74..012c37fdb688 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -453,7 +453,7 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer,
 
 /* Precise sleep: */
 
-extern int nanosleep_copyout(struct restart_block *, struct timespec *);
+extern int nanosleep_copyout(struct restart_block *, struct timespec64 *);
 extern long hrtimer_nanosleep(const struct timespec64 *rqtp,
 			      const enum hrtimer_mode mode,
 			      const clockid_t clockid);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c991cf212c6d..0b8ff7d257ea 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -712,14 +712,14 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
 		alarmtimer_freezerset(absexp, type);
 	restart = &current->restart_block;
 	if (restart->nanosleep.type != TT_NONE) {
-		struct timespec rmt;
+		struct timespec64 rmt;
 		ktime_t rem;
 
 		rem = ktime_sub(absexp, alarm_bases[type].gettime());
 
 		if (rem <= 0)
 			return 0;
-		rmt = ktime_to_timespec(rem);
+		rmt = ktime_to_timespec64(rem);
 
 		return nanosleep_copyout(restart, &rmt);
 	}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 81da124f1115..88f75f92ef36 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1440,17 +1440,17 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 }
 EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
 
-int nanosleep_copyout(struct restart_block *restart, struct timespec *ts)
+int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
 {
 	switch(restart->nanosleep.type) {
 #ifdef CONFIG_COMPAT
 	case TT_COMPAT:
-		if (compat_put_timespec(ts, restart->nanosleep.compat_rmtp))
+		if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp))
 			return -EFAULT;
 		break;
 #endif
 	case TT_NATIVE:
-		if (copy_to_user(restart->nanosleep.rmtp, ts, sizeof(struct timespec)))
+		if (put_timespec64(ts, restart->nanosleep.rmtp))
 			return -EFAULT;
 		break;
 	default:
@@ -1485,11 +1485,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 	restart = &current->restart_block;
 	if (restart->nanosleep.type != TT_NONE) {
 		ktime_t rem = hrtimer_expires_remaining(&t->timer);
-		struct timespec rmt;
+		struct timespec64 rmt;
 
 		if (rem <= 0)
 			return 0;
-		rmt = ktime_to_timespec(rem);
+		rmt = ktime_to_timespec64(rem);
 
 		return nanosleep_copyout(restart, &rmt);
 	}
@@ -1546,19 +1546,17 @@ out:
 SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 		struct timespec __user *, rmtp)
 {
-	struct timespec64 tu64;
-	struct timespec tu;
+	struct timespec64 tu;
 
-	if (copy_from_user(&tu, rqtp, sizeof(tu)))
+	if (get_timespec64(&tu, rqtp))
 		return -EFAULT;
 
-	tu64 = timespec_to_timespec64(tu);
-	if (!timespec64_valid(&tu64))
+	if (!timespec64_valid(&tu))
 		return -EINVAL;
 
 	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
 	current->restart_block.nanosleep.rmtp = rmtp;
-	return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+	return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 
 #ifdef CONFIG_COMPAT
@@ -1566,19 +1564,17 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
 		       struct compat_timespec __user *, rmtp)
 {
-	struct timespec64 tu64;
-	struct timespec tu;
+	struct timespec64 tu;
 
-	if (compat_get_timespec(&tu, rqtp))
+	if (compat_get_timespec64(&tu, rqtp))
 		return -EFAULT;
 
-	tu64 = timespec_to_timespec64(tu);
-	if (!timespec64_valid(&tu64))
+	if (!timespec64_valid(&tu))
 		return -EINVAL;
 
 	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
 	current->restart_block.nanosleep.compat_rmtp = rmtp;
-	return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+	return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 #endif
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 9df618ee64cf..7323da5950cc 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1314,12 +1314,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		 */
 		restart = &current->restart_block;
 		restart->nanosleep.expires = expires;
-		if (restart->nanosleep.type != TT_NONE) {
-			struct timespec ts;
-
-			ts = timespec64_to_timespec(it.it_value);
-			error = nanosleep_copyout(restart, &ts);
-		}
+		if (restart->nanosleep.type != TT_NONE)
+			error = nanosleep_copyout(restart, &it.it_value);
 	}
 
 	return error;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 39322ae5dd87..4b0fc3b0a1c4 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1213,26 +1213,24 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 		struct timespec __user *, rmtp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
-	struct timespec64 t64;
-	struct timespec t;
+	struct timespec64 t;
 
 	if (!kc)
 		return -EINVAL;
 	if (!kc->nsleep)
 		return -ENANOSLEEP_NOTSUP;
 
-	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+	if (get_timespec64(&t, rqtp))
 		return -EFAULT;
 
-	t64 = timespec_to_timespec64(t);
-	if (!timespec64_valid(&t64))
+	if (!timespec64_valid(&t))
 		return -EINVAL;
 	if (flags & TIMER_ABSTIME)
 		rmtp = NULL;
 	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
 	current->restart_block.nanosleep.rmtp = rmtp;
 
-	return kc->nsleep(which_clock, flags, &t64);
+	return kc->nsleep(which_clock, flags, &t);
 }
 
 #ifdef CONFIG_COMPAT
@@ -1241,26 +1239,24 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
 		       struct compat_timespec __user *, rmtp)
 {
 	const struct k_clock *kc = clockid_to_kclock(which_clock);
-	struct timespec64 t64;
-	struct timespec t;
+	struct timespec64 t;
 
 	if (!kc)
 		return -EINVAL;
 	if (!kc->nsleep)
 		return -ENANOSLEEP_NOTSUP;
 
-	if (compat_get_timespec(&t, rqtp))
+	if (compat_get_timespec64(&t, rqtp))
 		return -EFAULT;
 
-	t64 = timespec_to_timespec64(t);
-	if (!timespec64_valid(&t64))
+	if (!timespec64_valid(&t))
 		return -EINVAL;
 	if (flags & TIMER_ABSTIME)
 		rmtp = NULL;
 	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
 	current->restart_block.nanosleep.compat_rmtp = rmtp;
 
-	return kc->nsleep(which_clock, flags, &t64);
+	return kc->nsleep(which_clock, flags, &t);
 }
 #endif
 
-- 
cgit v1.3-14-g43fede


From 725816e8aabb1c183baa2bc9572ab9a0d26b9ea1 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sat, 24 Jun 2017 11:45:08 -0700
Subject: posix_clocks: Use get_itimerspec64() and put_itimerspec64()

Usage of these apis and their compat versions makes
the syscalls: timer_settime and timer_gettime and their
compat implementations simpler.

This patch also serves as a preparatory patch for changing
syscalls to use new time_t data types to support the
y2038 effort by isolating the processing of user pointers
through these apis.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/time/posix-timers.c | 44 ++++++++++++++++----------------------------
 1 file changed, 16 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 4b0fc3b0a1c4..13d6881f908b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -739,13 +739,11 @@ static int do_timer_gettime(timer_t timer_id,  struct itimerspec64 *setting)
 SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 		struct itimerspec __user *, setting)
 {
-	struct itimerspec64 cur_setting64;
+	struct itimerspec64 cur_setting;
 
-	int ret = do_timer_gettime(timer_id, &cur_setting64);
+	int ret = do_timer_gettime(timer_id, &cur_setting);
 	if (!ret) {
-		struct itimerspec cur_setting;
-		cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
-		if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+		if (put_itimerspec64(&cur_setting, setting))
 			ret = -EFAULT;
 	}
 	return ret;
@@ -755,13 +753,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 		       struct compat_itimerspec __user *, setting)
 {
-	struct itimerspec64 cur_setting64;
+	struct itimerspec64 cur_setting;
 
-	int ret = do_timer_gettime(timer_id, &cur_setting64);
+	int ret = do_timer_gettime(timer_id, &cur_setting);
 	if (!ret) {
-		struct itimerspec cur_setting;
-		cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
-		if (put_compat_itimerspec(setting, &cur_setting))
+		if (put_compat_itimerspec64(&cur_setting, setting))
 			ret = -EFAULT;
 	}
 	return ret;
@@ -907,23 +903,19 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
 		const struct itimerspec __user *, new_setting,
 		struct itimerspec __user *, old_setting)
 {
-	struct itimerspec64 new_spec64, old_spec64;
-	struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL;
-	struct itimerspec new_spec;
+	struct itimerspec64 new_spec, old_spec;
+	struct itimerspec64 *rtn = old_setting ? &old_spec : NULL;
 	int error = 0;
 
 	if (!new_setting)
 		return -EINVAL;
 
-	if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
+	if (get_itimerspec64(&new_spec, new_setting))
 		return -EFAULT;
-	new_spec64 = itimerspec_to_itimerspec64(&new_spec);
 
-	error = do_timer_settime(timer_id, flags, &new_spec64, rtn);
+	error = do_timer_settime(timer_id, flags, &new_spec, rtn);
 	if (!error && old_setting) {
-		struct itimerspec old_spec;
-		old_spec = itimerspec64_to_itimerspec(&old_spec64);
-		if (copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
+		if (put_itimerspec64(&old_spec, old_setting))
 			error = -EFAULT;
 	}
 	return error;
@@ -934,22 +926,18 @@ COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
 		       struct compat_itimerspec __user *, new,
 		       struct compat_itimerspec __user *, old)
 {
-	struct itimerspec64 new_spec64, old_spec64;
-	struct itimerspec64 *rtn = old ? &old_spec64 : NULL;
-	struct itimerspec new_spec;
+	struct itimerspec64 new_spec, old_spec;
+	struct itimerspec64 *rtn = old ? &old_spec : NULL;
 	int error = 0;
 
 	if (!new)
 		return -EINVAL;
-	if (get_compat_itimerspec(&new_spec, new))
+	if (get_compat_itimerspec64(&new_spec, new))
 		return -EFAULT;
 
-	new_spec64 = itimerspec_to_itimerspec64(&new_spec);
-	error = do_timer_settime(timer_id, flags, &new_spec64, rtn);
+	error = do_timer_settime(timer_id, flags, &new_spec, rtn);
 	if (!error && old) {
-		struct itimerspec old_spec;
-		old_spec = itimerspec64_to_itimerspec(&old_spec64);
-		if (put_compat_itimerspec(old, &old_spec))
+		if (put_compat_itimerspec64(&old_spec, old))
 			error = -EFAULT;
 	}
 	return error;
-- 
cgit v1.3-14-g43fede


From c207aee48037abca71c669cbec407b9891965c34 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 28 Jun 2017 10:11:06 -0500
Subject: objtool, x86: Add several functions and files to the objtool
 whitelist

In preparation for an objtool rewrite which will have broader checks,
whitelist functions and files which cause problems because they do
unusual things with the stack.

These whitelists serve as a TODO list for which functions and files
don't yet have undwarf unwinder coverage.  Eventually most of the
whitelists can be removed in favor of manual CFI hint annotations or
objtool improvements.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: live-patching@vger.kernel.org
Link: http://lkml.kernel.org/r/7f934a5d707a574bda33ea282e9478e627fb1829.1498659915.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/crypto/Makefile           | 2 ++
 arch/x86/crypto/sha1-mb/Makefile   | 2 ++
 arch/x86/crypto/sha256-mb/Makefile | 2 ++
 arch/x86/kernel/Makefile           | 1 +
 arch/x86/kernel/acpi/Makefile      | 2 ++
 arch/x86/kernel/kprobes/opt.c      | 9 ++++++++-
 arch/x86/kernel/reboot.c           | 2 ++
 arch/x86/kvm/svm.c                 | 2 ++
 arch/x86/kvm/vmx.c                 | 3 +++
 arch/x86/lib/msr-reg.S             | 8 ++++----
 arch/x86/net/Makefile              | 2 ++
 arch/x86/platform/efi/Makefile     | 1 +
 arch/x86/power/Makefile            | 2 ++
 arch/x86/xen/Makefile              | 3 +++
 kernel/kexec_core.c                | 4 +++-
 15 files changed, 39 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 34b3fa2889d1..9e32d40d71bd 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,8 @@
 # Arch-specific CryptoAPI modules.
 #
 
+OBJECT_FILES_NON_STANDARD := y
+
 avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
 avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
 				$(comma)4)$(comma)%ymm2,yes,no)
diff --git a/arch/x86/crypto/sha1-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile
index 2f8756375df5..2e14acc3da25 100644
--- a/arch/x86/crypto/sha1-mb/Makefile
+++ b/arch/x86/crypto/sha1-mb/Makefile
@@ -2,6 +2,8 @@
 # Arch-specific CryptoAPI modules.
 #
 
+OBJECT_FILES_NON_STANDARD := y
+
 avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
                                 $(comma)4)$(comma)%ymm2,yes,no)
 ifeq ($(avx2_supported),yes)
diff --git a/arch/x86/crypto/sha256-mb/Makefile b/arch/x86/crypto/sha256-mb/Makefile
index 41089e7c400c..45b4fca6c4a8 100644
--- a/arch/x86/crypto/sha256-mb/Makefile
+++ b/arch/x86/crypto/sha256-mb/Makefile
@@ -2,6 +2,8 @@
 # Arch-specific CryptoAPI modules.
 #
 
+OBJECT_FILES_NON_STANDARD := y
+
 avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
                                 $(comma)4)$(comma)%ymm2,yes,no)
 ifeq ($(avx2_supported),yes)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4b994232cb57..3c7c419c4e3e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -29,6 +29,7 @@ OBJECT_FILES_NON_STANDARD_head_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o	:= y
 OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_test_nx.o			:= y
+OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o	:= y
 
 # If instrumentation of this dir is enabled, boot hangs during first second.
 # Probably could be more selective here, but note that files related to irqs,
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 26b78d86f25a..85a9e17e0dbc 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y
+
 obj-$(CONFIG_ACPI)		+= boot.o
 obj-$(CONFIG_ACPI_SLEEP)	+= sleep.o wakeup_$(BITS).o
 obj-$(CONFIG_ACPI_APEI)		+= apei.o
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 901c640d152f..69ea0bc1cfa3 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -28,6 +28,7 @@
 #include <linux/kdebug.h>
 #include <linux/kallsyms.h>
 #include <linux/ftrace.h>
+#include <linux/frame.h>
 
 #include <asm/text-patching.h>
 #include <asm/cacheflush.h>
@@ -94,6 +95,7 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
 }
 
 asm (
+			"optprobe_template_func:\n"
 			".global optprobe_template_entry\n"
 			"optprobe_template_entry:\n"
 #ifdef CONFIG_X86_64
@@ -131,7 +133,12 @@ asm (
 			"	popf\n"
 #endif
 			".global optprobe_template_end\n"
-			"optprobe_template_end:\n");
+			"optprobe_template_end:\n"
+			".type optprobe_template_func, @function\n"
+			".size optprobe_template_func, .-optprobe_template_func\n");
+
+void optprobe_template_func(void);
+STACK_FRAME_NON_STANDARD(optprobe_template_func);
 
 #define TMPL_MOVE_IDX \
 	((long)&optprobe_template_val - (long)&optprobe_template_entry)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 2544700a2a87..67393fc88353 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -9,6 +9,7 @@
 #include <linux/sched.h>
 #include <linux/tboot.h>
 #include <linux/delay.h>
+#include <linux/frame.h>
 #include <acpi/reboot.h>
 #include <asm/io.h>
 #include <asm/apic.h>
@@ -123,6 +124,7 @@ void __noreturn machine_real_restart(unsigned int type)
 #ifdef CONFIG_APM_MODULE
 EXPORT_SYMBOL(machine_real_restart);
 #endif
+STACK_FRAME_NON_STANDARD(machine_real_restart);
 
 /*
  * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ba9891ac5c56..33460fcdeef9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -36,6 +36,7 @@
 #include <linux/slab.h>
 #include <linux/amd-iommu.h>
 #include <linux/hashtable.h>
+#include <linux/frame.h>
 
 #include <asm/apic.h>
 #include <asm/perf_event.h>
@@ -4906,6 +4907,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	mark_all_clean(svm->vmcb);
 }
+STACK_FRAME_NON_STANDARD(svm_vcpu_run);
 
 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ca5d2b93385c..1b469b6c762f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -33,6 +33,7 @@
 #include <linux/slab.h>
 #include <linux/tboot.h>
 #include <linux/hrtimer.h>
+#include <linux/frame.h>
 #include "kvm_cache_regs.h"
 #include "x86.h"
 
@@ -8652,6 +8653,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			);
 	}
 }
+STACK_FRAME_NON_STANDARD(vmx_handle_external_intr);
 
 static bool vmx_has_high_real_mode_segbase(void)
 {
@@ -9028,6 +9030,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vmx_recover_nmi_blocking(vmx);
 	vmx_complete_interrupts(vmx);
 }
+STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
 
 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 {
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index c81556409bbb..10ffa7e8519f 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -13,14 +13,14 @@
 .macro op_safe_regs op
 ENTRY(\op\()_safe_regs)
 	pushq %rbx
-	pushq %rbp
+	pushq %r12
 	movq	%rdi, %r10	/* Save pointer */
 	xorl	%r11d, %r11d	/* Return value */
 	movl    (%rdi), %eax
 	movl    4(%rdi), %ecx
 	movl    8(%rdi), %edx
 	movl    12(%rdi), %ebx
-	movl    20(%rdi), %ebp
+	movl    20(%rdi), %r12d
 	movl    24(%rdi), %esi
 	movl    28(%rdi), %edi
 1:	\op
@@ -29,10 +29,10 @@ ENTRY(\op\()_safe_regs)
 	movl    %ecx, 4(%r10)
 	movl    %edx, 8(%r10)
 	movl    %ebx, 12(%r10)
-	movl    %ebp, 20(%r10)
+	movl    %r12d, 20(%r10)
 	movl    %esi, 24(%r10)
 	movl    %edi, 28(%r10)
-	popq %rbp
+	popq %r12
 	popq %rbx
 	ret
 3:
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index 90568c33ddb0..fefb4b619598 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -1,4 +1,6 @@
 #
 # Arch-specific network modules
 #
+OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
+
 obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index f1d83b34c329..2f56e1ed61c3 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,4 +1,5 @@
 OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_efi_stub_$(BITS).o := y
 
 obj-$(CONFIG_EFI) 		+= quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
 obj-$(CONFIG_EARLY_PRINTK_EFI)	+= early_printk.o
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index a6a198c33623..05041871ac90 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD_hibernate_asm_$(BITS).o := y
+
 # __restore_processor_state() restores %gs after S3 resume and so should not
 # itself be stack-protected
 nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index fffb0a16f9e3..bced7a369a11 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,3 +1,6 @@
+OBJECT_FILES_NON_STANDARD_xen-asm_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_xen-pvh.o := y
+
 ifdef CONFIG_FUNCTION_TRACER
 # Do not profile debug and lowlevel utilities
 CFLAGS_REMOVE_spinlock.o = -pg
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index ae1a3ba24df5..154ffb489b93 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -38,6 +38,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/compiler.h>
 #include <linux/hugetlb.h>
+#include <linux/frame.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -874,7 +875,7 @@ int kexec_load_disabled;
  * only when panic_cpu holds the current CPU number; this is the only CPU
  * which processes crash_kexec routines.
  */
-void __crash_kexec(struct pt_regs *regs)
+void __noclone __crash_kexec(struct pt_regs *regs)
 {
 	/* Take the kexec_mutex here to prevent sys_kexec_load
 	 * running on one cpu from replacing the crash kernel
@@ -896,6 +897,7 @@ void __crash_kexec(struct pt_regs *regs)
 		mutex_unlock(&kexec_mutex);
 	}
 }
+STACK_FRAME_NON_STANDARD(__crash_kexec);
 
 void crash_kexec(struct pt_regs *regs)
 {
-- 
cgit v1.3-14-g43fede


From 3859a271a003aba01e45b85c9d8b355eb7bf25f9 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 28 Oct 2016 01:22:25 -0700
Subject: randstruct: Mark various structs for randomization

This marks many critical kernel structures for randomization. These are
structures that have been targeted in the past in security exploits, or
contain functions pointers, pointers to function pointer tables, lists,
workqueues, ref-counters, credentials, permissions, or are otherwise
sensitive. This initial list was extracted from Brad Spengler/PaX Team's
code in the last public patch of grsecurity/PaX based on my understanding
of the code. Changes or omissions from the original code are mine and
don't reflect the original grsecurity/PaX code.

Left out of this list is task_struct, which requires special handling
and will be covered in a subsequent patch.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/x86/include/asm/processor.h |  2 +-
 fs/mount.h                       |  4 ++--
 fs/namei.c                       |  2 +-
 fs/proc/internal.h               |  6 +++---
 include/linux/binfmts.h          |  4 ++--
 include/linux/cdev.h             |  2 +-
 include/linux/cred.h             |  4 ++--
 include/linux/dcache.h           |  2 +-
 include/linux/fs.h               | 17 +++++++++--------
 include/linux/fs_struct.h        |  2 +-
 include/linux/ipc.h              |  2 +-
 include/linux/ipc_namespace.h    |  2 +-
 include/linux/key-type.h         |  4 ++--
 include/linux/kmod.h             |  2 +-
 include/linux/kobject.h          |  2 +-
 include/linux/lsm_hooks.h        |  4 ++--
 include/linux/mm_types.h         |  4 ++--
 include/linux/module.h           |  4 ++--
 include/linux/mount.h            |  2 +-
 include/linux/msg.h              |  2 +-
 include/linux/path.h             |  2 +-
 include/linux/pid_namespace.h    |  2 +-
 include/linux/proc_ns.h          |  2 +-
 include/linux/sched.h            |  2 +-
 include/linux/sched/signal.h     |  2 +-
 include/linux/sem.h              |  2 +-
 include/linux/shm.h              |  2 +-
 include/linux/sysctl.h           |  2 +-
 include/linux/tty.h              |  2 +-
 include/linux/tty_driver.h       |  4 ++--
 include/linux/user_namespace.h   |  2 +-
 include/linux/utsname.h          |  2 +-
 include/net/af_unix.h            |  2 +-
 include/net/neighbour.h          |  2 +-
 include/net/net_namespace.h      |  2 +-
 include/net/sock.h               |  2 +-
 kernel/futex.c                   |  4 ++--
 security/keys/internal.h         |  2 +-
 38 files changed, 57 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada998a402..e2335edb9fc5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -129,7 +129,7 @@ struct cpuinfo_x86 {
 	/* Index into per_cpu list: */
 	u16			cpu_index;
 	u32			microcode;
-};
+} __randomize_layout;
 
 struct cpuid_regs {
 	u32 eax, ebx, ecx, edx;
diff --git a/fs/mount.h b/fs/mount.h
index bf1fda6eed8f..e406b286fba1 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -16,7 +16,7 @@ struct mnt_namespace {
 	u64 event;
 	unsigned int		mounts; /* # of mounts in the namespace */
 	unsigned int		pending_mounts;
-};
+} __randomize_layout;
 
 struct mnt_pcp {
 	int mnt_count;
@@ -68,7 +68,7 @@ struct mount {
 	struct hlist_head mnt_pins;
 	struct fs_pin mnt_umount;
 	struct dentry *mnt_ex_mountpoint;
-};
+} __randomize_layout;
 
 #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
 
diff --git a/fs/namei.c b/fs/namei.c
index 6571a5f5112e..1764620ac383 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -524,7 +524,7 @@ struct nameidata {
 	struct inode	*link_inode;
 	unsigned	root_seq;
 	int		dfd;
-};
+} __randomize_layout;
 
 static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 {
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c5ae09b6c726..07b16318223f 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -51,7 +51,7 @@ struct proc_dir_entry {
 	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
 	u8 namelen;
 	char name[];
-};
+} __randomize_layout;
 
 union proc_op {
 	int (*proc_get_link)(struct dentry *, struct path *);
@@ -70,7 +70,7 @@ struct proc_inode {
 	struct list_head sysctl_inodes;
 	const struct proc_ns_operations *ns_ops;
 	struct inode vfs_inode;
-};
+} __randomize_layout;
 
 /*
  * General functions
@@ -279,7 +279,7 @@ struct proc_maps_private {
 #ifdef CONFIG_NUMA
 	struct mempolicy *task_mempolicy;
 #endif
-};
+} __randomize_layout;
 
 struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);
 
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 05488da3aee9..3ae9013eeaaa 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -46,7 +46,7 @@ struct linux_binprm {
 	unsigned interp_flags;
 	unsigned interp_data;
 	unsigned long loader, exec;
-};
+} __randomize_layout;
 
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT)
@@ -81,7 +81,7 @@ struct linux_binfmt {
 	int (*load_shlib)(struct file *);
 	int (*core_dump)(struct coredump_params *cprm);
 	unsigned long min_coredump;	/* minimal dump size */
-};
+} __randomize_layout;
 
 extern void __register_binfmt(struct linux_binfmt *fmt, int insert);
 
diff --git a/include/linux/cdev.h b/include/linux/cdev.h
index 408bc09ce497..cb28eb21e3ca 100644
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -17,7 +17,7 @@ struct cdev {
 	struct list_head list;
 	dev_t dev;
 	unsigned int count;
-};
+} __randomize_layout;
 
 void cdev_init(struct cdev *, const struct file_operations *);
 
diff --git a/include/linux/cred.h b/include/linux/cred.h
index b03e7d049a64..82c8a9e1aabb 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -31,7 +31,7 @@ struct group_info {
 	atomic_t	usage;
 	int		ngroups;
 	kgid_t		gid[0];
-};
+} __randomize_layout;
 
 /**
  * get_group_info - Get a reference to a group info structure
@@ -145,7 +145,7 @@ struct cred {
 	struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
 	struct group_info *group_info;	/* supplementary groups for euid/fsgid */
 	struct rcu_head	rcu;		/* RCU deletion hook */
-};
+} __randomize_layout;
 
 extern void __put_cred(struct cred *);
 extern void exit_creds(struct task_struct *);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index d2e38dc6172c..7eb262e13d3c 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -113,7 +113,7 @@ struct dentry {
 		struct hlist_bl_node d_in_lookup_hash;	/* only for in-lookup ones */
 	 	struct rcu_head d_rcu;
 	} d_u;
-};
+} __randomize_layout;
 
 /*
  * dentry->d_lock spinlock nesting subclasses:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 803e5a9b2654..8f28143486c4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -275,7 +275,7 @@ struct kiocb {
 	void (*ki_complete)(struct kiocb *iocb, long ret, long ret2);
 	void			*private;
 	int			ki_flags;
-};
+} __randomize_layout;
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
 {
@@ -392,7 +392,7 @@ struct address_space {
 	gfp_t			gfp_mask;	/* implicit gfp mask for allocations */
 	struct list_head	private_list;	/* ditto */
 	void			*private_data;	/* ditto */
-} __attribute__((aligned(sizeof(long))));
+} __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*
 	 * On most architectures that alignment is already the case; but
 	 * must be enforced here for CRIS, to let the least significant bit
@@ -435,7 +435,7 @@ struct block_device {
 	int			bd_fsfreeze_count;
 	/* Mutex for freeze */
 	struct mutex		bd_fsfreeze_mutex;
-};
+} __randomize_layout;
 
 /*
  * Radix-tree tags, for tagging dirty and writeback pages within the pagecache
@@ -653,7 +653,7 @@ struct inode {
 #endif
 
 	void			*i_private; /* fs or device private pointer */
-};
+} __randomize_layout;
 
 static inline unsigned int i_blocksize(const struct inode *node)
 {
@@ -868,7 +868,8 @@ struct file {
 	struct list_head	f_tfile_llink;
 #endif /* #ifdef CONFIG_EPOLL */
 	struct address_space	*f_mapping;
-} __attribute__((aligned(4)));	/* lest something weird decides that 2 is OK */
+} __randomize_layout
+  __attribute__((aligned(4)));	/* lest something weird decides that 2 is OK */
 
 struct file_handle {
 	__u32 handle_bytes;
@@ -1005,7 +1006,7 @@ struct file_lock {
 			int state;		/* state of grant or error if -ve */
 		} afs;
 	} fl_u;
-};
+} __randomize_layout;
 
 struct file_lock_context {
 	spinlock_t		flc_lock;
@@ -1404,7 +1405,7 @@ struct super_block {
 
 	spinlock_t		s_inode_wblist_lock;
 	struct list_head	s_inodes_wb;	/* writeback inodes */
-};
+} __randomize_layout;
 
 /* Helper functions so that in most cases filesystems will
  * not need to deal directly with kuid_t and kgid_t and can
@@ -1690,7 +1691,7 @@ struct file_operations {
 			u64);
 	ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
 			u64);
-};
+} __randomize_layout;
 
 struct inode_operations {
 	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 0efc3e62843a..7a026240cbb1 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -12,7 +12,7 @@ struct fs_struct {
 	int umask;
 	int in_exec;
 	struct path root, pwd;
-};
+} __randomize_layout;
 
 extern struct kmem_cache *fs_cachep;
 
diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index 71fd92d81b26..ea0eb0b5f98c 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -20,6 +20,6 @@ struct kern_ipc_perm {
 	umode_t		mode;
 	unsigned long	seq;
 	void		*security;
-} ____cacheline_aligned_in_smp;
+} ____cacheline_aligned_in_smp __randomize_layout;
 
 #endif /* _LINUX_IPC_H */
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 848e5796400e..65327ee0936b 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -61,7 +61,7 @@ struct ipc_namespace {
 	struct ucounts *ucounts;
 
 	struct ns_common ns;
-};
+} __randomize_layout;
 
 extern struct ipc_namespace init_ipc_ns;
 extern spinlock_t mq_lock;
diff --git a/include/linux/key-type.h b/include/linux/key-type.h
index 8496cf64575c..9520fc3c3b9a 100644
--- a/include/linux/key-type.h
+++ b/include/linux/key-type.h
@@ -45,7 +45,7 @@ struct key_preparsed_payload {
 	size_t		datalen;	/* Raw datalen */
 	size_t		quotalen;	/* Quota length for proposed payload */
 	time_t		expiry;		/* Expiry time of key */
-};
+} __randomize_layout;
 
 typedef int (*request_key_actor_t)(struct key_construction *key,
 				   const char *op, void *aux);
@@ -158,7 +158,7 @@ struct key_type {
 	/* internal fields */
 	struct list_head	link;		/* link in types list */
 	struct lock_class_key	lock_class;	/* key->sem lock class */
-};
+} __randomize_layout;
 
 extern struct key_type key_type_keyring;
 
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index c4e441e00db5..655082c88fd9 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -64,7 +64,7 @@ struct subprocess_info {
 	int (*init)(struct subprocess_info *info, struct cred *new);
 	void (*cleanup)(struct subprocess_info *info);
 	void *data;
-};
+} __randomize_layout;
 
 extern int
 call_usermodehelper(const char *path, char **argv, char **envp, int wait);
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index ca85cb80e99a..084513350317 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -172,7 +172,7 @@ struct kset {
 	spinlock_t list_lock;
 	struct kobject kobj;
 	const struct kset_uevent_ops *uevent_ops;
-};
+} __randomize_layout;
 
 extern void kset_init(struct kset *kset);
 extern int __must_check kset_register(struct kset *kset);
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 080f34e66017..565163fc9ad4 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1876,7 +1876,7 @@ struct security_hook_heads {
 	struct list_head audit_rule_match;
 	struct list_head audit_rule_free;
 #endif /* CONFIG_AUDIT */
-};
+} __randomize_layout;
 
 /*
  * Security module hook list structure.
@@ -1887,7 +1887,7 @@ struct security_hook_list {
 	struct list_head		*head;
 	union security_list_options	hook;
 	char				*lsm;
-};
+} __randomize_layout;
 
 /*
  * Initializing a security_hook_list structure takes
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 45cdb27791a3..ff151814a02d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -342,7 +342,7 @@ struct vm_area_struct {
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
-};
+} __randomize_layout;
 
 struct core_thread {
 	struct task_struct *task;
@@ -500,7 +500,7 @@ struct mm_struct {
 	atomic_long_t hugetlb_usage;
 #endif
 	struct work_struct async_put_work;
-};
+} __randomize_layout;
 
 extern struct mm_struct init_mm;
 
diff --git a/include/linux/module.h b/include/linux/module.h
index 21f56393602f..d93111d7def6 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -45,7 +45,7 @@ struct module_kobject {
 	struct kobject *drivers_dir;
 	struct module_param_attrs *mp;
 	struct completion *kobj_completion;
-};
+} __randomize_layout;
 
 struct module_attribute {
 	struct attribute attr;
@@ -475,7 +475,7 @@ struct module {
 	ctor_fn_t *ctors;
 	unsigned int num_ctors;
 #endif
-} ____cacheline_aligned;
+} ____cacheline_aligned __randomize_layout;
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
 #endif
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 8e0352af06b7..1ce85e6fd95f 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -67,7 +67,7 @@ struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */
 	struct super_block *mnt_sb;	/* pointer to superblock */
 	int mnt_flags;
-};
+} __randomize_layout;
 
 struct file; /* forward dec */
 struct path;
diff --git a/include/linux/msg.h b/include/linux/msg.h
index f3f302f9c197..a001305f5a79 100644
--- a/include/linux/msg.h
+++ b/include/linux/msg.h
@@ -29,7 +29,7 @@ struct msg_queue {
 	struct list_head q_messages;
 	struct list_head q_receivers;
 	struct list_head q_senders;
-};
+} __randomize_layout;
 
 /* Helper routines for sys_msgsnd and sys_msgrcv */
 extern long do_msgsnd(int msqid, long mtype, void __user *mtext,
diff --git a/include/linux/path.h b/include/linux/path.h
index d1372186f431..cde895cc4af4 100644
--- a/include/linux/path.h
+++ b/include/linux/path.h
@@ -7,7 +7,7 @@ struct vfsmount;
 struct path {
 	struct vfsmount *mnt;
 	struct dentry *dentry;
-};
+} __randomize_layout;
 
 extern void path_get(const struct path *);
 extern void path_put(const struct path *);
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index c2a989dee876..b09136f88cf4 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -52,7 +52,7 @@ struct pid_namespace {
 	int hide_pid;
 	int reboot;	/* group exit code if this pidns was rebooted */
 	struct ns_common ns;
-};
+} __randomize_layout;
 
 extern struct pid_namespace init_pid_ns;
 
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 58ab28d81fc2..06844b54dfc1 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -21,7 +21,7 @@ struct proc_ns_operations {
 	int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
 	struct user_namespace *(*owner)(struct ns_common *ns);
 	struct ns_common *(*get_parent)(struct ns_common *ns);
-};
+} __randomize_layout;
 
 extern const struct proc_ns_operations netns_operations;
 extern const struct proc_ns_operations utsns_operations;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b69fc650201..f833254fce00 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -408,7 +408,7 @@ struct sched_rt_entity {
 	/* rq "owned" by this entity/group: */
 	struct rt_rq			*my_q;
 #endif
-};
+} __randomize_layout;
 
 struct sched_dl_entity {
 	struct rb_node			rb_node;
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index c06d63b3a583..2a0dd40b15db 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -222,7 +222,7 @@ struct signal_struct {
 	struct mutex cred_guard_mutex;	/* guard against foreign influences on
 					 * credential calculations
 					 * (notably. ptrace) */
-};
+} __randomize_layout;
 
 /*
  * Bits in flags field of signal_struct.
diff --git a/include/linux/sem.h b/include/linux/sem.h
index 9edec926e9d9..23bcbdfad4a6 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -21,7 +21,7 @@ struct sem_array {
 	int			sem_nsems;	/* no. of semaphores in array */
 	int			complex_count;	/* pending complex operations */
 	unsigned int		use_global_lock;/* >0: global lock required */
-};
+} __randomize_layout;
 
 #ifdef CONFIG_SYSVIPC
 
diff --git a/include/linux/shm.h b/include/linux/shm.h
index 04e881829625..0fb7061ec54c 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -22,7 +22,7 @@ struct shmid_kernel /* private to the kernel */
 	/* The task created the shm object.  NULL if the task is dead. */
 	struct task_struct	*shm_creator;
 	struct list_head	shm_clist;	/* list by creator */
-};
+} __randomize_layout;
 
 /* shm_mode upper byte flags */
 #define	SHM_DEST	01000	/* segment will be destroyed on last detach */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 80d07816def0..9ddeef2c03e2 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -117,7 +117,7 @@ struct ctl_table
 	struct ctl_table_poll *poll;
 	void *extra1;
 	void *extra2;
-};
+} __randomize_layout;
 
 struct ctl_node {
 	struct rb_node node;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index d07cd2105a6c..73f8d0977bb0 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -333,7 +333,7 @@ struct tty_struct {
 	/* If the tty has a pending do_SAK, queue it here - akpm */
 	struct work_struct SAK_work;
 	struct tty_port *port;
-};
+} __randomize_layout;
 
 /* Each of a tty's open files has private_data pointing to tty_file_private */
 struct tty_file_private {
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index b742b5e47cc2..00b2213f6a35 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -291,7 +291,7 @@ struct tty_operations {
 	void (*poll_put_char)(struct tty_driver *driver, int line, char ch);
 #endif
 	const struct file_operations *proc_fops;
-};
+} __randomize_layout;
 
 struct tty_driver {
 	int	magic;		/* magic number for this structure */
@@ -325,7 +325,7 @@ struct tty_driver {
 
 	const struct tty_operations *ops;
 	struct list_head tty_drivers;
-};
+} __randomize_layout;
 
 extern struct list_head tty_drivers;
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 32354b4b4b2b..b3575ce29148 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -66,7 +66,7 @@ struct user_namespace {
 #endif
 	struct ucounts		*ucounts;
 	int ucount_max[UCOUNT_COUNTS];
-};
+} __randomize_layout;
 
 struct ucounts {
 	struct hlist_node node;
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 60f0bb83b313..da826ed059cf 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -26,7 +26,7 @@ struct uts_namespace {
 	struct user_namespace *user_ns;
 	struct ucounts *ucounts;
 	struct ns_common ns;
-};
+} __randomize_layout;
 extern struct uts_namespace init_uts_ns;
 
 #ifdef CONFIG_UTS_NS
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index fd60eccb59a6..64e2a1e24a2c 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -36,7 +36,7 @@ struct unix_skb_parms {
 	u32			secid;		/* Security ID		*/
 #endif
 	u32			consumed;
-};
+} __randomize_layout;
 
 #define UNIXCB(skb) 	(*(struct unix_skb_parms *)&((skb)->cb))
 
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index e4dd3a214034..a62959d2b3f7 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -155,7 +155,7 @@ struct neighbour {
 	struct rcu_head		rcu;
 	struct net_device	*dev;
 	u8			primary_key[0];
-};
+} __randomize_layout;
 
 struct neigh_ops {
 	int			family;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index fe80bb48ab1f..a224196d16ac 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -147,7 +147,7 @@ struct net {
 #endif
 	struct sock		*diag_nlsk;
 	atomic_t		fnhe_genid;
-};
+} __randomize_layout;
 
 #include <linux/seq_file_net.h>
 
diff --git a/include/net/sock.h b/include/net/sock.h
index f33e3d134e0b..d349297db9e9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1113,7 +1113,7 @@ struct proto {
 	atomic_t		socks;
 #endif
 	int			(*diag_destroy)(struct sock *sk, int err);
-};
+} __randomize_layout;
 
 int proto_register(struct proto *prot, int alloc_slab);
 void proto_unregister(struct proto *prot);
diff --git a/kernel/futex.c b/kernel/futex.c
index 357348a6cf6b..5616511abf39 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -212,7 +212,7 @@ struct futex_pi_state {
 	atomic_t refcount;
 
 	union futex_key key;
-};
+} __randomize_layout;
 
 /**
  * struct futex_q - The hashed futex queue entry, one per waiting task
@@ -246,7 +246,7 @@ struct futex_q {
 	struct rt_mutex_waiter *rt_waiter;
 	union futex_key *requeue_pi_key;
 	u32 bitset;
-};
+} __randomize_layout;
 
 static const struct futex_q futex_q_init = {
 	/* list gets initialized in queue_me()*/
diff --git a/security/keys/internal.h b/security/keys/internal.h
index c0f8682eba69..6494954e9980 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -197,7 +197,7 @@ struct request_key_auth {
 	void			*callout_info;
 	size_t			callout_len;
 	pid_t			pid;
-};
+} __randomize_layout;
 
 extern struct key_type key_type_request_key_auth;
 extern struct key *request_key_auth_new(struct key *target,
-- 
cgit v1.3-14-g43fede


From 40304b2a1567fecc321f640ee4239556dd0f3ee0 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 30 Jun 2017 20:02:40 -0700
Subject: bpf: BPF support for sock_ops

Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding
struct that allows BPF programs of this type to access some of the
socket's fields (such as IP addresses, ports, etc.). It uses the
existing bpf cgroups infrastructure so the programs can be attached per
cgroup with full inheritance support. The program will be called at
appropriate times to set relevant connections parameters such as buffer
sizes, SYN and SYN-ACK RTOs, etc., based on connection information such
as IP addresses, port numbers, etc.

Alghough there are already 3 mechanisms to set parameters (sysctls,
route metrics and setsockopts), this new mechanism provides some
distinct advantages. Unlike sysctls, it can set parameters per
connection. In contrast to route metrics, it can also use port numbers
and information provided by a user level program. In addition, it could
set parameters probabilistically for evaluation purposes (i.e. do
something different on 10% of the flows and compare results with the
other 90% of the flows). Also, in cases where IPv6 addresses contain
geographic information, the rules to make changes based on the distance
(or RTT) between the hosts are much easier than route metric rules and
can be global. Finally, unlike setsockopt, it oes not require
application changes and it can be updated easily at any time.

Although the bpf cgroup framework already contains a sock related
program type (BPF_PROG_TYPE_CGROUP_SOCK), I created the new type
(BPF_PROG_TYPE_SOCK_OPS) beccause the existing type expects to be called
only once during the connections's lifetime. In contrast, the new
program type will be called multiple times from different places in the
network stack code.  For example, before sending SYN and SYN-ACKs to set
an appropriate timeout, when the connection is established to set
congestion control, etc. As a result it has "op" field to specify the
type of operation requested.

The purpose of this new program type is to simplify setting connection
parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is
easy to use facebook's internal IPv6 addresses to determine if both hosts
of a connection are in the same datacenter. Therefore, it is easy to
write a BPF program to choose a small SYN RTO value when both hosts are
in the same datacenter.

This patch only contains the framework to support the new BPF program
type, following patches add the functionality to set various connection
parameters.

This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS
and a new bpf syscall command to load a new program of this type:
BPF_PROG_LOAD_SOCKET_OPS.

Two new corresponding structs (one for the kernel one for the user/BPF
program):

/* kernel version */
struct bpf_sock_ops_kern {
        struct sock *sk;
        __u32  op;
        union {
                __u32 reply;
                __u32 replylong[4];
        };
};

/* user version
 * Some fields are in network byte order reflecting the sock struct
 * Use the bpf_ntohl helper macro in samples/bpf/bpf_endian.h to
 * convert them to host byte order.
 */
struct bpf_sock_ops {
        __u32 op;
        union {
                __u32 reply;
                __u32 replylong[4];
        };
        __u32 family;
        __u32 remote_ip4;     /* In network byte order */
        __u32 local_ip4;      /* In network byte order */
        __u32 remote_ip6[4];  /* In network byte order */
        __u32 local_ip6[4];   /* In network byte order */
        __u32 remote_port;    /* In network byte order */
        __u32 local_port;     /* In host byte horder */
};

Currently there are two types of ops. The first type expects the BPF
program to return a value which is then used by the caller (or a
negative value to indicate the operation is not supported). The second
type expects state changes to be done by the BPF program, for example
through a setsockopt BPF helper function, and they ignore the return
value.

The reply fields of the bpf_sockt_ops struct are there in case a bpf
program needs to return a value larger than an integer.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h |  18 +++++
 include/linux/bpf_types.h  |   1 +
 include/linux/filter.h     |   9 +++
 include/net/tcp.h          |  36 ++++++++++
 include/uapi/linux/bpf.h   |  30 ++++++++
 kernel/bpf/cgroup.c        |  37 ++++++++++
 kernel/bpf/syscall.c       |   5 ++
 net/core/filter.c          | 168 +++++++++++++++++++++++++++++++++++++++++++++
 samples/bpf/bpf_load.c     |  13 +++-
 9 files changed, 314 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index c970a25d2a49..360c082e885c 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -7,6 +7,7 @@
 struct sock;
 struct cgroup;
 struct sk_buff;
+struct bpf_sock_ops_kern;
 
 #ifdef CONFIG_CGROUP_BPF
 
@@ -42,6 +43,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 int __cgroup_bpf_run_filter_sk(struct sock *sk,
 			       enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
+				     struct bpf_sock_ops_kern *sock_ops,
+				     enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -75,6 +80,18 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)				       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled && (sock_ops)->sk) {	       \
+		typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk);	       \
+		if (sk_fullsock(__sk))					       \
+			__ret = __cgroup_bpf_run_filter_sock_ops(__sk,	       \
+								 sock_ops,     \
+							 BPF_CGROUP_SOCK_OPS); \
+	}								       \
+	__ret;								       \
+})
 #else
 
 struct cgroup_bpf {};
@@ -85,6 +102,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 03bf223f18be..3d137c33d664 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -10,6 +10,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops)
 #endif
 #ifdef CONFIG_BPF_EVENTS
 BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1fa26dc562ce..738f8b14f025 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -898,4 +898,13 @@ static inline int bpf_tell_extensions(void)
 	return SKF_AD_MAX;
 }
 
+struct bpf_sock_ops_kern {
+	struct	sock *sk;
+	u32	op;
+	union {
+		u32 reply;
+		u32 replylong[4];
+	};
+};
+
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d0751b79d99c..e58500825006 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -46,6 +46,10 @@
 #include <linux/seq_file.h>
 #include <linux/memcontrol.h>
 
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/bpf-cgroup.h>
+
 extern struct inet_hashinfo tcp_hashinfo;
 
 extern struct percpu_counter tcp_orphan_count;
@@ -2021,4 +2025,36 @@ int tcp_set_ulp(struct sock *sk, const char *name);
 void tcp_get_available_ulp(char *buf, size_t len);
 void tcp_cleanup_ulp(struct sock *sk);
 
+/* Call BPF_SOCK_OPS program that returns an int. If the return value
+ * is < 0, then the BPF op failed (for example if the loaded BPF
+ * program does not support the chosen operation or there is no BPF
+ * program loaded).
+ */
+#ifdef CONFIG_BPF
+static inline int tcp_call_bpf(struct sock *sk, int op)
+{
+	struct bpf_sock_ops_kern sock_ops;
+	int ret;
+
+	if (sk_fullsock(sk))
+		sock_owned_by_me(sk);
+
+	memset(&sock_ops, 0, sizeof(sock_ops));
+	sock_ops.sk = sk;
+	sock_ops.op = op;
+
+	ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+	if (ret == 0)
+		ret = sock_ops.reply;
+	else
+		ret = -1;
+	return ret;
+}
+#else
+static inline int tcp_call_bpf(struct sock *sk, int op)
+{
+	return -EPERM;
+}
+#endif
+
 #endif	/* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f94b48b168dc..01cd485ccd4f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -120,12 +120,14 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_IN,
 	BPF_PROG_TYPE_LWT_OUT,
 	BPF_PROG_TYPE_LWT_XMIT,
+	BPF_PROG_TYPE_SOCK_OPS,
 };
 
 enum bpf_attach_type {
 	BPF_CGROUP_INET_INGRESS,
 	BPF_CGROUP_INET_EGRESS,
 	BPF_CGROUP_INET_SOCK_CREATE,
+	BPF_CGROUP_SOCK_OPS,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -720,4 +722,32 @@ struct bpf_map_info {
 	__u32 map_flags;
 } __attribute__((aligned(8)));
 
+/* User bpf_sock_ops struct to access socket values and specify request ops
+ * and their replies.
+ * Some of this fields are in network (bigendian) byte order and may need
+ * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h).
+ * New fields can only be added at the end of this structure
+ */
+struct bpf_sock_ops {
+	__u32 op;
+	union {
+		__u32 reply;
+		__u32 replylong[4];
+	};
+	__u32 family;
+	__u32 remote_ip4;	/* Stored in network byte order */
+	__u32 local_ip4;	/* Stored in network byte order */
+	__u32 remote_ip6[4];	/* Stored in network byte order */
+	__u32 local_ip6[4];	/* Stored in network byte order */
+	__u32 remote_port;	/* Stored in network byte order */
+	__u32 local_port;	/* stored in host byte order */
+};
+
+/* List of known BPF sock_ops operators.
+ * New entries can only be added at the end
+ */
+enum {
+	BPF_SOCK_OPS_VOID,
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ea6033cba947..546113430049 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 	return ret;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
+
+/**
+ * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
+ * @sk: socket to get cgroup from
+ * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
+ * sk with connection information (IP addresses, etc.) May not contain
+ * cgroup info if it is a req sock.
+ * @type: The type of program to be exectuted
+ *
+ * socket passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock_ops
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
+				     struct bpf_sock_ops_kern *sock_ops,
+				     enum bpf_attach_type type)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_prog *prog;
+	int ret = 0;
+
+
+	rcu_read_lock();
+
+	prog = rcu_dereference(cgrp->bpf.effective[type]);
+	if (prog)
+		ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4409ccca8831..d4d47de75bba 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1079,6 +1079,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET_SOCK_CREATE:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
 		break;
+	case BPF_CGROUP_SOCK_OPS:
+		ptype = BPF_PROG_TYPE_SOCK_OPS;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1119,6 +1122,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
 	case BPF_CGROUP_INET_SOCK_CREATE:
+	case BPF_CGROUP_SOCK_OPS:
 		cgrp = cgroup_get_from_fd(attr->target_fd);
 		if (IS_ERR(cgrp))
 			return PTR_ERR(cgrp);
@@ -1133,6 +1137,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 
 	return ret;
 }
+
 #endif /* CONFIG_CGROUP_BPF */
 
 #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
diff --git a/net/core/filter.c b/net/core/filter.c
index b39c869d22e3..1f6a26c4f8b9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3110,6 +3110,36 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
+static bool __is_valid_sock_ops_access(int off, int size)
+{
+	if (off < 0 || off >= sizeof(struct bpf_sock_ops))
+		return false;
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+	if (size != sizeof(__u32))
+		return false;
+
+	return true;
+}
+
+static bool sock_ops_is_valid_access(int off, int size,
+				     enum bpf_access_type type,
+				     struct bpf_insn_access_aux *info)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct bpf_sock_ops, op) ...
+		     offsetof(struct bpf_sock_ops, replylong[3]):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	return __is_valid_sock_ops_access(off, size);
+}
+
 static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				  const struct bpf_insn *si,
 				  struct bpf_insn *insn_buf,
@@ -3379,6 +3409,138 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
+static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
+				       const struct bpf_insn *si,
+				       struct bpf_insn *insn_buf,
+				       struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+	int off;
+
+	switch (si->off) {
+	case offsetof(struct bpf_sock_ops, op) ...
+	     offsetof(struct bpf_sock_ops, replylong[3]):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) !=
+			     FIELD_SIZEOF(struct bpf_sock_ops_kern, op));
+		BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) !=
+			     FIELD_SIZEOF(struct bpf_sock_ops_kern, reply));
+		BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) !=
+			     FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong));
+		off = si->off;
+		off -= offsetof(struct bpf_sock_ops, op);
+		off += offsetof(struct bpf_sock_ops_kern, op);
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+					      off);
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+					      off);
+		break;
+
+	case offsetof(struct bpf_sock_ops, family):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+					      struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_family));
+		break;
+
+	case offsetof(struct bpf_sock_ops, remote_ip4):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_daddr));
+		break;
+
+	case offsetof(struct bpf_sock_ops, local_ip4):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+					      struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_rcv_saddr));
+		break;
+
+	case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
+	     offsetof(struct bpf_sock_ops, remote_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_v6_daddr.s6_addr32[0]) != 4);
+
+		off = si->off;
+		off -= offsetof(struct bpf_sock_ops, remote_ip6[0]);
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_v6_daddr.s6_addr32[0]) +
+				      off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
+	     offsetof(struct bpf_sock_ops, local_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+					  skc_v6_rcv_saddr.s6_addr32[0]) != 4);
+
+		off = si->off;
+		off -= offsetof(struct bpf_sock_ops, local_ip6[0]);
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common,
+					       skc_v6_rcv_saddr.s6_addr32[0]) +
+				      off);
+#else
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct bpf_sock_ops, remote_port):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_dport));
+#ifndef __BIG_ENDIAN_BITFIELD
+		*insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
+#endif
+		break;
+
+	case offsetof(struct bpf_sock_ops, local_port):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock_common, skc_num));
+		break;
+	}
+	return insn - insn_buf;
+}
+
 const struct bpf_verifier_ops sk_filter_prog_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
@@ -3428,6 +3590,12 @@ const struct bpf_verifier_ops cg_sock_prog_ops = {
 	.convert_ctx_access	= sock_filter_convert_ctx_access,
 };
 
+const struct bpf_verifier_ops sock_ops_prog_ops = {
+	.get_func_proto		= bpf_base_func_proto,
+	.is_valid_access	= sock_ops_is_valid_access,
+	.convert_ctx_access	= sock_ops_convert_ctx_access,
+};
+
 int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index a91c57dd8571..a4be7cfa6519 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -64,6 +64,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	bool is_perf_event = strncmp(event, "perf_event", 10) == 0;
 	bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0;
 	bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
+	bool is_sockops = strncmp(event, "sockops", 7) == 0;
 	size_t insns_cnt = size / sizeof(struct bpf_insn);
 	enum bpf_prog_type prog_type;
 	char buf[256];
@@ -89,6 +90,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		prog_type = BPF_PROG_TYPE_CGROUP_SKB;
 	} else if (is_cgroup_sk) {
 		prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
+	} else if (is_sockops) {
+		prog_type = BPF_PROG_TYPE_SOCK_OPS;
 	} else {
 		printf("Unknown event '%s'\n", event);
 		return -1;
@@ -106,8 +109,11 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
 		return 0;
 
-	if (is_socket) {
-		event += 6;
+	if (is_socket || is_sockops) {
+		if (is_socket)
+			event += 6;
+		else
+			event += 7;
 		if (*event != '/')
 			return 0;
 		event++;
@@ -560,7 +566,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
 		    memcmp(shname, "xdp", 3) == 0 ||
 		    memcmp(shname, "perf_event", 10) == 0 ||
 		    memcmp(shname, "socket", 6) == 0 ||
-		    memcmp(shname, "cgroup/", 7) == 0)
+		    memcmp(shname, "cgroup/", 7) == 0 ||
+		    memcmp(shname, "sockops", 7) == 0)
 			load_and_attach(shname, data->d_buf, data->d_size);
 	}
 
-- 
cgit v1.3-14-g43fede


From f96da09473b52c09125cc9bf7d7d4576ae8229e0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 2 Jul 2017 02:13:27 +0200
Subject: bpf: simplify narrower ctx access

This work tries to make the semantics and code around the
narrower ctx access a bit easier to follow. Right now
everything is done inside the .is_valid_access(). Offset
matching is done differently for read/write types, meaning
writes don't support narrower access and thus matching only
on offsetof(struct foo, bar) is enough whereas for read
case that supports narrower access we must check for
offsetof(struct foo, bar) + offsetof(struct foo, bar) +
sizeof(<bar>) - 1 for each of the cases. For read cases of
individual members that don't support narrower access (like
packet pointers or skb->cb[] case which has its own narrow
access logic), we check as usual only offsetof(struct foo,
bar) like in write case. Then, for the case where narrower
access is allowed, we also need to set the aux info for the
access. Meaning, ctx_field_size and converted_op_size have
to be set. First is the original field size e.g. sizeof(<bar>)
as in above example from the user facing ctx, and latter
one is the target size after actual rewrite happened, thus
for the kernel facing ctx. Also here we need the range match
and we need to keep track changing convert_ctx_access() and
converted_op_size from is_valid_access() as both are not at
the same location.

We can simplify the code a bit: check_ctx_access() becomes
simpler in that we only store ctx_field_size as a meta data
and later in convert_ctx_accesses() we fetch the target_size
right from the location where we do convert. Should the verifier
be misconfigured we do reject for BPF_WRITE cases or target_size
that are not provided. For the subsystems, we always work on
ranges in is_valid_access() and add small helpers for ranges
and narrow access, convert_ctx_accesses() sets target_size
for the relevant instruction.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   9 +-
 include/linux/filter.h   |  47 ++++++++++
 kernel/bpf/verifier.c    |  78 +++++++---------
 kernel/trace/bpf_trace.c |  31 +++---
 net/core/filter.c        | 239 +++++++++++++++++++++--------------------------
 5 files changed, 209 insertions(+), 195 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5175729270d7..b69e7a5869ff 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -156,9 +156,14 @@ struct bpf_prog;
 struct bpf_insn_access_aux {
 	enum bpf_reg_type reg_type;
 	int ctx_field_size;
-	int converted_op_size;
 };
 
+static inline void
+bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
+{
+	aux->ctx_field_size = size;
+}
+
 struct bpf_verifier_ops {
 	/* return eBPF function prototype for verification */
 	const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
@@ -173,7 +178,7 @@ struct bpf_verifier_ops {
 	u32 (*convert_ctx_access)(enum bpf_access_type type,
 				  const struct bpf_insn *src,
 				  struct bpf_insn *dst,
-				  struct bpf_prog *prog);
+				  struct bpf_prog *prog, u32 *target_size);
 	int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
 			union bpf_attr __user *uattr);
 };
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 738f8b14f025..f1fc9baa3509 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -337,6 +337,22 @@ struct bpf_prog_aux;
 	bpf_size;						\
 })
 
+#define bpf_size_to_bytes(bpf_size)				\
+({								\
+	int bytes = -EINVAL;					\
+								\
+	if (bpf_size == BPF_B)					\
+		bytes = sizeof(u8);				\
+	else if (bpf_size == BPF_H)				\
+		bytes = sizeof(u16);				\
+	else if (bpf_size == BPF_W)				\
+		bytes = sizeof(u32);				\
+	else if (bpf_size == BPF_DW)				\
+		bytes = sizeof(u64);				\
+								\
+	bytes;							\
+})
+
 #define BPF_SIZEOF(type)					\
 	({							\
 		const int __size = bytes_to_bpf_size(sizeof(type)); \
@@ -351,6 +367,13 @@ struct bpf_prog_aux;
 		__size;						\
 	})
 
+#define BPF_LDST_BYTES(insn)					\
+	({							\
+		const int __size = bpf_size_to_bytes(BPF_SIZE(insn->code)); \
+		WARN_ON(__size < 0);				\
+		__size;						\
+	})
+
 #define __BPF_MAP_0(m, v, ...) v
 #define __BPF_MAP_1(m, v, t, a, ...) m(t, a)
 #define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__)
@@ -401,6 +424,18 @@ struct bpf_prog_aux;
 #define BPF_CALL_4(name, ...)	BPF_CALL_x(4, name, __VA_ARGS__)
 #define BPF_CALL_5(name, ...)	BPF_CALL_x(5, name, __VA_ARGS__)
 
+#define bpf_ctx_range(TYPE, MEMBER)						\
+	offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
+#define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2)				\
+	offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1
+
+#define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE)				\
+	({									\
+		BUILD_BUG_ON(FIELD_SIZEOF(TYPE, MEMBER) != (SIZE));		\
+		*(PTR_SIZE) = (SIZE);						\
+		offsetof(TYPE, MEMBER);						\
+	})
+
 #ifdef CONFIG_COMPAT
 /* A struct sock_filter is architecture independent. */
 struct compat_sock_fprog {
@@ -564,6 +599,18 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
 	return prog->type == BPF_PROG_TYPE_UNSPEC;
 }
 
+static inline bool
+bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default)
+{
+	bool off_ok;
+#ifdef __LITTLE_ENDIAN
+	off_ok = (off & (size_default - 1)) == 0;
+#else
+	off_ok = (off & (size_default - 1)) + size == size_default;
+#endif
+	return off_ok && size <= size_default && (size & (size - 1)) == 0;
+}
+
 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
 
 #ifdef CONFIG_ARCH_HAS_SET_MEMORY
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6ea2adcb233b..6f820a044079 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -546,20 +546,6 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
 	return 0;
 }
 
-static int bpf_size_to_bytes(int bpf_size)
-{
-	if (bpf_size == BPF_W)
-		return 4;
-	else if (bpf_size == BPF_H)
-		return 2;
-	else if (bpf_size == BPF_B)
-		return 1;
-	else if (bpf_size == BPF_DW)
-		return 8;
-	else
-		return -EINVAL;
-}
-
 static bool is_spillable_regtype(enum bpf_reg_type type)
 {
 	switch (type) {
@@ -761,7 +747,9 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
 {
-	struct bpf_insn_access_aux info = { .reg_type = *reg_type };
+	struct bpf_insn_access_aux info = {
+		.reg_type = *reg_type,
+	};
 
 	/* for analyzer ctx accesses are already validated and converted */
 	if (env->analyzer_ops)
@@ -769,25 +757,14 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 
 	if (env->prog->aux->ops->is_valid_access &&
 	    env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
-		/* a non zero info.ctx_field_size indicates:
-		 * . For this field, the prog type specific ctx conversion algorithm
-		 *   only supports whole field access.
-		 * . This ctx access is a candiate for later verifier transformation
-		 *   to load the whole field and then apply a mask to get correct result.
-		 * a non zero info.converted_op_size indicates perceived actual converted
-		 * value width in convert_ctx_access.
+		/* A non zero info.ctx_field_size indicates that this field is a
+		 * candidate for later verifier transformation to load the whole
+		 * field and then apply a mask when accessed with a narrower
+		 * access than actual ctx access size. A zero info.ctx_field_size
+		 * will only allow for whole field access and rejects any other
+		 * type of narrower access.
 		 */
-		if ((info.ctx_field_size && !info.converted_op_size) ||
-		    (!info.ctx_field_size &&  info.converted_op_size)) {
-			verbose("verifier bug in is_valid_access prog type=%u off=%d size=%d\n",
-				env->prog->type, off, size);
-			return -EACCES;
-		}
-
-		if (info.ctx_field_size) {
-			env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
-			env->insn_aux_data[insn_idx].converted_op_size = info.converted_op_size;
-		}
+		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		*reg_type = info.reg_type;
 
 		/* remember the offset of last byte accessed in ctx */
@@ -3401,11 +3378,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
 static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
 	const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+	int i, cnt, size, ctx_field_size, delta = 0;
 	const int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16], *insn;
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
-	int i, cnt, off, size, ctx_field_size, converted_op_size, is_narrower_load, delta = 0;
+	bool is_narrower_load;
+	u32 target_size;
 
 	if (ops->gen_prologue) {
 		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
@@ -3445,39 +3424,50 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
 			continue;
 
-		off = insn->off;
-		size = bpf_size_to_bytes(BPF_SIZE(insn->code));
 		ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
-		converted_op_size = env->insn_aux_data[i + delta].converted_op_size;
-		is_narrower_load = type == BPF_READ && size < ctx_field_size;
+		size = BPF_LDST_BYTES(insn);
 
 		/* If the read access is a narrower load of the field,
 		 * convert to a 4/8-byte load, to minimum program type specific
 		 * convert_ctx_access changes. If conversion is successful,
 		 * we will apply proper mask to the result.
 		 */
+		is_narrower_load = size < ctx_field_size;
 		if (is_narrower_load) {
-			int size_code = BPF_H;
+			u32 off = insn->off;
+			u8 size_code;
+
+			if (type == BPF_WRITE) {
+				verbose("bpf verifier narrow ctx access misconfigured\n");
+				return -EINVAL;
+			}
 
+			size_code = BPF_H;
 			if (ctx_field_size == 4)
 				size_code = BPF_W;
 			else if (ctx_field_size == 8)
 				size_code = BPF_DW;
+
 			insn->off = off & ~(ctx_field_size - 1);
 			insn->code = BPF_LDX | BPF_MEM | size_code;
 		}
-		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
-		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+
+		target_size = 0;
+		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
+					      &target_size);
+		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
+		    (ctx_field_size && !target_size)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
 		}
-		if (is_narrower_load && size < converted_op_size) {
+
+		if (is_narrower_load && size < target_size) {
 			if (ctx_field_size <= 4)
 				insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
-							(1 << size * 8) - 1);
+								(1 << size * 8) - 1);
 			else
 				insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
-							(1 << size * 8) - 1);
+								(1 << size * 8) - 1);
 		}
 
 		new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 97c46b440cd6..5c6d538dbf43 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -583,7 +583,8 @@ const struct bpf_verifier_ops tracepoint_prog_ops = {
 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
 				    struct bpf_insn_access_aux *info)
 {
-	int sample_period_off;
+	const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data,
+					 sample_period);
 
 	if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
 		return false;
@@ -592,43 +593,35 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
 	if (off % size != 0)
 		return false;
 
-	/* permit 1, 2, 4 byte narrower and 8 normal read access to sample_period */
-	sample_period_off = offsetof(struct bpf_perf_event_data, sample_period);
-	if (off >= sample_period_off && off < sample_period_off + sizeof(__u64)) {
-		int allowed;
-
-#ifdef __LITTLE_ENDIAN
-		allowed = (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0;
-#else
-		allowed = ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0;
-#endif
-		if (!allowed)
+	switch (off) {
+	case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
+		bpf_ctx_record_field_size(info, size_sp);
+		if (!bpf_ctx_narrow_access_ok(off, size, size_sp))
 			return false;
-		info->ctx_field_size = 8;
-		info->converted_op_size = 8;
-	} else {
+		break;
+	default:
 		if (size != sizeof(long))
 			return false;
 	}
+
 	return true;
 }
 
 static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
 				      const struct bpf_insn *si,
 				      struct bpf_insn *insn_buf,
-				      struct bpf_prog *prog)
+				      struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
 
 	switch (si->off) {
 	case offsetof(struct bpf_perf_event_data, sample_period):
-		BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
-
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
 						       data), si->dst_reg, si->src_reg,
 				      offsetof(struct bpf_perf_event_data_kern, data));
 		*insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
-				      offsetof(struct perf_sample_data, period));
+				      bpf_target_off(struct perf_sample_data, period, 8,
+						     target_size));
 		break;
 	default:
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
diff --git a/net/core/filter.c b/net/core/filter.c
index 29620df45b7c..94169572d002 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3088,38 +3088,11 @@ lwt_xmit_func_proto(enum bpf_func_id func_id)
 	}
 }
 
-static void __set_access_aux_info(int off, struct bpf_insn_access_aux *info)
+static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
+				    struct bpf_insn_access_aux *info)
 {
-	info->ctx_field_size = 4;
-	switch (off) {
-	case offsetof(struct __sk_buff, pkt_type) ...
-	     offsetof(struct __sk_buff, pkt_type) + sizeof(__u32) - 1:
-	case offsetof(struct __sk_buff, vlan_present) ...
-	     offsetof(struct __sk_buff, vlan_present) + sizeof(__u32) - 1:
-		info->converted_op_size = 1;
-		break;
-	case offsetof(struct __sk_buff, queue_mapping) ...
-	     offsetof(struct __sk_buff, queue_mapping) + sizeof(__u32) - 1:
-	case offsetof(struct __sk_buff, protocol) ...
-	     offsetof(struct __sk_buff, protocol) + sizeof(__u32) - 1:
-	case offsetof(struct __sk_buff, vlan_tci) ...
-	     offsetof(struct __sk_buff, vlan_tci) + sizeof(__u32) - 1:
-	case offsetof(struct __sk_buff, vlan_proto) ...
-	     offsetof(struct __sk_buff, vlan_proto) + sizeof(__u32) - 1:
-	case offsetof(struct __sk_buff, tc_index) ...
-	     offsetof(struct __sk_buff, tc_index) + sizeof(__u32) - 1:
-	case offsetof(struct __sk_buff, tc_classid) ...
-	     offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1:
-		info->converted_op_size = 2;
-		break;
-	default:
-		info->converted_op_size = 4;
-	}
-}
+	const int size_default = sizeof(__u32);
 
-static bool __is_valid_access(int off, int size, enum bpf_access_type type,
-			      struct bpf_insn_access_aux *info)
-{
 	if (off < 0 || off >= sizeof(struct __sk_buff))
 		return false;
 
@@ -3128,40 +3101,24 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type,
 		return false;
 
 	switch (off) {
-	case offsetof(struct __sk_buff, cb[0]) ...
-	     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
-		if (off + size >
-		    offsetof(struct __sk_buff, cb[4]) + sizeof(__u32))
+	case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+		if (off + size > offsetofend(struct __sk_buff, cb[4]))
 			return false;
 		break;
-	case offsetof(struct __sk_buff, data) ...
-	     offsetof(struct __sk_buff, data) + sizeof(__u32) - 1:
-		if (size != sizeof(__u32))
+	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_end):
+		if (size != size_default)
 			return false;
-		info->reg_type = PTR_TO_PACKET;
-		break;
-	case offsetof(struct __sk_buff, data_end) ...
-	     offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1:
-		if (size != sizeof(__u32))
-			return false;
-		info->reg_type = PTR_TO_PACKET_END;
 		break;
 	default:
+		/* Only narrow read access allowed for now. */
 		if (type == BPF_WRITE) {
-			if (size != sizeof(__u32))
+			if (size != size_default)
 				return false;
 		} else {
-			int allowed;
-
-			/* permit narrower load for not cb/data/data_end fields */
-#ifdef __LITTLE_ENDIAN
-			allowed = (off & 0x3) == 0 && size <= 4 && (size & (size - 1)) == 0;
-#else
-			allowed = (off & 0x3) + size == 4 && size <= 4 && (size & (size - 1)) == 0;
-#endif
-			if (!allowed)
+			bpf_ctx_record_field_size(info, size_default);
+			if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 				return false;
-			__set_access_aux_info(off, info);
 		}
 	}
 
@@ -3173,26 +3130,22 @@ static bool sk_filter_is_valid_access(int off, int size,
 				      struct bpf_insn_access_aux *info)
 {
 	switch (off) {
-	case offsetof(struct __sk_buff, tc_classid) ...
-	     offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1:
-	case offsetof(struct __sk_buff, data) ...
-	     offsetof(struct __sk_buff, data) + sizeof(__u32) - 1:
-	case offsetof(struct __sk_buff, data_end) ...
-	     offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1:
+	case bpf_ctx_range(struct __sk_buff, tc_classid):
+	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_end):
 		return false;
 	}
 
 	if (type == BPF_WRITE) {
 		switch (off) {
-		case offsetof(struct __sk_buff, cb[0]) ...
-		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 			break;
 		default:
 			return false;
 		}
 	}
 
-	return __is_valid_access(off, size, type, info);
+	return bpf_skb_is_valid_access(off, size, type, info);
 }
 
 static bool lwt_is_valid_access(int off, int size,
@@ -3200,24 +3153,31 @@ static bool lwt_is_valid_access(int off, int size,
 				struct bpf_insn_access_aux *info)
 {
 	switch (off) {
-	case offsetof(struct __sk_buff, tc_classid) ...
-	     offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1:
+	case bpf_ctx_range(struct __sk_buff, tc_classid):
 		return false;
 	}
 
 	if (type == BPF_WRITE) {
 		switch (off) {
-		case offsetof(struct __sk_buff, mark):
-		case offsetof(struct __sk_buff, priority):
-		case offsetof(struct __sk_buff, cb[0]) ...
-		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+		case bpf_ctx_range(struct __sk_buff, mark):
+		case bpf_ctx_range(struct __sk_buff, priority):
+		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 			break;
 		default:
 			return false;
 		}
 	}
 
-	return __is_valid_access(off, size, type, info);
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		break;
+	case bpf_ctx_range(struct __sk_buff, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		break;
+	}
+
+	return bpf_skb_is_valid_access(off, size, type, info);
 }
 
 static bool sock_filter_is_valid_access(int off, int size,
@@ -3289,19 +3249,27 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 {
 	if (type == BPF_WRITE) {
 		switch (off) {
-		case offsetof(struct __sk_buff, mark):
-		case offsetof(struct __sk_buff, tc_index):
-		case offsetof(struct __sk_buff, priority):
-		case offsetof(struct __sk_buff, cb[0]) ...
-		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
-		case offsetof(struct __sk_buff, tc_classid):
+		case bpf_ctx_range(struct __sk_buff, mark):
+		case bpf_ctx_range(struct __sk_buff, tc_index):
+		case bpf_ctx_range(struct __sk_buff, priority):
+		case bpf_ctx_range(struct __sk_buff, tc_classid):
+		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 			break;
 		default:
 			return false;
 		}
 	}
 
-	return __is_valid_access(off, size, type, info);
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		break;
+	case bpf_ctx_range(struct __sk_buff, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		break;
+	}
+
+	return bpf_skb_is_valid_access(off, size, type, info);
 }
 
 static bool __is_valid_xdp_access(int off, int size)
@@ -3374,98 +3342,108 @@ static bool sock_ops_is_valid_access(int off, int size,
 static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				  const struct bpf_insn *si,
 				  struct bpf_insn *insn_buf,
-				  struct bpf_prog *prog)
+				  struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
 	int off;
 
 	switch (si->off) {
 	case offsetof(struct __sk_buff, len):
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
-
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
-				      offsetof(struct sk_buff, len));
+				      bpf_target_off(struct sk_buff, len, 4,
+						     target_size));
 		break;
 
 	case offsetof(struct __sk_buff, protocol):
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
-
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
-				      offsetof(struct sk_buff, protocol));
+				      bpf_target_off(struct sk_buff, protocol, 2,
+						     target_size));
 		break;
 
 	case offsetof(struct __sk_buff, vlan_proto):
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
-
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
-				      offsetof(struct sk_buff, vlan_proto));
+				      bpf_target_off(struct sk_buff, vlan_proto, 2,
+						     target_size));
 		break;
 
 	case offsetof(struct __sk_buff, priority):
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
-
 		if (type == BPF_WRITE)
 			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
-					      offsetof(struct sk_buff, priority));
+					      bpf_target_off(struct sk_buff, priority, 4,
+							     target_size));
 		else
 			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
-					      offsetof(struct sk_buff, priority));
+					      bpf_target_off(struct sk_buff, priority, 4,
+							     target_size));
 		break;
 
 	case offsetof(struct __sk_buff, ingress_ifindex):
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
-
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
-				      offsetof(struct sk_buff, skb_iif));
+				      bpf_target_off(struct sk_buff, skb_iif, 4,
+						     target_size));
 		break;
 
 	case offsetof(struct __sk_buff, ifindex):
-		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
-
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, dev));
 		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
-				      offsetof(struct net_device, ifindex));
+				      bpf_target_off(struct net_device, ifindex, 4,
+						     target_size));
 		break;
 
 	case offsetof(struct __sk_buff, hash):
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
-
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
-				      offsetof(struct sk_buff, hash));
+				      bpf_target_off(struct sk_buff, hash, 4,
+						     target_size));
 		break;
 
 	case offsetof(struct __sk_buff, mark):
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
-
 		if (type == BPF_WRITE)
 			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
-					      offsetof(struct sk_buff, mark));
+					      bpf_target_off(struct sk_buff, mark, 4,
+							     target_size));
 		else
 			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
-					      offsetof(struct sk_buff, mark));
+					      bpf_target_off(struct sk_buff, mark, 4,
+							     target_size));
 		break;
 
 	case offsetof(struct __sk_buff, pkt_type):
-		return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg,
-					  si->src_reg, insn);
+		*target_size = 1;
+		*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
+				      PKT_TYPE_OFFSET());
+		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
+#ifdef __BIG_ENDIAN_BITFIELD
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
+#endif
+		break;
 
 	case offsetof(struct __sk_buff, queue_mapping):
-		return convert_skb_access(SKF_AD_QUEUE, si->dst_reg,
-					  si->src_reg, insn);
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+				      bpf_target_off(struct sk_buff, queue_mapping, 2,
+						     target_size));
+		break;
 
 	case offsetof(struct __sk_buff, vlan_present):
-		return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
-					  si->dst_reg, si->src_reg, insn);
-
 	case offsetof(struct __sk_buff, vlan_tci):
-		return convert_skb_access(SKF_AD_VLAN_TAG,
-					  si->dst_reg, si->src_reg, insn);
+		BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+
+		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+				      bpf_target_off(struct sk_buff, vlan_tci, 2,
+						     target_size));
+		if (si->off == offsetof(struct __sk_buff, vlan_tci)) {
+			*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg,
+						~VLAN_TAG_PRESENT);
+		} else {
+			*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12);
+			*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1);
+		}
+		break;
 
 	case offsetof(struct __sk_buff, cb[0]) ...
-	     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
+	     offsetofend(struct __sk_buff, cb[4]) - 1:
 		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
 		BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
 			      offsetof(struct qdisc_skb_cb, data)) %
@@ -3491,6 +3469,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 		off -= offsetof(struct __sk_buff, tc_classid);
 		off += offsetof(struct sk_buff, cb);
 		off += offsetof(struct qdisc_skb_cb, tc_classid);
+		*target_size = 2;
 		if (type == BPF_WRITE)
 			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
 					      si->src_reg, off);
@@ -3516,14 +3495,14 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 
 	case offsetof(struct __sk_buff, tc_index):
 #ifdef CONFIG_NET_SCHED
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
-
 		if (type == BPF_WRITE)
 			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
-					      offsetof(struct sk_buff, tc_index));
+					      bpf_target_off(struct sk_buff, tc_index, 2,
+							     target_size));
 		else
 			*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
-					      offsetof(struct sk_buff, tc_index));
+					      bpf_target_off(struct sk_buff, tc_index, 2,
+							     target_size));
 #else
 		if (type == BPF_WRITE)
 			*insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
@@ -3534,10 +3513,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 
 	case offsetof(struct __sk_buff, napi_id):
 #if defined(CONFIG_NET_RX_BUSY_POLL)
-		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, napi_id) != 4);
-
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
-				      offsetof(struct sk_buff, napi_id));
+				      bpf_target_off(struct sk_buff, napi_id, 4,
+						     target_size));
 		*insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);
 		*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
 #else
@@ -3552,7 +3530,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
 					  const struct bpf_insn *si,
 					  struct bpf_insn *insn_buf,
-					  struct bpf_prog *prog)
+					  struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
 
@@ -3596,22 +3574,22 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
 					 const struct bpf_insn *si,
 					 struct bpf_insn *insn_buf,
-					 struct bpf_prog *prog)
+					 struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
 
 	switch (si->off) {
 	case offsetof(struct __sk_buff, ifindex):
-		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
-
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct sk_buff, dev));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
-				      offsetof(struct net_device, ifindex));
+				      bpf_target_off(struct net_device, ifindex, 4,
+						     target_size));
 		break;
 	default:
-		return bpf_convert_ctx_access(type, si, insn_buf, prog);
+		return bpf_convert_ctx_access(type, si, insn_buf, prog,
+					      target_size);
 	}
 
 	return insn - insn_buf;
@@ -3620,7 +3598,7 @@ static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
 static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				  const struct bpf_insn *si,
 				  struct bpf_insn *insn_buf,
-				  struct bpf_prog *prog)
+				  struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
 
@@ -3643,7 +3621,8 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 				       const struct bpf_insn *si,
 				       struct bpf_insn *insn_buf,
-				       struct bpf_prog *prog)
+				       struct bpf_prog *prog,
+				       u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
 	int off;
-- 
cgit v1.3-14-g43fede


From 9780c0ab1a4e64ef6998c4d83f9df5be806a02dc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 2 Jul 2017 02:13:28 +0200
Subject: bpf: export whether tail call has jited owner

We do export through fdinfo already whether a prog is JITed or not,
given a program load can fail in case of either prog or tail call map
has JITed property, but neither both are JITed or not JITed, we can
facilitate error reporting in loaders like iproute2 through exporting
owner_jited of tail call map. We already do export owner_prog_type
through this facility, so parser can pick up both for comparison.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d4d47de75bba..18980472f5b0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -216,10 +216,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 	const struct bpf_map *map = filp->private_data;
 	const struct bpf_array *array;
 	u32 owner_prog_type = 0;
+	u32 owner_jited = 0;
 
 	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
 		array = container_of(map, struct bpf_array, map);
 		owner_prog_type = array->owner_prog_type;
+		owner_jited = array->owner_jited;
 	}
 
 	seq_printf(m,
@@ -236,9 +238,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   map->map_flags,
 		   map->pages * 1ULL << PAGE_SHIFT);
 
-	if (owner_prog_type)
+	if (owner_prog_type) {
 		seq_printf(m, "owner_prog_type:\t%u\n",
 			   owner_prog_type);
+		seq_printf(m, "owner_jited:\t%u\n",
+			   owner_jited);
+	}
 }
 #endif
 
-- 
cgit v1.3-14-g43fede


From 7bda4b40c5624c3f1c69227f8ebfd46a4b83f2ef Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 2 Jul 2017 02:13:29 +0200
Subject: bpf: extend bpf_trace_printk to support %i

Currently, bpf_trace_printk does not support common formatting
symbol '%i' however vsprintf does and is what eventually gets
called by bpf helper. If users are used to '%i' and currently
make use of it, then bpf_trace_printk will just return with
error without dumping anything to the trace pipe, so just add
support for '%i' to the helper.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 5c6d538dbf43..37385193a608 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -122,8 +122,8 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
 }
 
 /*
- * limited trace_printk()
- * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
+ * Only limited trace_printk() conversion specifiers allowed:
+ * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s
  */
 BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
 	   u64, arg2, u64, arg3)
@@ -198,7 +198,8 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
 			i++;
 		}
 
-		if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+		if (fmt[i] != 'i' && fmt[i] != 'd' &&
+		    fmt[i] != 'u' && fmt[i] != 'x')
 			return -EINVAL;
 		fmt_cnt++;
 	}
-- 
cgit v1.3-14-g43fede


From 43188702b3d98d2792969a3377a30957f05695e6 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 2 Jul 2017 02:13:30 +0200
Subject: bpf, verifier: add additional patterns to evaluate_reg_imm_alu

Currently the verifier does not track imm across alu operations when
the source register is of unknown type. This adds additional pattern
matching to catch this and track imm. We've seen LLVM generating this
pattern while working on cilium.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6f820a044079..6a86723c5b64 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1657,6 +1657,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
+static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
+					struct bpf_insn *insn)
+{
+	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+	struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+	u8 opcode = BPF_OP(insn->code);
+	s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
+
+	/* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
+	if (src_reg->imm > 0 && dst_reg->imm) {
+		switch (opcode) {
+		case BPF_ADD:
+			/* dreg += sreg
+			 * where both have zero upper bits. Adding them
+			 * can only result making one more bit non-zero
+			 * in the larger value.
+			 * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
+			 *     0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
+			 */
+			dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+			dst_reg->imm--;
+			break;
+		case BPF_AND:
+			/* dreg &= sreg
+			 * AND can not extend zero bits only shrink
+			 * Ex.  0x00..00ffffff
+			 *    & 0x0f..ffffffff
+			 *     ----------------
+			 *      0x00..00ffffff
+			 */
+			dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
+			break;
+		case BPF_OR:
+			/* dreg |= sreg
+			 * OR can only extend zero bits
+			 * Ex.  0x00..00ffffff
+			 *    | 0x0f..ffffffff
+			 *     ----------------
+			 *      0x0f..00ffffff
+			 */
+			dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+			break;
+		case BPF_SUB:
+		case BPF_MUL:
+		case BPF_RSH:
+		case BPF_LSH:
+			/* These may be flushed out later */
+		default:
+			mark_reg_unknown_value(regs, insn->dst_reg);
+		}
+	} else {
+		mark_reg_unknown_value(regs, insn->dst_reg);
+	}
+
+	dst_reg->type = UNKNOWN_VALUE;
+	return 0;
+}
+
 static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
 				struct bpf_insn *insn)
 {
@@ -1666,6 +1725,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
 	u8 opcode = BPF_OP(insn->code);
 	u64 dst_imm = dst_reg->imm;
 
+	if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
+		return evaluate_reg_imm_alu_unknown(env, insn);
+
 	/* dst_reg->type == CONST_IMM here. Simulate execution of insns
 	 * containing ALU ops. Don't care about overflow or negative
 	 * values, just add/sub/... them; registers are in u64.
-- 
cgit v1.3-14-g43fede


From 3b9c08ae3dd44201b3a188aef34d6ddf73434015 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 4 Jul 2017 11:53:40 +0200
Subject: Revert "sched/cputime: Refactor the cputime_adjust() code"

This reverts commit 72298e5c92c50edd8cb7cfda4519483ce65fa166.

As Peter explains:

> Argh, no... That code was perfectly fine. The new code otoh is
> convoluted.
>
> The old code had the following form:
>
>         if (exception1)
>           deal with exception1
>
>         if (execption2)
>           deal with exception2
>
>         do normal stuff
>
> Which is as simple and straight forward as it gets.
>
> The new code otoh reads like:
>
>         if (!exception1) {
>                 if (exception2)
>                   deal with exception 2
>                 else
>                   do normal stuff
>         }

So restore the old form.

Also fix the comment describing the logic, as it was confusing.

Requested-by: Peter Zijlstra <peterz@infradead.org>
Cc: Gustavo A. R. Silva <garsilva@embeddedor.com>
Cc: Frans Klaver <fransklaver@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 67c70e287647..84a419bdf5aa 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr,
 	utime = curr->utime;
 
 	/*
-	 * If either stime or both stime and utime are 0, assume all runtime is
-	 * userspace. Once a task gets some ticks, the monotonicy code at
-	 * 'update' will ensure things converge to the observed ratio.
+	 * If either stime or utime are 0, assume all runtime is userspace.
+	 * Once a task gets some ticks, the monotonicy code at 'update:'
+	 * will ensure things converge to the observed ratio.
 	 */
-	if (stime != 0) {
-		if (utime == 0)
-			stime = rtime;
-		else
-			stime = scale_stime(stime, rtime, stime + utime);
+	if (stime == 0) {
+		utime = rtime;
+		goto update;
 	}
 
+	if (utime == 0) {
+		stime = rtime;
+		goto update;
+	}
+
+	stime = scale_stime(stime, rtime, stime + utime);
+
+update:
 	/*
 	 * Make sure stime doesn't go backwards; this preserves monotonicity
 	 * for utime because rtime is monotonic.
-- 
cgit v1.3-14-g43fede


From e5682b4eecb2b73282853d0ef314d3164b986997 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Tue, 4 Jul 2017 11:25:15 +0200
Subject: genirq/debugfs: Fix build for !CONFIG_IRQ_DOMAIN

Fix this build error:

kernel/irq/internals.h:440:20: error: inlining failed in call to always_inline
  'irq_domain_debugfs_init': function body not available
kernel/irq/debugfs.c:202:2: note: called from here
  irq_domain_debugfs_init(root_dir);
  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/alpine.LFD.2.20.1707041124000.1712@schleppi
---
 kernel/irq/internals.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 9da14d125df4..dbfba9933ed2 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -437,7 +437,9 @@ static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
 # ifdef CONFIG_IRQ_DOMAIN
 void irq_domain_debugfs_init(struct dentry *root);
 # else
-static inline void irq_domain_debugfs_init(struct dentry *root);
+static inline void irq_domain_debugfs_init(struct dentry *root)
+{
+}
 # endif
 #else /* CONFIG_GENERIC_IRQ_DEBUGFS */
 static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
-- 
cgit v1.3-14-g43fede


From 2372a519f63829b8effcdde5f4564a7e036294f0 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Tue, 4 Jul 2017 12:06:01 +0200
Subject: genirq: Force inlining of __irq_startup_managed to prevent build
 failure

If CONFIG_SMP=n, and gcc (e.g. 4.1.2) decides not to inline
__irq_startup_managed(), the build fails with:

    kernel/built-in.o: In function `irq_startup':
    (.text+0x38ed8): undefined reference to `irq_set_affinity_locked'

Fix this by forcing inlining of __irq_startup_managed().

Fixes: 761ea388e8c4e3ac ("genirq: Handle managed irqs gracefully in irq_startup()")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Link: http://lkml.kernel.org/r/1499162761-12398-1-git-send-email-geert@linux-m68k.org
---
 kernel/irq/chip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2e30d925a40d..aa5497dfb29e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -234,7 +234,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
 	return IRQ_STARTUP_MANAGED;
 }
 #else
-static int
+static __always_inline int
 __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
 {
 	return IRQ_STARTUP_NORMAL;
-- 
cgit v1.3-14-g43fede


From 3a90795e1e885167209056a1a90be965add30e25 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jun 2017 23:33:36 +0200
Subject: genirq: Move bus locking into __setup_irq()

There is no point in having the irq_bus_lock() protection around all
callers to __setup_irq().

Move it into __setup_irq(). This is also a preparatory patch for addressing
the issues with the irq resource callbacks.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Heiko Stuebner <heiko@sntech.de>
Cc: Julia Cartwright <julia@ni.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Brian Norris <briannorris@chromium.org>
Cc: Doug Anderson <dianders@chromium.org>
Cc: linux-rockchip@lists.infradead.org
Cc: John Keeping <john@metanate.com>
Cc: linux-gpio@vger.kernel.org
Link: http://lkml.kernel.org/r/20170629214343.960949031@linutronix.de
---
 kernel/irq/manage.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5c11c1730ba5..0934e02fa04e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1167,6 +1167,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
 		new->flags &= ~IRQF_ONESHOT;
 
+	chip_bus_lock(desc);
+
 	/*
 	 * The following block of code has to be executed atomically
 	 */
@@ -1347,6 +1349,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	}
 
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	chip_bus_sync_unlock(desc);
 
 	irq_setup_timings(desc, new);
 
@@ -1378,6 +1381,8 @@ mismatch:
 out_unlock:
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
+	chip_bus_sync_unlock(desc);
+
 out_thread:
 	if (new->thread) {
 		struct task_struct *t = new->thread;
@@ -1417,9 +1422,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 	if (retval < 0)
 		return retval;
 
-	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, act);
-	chip_bus_sync_unlock(desc);
 
 	if (retval)
 		irq_chip_pm_put(&desc->irq_data);
@@ -1674,9 +1677,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 		return retval;
 	}
 
-	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, action);
-	chip_bus_sync_unlock(desc);
 
 	if (retval) {
 		irq_chip_pm_put(&desc->irq_data);
@@ -1924,9 +1925,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
 	if (retval < 0)
 		return retval;
 
-	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, act);
-	chip_bus_sync_unlock(desc);
 
 	if (retval)
 		irq_chip_pm_put(&desc->irq_data);
@@ -1980,9 +1979,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
 		return retval;
 	}
 
-	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, action);
-	chip_bus_sync_unlock(desc);
 
 	if (retval) {
 		irq_chip_pm_put(&desc->irq_data);
-- 
cgit v1.3-14-g43fede


From 9114014cf4e6df0b22d764380ae1fc54f1a7a8b2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jun 2017 23:33:37 +0200
Subject: genirq: Add mutex to irq desc to serialize request/free_irq()

The irq_request/release_resources() callbacks ar currently invoked under
desc->lock with interrupts disabled. This is a source of problems on RT and
conceptually not required.

Add a seperate mutex to struct irq_desc which allows to serialize
request/free_irq(), which can be used to move the resource functions out of
the desc->lock held region.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Heiko Stuebner <heiko@sntech.de>
Cc: Julia Cartwright <julia@ni.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Brian Norris <briannorris@chromium.org>
Cc: Doug Anderson <dianders@chromium.org>
Cc: linux-rockchip@lists.infradead.org
Cc: John Keeping <john@metanate.com>
Cc: linux-gpio@vger.kernel.org
Link: http://lkml.kernel.org/r/20170629214344.039220922@linutronix.de
---
 include/linux/irqdesc.h | 3 +++
 kernel/irq/irqdesc.c    | 1 +
 kernel/irq/manage.c     | 8 ++++++++
 3 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index d425a3a09722..3e90a094798d 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -3,6 +3,7 @@
 
 #include <linux/rcupdate.h>
 #include <linux/kobject.h>
+#include <linux/mutex.h>
 
 /*
  * Core internal functions to deal with irq descriptors
@@ -45,6 +46,7 @@ struct pt_regs;
  *			IRQF_FORCE_RESUME set
  * @rcu:		rcu head for delayed free
  * @kobj:		kobject used to represent this struct in sysfs
+ * @request_mutex:	mutex to protect request/free before locking desc->lock
  * @dir:		/proc/irq/ procfs entry
  * @debugfs_file:	dentry for the debugfs file
  * @name:		flow handler name for /proc/interrupts output
@@ -96,6 +98,7 @@ struct irq_desc {
 	struct rcu_head		rcu;
 	struct kobject		kobj;
 #endif
+	struct mutex		request_mutex;
 	int			parent_irq;
 	struct module		*owner;
 	const char		*name;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 948b50e78549..906a67e58391 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -373,6 +373,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
 
 	raw_spin_lock_init(&desc->lock);
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	mutex_init(&desc->request_mutex);
 	init_rcu_head(&desc->rcu);
 
 	desc_set_defaults(irq, desc, node, affinity, owner);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0934e02fa04e..0139908b8d53 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1167,6 +1167,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
 		new->flags &= ~IRQF_ONESHOT;
 
+	mutex_lock(&desc->request_mutex);
+
 	chip_bus_lock(desc);
 
 	/*
@@ -1350,6 +1352,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	chip_bus_sync_unlock(desc);
+	mutex_unlock(&desc->request_mutex);
 
 	irq_setup_timings(desc, new);
 
@@ -1383,6 +1386,8 @@ out_unlock:
 
 	chip_bus_sync_unlock(desc);
 
+	mutex_unlock(&desc->request_mutex);
+
 out_thread:
 	if (new->thread) {
 		struct task_struct *t = new->thread;
@@ -1446,6 +1451,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 	if (!desc)
 		return NULL;
 
+	mutex_lock(&desc->request_mutex);
 	chip_bus_lock(desc);
 	raw_spin_lock_irqsave(&desc->lock, flags);
 
@@ -1521,6 +1527,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 		}
 	}
 
+	mutex_unlock(&desc->request_mutex);
+
 	irq_chip_pm_put(&desc->irq_data);
 	module_put(desc->owner);
 	kfree(action->secondary);
-- 
cgit v1.3-14-g43fede


From 46e48e257360f0845fe17089713cbad4db611e70 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jun 2017 23:33:38 +0200
Subject: genirq: Move irq resource handling out of spinlocked region

Aside of being conceptually wrong, there is also an actual (hard to
trigger and mostly theoretical) problem.

CPU0				CPU1
free_irq(X)			interrupt X
				spin_lock(desc->lock)
				wake irq thread()
				spin_unlock(desc->lock)
spin_lock(desc->lock)
remove action()
shutdown_irq()
release_resources()		thread_handler()
spin_unlock(desc->lock)		  access released resources.

synchronize_irq()

Move the release resources invocation after synchronize_irq() so it's
guaranteed that the threaded handler has finished.

Move the resource request call out of the desc->lock held region as well,
so the invocation context is the same for both request and release.

This solves the problems with those functions on RT as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Heiko Stuebner <heiko@sntech.de>
Cc: Julia Cartwright <julia@ni.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Brian Norris <briannorris@chromium.org>
Cc: Doug Anderson <dianders@chromium.org>
Cc: linux-rockchip@lists.infradead.org
Cc: John Keeping <john@metanate.com>
Cc: linux-gpio@vger.kernel.org
Link: http://lkml.kernel.org/r/20170629214344.117028181@linutronix.de
---
 kernel/irq/manage.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0139908b8d53..3e693430bfe1 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1168,6 +1168,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		new->flags &= ~IRQF_ONESHOT;
 
 	mutex_lock(&desc->request_mutex);
+	if (!desc->action) {
+		ret = irq_request_resources(desc);
+		if (ret) {
+			pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
+			       new->name, irq, desc->irq_data.chip->name);
+			goto out_mutex;
+		}
+	}
 
 	chip_bus_lock(desc);
 
@@ -1271,13 +1279,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	}
 
 	if (!shared) {
-		ret = irq_request_resources(desc);
-		if (ret) {
-			pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
-			       new->name, irq, desc->irq_data.chip->name);
-			goto out_unlock;
-		}
-
 		init_waitqueue_head(&desc->wait_for_threads);
 
 		/* Setup the type (level, edge polarity) if configured: */
@@ -1386,6 +1387,10 @@ out_unlock:
 
 	chip_bus_sync_unlock(desc);
 
+	if (!desc->action)
+		irq_release_resources(desc);
+
+out_mutex:
 	mutex_unlock(&desc->request_mutex);
 
 out_thread:
@@ -1484,7 +1489,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 	if (!desc->action) {
 		irq_settings_clr_disable_unlazy(desc);
 		irq_shutdown(desc);
-		irq_release_resources(desc);
 		irq_remove_timings(desc);
 	}
 
@@ -1527,6 +1531,9 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 		}
 	}
 
+	if (!desc->action)
+		irq_release_resources(desc);
+
 	mutex_unlock(&desc->request_mutex);
 
 	irq_chip_pm_put(&desc->irq_data);
-- 
cgit v1.3-14-g43fede


From 2343877fbda701599653e63f8dcc318aa1bf15ee Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jun 2017 23:33:39 +0200
Subject: genirq/timings: Move free timings out of spinlocked region

No point to do memory management from a interrupt disabled spin locked
region.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Heiko Stuebner <heiko@sntech.de>
Cc: Julia Cartwright <julia@ni.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Brian Norris <briannorris@chromium.org>
Cc: Doug Anderson <dianders@chromium.org>
Cc: linux-rockchip@lists.infradead.org
Cc: John Keeping <john@metanate.com>
Cc: linux-gpio@vger.kernel.org
Link: http://lkml.kernel.org/r/20170629214344.196130646@linutronix.de
---
 kernel/irq/manage.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3e693430bfe1..91e1f2390752 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1489,7 +1489,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 	if (!desc->action) {
 		irq_settings_clr_disable_unlazy(desc);
 		irq_shutdown(desc);
-		irq_remove_timings(desc);
 	}
 
 #ifdef CONFIG_SMP
@@ -1531,8 +1530,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 		}
 	}
 
-	if (!desc->action)
+	if (!desc->action) {
 		irq_release_resources(desc);
+		irq_remove_timings(desc);
+	}
 
 	mutex_unlock(&desc->request_mutex);
 
-- 
cgit v1.3-14-g43fede


From 1d0c6e593023ac5dafc2ea2b3f23d96f1c1f2fa2 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Fri, 30 Jun 2017 10:22:14 +0530
Subject: PM / sleep: constify attribute_group structures

attribute_groups are not supposed to change at runtime. All functions
working with attribute_groups provided by <linux/sysfs.h> work with const
attribute_group. So mark the non-const structs as const.

File size before:
   text	   data	    bss	    dec	    hex	filename
   3802	    624	     32	   4458	   116a	kernel/power/main.o

File size After adding 'const':
   text	   data	    bss	    dec	    hex	filename
   3866	    560	     32	   4458	   116a	kernel/power/main.o

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index d401c21136d1..42bd800a6755 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -705,7 +705,7 @@ static struct attribute * g[] = {
 	NULL,
 };
 
-static struct attribute_group attr_group = {
+static const struct attribute_group attr_group = {
 	.attrs = g,
 };
 
-- 
cgit v1.3-14-g43fede


From 1c3eda01a79b8e9237d91c52c5a75b20983f47c6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 29 Jun 2017 19:15:07 +0200
Subject: vtime, sched/cputime: Remove vtime_account_user()

It's an unnecessary function between vtime_user_exit() and
account_user_time().

Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1498756511-11714-2-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/vtime.h  |  9 +--------
 kernel/sched/cputime.c | 12 ++++++------
 2 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 0681fe25abeb..18b405e3cd93 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -67,19 +67,12 @@ static inline void vtime_account_system(struct task_struct *tsk) { }
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void arch_vtime_task_switch(struct task_struct *tsk);
-extern void vtime_account_user(struct task_struct *tsk);
 extern void vtime_user_enter(struct task_struct *tsk);
-
-static inline void vtime_user_exit(struct task_struct *tsk)
-{
-	vtime_account_user(tsk);
-}
-
+extern void vtime_user_exit(struct task_struct *tsk);
 extern void vtime_guest_enter(struct task_struct *tsk);
 extern void vtime_guest_exit(struct task_struct *tsk);
 extern void vtime_init_idle(struct task_struct *tsk, int cpu);
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
-static inline void vtime_account_user(struct task_struct *tsk) { }
 static inline void vtime_user_enter(struct task_struct *tsk) { }
 static inline void vtime_user_exit(struct task_struct *tsk) { }
 static inline void vtime_guest_enter(struct task_struct *tsk) { }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 84a419bdf5aa..5adc896d0f64 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -724,21 +724,21 @@ void vtime_account_system(struct task_struct *tsk)
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
-void vtime_account_user(struct task_struct *tsk)
+void vtime_user_enter(struct task_struct *tsk)
 {
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	tsk->vtime_snap_whence = VTIME_SYS;
 	if (vtime_delta(tsk))
-		account_user_time(tsk, get_vtime_delta(tsk));
+		__vtime_account_system(tsk);
+	tsk->vtime_snap_whence = VTIME_USER;
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
-void vtime_user_enter(struct task_struct *tsk)
+void vtime_user_exit(struct task_struct *tsk)
 {
 	write_seqcount_begin(&tsk->vtime_seqcount);
+	tsk->vtime_snap_whence = VTIME_SYS;
 	if (vtime_delta(tsk))
-		__vtime_account_system(tsk);
-	tsk->vtime_snap_whence = VTIME_USER;
+		account_user_time(tsk, get_vtime_delta(tsk));
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
-- 
cgit v1.3-14-g43fede


From 9fa57cf5a5c4aed1e45879b335fe433048709327 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 29 Jun 2017 19:15:08 +0200
Subject: sched/cputime: Always set tsk->vtime_snap_whence after accounting
 vtime

Even though it doesn't have functional consequences, setting
the task's new context state after we actually accounted the pending
vtime from the old context state makes more sense from a review
perspective.

vtime_user_exit() is the only function that doesn't follow that rule
and that can bug the reviewer for a little while until he realizes there
is no reason for this special case.

Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1498756511-11714-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5adc896d0f64..ab68927e8e94 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -736,9 +736,9 @@ void vtime_user_enter(struct task_struct *tsk)
 void vtime_user_exit(struct task_struct *tsk)
 {
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	tsk->vtime_snap_whence = VTIME_SYS;
 	if (vtime_delta(tsk))
 		account_user_time(tsk, get_vtime_delta(tsk));
+	tsk->vtime_snap_whence = VTIME_SYS;
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
-- 
cgit v1.3-14-g43fede


From 60a9ce57e7c5ac1df3a39fb941022bbfa40c0862 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 29 Jun 2017 19:15:09 +0200
Subject: sched/cputime: Rename vtime fields

The current "snapshot" based naming on vtime fields suggests we record
some past event but that's a low level picture of their actual purpose
which comes out blurry. The real point of these fields is to run a basic
state machine that tracks down cputime entry while switching between
contexts.

So lets reflect that with more meaningful names.

Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1498756511-11714-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/init_task.h |  4 ++--
 include/linux/sched.h     |  4 ++--
 kernel/fork.c             |  4 ++--
 kernel/sched/cputime.c    | 30 +++++++++++++++---------------
 4 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e049526bc188..3d537331cd4e 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -171,8 +171,8 @@ extern struct cred init_cred;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 # define INIT_VTIME(tsk)						\
 	.vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),	\
-	.vtime_snap = 0,				\
-	.vtime_snap_whence = VTIME_SYS,
+	.vtime_starttime = 0,				\
+	.vtime_state = VTIME_SYS,
 #else
 # define INIT_VTIME(tsk)
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c4ca7433d9d..ff001646549e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -689,7 +689,7 @@ struct task_struct {
 	struct prev_cputime		prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	seqcount_t			vtime_seqcount;
-	unsigned long long		vtime_snap;
+	unsigned long long		vtime_starttime;
 	enum {
 		/* Task is sleeping or running in a CPU with VTIME inactive: */
 		VTIME_INACTIVE = 0,
@@ -697,7 +697,7 @@ struct task_struct {
 		VTIME_USER,
 		/* Task runs in kernelspace in a CPU with VTIME active: */
 		VTIME_SYS,
-	} vtime_snap_whence;
+	} vtime_state;
 #endif
 
 #ifdef CONFIG_NO_HZ_FULL
diff --git a/kernel/fork.c b/kernel/fork.c
index e53770d2bf95..83c4f9bf3e14 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1638,8 +1638,8 @@ static __latent_entropy struct task_struct *copy_process(
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	seqcount_init(&p->vtime_seqcount);
-	p->vtime_snap = 0;
-	p->vtime_snap_whence = VTIME_INACTIVE;
+	p->vtime_starttime = 0;
+	p->vtime_state = VTIME_INACTIVE;
 #endif
 
 #if defined(SPLIT_RSS_COUNTING)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ab68927e8e94..8c64753067c5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -683,10 +683,10 @@ static u64 vtime_delta(struct task_struct *tsk)
 {
 	unsigned long now = READ_ONCE(jiffies);
 
-	if (time_before(now, (unsigned long)tsk->vtime_snap))
+	if (time_before(now, (unsigned long)tsk->vtime_starttime))
 		return 0;
 
-	return jiffies_to_nsecs(now - tsk->vtime_snap);
+	return jiffies_to_nsecs(now - tsk->vtime_starttime);
 }
 
 static u64 get_vtime_delta(struct task_struct *tsk)
@@ -701,10 +701,10 @@ static u64 get_vtime_delta(struct task_struct *tsk)
 	 * elapsed time. Limit account_other_time to prevent rounding
 	 * errors from causing elapsed vtime to go negative.
 	 */
-	delta = jiffies_to_nsecs(now - tsk->vtime_snap);
+	delta = jiffies_to_nsecs(now - tsk->vtime_starttime);
 	other = account_other_time(delta);
-	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
-	tsk->vtime_snap = now;
+	WARN_ON_ONCE(tsk->vtime_state == VTIME_INACTIVE);
+	tsk->vtime_starttime = now;
 
 	return delta - other;
 }
@@ -746,7 +746,7 @@ void vtime_guest_enter(struct task_struct *tsk)
 {
 	/*
 	 * The flags must be updated under the lock with
-	 * the vtime_snap flush and update.
+	 * the vtime_starttime flush and update.
 	 * That enforces a right ordering and update sequence
 	 * synchronization against the reader (task_gtime())
 	 * that can thus safely catch up with a tickless delta.
@@ -776,12 +776,12 @@ void vtime_account_idle(struct task_struct *tsk)
 void arch_vtime_task_switch(struct task_struct *prev)
 {
 	write_seqcount_begin(&prev->vtime_seqcount);
-	prev->vtime_snap_whence = VTIME_INACTIVE;
+	prev->vtime_state = VTIME_INACTIVE;
 	write_seqcount_end(&prev->vtime_seqcount);
 
 	write_seqcount_begin(&current->vtime_seqcount);
-	current->vtime_snap_whence = VTIME_SYS;
-	current->vtime_snap = jiffies;
+	current->vtime_state = VTIME_SYS;
+	current->vtime_starttime = jiffies;
 	write_seqcount_end(&current->vtime_seqcount);
 }
 
@@ -791,8 +791,8 @@ void vtime_init_idle(struct task_struct *t, int cpu)
 
 	local_irq_save(flags);
 	write_seqcount_begin(&t->vtime_seqcount);
-	t->vtime_snap_whence = VTIME_SYS;
-	t->vtime_snap = jiffies;
+	t->vtime_state = VTIME_SYS;
+	t->vtime_starttime = jiffies;
 	write_seqcount_end(&t->vtime_seqcount);
 	local_irq_restore(flags);
 }
@@ -809,7 +809,7 @@ u64 task_gtime(struct task_struct *t)
 		seq = read_seqcount_begin(&t->vtime_seqcount);
 
 		gtime = t->gtime;
-		if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
+		if (t->vtime_state == VTIME_SYS && t->flags & PF_VCPU)
 			gtime += vtime_delta(t);
 
 	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
@@ -840,7 +840,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 		*stime = t->stime;
 
 		/* Task is sleeping, nothing to add */
-		if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
+		if (t->vtime_state == VTIME_INACTIVE || is_idle_task(t))
 			continue;
 
 		delta = vtime_delta(t);
@@ -849,9 +849,9 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 		 * Task runs either in user or kernel space, add pending nohz time to
 		 * the right place.
 		 */
-		if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
+		if (t->vtime_state == VTIME_USER || t->flags & PF_VCPU)
 			*utime += delta;
-		else if (t->vtime_snap_whence == VTIME_SYS)
+		else if (t->vtime_state == VTIME_SYS)
 			*stime += delta;
 	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
 }
-- 
cgit v1.3-14-g43fede


From bac5b6b6b11560f323e71d0ebac4061cfe5f56c0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 29 Jun 2017 19:15:10 +0200
Subject: sched/cputime: Move the vtime task fields to their own struct

We are about to add vtime accumulation fields to the task struct. Let's
avoid more bloatification and gather vtime information to their own
struct.

Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1498756511-11714-5-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/init_task.h |   6 +--
 include/linux/sched.h     |  26 ++++++-----
 kernel/fork.c             |   6 +--
 kernel/sched/cputime.c    | 112 ++++++++++++++++++++++++++--------------------
 4 files changed, 86 insertions(+), 64 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 3d537331cd4e..a2f6707e9fc0 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -170,9 +170,9 @@ extern struct cred init_cred;
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 # define INIT_VTIME(tsk)						\
-	.vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount),	\
-	.vtime_starttime = 0,				\
-	.vtime_state = VTIME_SYS,
+	.vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount),		\
+	.vtime.starttime = 0,						\
+	.vtime.state = VTIME_SYS,
 #else
 # define INIT_VTIME(tsk)
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ff001646549e..eeff8a024f0c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -223,6 +223,21 @@ struct task_cputime {
 #define prof_exp			stime
 #define sched_exp			sum_exec_runtime
 
+enum vtime_state {
+	/* Task is sleeping or running in a CPU with VTIME inactive: */
+	VTIME_INACTIVE = 0,
+	/* Task runs in userspace in a CPU with VTIME active: */
+	VTIME_USER,
+	/* Task runs in kernelspace in a CPU with VTIME active: */
+	VTIME_SYS,
+};
+
+struct vtime {
+	seqcount_t		seqcount;
+	unsigned long long	starttime;
+	enum vtime_state	state;
+};
+
 struct sched_info {
 #ifdef CONFIG_SCHED_INFO
 	/* Cumulative counters: */
@@ -688,16 +703,7 @@ struct task_struct {
 	u64				gtime;
 	struct prev_cputime		prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-	seqcount_t			vtime_seqcount;
-	unsigned long long		vtime_starttime;
-	enum {
-		/* Task is sleeping or running in a CPU with VTIME inactive: */
-		VTIME_INACTIVE = 0,
-		/* Task runs in userspace in a CPU with VTIME active: */
-		VTIME_USER,
-		/* Task runs in kernelspace in a CPU with VTIME active: */
-		VTIME_SYS,
-	} vtime_state;
+	struct vtime			vtime;
 #endif
 
 #ifdef CONFIG_NO_HZ_FULL
diff --git a/kernel/fork.c b/kernel/fork.c
index 83c4f9bf3e14..d927ec11aa7a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1637,9 +1637,9 @@ static __latent_entropy struct task_struct *copy_process(
 	prev_cputime_init(&p->prev_cputime);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-	seqcount_init(&p->vtime_seqcount);
-	p->vtime_starttime = 0;
-	p->vtime_state = VTIME_INACTIVE;
+	seqcount_init(&p->vtime.seqcount);
+	p->vtime.starttime = 0;
+	p->vtime.state = VTIME_INACTIVE;
 #endif
 
 #if defined(SPLIT_RSS_COUNTING)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8c64753067c5..9ee725edcbe0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -679,17 +679,17 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static u64 vtime_delta(struct task_struct *tsk)
+static u64 vtime_delta(struct vtime *vtime)
 {
 	unsigned long now = READ_ONCE(jiffies);
 
-	if (time_before(now, (unsigned long)tsk->vtime_starttime))
+	if (time_before(now, (unsigned long)vtime->starttime))
 		return 0;
 
-	return jiffies_to_nsecs(now - tsk->vtime_starttime);
+	return jiffies_to_nsecs(now - vtime->starttime);
 }
 
-static u64 get_vtime_delta(struct task_struct *tsk)
+static u64 get_vtime_delta(struct vtime *vtime)
 {
 	unsigned long now = READ_ONCE(jiffies);
 	u64 delta, other;
@@ -701,49 +701,56 @@ static u64 get_vtime_delta(struct task_struct *tsk)
 	 * elapsed time. Limit account_other_time to prevent rounding
 	 * errors from causing elapsed vtime to go negative.
 	 */
-	delta = jiffies_to_nsecs(now - tsk->vtime_starttime);
+	delta = jiffies_to_nsecs(now - vtime->starttime);
 	other = account_other_time(delta);
-	WARN_ON_ONCE(tsk->vtime_state == VTIME_INACTIVE);
-	tsk->vtime_starttime = now;
+	WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
+	vtime->starttime = now;
 
 	return delta - other;
 }
 
 static void __vtime_account_system(struct task_struct *tsk)
 {
-	account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
+	account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime));
 }
 
 void vtime_account_system(struct task_struct *tsk)
 {
-	if (!vtime_delta(tsk))
+	struct vtime *vtime = &tsk->vtime;
+
+	if (!vtime_delta(vtime))
 		return;
 
-	write_seqcount_begin(&tsk->vtime_seqcount);
+	write_seqcount_begin(&vtime->seqcount);
 	__vtime_account_system(tsk);
-	write_seqcount_end(&tsk->vtime_seqcount);
+	write_seqcount_end(&vtime->seqcount);
 }
 
 void vtime_user_enter(struct task_struct *tsk)
 {
-	write_seqcount_begin(&tsk->vtime_seqcount);
-	if (vtime_delta(tsk))
+	struct vtime *vtime = &tsk->vtime;
+
+	write_seqcount_begin(&vtime->seqcount);
+	if (vtime_delta(vtime))
 		__vtime_account_system(tsk);
-	tsk->vtime_snap_whence = VTIME_USER;
-	write_seqcount_end(&tsk->vtime_seqcount);
+	vtime->state = VTIME_USER;
+	write_seqcount_end(&vtime->seqcount);
 }
 
 void vtime_user_exit(struct task_struct *tsk)
 {
-	write_seqcount_begin(&tsk->vtime_seqcount);
-	if (vtime_delta(tsk))
-		account_user_time(tsk, get_vtime_delta(tsk));
-	tsk->vtime_snap_whence = VTIME_SYS;
-	write_seqcount_end(&tsk->vtime_seqcount);
+	struct vtime *vtime = &tsk->vtime;
+
+	write_seqcount_begin(&vtime->seqcount);
+	if (vtime_delta(vtime))
+		account_user_time(tsk, get_vtime_delta(vtime));
+	vtime->state = VTIME_SYS;
+	write_seqcount_end(&vtime->seqcount);
 }
 
 void vtime_guest_enter(struct task_struct *tsk)
 {
+	struct vtime *vtime = &tsk->vtime;
 	/*
 	 * The flags must be updated under the lock with
 	 * the vtime_starttime flush and update.
@@ -751,54 +758,62 @@ void vtime_guest_enter(struct task_struct *tsk)
 	 * synchronization against the reader (task_gtime())
 	 * that can thus safely catch up with a tickless delta.
 	 */
-	write_seqcount_begin(&tsk->vtime_seqcount);
-	if (vtime_delta(tsk))
+	write_seqcount_begin(&vtime->seqcount);
+	if (vtime_delta(vtime))
 		__vtime_account_system(tsk);
 	current->flags |= PF_VCPU;
-	write_seqcount_end(&tsk->vtime_seqcount);
+	write_seqcount_end(&vtime->seqcount);
 }
 EXPORT_SYMBOL_GPL(vtime_guest_enter);
 
 void vtime_guest_exit(struct task_struct *tsk)
 {
-	write_seqcount_begin(&tsk->vtime_seqcount);
+	struct vtime *vtime = &tsk->vtime;
+
+	write_seqcount_begin(&vtime->seqcount);
 	__vtime_account_system(tsk);
 	current->flags &= ~PF_VCPU;
-	write_seqcount_end(&tsk->vtime_seqcount);
+	write_seqcount_end(&vtime->seqcount);
 }
 EXPORT_SYMBOL_GPL(vtime_guest_exit);
 
 void vtime_account_idle(struct task_struct *tsk)
 {
-	account_idle_time(get_vtime_delta(tsk));
+	account_idle_time(get_vtime_delta(&tsk->vtime));
 }
 
 void arch_vtime_task_switch(struct task_struct *prev)
 {
-	write_seqcount_begin(&prev->vtime_seqcount);
-	prev->vtime_state = VTIME_INACTIVE;
-	write_seqcount_end(&prev->vtime_seqcount);
+	struct vtime *vtime = &prev->vtime;
 
-	write_seqcount_begin(&current->vtime_seqcount);
-	current->vtime_state = VTIME_SYS;
-	current->vtime_starttime = jiffies;
-	write_seqcount_end(&current->vtime_seqcount);
+	write_seqcount_begin(&vtime->seqcount);
+	vtime->state = VTIME_INACTIVE;
+	write_seqcount_end(&vtime->seqcount);
+
+	vtime = &current->vtime;
+
+	write_seqcount_begin(&vtime->seqcount);
+	vtime->state = VTIME_SYS;
+	vtime->starttime = jiffies;
+	write_seqcount_end(&vtime->seqcount);
 }
 
 void vtime_init_idle(struct task_struct *t, int cpu)
 {
+	struct vtime *vtime = &t->vtime;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	write_seqcount_begin(&t->vtime_seqcount);
-	t->vtime_state = VTIME_SYS;
-	t->vtime_starttime = jiffies;
-	write_seqcount_end(&t->vtime_seqcount);
+	write_seqcount_begin(&vtime->seqcount);
+	vtime->state = VTIME_SYS;
+	vtime->starttime = jiffies;
+	write_seqcount_end(&vtime->seqcount);
 	local_irq_restore(flags);
 }
 
 u64 task_gtime(struct task_struct *t)
 {
+	struct vtime *vtime = &t->vtime;
 	unsigned int seq;
 	u64 gtime;
 
@@ -806,13 +821,13 @@ u64 task_gtime(struct task_struct *t)
 		return t->gtime;
 
 	do {
-		seq = read_seqcount_begin(&t->vtime_seqcount);
+		seq = read_seqcount_begin(&vtime->seqcount);
 
 		gtime = t->gtime;
-		if (t->vtime_state == VTIME_SYS && t->flags & PF_VCPU)
-			gtime += vtime_delta(t);
+		if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+			gtime += vtime_delta(vtime);
 
-	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
+	} while (read_seqcount_retry(&vtime->seqcount, seq));
 
 	return gtime;
 }
@@ -824,8 +839,9 @@ u64 task_gtime(struct task_struct *t)
  */
 void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 {
-	u64 delta;
+	struct vtime *vtime = &t->vtime;
 	unsigned int seq;
+	u64 delta;
 
 	if (!vtime_accounting_enabled()) {
 		*utime = t->utime;
@@ -834,25 +850,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 	}
 
 	do {
-		seq = read_seqcount_begin(&t->vtime_seqcount);
+		seq = read_seqcount_begin(&vtime->seqcount);
 
 		*utime = t->utime;
 		*stime = t->stime;
 
 		/* Task is sleeping, nothing to add */
-		if (t->vtime_state == VTIME_INACTIVE || is_idle_task(t))
+		if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
 			continue;
 
-		delta = vtime_delta(t);
+		delta = vtime_delta(vtime);
 
 		/*
 		 * Task runs either in user or kernel space, add pending nohz time to
 		 * the right place.
 		 */
-		if (t->vtime_state == VTIME_USER || t->flags & PF_VCPU)
+		if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
 			*utime += delta;
-		else if (t->vtime_state == VTIME_SYS)
+		else if (vtime->state == VTIME_SYS)
 			*stime += delta;
-	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
+	} while (read_seqcount_retry(&vtime->seqcount, seq));
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
-- 
cgit v1.3-14-g43fede


From 2a42eb9594a1480b4ead9e036e06ee1290e5fa6d Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 29 Jun 2017 19:15:11 +0200
Subject: sched/cputime: Accumulate vtime on top of nsec clocksource

Currently the cputime source used by vtime is jiffies. When we cross
a context boundary and jiffies have changed since the last snapshot, the
pending cputime is accounted to the switching out context.

This system works ok if the ticks are not aligned across CPUs. If they
instead are aligned (ie: all fire at the same time) and the CPUs run in
userspace, the jiffies change is only observed on tick exit and therefore
the user cputime is accounted as system cputime. This is because the
CPU that maintains timekeeping fires its tick at the same time as the
others. It updates jiffies in the middle of the tick and the other CPUs
see that update on IRQ exit:

    CPU 0 (timekeeper)                  CPU 1
    -------------------              -------------
                      jiffies = N
    ...                              run in userspace for a jiffy
    tick entry                       tick entry (sees jiffies = N)
    set jiffies = N + 1
    tick exit                        tick exit (sees jiffies = N + 1)
                                                account 1 jiffy as stime

Fix this with using a nanosec clock source instead of jiffies. The
cputime is then accumulated and flushed everytime the pending delta
reaches a jiffy in order to mitigate the accounting overhead.

[ fweisbec: changelog, rebase on struct vtime, field renames, add delta
  on cputime readers, keep idle vtime as-is (low overhead accounting),
  harmonize clock sources. ]

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Luiz Capitulino <lcapitulino@redhat.com>
Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1498756511-11714-6-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h  |  3 +++
 kernel/sched/cputime.c | 64 +++++++++++++++++++++++++++++++++-----------------
 2 files changed, 45 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index eeff8a024f0c..4818126c5153 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -236,6 +236,9 @@ struct vtime {
 	seqcount_t		seqcount;
 	unsigned long long	starttime;
 	enum vtime_state	state;
+	u64			utime;
+	u64			stime;
+	u64			gtime;
 };
 
 struct sched_info {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9ee725edcbe0..6e3ea4ac1bda 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -681,18 +681,19 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 static u64 vtime_delta(struct vtime *vtime)
 {
-	unsigned long now = READ_ONCE(jiffies);
+	unsigned long long clock;
 
-	if (time_before(now, (unsigned long)vtime->starttime))
+	clock = sched_clock_cpu(smp_processor_id());
+	if (clock < vtime->starttime)
 		return 0;
 
-	return jiffies_to_nsecs(now - vtime->starttime);
+	return clock - vtime->starttime;
 }
 
 static u64 get_vtime_delta(struct vtime *vtime)
 {
-	unsigned long now = READ_ONCE(jiffies);
-	u64 delta, other;
+	u64 delta = vtime_delta(vtime);
+	u64 other;
 
 	/*
 	 * Unlike tick based timing, vtime based timing never has lost
@@ -701,17 +702,31 @@ static u64 get_vtime_delta(struct vtime *vtime)
 	 * elapsed time. Limit account_other_time to prevent rounding
 	 * errors from causing elapsed vtime to go negative.
 	 */
-	delta = jiffies_to_nsecs(now - vtime->starttime);
 	other = account_other_time(delta);
 	WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
-	vtime->starttime = now;
+	vtime->starttime += delta;
 
 	return delta - other;
 }
 
-static void __vtime_account_system(struct task_struct *tsk)
+static void __vtime_account_system(struct task_struct *tsk,
+				   struct vtime *vtime)
 {
-	account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime));
+	vtime->stime += get_vtime_delta(vtime);
+	if (vtime->stime >= TICK_NSEC) {
+		account_system_time(tsk, irq_count(), vtime->stime);
+		vtime->stime = 0;
+	}
+}
+
+static void vtime_account_guest(struct task_struct *tsk,
+				struct vtime *vtime)
+{
+	vtime->gtime += get_vtime_delta(vtime);
+	if (vtime->gtime >= TICK_NSEC) {
+		account_guest_time(tsk, vtime->gtime);
+		vtime->gtime = 0;
+	}
 }
 
 void vtime_account_system(struct task_struct *tsk)
@@ -722,7 +737,11 @@ void vtime_account_system(struct task_struct *tsk)
 		return;
 
 	write_seqcount_begin(&vtime->seqcount);
-	__vtime_account_system(tsk);
+	/* We might have scheduled out from guest path */
+	if (current->flags & PF_VCPU)
+		vtime_account_guest(tsk, vtime);
+	else
+		__vtime_account_system(tsk, vtime);
 	write_seqcount_end(&vtime->seqcount);
 }
 
@@ -731,8 +750,7 @@ void vtime_user_enter(struct task_struct *tsk)
 	struct vtime *vtime = &tsk->vtime;
 
 	write_seqcount_begin(&vtime->seqcount);
-	if (vtime_delta(vtime))
-		__vtime_account_system(tsk);
+	__vtime_account_system(tsk, vtime);
 	vtime->state = VTIME_USER;
 	write_seqcount_end(&vtime->seqcount);
 }
@@ -742,8 +760,11 @@ void vtime_user_exit(struct task_struct *tsk)
 	struct vtime *vtime = &tsk->vtime;
 
 	write_seqcount_begin(&vtime->seqcount);
-	if (vtime_delta(vtime))
-		account_user_time(tsk, get_vtime_delta(vtime));
+	vtime->utime += get_vtime_delta(vtime);
+	if (vtime->utime >= TICK_NSEC) {
+		account_user_time(tsk, vtime->utime);
+		vtime->utime = 0;
+	}
 	vtime->state = VTIME_SYS;
 	write_seqcount_end(&vtime->seqcount);
 }
@@ -759,8 +780,7 @@ void vtime_guest_enter(struct task_struct *tsk)
 	 * that can thus safely catch up with a tickless delta.
 	 */
 	write_seqcount_begin(&vtime->seqcount);
-	if (vtime_delta(vtime))
-		__vtime_account_system(tsk);
+	__vtime_account_system(tsk, vtime);
 	current->flags |= PF_VCPU;
 	write_seqcount_end(&vtime->seqcount);
 }
@@ -771,7 +791,7 @@ void vtime_guest_exit(struct task_struct *tsk)
 	struct vtime *vtime = &tsk->vtime;
 
 	write_seqcount_begin(&vtime->seqcount);
-	__vtime_account_system(tsk);
+	vtime_account_guest(tsk, vtime);
 	current->flags &= ~PF_VCPU;
 	write_seqcount_end(&vtime->seqcount);
 }
@@ -794,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
 
 	write_seqcount_begin(&vtime->seqcount);
 	vtime->state = VTIME_SYS;
-	vtime->starttime = jiffies;
+	vtime->starttime = sched_clock_cpu(smp_processor_id());
 	write_seqcount_end(&vtime->seqcount);
 }
 
@@ -806,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
 	local_irq_save(flags);
 	write_seqcount_begin(&vtime->seqcount);
 	vtime->state = VTIME_SYS;
-	vtime->starttime = jiffies;
+	vtime->starttime = sched_clock_cpu(cpu);
 	write_seqcount_end(&vtime->seqcount);
 	local_irq_restore(flags);
 }
@@ -825,7 +845,7 @@ u64 task_gtime(struct task_struct *t)
 
 		gtime = t->gtime;
 		if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
-			gtime += vtime_delta(vtime);
+			gtime += vtime->gtime + vtime_delta(vtime);
 
 	} while (read_seqcount_retry(&vtime->seqcount, seq));
 
@@ -866,9 +886,9 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
 		 * the right place.
 		 */
 		if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
-			*utime += delta;
+			*utime += vtime->utime + delta;
 		else if (vtime->state == VTIME_SYS)
-			*stime += delta;
+			*stime += vtime->stime + delta;
 	} while (read_seqcount_retry(&vtime->seqcount, seq));
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
-- 
cgit v1.3-14-g43fede


From a0c4acd2c220376b4e9690e75782d0c0afdaab9f Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 16 Jun 2017 16:44:34 +0300
Subject: locking/rwsem-spinlock: Fix EINTR branch in __down_write_common()

If a writer could been woken up, the above branch

	if (sem->count == 0)
		break;

would have moved us to taking the sem. So, it's
not the time to wake a writer now, and only readers
are allowed now. Thus, 0 must be passed to __rwsem_do_wake().

Next, __rwsem_do_wake() wakes readers unconditionally.
But we mustn't do that if the sem is owned by writer
in the moment. Otherwise, writer and reader own the sem
the same time, which leads to memory corruption in
callers.

rwsem-xadd.c does not need that, as:

  1) the similar check is made lockless there,
  2) in __rwsem_mark_wake::try_reader_grant we test,

that sem is not owned by writer.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@vger.kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Niklas Cassel <niklas.cassel@axis.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 17fcbd590d0c "locking/rwsem: Fix down_write_killable() for CONFIG_RWSEM_GENERIC_SPINLOCK=y"
Link: http://lkml.kernel.org/r/149762063282.19811.9129615532201147826.stgit@localhost.localdomain
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-spinlock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index c65f7989f850..20819df98125 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -231,8 +231,8 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
 
 out_nolock:
 	list_del(&waiter.list);
-	if (!list_empty(&sem->wait_list))
-		__rwsem_do_wake(sem, 1);
+	if (!list_empty(&sem->wait_list) && sem->count >= 0)
+		__rwsem_do_wake(sem, 0);
 	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 
 	return -EINTR;
-- 
cgit v1.3-14-g43fede


From 69d71879d2cf67a381055f698a1d7def00dc4ed7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 5 Jul 2017 09:45:43 -0400
Subject: ftrace: Test for NULL iter->tr in regex for stack_trace_filter
 changes

As writing into stack_trace_filter, the iter-tr is not set and is NULL.
Check if it is NULL before dereferencing it in ftrace_regex_release().

Fixes: 8c08f0d5c6fb ("ftrace: Have cached module filters be an active filter")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f8c18f15b190..2953d558bbee 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5043,7 +5043,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 
 		if (filter_hash) {
 			orig_hash = &iter->ops->func_hash->filter_hash;
-			if (!list_empty(&iter->tr->mod_trace))
+			if (iter->tr && !list_empty(&iter->tr->mod_trace))
 				iter->hash->flags |= FTRACE_HASH_FL_MOD;
 		} else
 			orig_hash = &iter->ops->func_hash->notrace_hash;
-- 
cgit v1.3-14-g43fede


From 65a4433aebe36c8c6abeb69b99ef00274b971c6c Mon Sep 17 00:00:00 2001
From: Jeffrey Hugo <jhugo@codeaurora.org>
Date: Wed, 7 Jun 2017 13:18:57 -0600
Subject: sched/fair: Fix load_balance() affinity redo path

If load_balance() fails to migrate any tasks because all tasks were
affined, load_balance() removes the source CPU from consideration and
attempts to redo and balance among the new subset of CPUs.

There is a bug in this code path where the algorithm considers all active
CPUs in the system (minus the source that was just masked out).  This is
not valid for two reasons: some active CPUs may not be in the current
scheduling domain and one of the active CPUs is dst_cpu. These CPUs should
not be considered, as we cannot pull load from them.

Instead of failing out of load_balance(), we may end up redoing the search
with no valid CPUs and incorrectly concluding the domain is balanced.
Additionally, if the group_imbalance flag was just set, it may also be
incorrectly unset, thus the flag will not be seen by other CPUs in future
load_balance() runs as that algorithm intends.

Fix the check by removing CPUs not in the current domain and the dst_cpu
from considertation, thus limiting the evaluation to valid remaining CPUs
from which load might be migrated.

Co-authored-by: Austin Christ <austinwc@codeaurora.org>
Co-authored-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Tyler Baicar <tbaicar@codeaurora.org>
Signed-off-by: Jeffrey Hugo <jhugo@codeaurora.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Austin Christ <austinwc@codeaurora.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Timur Tabi <timur@codeaurora.org>
Link: http://lkml.kernel.org/r/1496863138-11322-2-git-send-email-jhugo@codeaurora.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 008c514dc241..c95880e216f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6646,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 		 * our sched_group. We may want to revisit it if we couldn't
 		 * meet load balance goals by pulling other tasks on src_cpu.
 		 *
-		 * Also avoid computing new_dst_cpu if we have already computed
-		 * one in current iteration.
+		 * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
+		 * already computed one in current iteration.
 		 */
-		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+		if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
 			return 0;
 
 		/* Prevent to re-select dst_cpu via env's cpus */
@@ -8022,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		.tasks		= LIST_HEAD_INIT(env.tasks),
 	};
 
-	/*
-	 * For NEWLY_IDLE load_balancing, we don't need to consider
-	 * other cpus in our group
-	 */
-	if (idle == CPU_NEWLY_IDLE)
-		env.dst_grpmask = NULL;
-
-	cpumask_copy(cpus, cpu_active_mask);
+	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
 
 	schedstat_inc(sd->lb_count[idle]);
 
@@ -8151,7 +8144,15 @@ more_balance:
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
 			cpumask_clear_cpu(cpu_of(busiest), cpus);
-			if (!cpumask_empty(cpus)) {
+			/*
+			 * Attempting to continue load balancing at the current
+			 * sched_domain level only makes sense if there are
+			 * active CPUs remaining as possible busiest CPUs to
+			 * pull load from which are not contained within the
+			 * destination group that is receiving any migrated
+			 * load.
+			 */
+			if (!cpumask_subset(cpus, env.dst_grpmask)) {
 				env.loop = 0;
 				env.loop_break = sched_nr_migrate_break;
 				goto redo;
@@ -8447,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data)
 			.src_cpu	= busiest_rq->cpu,
 			.src_rq		= busiest_rq,
 			.idle		= CPU_IDLE,
+			/*
+			 * can_migrate_task() doesn't need to compute new_dst_cpu
+			 * for active balancing. Since we have CPU_IDLE, but no
+			 * @dst_grpmask we need to make that test go away with lying
+			 * about DST_PINNED.
+			 */
+			.flags		= LBF_DST_PINNED,
 		};
 
 		schedstat_inc(sd->alb_count);
-- 
cgit v1.3-14-g43fede


From 4cc7c1864bbd4cf80f6bdc8ba3217de5aa5f4688 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 5 Jul 2017 16:24:49 +0100
Subject: bpf: Implement show_options

Implement the show_options superblock op for bpf as part of a bid to get
rid of s_options and generic_show_options() to make it easier to implement
a context-based mount where the mount options can be passed individually
over a file descriptor.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Alexei Starovoitov <ast@kernel.org>
cc: Daniel Borkmann <daniel@iogearbox.net>
cc: netdev@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/bpf/inode.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 9bbd33497d3d..e833ed914358 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -377,10 +377,22 @@ static void bpf_evict_inode(struct inode *inode)
 		bpf_any_put(inode->i_private, type);
 }
 
+/*
+ * Display the mount options in /proc/mounts.
+ */
+static int bpf_show_options(struct seq_file *m, struct dentry *root)
+{
+	umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
+
+	if (mode != S_IRWXUGO)
+		seq_printf(m, ",mode=%o", mode);
+	return 0;
+}
+
 static const struct super_operations bpf_super_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
-	.show_options	= generic_show_options,
+	.show_options	= bpf_show_options,
 	.evict_inode	= bpf_evict_inode,
 };
 
@@ -434,8 +446,6 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *inode;
 	int ret;
 
-	save_mount_options(sb, data);
-
 	ret = bpf_parse_options(data, &opts);
 	if (ret)
 		return ret;
-- 
cgit v1.3-14-g43fede


From 9cd4f1a4e7a858849e889a081a99adff83e08e4c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 4 Jul 2017 22:20:23 +0200
Subject: smp/hotplug: Move unparking of percpu threads to the control CPU

Vikram reported the following backtrace:

   BUG: scheduling while atomic: swapper/7/0/0x00000002
   CPU: 7 PID: 0 Comm: swapper/7 Not tainted 4.9.32-perf+ #680
   schedule
   schedule_hrtimeout_range_clock
   schedule_hrtimeout
   wait_task_inactive
   __kthread_bind_mask
   __kthread_bind
   __kthread_unpark
   kthread_unpark
   cpuhp_online_idle
   cpu_startup_entry
   secondary_start_kernel

He analyzed correctly that a parked cpu hotplug thread of an offlined CPU
was still on the runqueue when the CPU came back online and tried to unpark
it. This causes the thread which invoked kthread_unpark() to call
wait_task_inactive() and subsequently schedule() with preemption disabled.
His proposed workaround was to "make sure" that a parked thread has
scheduled out when the CPU goes offline, so the situation cannot happen.

But that's still wrong because the root cause is not the fact that the
percpu thread is still on the runqueue and neither that preemption is
disabled, which could be simply solved by enabling preemption before
calling kthread_unpark().

The real issue is that the calling thread is the idle task of the upcoming
CPU, which is not supposed to call anything which might sleep.  The moron,
who wrote that code, missed completely that kthread_unpark() might end up
in schedule().

The solution is simpler than expected. The thread which controls the
hotplug operation is waiting for the CPU to call complete() on the hotplug
state completion. So the idle task of the upcoming CPU can set its state to
CPUHP_AP_ONLINE_IDLE and invoke complete(). This in turn wakes the control
task on a different CPU, which then can safely do the unpark and kick the
now unparked hotplug thread of the upcoming CPU to complete the bringup to
the final target state.

Control CPU                     AP

bringup_cpu();
  __cpu_up()  ------------>
				bringup_ap();
  bringup_wait_for_ap()
    wait_for_completion();
                                cpuhp_online_idle();
                <------------    complete();
    unpark(AP->stopper);
    unpark(AP->hotplugthread);
                                while(1)
                                  do_idle();
    kick(AP->hotplugthread);
    wait_for_completion();	hotplug_thread()
				  run_online_callbacks();
				  complete();

Fixes: 8df3e07e7f21 ("cpu/hotplug: Let upcoming cpu bring itself fully up")
Reported-by: Vikram Mulukutla <markivx@codeaurora.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Sewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1707042218020.2131@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index b03a32595cfe..ab860453841d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -271,11 +271,25 @@ void cpu_hotplug_enable(void)
 EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
 #endif	/* CONFIG_HOTPLUG_CPU */
 
+static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st);
+
 static int bringup_wait_for_ap(unsigned int cpu)
 {
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 
+	/* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
 	wait_for_completion(&st->done);
+	BUG_ON(!cpu_online(cpu));
+
+	/* Unpark the stopper thread and the hotplug thread of the target cpu */
+	stop_machine_unpark(cpu);
+	kthread_unpark(st->thread);
+
+	/* Should we go further up ? */
+	if (st->target > CPUHP_AP_ONLINE_IDLE) {
+		__cpuhp_kick_ap_work(st);
+		wait_for_completion(&st->done);
+	}
 	return st->result;
 }
 
@@ -296,9 +310,7 @@ static int bringup_cpu(unsigned int cpu)
 	irq_unlock_sparse();
 	if (ret)
 		return ret;
-	ret = bringup_wait_for_ap(cpu);
-	BUG_ON(!cpu_online(cpu));
-	return ret;
+	return bringup_wait_for_ap(cpu);
 }
 
 /*
@@ -767,31 +779,20 @@ void notify_cpu_starting(unsigned int cpu)
 }
 
 /*
- * Called from the idle task. We need to set active here, so we can kick off
- * the stopper thread and unpark the smpboot threads. If the target state is
- * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the
- * cpu further.
+ * Called from the idle task. Wake up the controlling task which brings the
+ * stopper and the hotplug thread of the upcoming CPU up and then delegates
+ * the rest of the online bringup to the hotplug thread.
  */
 void cpuhp_online_idle(enum cpuhp_state state)
 {
 	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
-	unsigned int cpu = smp_processor_id();
 
 	/* Happens for the boot cpu */
 	if (state != CPUHP_AP_ONLINE_IDLE)
 		return;
 
 	st->state = CPUHP_AP_ONLINE_IDLE;
-
-	/* Unpark the stopper thread and the hotplug thread of this cpu */
-	stop_machine_unpark(cpu);
-	kthread_unpark(st->thread);
-
-	/* Should we go further up ? */
-	if (st->target > CPUHP_AP_ONLINE_IDLE)
-		__cpuhp_kick_ap_work(st);
-	else
-		complete(&st->done);
+	complete(&st->done);
 }
 
 /* Requires cpu_add_remove_lock to be held */
-- 
cgit v1.3-14-g43fede


From 99c621d704cf1c4eb74c3c42e674edf3df64f92d Mon Sep 17 00:00:00 2001
From: Michael Sartain <mikesart@fastmail.com>
Date: Wed, 5 Jul 2017 22:07:15 -0600
Subject: tracing: Add saved_tgids file to show cached pid to tgid mappings

Export the cached pid / tgid mappings in debugfs tracing saved_tgids file.
This allows user apps to translate the pids from a trace to their respective
thread group.

Example saved_tgids file with pid / tgid values separated by ' ':

  # cat saved_tgids
  1048 1048
  1047 1047
  7 7
  1049 1047
  1054 1047
  1053 1047

Link: http://lkml.kernel.org/r/20170630004023.064965233@goodmis.org
Link: http://lkml.kernel.org/r/20170706040713.unwkumbta5menygi@mikesart-cos

Reviewed-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Michael Sartain <mikesart@fastmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 00e2e4169b1e..f079a8ca1117 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4688,6 +4688,76 @@ static const struct file_operations tracing_readme_fops = {
 	.llseek		= generic_file_llseek,
 };
 
+static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	int *ptr = v;
+
+	if (*pos || m->count)
+		ptr++;
+
+	(*pos)++;
+
+	for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) {
+		if (trace_find_tgid(*ptr))
+			return ptr;
+	}
+
+	return NULL;
+}
+
+static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
+{
+	void *v;
+	loff_t l = 0;
+
+	if (!tgid_map)
+		return NULL;
+
+	v = &tgid_map[0];
+	while (l <= *pos) {
+		v = saved_tgids_next(m, v, &l);
+		if (!v)
+			return NULL;
+	}
+
+	return v;
+}
+
+static void saved_tgids_stop(struct seq_file *m, void *v)
+{
+}
+
+static int saved_tgids_show(struct seq_file *m, void *v)
+{
+	int pid = (int *)v - tgid_map;
+
+	seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid));
+	return 0;
+}
+
+static const struct seq_operations tracing_saved_tgids_seq_ops = {
+	.start		= saved_tgids_start,
+	.stop		= saved_tgids_stop,
+	.next		= saved_tgids_next,
+	.show		= saved_tgids_show,
+};
+
+static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
+{
+	if (tracing_disabled)
+		return -ENODEV;
+
+	return seq_open(filp, &tracing_saved_tgids_seq_ops);
+}
+
+
+static const struct file_operations tracing_saved_tgids_fops = {
+	.open		= tracing_saved_tgids_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	unsigned int *ptr = v;
@@ -7920,6 +7990,9 @@ static __init int tracer_init_tracefs(void)
 	trace_create_file("saved_cmdlines_size", 0644, d_tracer,
 			  NULL, &tracing_saved_cmdlines_size_fops);
 
+	trace_create_file("saved_tgids", 0444, d_tracer,
+			NULL, &tracing_saved_tgids_fops);
+
 	trace_eval_init();
 
 	trace_create_eval_file(d_tracer);
-- 
cgit v1.3-14-g43fede


From c80081b9209713e0fe86d3def395a9fc66503c58 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Thu, 6 Jul 2017 14:29:04 +0200
Subject: genirq: Allow to pass the IRQF_TIMER flag with percpu irq request

The irq timings infrastructure tracks when interrupts occur in order to
statistically predict te next interrupt event.

There is no point to track timer interrupts and try to predict them because
the next expiration time is already known. This can be avoided via the
IRQF_TIMER flag which is passed by timer drivers in request_irq(). It marks
the interrupt as timer based which alloes to ignore these interrupts in the
timings code.

Per CPU interrupts which are requested via request_percpu_+irq() have no
flag argument, so marking per cpu timer interrupts is not possible and they
get tracked pointlessly.

Add __request_percpu_irq() as a variant of request_percpu_irq() with a
flags argument and make request_percpu_irq() an inline wrapper passing
flags = 0.

The flag parameter is restricted to IRQF_TIMER as all other IRQF_ flags
make no sense for per cpu interrupts.

The next step is to convert all existing users of request_percpu_irq() and
then remove the wrapper and the underscores.

[ tglx: Massaged changelog ]

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: peterz@infradead.org
Cc: nicolas.pitre@linaro.org
Cc: vincent.guittot@linaro.org
Cc: rafael@kernel.org
Link: http://lkml.kernel.org/r/1499344144-3964-1-git-send-email-daniel.lezcano@linaro.org
---
 include/linux/interrupt.h | 11 ++++++++++-
 kernel/irq/manage.c       | 15 ++++++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 37f8e354f564..5ac6e238555e 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -152,8 +152,17 @@ request_any_context_irq(unsigned int irq, irq_handler_t handler,
 			unsigned long flags, const char *name, void *dev_id);
 
 extern int __must_check
+__request_percpu_irq(unsigned int irq, irq_handler_t handler,
+		     unsigned long flags, const char *devname,
+		     void __percpu *percpu_dev_id);
+
+static inline int __must_check
 request_percpu_irq(unsigned int irq, irq_handler_t handler,
-		   const char *devname, void __percpu *percpu_dev_id);
+		   const char *devname, void __percpu *percpu_dev_id)
+{
+	return __request_percpu_irq(irq, handler, 0,
+				    devname, percpu_dev_id);
+}
 
 extern const void *free_irq(unsigned int, void *);
 extern void free_percpu_irq(unsigned int, void __percpu *);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 91e1f2390752..5624b2dd6b58 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1950,9 +1950,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
 }
 
 /**
- *	request_percpu_irq - allocate a percpu interrupt line
+ *	__request_percpu_irq - allocate a percpu interrupt line
  *	@irq: Interrupt line to allocate
  *	@handler: Function to be called when the IRQ occurs.
+ *	@flags: Interrupt type flags (IRQF_TIMER only)
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A percpu cookie passed back to the handler function
  *
@@ -1965,8 +1966,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
  *	the handler gets called with the interrupted CPU's instance of
  *	that variable.
  */
-int request_percpu_irq(unsigned int irq, irq_handler_t handler,
-		       const char *devname, void __percpu *dev_id)
+int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
+			 unsigned long flags, const char *devname,
+			 void __percpu *dev_id)
 {
 	struct irqaction *action;
 	struct irq_desc *desc;
@@ -1980,12 +1982,15 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
 	    !irq_settings_is_per_cpu_devid(desc))
 		return -EINVAL;
 
+	if (flags && flags != IRQF_TIMER)
+		return -EINVAL;
+
 	action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
 	if (!action)
 		return -ENOMEM;
 
 	action->handler = handler;
-	action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
+	action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND;
 	action->name = devname;
 	action->percpu_dev_id = dev_id;
 
@@ -2004,7 +2009,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
 
 	return retval;
 }
-EXPORT_SYMBOL_GPL(request_percpu_irq);
+EXPORT_SYMBOL_GPL(__request_percpu_irq);
 
 /**
  *	irq_get_irqchip_state - returns the irqchip state of a interrupt.
-- 
cgit v1.3-14-g43fede


From c0d80ddab89916273cb97114889d3f337bc370ae Mon Sep 17 00:00:00 2001
From: Marcin Nowakowski <marcin.nowakowski@imgtec.com>
Date: Thu, 6 Jul 2017 15:35:31 -0700
Subject: kernel/extable.c: mark core_kernel_text notrace

core_kernel_text is used by MIPS in its function graph trace processing,
so having this method traced leads to an infinite set of recursive calls
such as:

  Call Trace:
     ftrace_return_to_handler+0x50/0x128
     core_kernel_text+0x10/0x1b8
     prepare_ftrace_return+0x6c/0x114
     ftrace_graph_caller+0x20/0x44
     return_to_handler+0x10/0x30
     return_to_handler+0x0/0x30
     return_to_handler+0x0/0x30
     ftrace_ops_no_ops+0x114/0x1bc
     core_kernel_text+0x10/0x1b8
     core_kernel_text+0x10/0x1b8
     core_kernel_text+0x10/0x1b8
     ftrace_ops_no_ops+0x114/0x1bc
     core_kernel_text+0x10/0x1b8
     prepare_ftrace_return+0x6c/0x114
     ftrace_graph_caller+0x20/0x44
     (...)

Mark the function notrace to avoid it being traced.

Link: http://lkml.kernel.org/r/1498028607-6765-1-git-send-email-marcin.nowakowski@imgtec.com
Signed-off-by: Marcin Nowakowski <marcin.nowakowski@imgtec.com>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Meyer <thomas@m3y3r.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/extable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index 0fbdd8582f08..223df4a328a4 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
 	return 0;
 }
 
-int core_kernel_text(unsigned long addr)
+int notrace core_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext &&
 	    addr < (unsigned long)_etext)
-- 
cgit v1.3-14-g43fede


From 61f6d09a931c3ab216f43e00505073088d387d05 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 6 Jul 2017 15:35:55 -0700
Subject: kernel/power/snapshot.c: use linux/set_memory.h

This header always exists, so doesn't require an ifdef around its
inclusion.  When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm
header, otherwise it provides empty versions of the set_memory_xx()
routines.

Link: http://lkml.kernel.org/r/1498717781-29151-2-git-send-email-mpe@ellerman.id.au
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Laura Abbott <labbott@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/snapshot.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b7708e319941..222317721c5a 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -30,15 +30,13 @@
 #include <linux/slab.h>
 #include <linux/compiler.h>
 #include <linux/ktime.h>
+#include <linux/set_memory.h>
 
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-#include <asm/set_memory.h>
-#endif
 
 #include "power.h"
 
-- 
cgit v1.3-14-g43fede


From 563ec5cbc615698239c3a63511b939a7a7e38870 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 6 Jul 2017 15:35:58 -0700
Subject: kernel/module.c: use linux/set_memory.h

This header always exists, so doesn't require an ifdef around its
inclusion.  When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm
header, otherwise it provides empty versions of the set_memory_xx()
routines.

The usages of set_memory_xx() are still guarded by
CONFIG_STRICT_MODULE_RWX.

Link: http://lkml.kernel.org/r/1498717781-29151-3-git-send-email-mpe@ellerman.id.au
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Laura Abbott <labbott@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index d7eb41d772c4..8f883d86cedc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -49,9 +49,7 @@
 #include <linux/rculist.h>
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
-#ifdef CONFIG_STRICT_MODULE_RWX
-#include <asm/set_memory.h>
-#endif
+#include <linux/set_memory.h>
 #include <asm/mmu_context.h>
 #include <linux/license.h>
 #include <asm/sections.h>
-- 
cgit v1.3-14-g43fede


From f1dd2cd13c4bbbc9a7c4617b3b034fa643de98fe Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:38:11 -0700
Subject: mm, memory_hotplug: do not associate hotadded memory to zones until
 online

The current memory hotplug implementation relies on having all the
struct pages associate with a zone/node during the physical hotplug
phase (arch_add_memory->__add_pages->__add_section->__add_zone).  In the
vast majority of cases this means that they are added to ZONE_NORMAL.
This has been so since 9d99aaa31f59 ("[PATCH] x86_64: Support memory
hotadd without sparsemem") and it wasn't a big deal back then because
movable onlining didn't exist yet.

Much later memory hotplug wanted to (ab)use ZONE_MOVABLE for movable
onlining 511c2aba8f07 ("mm, memory-hotplug: dynamic configure movable
memory and portion memory") and then things got more complicated.
Rather than reconsidering the zone association which was no longer
needed (because the memory hotplug already depended on SPARSEMEM) a
convoluted semantic of zone shifting has been developed.  Only the
currently last memblock or the one adjacent to the zone_movable can be
onlined movable.  This essentially means that the online type changes as
the new memblocks are added.

Let's simulate memory hot online manually
  $ echo 0x100000000 > /sys/devices/system/memory/probe
  $ grep . /sys/devices/system/memory/memory32/valid_zones
  Normal Movable

  $ echo $((0x100000000+(128<<20))) > /sys/devices/system/memory/probe
  $ grep . /sys/devices/system/memory/memory3?/valid_zones
  /sys/devices/system/memory/memory32/valid_zones:Normal
  /sys/devices/system/memory/memory33/valid_zones:Normal Movable

  $ echo $((0x100000000+2*(128<<20))) > /sys/devices/system/memory/probe
  $ grep . /sys/devices/system/memory/memory3?/valid_zones
  /sys/devices/system/memory/memory32/valid_zones:Normal
  /sys/devices/system/memory/memory33/valid_zones:Normal
  /sys/devices/system/memory/memory34/valid_zones:Normal Movable

  $ echo online_movable > /sys/devices/system/memory/memory34/state
  $ grep . /sys/devices/system/memory/memory3?/valid_zones
  /sys/devices/system/memory/memory32/valid_zones:Normal
  /sys/devices/system/memory/memory33/valid_zones:Normal Movable
  /sys/devices/system/memory/memory34/valid_zones:Movable Normal

This is an awkward semantic because an udev event is sent as soon as the
block is onlined and an udev handler might want to online it based on
some policy (e.g.  association with a node) but it will inherently race
with new blocks showing up.

This patch changes the physical online phase to not associate pages with
any zone at all.  All the pages are just marked reserved and wait for
the onlining phase to be associated with the zone as per the online
request.  There are only two requirements

	- existing ZONE_NORMAL and ZONE_MOVABLE cannot overlap

	- ZONE_NORMAL precedes ZONE_MOVABLE in physical addresses

the latter one is not an inherent requirement and can be changed in the
future.  It preserves the current behavior and made the code slightly
simpler.  This is subject to change in future.

This means that the same physical online steps as above will lead to the
following state: Normal Movable

  /sys/devices/system/memory/memory32/valid_zones:Normal Movable
  /sys/devices/system/memory/memory33/valid_zones:Normal Movable

  /sys/devices/system/memory/memory32/valid_zones:Normal Movable
  /sys/devices/system/memory/memory33/valid_zones:Normal Movable
  /sys/devices/system/memory/memory34/valid_zones:Normal Movable

  /sys/devices/system/memory/memory32/valid_zones:Normal Movable
  /sys/devices/system/memory/memory33/valid_zones:Normal Movable
  /sys/devices/system/memory/memory34/valid_zones:Movable

Implementation:
The current move_pfn_range is reimplemented to check the above
requirements (allow_online_pfn_range) and then updates the respective
zone (move_pfn_range_to_zone), the pgdat and links all the pages in the
pfn range with the zone/node.  __add_pages is updated to not require the
zone and only initializes sections in the range.  This allowed to
simplify the arch_add_memory code (s390 could get rid of quite some of
code).

devm_memremap_pages is the only user of arch_add_memory which relies on
the zone association because it only hooks into the memory hotplug only
half way.  It uses it to associate the new memory with ZONE_DEVICE but
doesn't allow it to be {on,off}lined via sysfs.  This means that this
particular code path has to call move_pfn_range_to_zone explicitly.

The original zone shifting code is kept in place and will be removed in
the follow up patch for an easier review.

Please note that this patch also changes the original behavior when
offlining a memory block adjacent to another zone (Normal vs.  Movable)
used to allow to change its movable type.  This will be handled later.

[richard.weiyang@gmail.com: simplify zone_intersects()]
  Link: http://lkml.kernel.org/r/20170616092335.5177-1-richard.weiyang@gmail.com
[richard.weiyang@gmail.com: remove duplicate call for set_page_links]
  Link: http://lkml.kernel.org/r/20170616092335.5177-2-richard.weiyang@gmail.com
[akpm@linux-foundation.org: remove unused local `i']
Link: http://lkml.kernel.org/r/20170515085827.16474-12-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com> # For s390 bits
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c            |   9 +-
 arch/powerpc/mm/mem.c          |  10 +-
 arch/s390/mm/init.c            |  30 +-----
 arch/sh/mm/init.c              |   8 +-
 arch/x86/mm/init_32.c          |   5 +-
 arch/x86/mm/init_64.c          |   9 +-
 drivers/base/memory.c          |  52 ++++++-----
 include/linux/memory_hotplug.h |  13 +--
 include/linux/mmzone.h         |  16 ++++
 kernel/memremap.c              |   4 +
 mm/memory_hotplug.c            | 201 +++++++++++++++++++++++++----------------
 mm/sparse.c                    |   3 +-
 12 files changed, 185 insertions(+), 175 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 39e2aeb4669d..80db57d063d0 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -648,18 +648,11 @@ mem_init (void)
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
-	pg_data_t *pgdat;
-	struct zone *zone;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	pgdat = NODE_DATA(nid);
-
-	zone = pgdat->node_zones +
-		zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
-	ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
-
+	ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
 	if (ret)
 		printk("%s: Problem encountered in __add_pages() as ret=%d\n",
 		       __func__,  ret);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index e6b2e6618b6c..72c46eb53215 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -128,16 +128,12 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
 
 int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
-	struct pglist_data *pgdata;
-	struct zone *zone;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int rc;
 
 	resize_hpt_for_hotplug(memblock_phys_mem_size());
 
-	pgdata = NODE_DATA(nid);
-
 	start = (unsigned long)__va(start);
 	rc = create_section_mapping(start, start + size);
 	if (rc) {
@@ -147,11 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 		return -EFAULT;
 	}
 
-	/* this should work for most non-highmem platforms */
-	zone = pgdata->node_zones +
-		zone_for_memory(nid, start, size, 0, for_device);
-
-	return __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
+	return __add_pages(nid, start_pfn, nr_pages, !for_device);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index a3d549966b6a..bfa918e3592b 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -168,41 +168,15 @@ unsigned long memory_block_size_bytes(void)
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
-	unsigned long zone_start_pfn, zone_end_pfn, nr_pages;
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long size_pages = PFN_DOWN(size);
-	pg_data_t *pgdat = NODE_DATA(nid);
-	struct zone *zone;
-	int rc, i;
+	int rc;
 
 	rc = vmem_add_mapping(start, size);
 	if (rc)
 		return rc;
 
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		zone = pgdat->node_zones + i;
-		if (zone_idx(zone) != ZONE_MOVABLE) {
-			/* Add range within existing zone limits, if possible */
-			zone_start_pfn = zone->zone_start_pfn;
-			zone_end_pfn = zone->zone_start_pfn +
-				       zone->spanned_pages;
-		} else {
-			/* Add remaining range to ZONE_MOVABLE */
-			zone_start_pfn = start_pfn;
-			zone_end_pfn = start_pfn + size_pages;
-		}
-		if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn)
-			continue;
-		nr_pages = (start_pfn + size_pages > zone_end_pfn) ?
-			   zone_end_pfn - start_pfn : size_pages;
-		rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
-		if (rc)
-			break;
-		start_pfn += nr_pages;
-		size_pages -= nr_pages;
-		if (!size_pages)
-			break;
-	}
+	rc = __add_pages(nid, start_pfn, size_pages, !for_device);
 	if (rc)
 		vmem_remove_mapping(start, size);
 	return rc;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index a9d57f75ae8c..3813a610a2bb 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -487,18 +487,12 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
-	pg_data_t *pgdat;
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	pgdat = NODE_DATA(nid);
-
 	/* We only have ZONE_NORMAL, so this is easy.. */
-	ret = __add_pages(nid, pgdat->node_zones +
-			zone_for_memory(nid, start, size, ZONE_NORMAL,
-			for_device),
-			start_pfn, nr_pages, !for_device);
+	ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
 	if (unlikely(ret))
 		printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 94594b889144..a424066d0552 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -825,13 +825,10 @@ void __init mem_init(void)
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
-	struct pglist_data *pgdata = NODE_DATA(nid);
-	struct zone *zone = pgdata->node_zones +
-		zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	return __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
+	return __add_pages(nid, start_pfn, nr_pages, !for_device);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9d64291459b6..06afa84ac0a0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -772,22 +772,15 @@ static void  update_end_of_memory_vars(u64 start, u64 size)
 	}
 }
 
-/*
- * Memory is added always to NORMAL zone. This means you will never get
- * additional DMA/DMA32 memory.
- */
 int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
-	struct pglist_data *pgdat = NODE_DATA(nid);
-	struct zone *zone = pgdat->node_zones +
-		zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
 	init_memory_mapping(start, start + size);
 
-	ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
+	ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
 	WARN_ON_ONCE(ret);
 
 	/* update max_pfn, max_low_pfn and high_memory */
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 1e884d82af6f..b86fda30ce62 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -392,39 +392,43 @@ static ssize_t show_valid_zones(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	struct memory_block *mem = to_memory_block(dev);
-	unsigned long start_pfn, end_pfn;
-	unsigned long valid_start, valid_end, valid_pages;
+	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-	struct zone *zone;
-	int zone_shift = 0;
+	unsigned long valid_start_pfn, valid_end_pfn;
+	bool append = false;
+	int nid;
 
-	start_pfn = section_nr_to_pfn(mem->start_section_nr);
-	end_pfn = start_pfn + nr_pages;
-
-	/* The block contains more than one zone can not be offlined. */
-	if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end))
+	/*
+	 * The block contains more than one zone can not be offlined.
+	 * This can happen e.g. for ZONE_DMA and ZONE_DMA32
+	 */
+	if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn))
 		return sprintf(buf, "none\n");
 
-	zone = page_zone(pfn_to_page(valid_start));
-	valid_pages = valid_end - valid_start;
-
-	/* MMOP_ONLINE_KEEP */
-	sprintf(buf, "%s", zone->name);
+	start_pfn = valid_start_pfn;
+	nr_pages = valid_end_pfn - start_pfn;
 
-	/* MMOP_ONLINE_KERNEL */
-	zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift);
-	if (zone_shift) {
-		strcat(buf, " ");
-		strcat(buf, (zone + zone_shift)->name);
+	/*
+	 * Check the existing zone. Make sure that we do that only on the
+	 * online nodes otherwise the page_zone is not reliable
+	 */
+	if (mem->state == MEM_ONLINE) {
+		strcat(buf, page_zone(pfn_to_page(start_pfn))->name);
+		goto out;
 	}
 
-	/* MMOP_ONLINE_MOVABLE */
-	zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift);
-	if (zone_shift) {
-		strcat(buf, " ");
-		strcat(buf, (zone + zone_shift)->name);
+	nid = pfn_to_nid(start_pfn);
+	if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) {
+		strcat(buf, NODE_DATA(nid)->node_zones[ZONE_NORMAL].name);
+		append = true;
 	}
 
+	if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) {
+		if (append)
+			strcat(buf, " ");
+		strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name);
+	}
+out:
 	strcat(buf, "\n");
 
 	return strlen(buf);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index a61aede1b391..8a07a49fd8dc 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -123,8 +123,8 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
 	unsigned long nr_pages);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
-/* reasonably generic interface to expand the physical pages in a zone  */
-extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn,
+/* reasonably generic interface to expand the physical pages */
+extern int __add_pages(int nid, unsigned long start_pfn,
 	unsigned long nr_pages, bool want_memblock);
 
 #ifdef CONFIG_NUMA
@@ -299,15 +299,16 @@ extern int add_memory_resource(int nid, struct resource *resource, bool online);
 extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
 		bool for_device);
 extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
+extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
+		unsigned long nr_pages);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern void remove_memory(int nid, u64 start, u64 size);
-extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn);
+extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn);
 extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
 		unsigned long map_offset);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
-extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
-			  enum zone_type target, int *zone_shift);
-
+extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
+		int online_type);
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2aaf7e08c5a8..abc1641011f2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -532,6 +532,22 @@ static inline bool zone_is_empty(struct zone *zone)
 	return zone->spanned_pages == 0;
 }
 
+/*
+ * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
+ * intersection with the given zone
+ */
+static inline bool zone_intersects(struct zone *zone,
+		unsigned long start_pfn, unsigned long nr_pages)
+{
+	if (zone_is_empty(zone))
+		return false;
+	if (start_pfn >= zone_end_pfn(zone) ||
+	    start_pfn + nr_pages <= zone->zone_start_pfn)
+		return false;
+
+	return true;
+}
+
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
  * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 23a6483c3666..281eb478856a 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -359,6 +359,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 
 	mem_hotplug_begin();
 	error = arch_add_memory(nid, align_start, align_size, true);
+	if (!error)
+		move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+					align_start >> PAGE_SHIFT,
+					align_size >> PAGE_SHIFT);
 	mem_hotplug_done();
 	if (error)
 		goto err_add_memory;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b2ebe9ad7f6c..9438ffe24cb2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -433,25 +433,6 @@ out_fail:
 	return -1;
 }
 
-static struct zone * __meminit move_pfn_range(int zone_shift,
-		unsigned long start_pfn, unsigned long end_pfn)
-{
-	struct zone *zone = page_zone(pfn_to_page(start_pfn));
-	int ret = 0;
-
-	if (zone_shift < 0)
-		ret = move_pfn_range_left(zone + zone_shift, zone,
-					  start_pfn, end_pfn);
-	else if (zone_shift)
-		ret = move_pfn_range_right(zone, zone + zone_shift,
-					   start_pfn, end_pfn);
-
-	if (ret)
-		return NULL;
-
-	return zone + zone_shift;
-}
-
 static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
 				      unsigned long end_pfn)
 {
@@ -493,23 +474,35 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 	return 0;
 }
 
-static int __meminit __add_section(int nid, struct zone *zone,
-		unsigned long phys_start_pfn, bool want_memblock)
+static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
+		bool want_memblock)
 {
 	int ret;
+	int i;
 
 	if (pfn_valid(phys_start_pfn))
 		return -EEXIST;
 
-	ret = sparse_add_one_section(zone, phys_start_pfn);
-
+	ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn);
 	if (ret < 0)
 		return ret;
 
-	ret = __add_zone(zone, phys_start_pfn);
+	/*
+	 * Make all the pages reserved so that nobody will stumble over half
+	 * initialized state.
+	 * FIXME: We also have to associate it with a node because pfn_to_node
+	 * relies on having page with the proper node.
+	 */
+	for (i = 0; i < PAGES_PER_SECTION; i++) {
+		unsigned long pfn = phys_start_pfn + i;
+		struct page *page;
+		if (!pfn_valid(pfn))
+			continue;
 
-	if (ret < 0)
-		return ret;
+		page = pfn_to_page(pfn);
+		set_page_node(page, nid);
+		SetPageReserved(page);
+	}
 
 	if (!want_memblock)
 		return 0;
@@ -523,7 +516,7 @@ static int __meminit __add_section(int nid, struct zone *zone,
  * call this function after deciding the zone to which to
  * add the new pages.
  */
-int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
+int __ref __add_pages(int nid, unsigned long phys_start_pfn,
 			unsigned long nr_pages, bool want_memblock)
 {
 	unsigned long i;
@@ -531,8 +524,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
 	int start_sec, end_sec;
 	struct vmem_altmap *altmap;
 
-	clear_zone_contiguous(zone);
-
 	/* during initialize mem_map, align hot-added range to section */
 	start_sec = pfn_to_section_nr(phys_start_pfn);
 	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
@@ -552,7 +543,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
 	}
 
 	for (i = start_sec; i <= end_sec; i++) {
-		err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock);
+		err = __add_section(nid, section_nr_to_pfn(i), want_memblock);
 
 		/*
 		 * EEXIST is finally dealt with by ioresource collision
@@ -565,7 +556,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
 	}
 	vmemmap_populate_print_last();
 out:
-	set_zone_contiguous(zone);
 	return err;
 }
 EXPORT_SYMBOL_GPL(__add_pages);
@@ -1034,39 +1024,109 @@ static void node_states_set_node(int node, struct memory_notify *arg)
 	node_set_state(node, N_MEMORY);
 }
 
-bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
-		   enum zone_type target, int *zone_shift)
+bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
 {
-	struct zone *zone = page_zone(pfn_to_page(pfn));
-	enum zone_type idx = zone_idx(zone);
-	int i;
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
+	struct zone *normal_zone =  &pgdat->node_zones[ZONE_NORMAL];
 
-	*zone_shift = 0;
+	/*
+	 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
+	 * physically before ZONE_MOVABLE. All we need is they do not
+	 * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
+	 * though so let's stick with it for simplicity for now.
+	 * TODO make sure we do not overlap with ZONE_DEVICE
+	 */
+	if (online_type == MMOP_ONLINE_KERNEL) {
+		if (zone_is_empty(movable_zone))
+			return true;
+		return movable_zone->zone_start_pfn >= pfn + nr_pages;
+	} else if (online_type == MMOP_ONLINE_MOVABLE) {
+		return zone_end_pfn(normal_zone) <= pfn;
+	}
 
-	if (idx < target) {
-		/* pages must be at end of current zone */
-		if (pfn + nr_pages != zone_end_pfn(zone))
-			return false;
+	/* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
+	return online_type == MMOP_ONLINE_KEEP;
+}
 
-		/* no zones in use between current zone and target */
-		for (i = idx + 1; i < target; i++)
-			if (zone_is_initialized(zone - idx + i))
-				return false;
-	}
+static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
+		unsigned long nr_pages)
+{
+	unsigned long old_end_pfn = zone_end_pfn(zone);
 
-	if (target < idx) {
-		/* pages must be at beginning of current zone */
-		if (pfn != zone->zone_start_pfn)
-			return false;
+	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
+		zone->zone_start_pfn = start_pfn;
+
+	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
+}
+
+static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
+                                     unsigned long nr_pages)
+{
+	unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
 
-		/* no zones in use between current zone and target */
-		for (i = target + 1; i < idx; i++)
-			if (zone_is_initialized(zone - idx + i))
-				return false;
+	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
+		pgdat->node_start_pfn = start_pfn;
+
+	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
+}
+
+void move_pfn_range_to_zone(struct zone *zone,
+		unsigned long start_pfn, unsigned long nr_pages)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	int nid = pgdat->node_id;
+	unsigned long flags;
+
+	if (zone_is_empty(zone))
+		init_currently_empty_zone(zone, start_pfn, nr_pages);
+
+	clear_zone_contiguous(zone);
+
+	/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
+	pgdat_resize_lock(pgdat, &flags);
+	zone_span_writelock(zone);
+	resize_zone_range(zone, start_pfn, nr_pages);
+	zone_span_writeunlock(zone);
+	resize_pgdat_range(pgdat, start_pfn, nr_pages);
+	pgdat_resize_unlock(pgdat, &flags);
+
+	/*
+	 * TODO now we have a visible range of pages which are not associated
+	 * with their zone properly. Not nice but set_pfnblock_flags_mask
+	 * expects the zone spans the pfn range. All the pages in the range
+	 * are reserved so nobody should be touching them so we should be safe
+	 */
+	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG);
+
+	set_zone_contiguous(zone);
+}
+
+/*
+ * Associates the given pfn range with the given node and the zone appropriate
+ * for the given online type.
+ */
+static struct zone * __meminit move_pfn_range(int online_type, int nid,
+		unsigned long start_pfn, unsigned long nr_pages)
+{
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	struct zone *zone = &pgdat->node_zones[ZONE_NORMAL];
+
+	if (online_type == MMOP_ONLINE_KEEP) {
+		struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
+		/*
+		 * MMOP_ONLINE_KEEP inherits the current zone which is
+		 * ZONE_NORMAL by default but we might be within ZONE_MOVABLE
+		 * already.
+		 */
+		if (zone_intersects(movable_zone, start_pfn, nr_pages))
+			zone = movable_zone;
+	} else if (online_type == MMOP_ONLINE_MOVABLE) {
+		zone = &pgdat->node_zones[ZONE_MOVABLE];
 	}
 
-	*zone_shift = target - idx;
-	return true;
+	move_pfn_range_to_zone(zone, start_pfn, nr_pages);
+	return zone;
 }
 
 /* Must be protected by mem_hotplug_begin() */
@@ -1079,38 +1139,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	int nid;
 	int ret;
 	struct memory_notify arg;
-	int zone_shift = 0;
 
-	/*
-	 * This doesn't need a lock to do pfn_to_page().
-	 * The section can't be removed here because of the
-	 * memory_block->state_mutex.
-	 */
-	zone = page_zone(pfn_to_page(pfn));
-
-	if ((zone_idx(zone) > ZONE_NORMAL ||
-	    online_type == MMOP_ONLINE_MOVABLE) &&
-	    !can_online_high_movable(pfn_to_nid(pfn)))
+	nid = pfn_to_nid(pfn);
+	if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
 		return -EINVAL;
 
-	if (online_type == MMOP_ONLINE_KERNEL) {
-		if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift))
-			return -EINVAL;
-	} else if (online_type == MMOP_ONLINE_MOVABLE) {
-		if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift))
-			return -EINVAL;
-	}
-
-	zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
-	if (!zone)
+	if (online_type == MMOP_ONLINE_MOVABLE && !can_online_high_movable(nid))
 		return -EINVAL;
 
+	/* associate pfn range with the zone */
+	zone = move_pfn_range(online_type, nid, pfn, nr_pages);
+
 	arg.start_pfn = pfn;
 	arg.nr_pages = nr_pages;
 	node_states_check_changes_online(nr_pages, zone, &arg);
 
-	nid = zone_to_nid(zone);
-
 	ret = memory_notify(MEM_GOING_ONLINE, &arg);
 	ret = notifier_to_errno(ret);
 	if (ret)
diff --git a/mm/sparse.c b/mm/sparse.c
index 9d7fd666015e..7b4be3fd5cac 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -761,10 +761,9 @@ static void free_map_bootmem(struct page *memmap)
  * set.  If this is <=0, then that means that the passed-in
  * map was not consumed and must be freed.
  */
-int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
+int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
-	struct pglist_data *pgdat = zone->zone_pgdat;
 	struct mem_section *ms;
 	struct page *memmap;
 	unsigned long *usemap;
-- 
cgit v1.3-14-g43fede


From 3d79a728f9b2e6ddcce4e02c91c4de1076548a4c Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:38:21 -0700
Subject: mm, memory_hotplug: replace for_device by want_memblock in
 arch_add_memory

arch_add_memory gets for_device argument which then controls whether we
want to create memblocks for created memory sections.  Simplify the
logic by telling whether we want memblocks directly rather than going
through pointless negation.  This also makes the api easier to
understand because it is clear what we want rather than nothing telling
for_device which can mean anything.

This shouldn't introduce any functional change.

Link: http://lkml.kernel.org/r/20170515085827.16474-13-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c            | 4 ++--
 arch/powerpc/mm/mem.c          | 4 ++--
 arch/s390/mm/init.c            | 4 ++--
 arch/sh/mm/init.c              | 4 ++--
 arch/x86/mm/init_32.c          | 4 ++--
 arch/x86/mm/init_64.c          | 4 ++--
 include/linux/memory_hotplug.h | 2 +-
 kernel/memremap.c              | 2 +-
 mm/memory_hotplug.c            | 2 +-
 9 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 80db57d063d0..a4e8d6bd9cfa 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -646,13 +646,13 @@ mem_init (void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
+	ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
 	if (ret)
 		printk("%s: Problem encountered in __add_pages() as ret=%d\n",
 		       __func__,  ret);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 72c46eb53215..de5a90e1ceaa 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -126,7 +126,7 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
 	return -ENODEV;
 }
 
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -143,7 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 		return -EFAULT;
 	}
 
-	return __add_pages(nid, start_pfn, nr_pages, !for_device);
+	return __add_pages(nid, start_pfn, nr_pages, want_memblock);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index bfa918e3592b..8111694ce55a 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -166,7 +166,7 @@ unsigned long memory_block_size_bytes(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long size_pages = PFN_DOWN(size);
@@ -176,7 +176,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 	if (rc)
 		return rc;
 
-	rc = __add_pages(nid, start_pfn, size_pages, !for_device);
+	rc = __add_pages(nid, start_pfn, size_pages, want_memblock);
 	if (rc)
 		vmem_remove_mapping(start, size);
 	return rc;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 3813a610a2bb..bf726af5f1a5 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -485,14 +485,14 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 #endif
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
 	/* We only have ZONE_NORMAL, so this is easy.. */
-	ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
+	ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
 	if (unlikely(ret))
 		printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index a424066d0552..8a64a6f2848d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -823,12 +823,12 @@ void __init mem_init(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	return __add_pages(nid, start_pfn, nr_pages, !for_device);
+	return __add_pages(nid, start_pfn, nr_pages, want_memblock);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 06afa84ac0a0..136422d7d539 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -772,7 +772,7 @@ static void  update_end_of_memory_vars(u64 start, u64 size)
 	}
 }
 
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -780,7 +780,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 
 	init_memory_mapping(start, start + size);
 
-	ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
+	ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
 	WARN_ON_ONCE(ret);
 
 	/* update max_pfn, max_low_pfn and high_memory */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4d65a2fcac15..780c806e17d3 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -298,7 +298,7 @@ extern int add_memory(int nid, u64 start, u64 size);
 extern int add_memory_resource(int nid, struct resource *resource, bool online);
 extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
 		bool for_device);
-extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
+extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock);
 extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 		unsigned long nr_pages);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 281eb478856a..124bed776532 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -358,7 +358,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 		goto err_pfn_remap;
 
 	mem_hotplug_begin();
-	error = arch_add_memory(nid, align_start, align_size, true);
+	error = arch_add_memory(nid, align_start, align_size, false);
 	if (!error)
 		move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
 					align_start >> PAGE_SHIFT,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 4263fa6f2ab4..9b04cf5ea813 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1448,7 +1448,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
 	}
 
 	/* call arch's memory hotadd */
-	ret = arch_add_memory(nid, start, size, false);
+	ret = arch_add_memory(nid, start, size, true);
 
 	if (ret < 0)
 		goto error;
-- 
cgit v1.3-14-g43fede


From 57ecbd3831ee3ad43914d5c9dddbff7ce30e3d42 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:38:32 -0700
Subject: kernel/exit.c: don't include unused userfaultfd_k.h

Commit dd0db88d8094 ("userfaultfd: non-cooperative: rollback
userfaultfd_exit") removed userfaultfd callback from exit() which makes
the include of <linux/userfaultfd_k.h> unnecessary.

Link: http://lkml.kernel.org/r/1494930907-3060-1-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index b0cc86a2d00b..2bbc23273e2f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,7 +51,6 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
-#include <linux/userfaultfd_k.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
-- 
cgit v1.3-14-g43fede


From 3d375d78593cd5daeead34ed3279c4ff63dd04f2 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Thu, 6 Jul 2017 15:39:11 -0700
Subject: mm: update callers to use HASH_ZERO flag

Update dcache, inode, pid, mountpoint, and mount hash tables to use
HASH_ZERO, and remove initialization after allocations.  In case of
places where HASH_EARLY was used such as in __pv_init_lock_hash the
zeroed hash table was already assumed, because memblock zeroes the
memory.

CPU: SPARC M6, Memory: 7T
Before fix:
  Dentry cache hash table entries: 1073741824
  Inode-cache hash table entries: 536870912
  Mount-cache hash table entries: 16777216
  Mountpoint-cache hash table entries: 16777216
  ftrace: allocating 20414 entries in 40 pages
  Total time: 11.798s

After fix:
  Dentry cache hash table entries: 1073741824
  Inode-cache hash table entries: 536870912
  Mount-cache hash table entries: 16777216
  Mountpoint-cache hash table entries: 16777216
  ftrace: allocating 20414 entries in 40 pages
  Total time: 3.198s

CPU: Intel Xeon E5-2630, Memory: 2.2T:
Before fix:
  Dentry cache hash table entries: 536870912
  Inode-cache hash table entries: 268435456
  Mount-cache hash table entries: 8388608
  Mountpoint-cache hash table entries: 8388608
  CPU: Physical Processor ID: 0
  Total time: 3.245s

After fix:
  Dentry cache hash table entries: 536870912
  Inode-cache hash table entries: 268435456
  Mount-cache hash table entries: 8388608
  Mountpoint-cache hash table entries: 8388608
  CPU: Physical Processor ID: 0
  Total time: 3.244s

Link: http://lkml.kernel.org/r/1488432825-92126-4-git-send-email-pasha.tatashin@oracle.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Reviewed-by: Babu Moger <babu.moger@oracle.com>
Cc: David Miller <davem@davemloft.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c                         | 18 ++++--------------
 fs/inode.c                          | 14 ++------------
 fs/namespace.c                      | 10 ++--------
 kernel/locking/qspinlock_paravirt.h |  3 ++-
 kernel/pid.c                        |  7 ++-----
 5 files changed, 12 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/fs/dcache.c b/fs/dcache.c
index a9f995f6859e..a140fe1dbb1a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3546,8 +3546,6 @@ __setup("dhash_entries=", set_dhash_entries);
 
 static void __init dcache_init_early(void)
 {
-	unsigned int loop;
-
 	/* If hashes are distributed across NUMA nodes, defer
 	 * hash allocation until vmalloc space is available.
 	 */
@@ -3559,24 +3557,19 @@ static void __init dcache_init_early(void)
 					sizeof(struct hlist_bl_head),
 					dhash_entries,
 					13,
-					HASH_EARLY,
+					HASH_EARLY | HASH_ZERO,
 					&d_hash_shift,
 					&d_hash_mask,
 					0,
 					0);
-
-	for (loop = 0; loop < (1U << d_hash_shift); loop++)
-		INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
 }
 
 static void __init dcache_init(void)
 {
-	unsigned int loop;
-
-	/* 
+	/*
 	 * A constructor could be added for stable state like the lists,
 	 * but it is probably not worth it because of the cache nature
-	 * of the dcache. 
+	 * of the dcache.
 	 */
 	dentry_cache = KMEM_CACHE(dentry,
 		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
@@ -3590,14 +3583,11 @@ static void __init dcache_init(void)
 					sizeof(struct hlist_bl_head),
 					dhash_entries,
 					13,
-					0,
+					HASH_ZERO,
 					&d_hash_shift,
 					&d_hash_mask,
 					0,
 					0);
-
-	for (loop = 0; loop < (1U << d_hash_shift); loop++)
-		INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
 }
 
 /* SLAB cache for __getname() consumers */
diff --git a/fs/inode.c b/fs/inode.c
index ab3b9a795c0b..5cbc8e6e9390 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1915,8 +1915,6 @@ __setup("ihash_entries=", set_ihash_entries);
  */
 void __init inode_init_early(void)
 {
-	unsigned int loop;
-
 	/* If hashes are distributed across NUMA nodes, defer
 	 * hash allocation until vmalloc space is available.
 	 */
@@ -1928,20 +1926,15 @@ void __init inode_init_early(void)
 					sizeof(struct hlist_head),
 					ihash_entries,
 					14,
-					HASH_EARLY,
+					HASH_EARLY | HASH_ZERO,
 					&i_hash_shift,
 					&i_hash_mask,
 					0,
 					0);
-
-	for (loop = 0; loop < (1U << i_hash_shift); loop++)
-		INIT_HLIST_HEAD(&inode_hashtable[loop]);
 }
 
 void __init inode_init(void)
 {
-	unsigned int loop;
-
 	/* inode slab cache */
 	inode_cachep = kmem_cache_create("inode_cache",
 					 sizeof(struct inode),
@@ -1959,14 +1952,11 @@ void __init inode_init(void)
 					sizeof(struct hlist_head),
 					ihash_entries,
 					14,
-					0,
+					HASH_ZERO,
 					&i_hash_shift,
 					&i_hash_mask,
 					0,
 					0);
-
-	for (loop = 0; loop < (1U << i_hash_shift); loop++)
-		INIT_HLIST_HEAD(&inode_hashtable[loop]);
 }
 
 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
diff --git a/fs/namespace.c b/fs/namespace.c
index f70914a859a4..81f934b5d571 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3239,7 +3239,6 @@ static void __init init_mount_tree(void)
 
 void __init mnt_init(void)
 {
-	unsigned u;
 	int err;
 
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
@@ -3248,22 +3247,17 @@ void __init mnt_init(void)
 	mount_hashtable = alloc_large_system_hash("Mount-cache",
 				sizeof(struct hlist_head),
 				mhash_entries, 19,
-				0,
+				HASH_ZERO,
 				&m_hash_shift, &m_hash_mask, 0, 0);
 	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
 				sizeof(struct hlist_head),
 				mphash_entries, 19,
-				0,
+				HASH_ZERO,
 				&mp_hash_shift, &mp_hash_mask, 0, 0);
 
 	if (!mount_hashtable || !mountpoint_hashtable)
 		panic("Failed to allocate mount hash table\n");
 
-	for (u = 0; u <= m_hash_mask; u++)
-		INIT_HLIST_HEAD(&mount_hashtable[u]);
-	for (u = 0; u <= mp_hash_mask; u++)
-		INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
-
 	kernfs_init();
 
 	err = sysfs_init();
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e6b2f7ad3e51..4ccfcaae5b89 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void)
 	 */
 	pv_lock_hash = alloc_large_system_hash("PV qspinlock",
 					       sizeof(struct pv_hash_entry),
-					       pv_hash_size, 0, HASH_EARLY,
+					       pv_hash_size, 0,
+					       HASH_EARLY | HASH_ZERO,
 					       &pv_lock_hash_bits, NULL,
 					       pv_hash_size, pv_hash_size);
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index fd1cde1e4576..731c4e528f4e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -575,16 +575,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
  */
 void __init pidhash_init(void)
 {
-	unsigned int i, pidhash_size;
+	unsigned int pidhash_size;
 
 	pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
-					   HASH_EARLY | HASH_SMALL,
+					   HASH_EARLY | HASH_SMALL | HASH_ZERO,
 					   &pidhash_shift, NULL,
 					   0, 4096);
 	pidhash_size = 1U << pidhash_shift;
-
-	for (i = 0; i < pidhash_size; i++)
-		INIT_HLIST_HEAD(&pid_hash[i]);
 }
 
 void __init pidmap_init(void)
-- 
cgit v1.3-14-g43fede


From 213980c0f23b6c4932fd5516da7e8443b2a615ea Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 6 Jul 2017 15:40:06 -0700
Subject: mm, mempolicy: simplify rebinding mempolicies when updating cpusets

Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when
changing cpuset's mems") has introduced a two-step protocol when
rebinding task's mempolicy due to cpuset update, in order to avoid a
parallel allocation seeing an empty effective nodemask and failing.

Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory
barrier related damage v3") introduced a seqlock protection and removed
the synchronization point between the two update steps.  At that point
(or perhaps later), the two-step rebinding became unnecessary.

Currently it only makes sure that the update first adds new nodes in
step 1 and then removes nodes in step 2.  Without memory barriers the
effects are questionable, and even then this cannot prevent a parallel
zonelist iteration checking the nodemask at each step to observe all
nodes as unusable for allocation.  We now fully rely on the seqlock to
prevent premature OOMs and allocation failures.

We can thus remove the two-step update parts and simplify the code.

Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h      |   6 +--
 include/uapi/linux/mempolicy.h |   8 ----
 kernel/cgroup/cpuset.c         |   4 +-
 mm/mempolicy.c                 | 102 ++++++++---------------------------------
 4 files changed, 21 insertions(+), 99 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index ecb6cbeede5a..3a58b4be1b0c 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -142,8 +142,7 @@ bool vma_policy_mof(struct vm_area_struct *vma);
 
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
-extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
-				enum mpol_rebind_step step);
+extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
 
 extern int huge_node(struct vm_area_struct *vma,
@@ -260,8 +259,7 @@ static inline void numa_default_policy(void)
 }
 
 static inline void mpol_rebind_task(struct task_struct *tsk,
-				const nodemask_t *new,
-				enum mpol_rebind_step step)
+				const nodemask_t *new)
 {
 }
 
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 9cd8b21dddbe..2a4d89508fec 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -24,13 +24,6 @@ enum {
 	MPOL_MAX,	/* always last member of enum */
 };
 
-enum mpol_rebind_step {
-	MPOL_REBIND_ONCE,	/* do rebind work at once(not by two step) */
-	MPOL_REBIND_STEP1,	/* first step(set all the newly nodes) */
-	MPOL_REBIND_STEP2,	/* second step(clean all the disallowed nodes)*/
-	MPOL_REBIND_NSTEP,
-};
-
 /* Flags for set_mempolicy */
 #define MPOL_F_STATIC_NODES	(1 << 15)
 #define MPOL_F_RELATIVE_NODES	(1 << 14)
@@ -65,7 +58,6 @@ enum mpol_rebind_step {
  */
 #define MPOL_F_SHARED  (1 << 0)	/* identify shared policies */
 #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */
-#define MPOL_F_REBINDING (1 << 2)	/* identify policies in rebinding */
 #define MPOL_F_MOF	(1 << 3) /* this policy wants migrate on fault */
 #define MPOL_F_MORON	(1 << 4) /* Migrate On protnone Reference On Node */
 
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ae643412948a..5fd1bdbaa381 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1063,9 +1063,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 	}
 
 	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-
-	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
+	mpol_rebind_task(tsk, newmems);
 	tsk->mems_allowed = *newmems;
 
 	if (need_loop) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c60807625fd5..047181452040 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -146,22 +146,7 @@ struct mempolicy *get_task_policy(struct task_struct *p)
 
 static const struct mempolicy_operations {
 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
-	/*
-	 * If read-side task has no lock to protect task->mempolicy, write-side
-	 * task will rebind the task->mempolicy by two step. The first step is
-	 * setting all the newly nodes, and the second step is cleaning all the
-	 * disallowed nodes. In this way, we can avoid finding no node to alloc
-	 * page.
-	 * If we have a lock to protect task->mempolicy in read-side, we do
-	 * rebind directly.
-	 *
-	 * step:
-	 * 	MPOL_REBIND_ONCE - do rebind work at once
-	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
-	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
-	 */
-	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
-			enum mpol_rebind_step step);
+	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 } mpol_ops[MPOL_MAX];
 
 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
@@ -304,19 +289,11 @@ void __mpol_put(struct mempolicy *p)
 	kmem_cache_free(policy_cache, p);
 }
 
-static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
-				enum mpol_rebind_step step)
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 {
 }
 
-/*
- * step:
- * 	MPOL_REBIND_ONCE  - do rebind work at once
- * 	MPOL_REBIND_STEP1 - set all the newly nodes
- * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
- */
-static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
-				 enum mpol_rebind_step step)
+static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 {
 	nodemask_t tmp;
 
@@ -325,35 +302,19 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 	else {
-		/*
-		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
-		 * result
-		 */
-		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
-			nodes_remap(tmp, pol->v.nodes,
-					pol->w.cpuset_mems_allowed, *nodes);
-			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
-		} else if (step == MPOL_REBIND_STEP2) {
-			tmp = pol->w.cpuset_mems_allowed;
-			pol->w.cpuset_mems_allowed = *nodes;
-		} else
-			BUG();
+		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
+								*nodes);
+		pol->w.cpuset_mems_allowed = tmp;
 	}
 
 	if (nodes_empty(tmp))
 		tmp = *nodes;
 
-	if (step == MPOL_REBIND_STEP1)
-		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
-	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
-		pol->v.nodes = tmp;
-	else
-		BUG();
+	pol->v.nodes = tmp;
 }
 
 static void mpol_rebind_preferred(struct mempolicy *pol,
-				  const nodemask_t *nodes,
-				  enum mpol_rebind_step step)
+						const nodemask_t *nodes)
 {
 	nodemask_t tmp;
 
@@ -379,42 +340,19 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
 /*
  * mpol_rebind_policy - Migrate a policy to a different set of nodes
  *
- * If read-side task has no lock to protect task->mempolicy, write-side
- * task will rebind the task->mempolicy by two step. The first step is
- * setting all the newly nodes, and the second step is cleaning all the
- * disallowed nodes. In this way, we can avoid finding no node to alloc
- * page.
- * If we have a lock to protect task->mempolicy in read-side, we do
- * rebind directly.
- *
- * step:
- * 	MPOL_REBIND_ONCE  - do rebind work at once
- * 	MPOL_REBIND_STEP1 - set all the newly nodes
- * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ * Per-vma policies are protected by mmap_sem. Allocations using per-task
+ * policies are protected by task->mems_allowed_seq to prevent a premature
+ * OOM/allocation failure due to parallel nodemask modification.
  */
-static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
-				enum mpol_rebind_step step)
+static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 {
 	if (!pol)
 		return;
-	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
+	if (!mpol_store_user_nodemask(pol) &&
 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
 
-	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
-		return;
-
-	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
-		BUG();
-
-	if (step == MPOL_REBIND_STEP1)
-		pol->flags |= MPOL_F_REBINDING;
-	else if (step == MPOL_REBIND_STEP2)
-		pol->flags &= ~MPOL_F_REBINDING;
-	else if (step >= MPOL_REBIND_NSTEP)
-		BUG();
-
-	mpol_ops[pol->mode].rebind(pol, newmask, step);
+	mpol_ops[pol->mode].rebind(pol, newmask);
 }
 
 /*
@@ -424,10 +362,9 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
  * Called with task's alloc_lock held.
  */
 
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
-			enum mpol_rebind_step step)
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 {
-	mpol_rebind_policy(tsk->mempolicy, new, step);
+	mpol_rebind_policy(tsk->mempolicy, new);
 }
 
 /*
@@ -442,7 +379,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 
 	down_write(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next)
-		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
+		mpol_rebind_policy(vma->vm_policy, new);
 	up_write(&mm->mmap_sem);
 }
 
@@ -2101,10 +2038,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
 
 	if (current_cpuset_is_being_rebound()) {
 		nodemask_t mems = cpuset_mems_allowed(current);
-		if (new->flags & MPOL_F_REBINDING)
-			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
-		else
-			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
+		mpol_rebind_policy(new, &mems);
 	}
 	atomic_set(&new->refcnt, 1);
 	return new;
-- 
cgit v1.3-14-g43fede


From 5f155f27cb7f0670429e2b8bb954094fa4110df9 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 6 Jul 2017 15:40:09 -0700
Subject: mm, cpuset: always use seqlock when changing task's nodemask

When updating task's mems_allowed and rebinding its mempolicy due to
cpuset's mems being changed, we currently only take the seqlock for
writing when either the task has a mempolicy, or the new mems has no
intersection with the old mems.

This should be enough to prevent a parallel allocation seeing no
available nodes, but the optimization is IMHO unnecessary (cpuset
updates should not be frequent), and we still potentially risk issues if
the intersection of new and old nodes has limited amount of
free/reclaimable memory.

Let's just use the seqlock for all tasks.

Link: http://lkml.kernel.org/r/20170517081140.30654-6-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup/cpuset.c | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 5fd1bdbaa381..ca8376e5008c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1038,38 +1038,25 @@ static void cpuset_post_attach(void)
  * @tsk: the task to change
  * @newmems: new nodes that the task will be set
  *
- * In order to avoid seeing no nodes if the old and new nodes are disjoint,
- * we structure updates as setting all new allowed nodes, then clearing newly
- * disallowed ones.
+ * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
+ * and rebind an eventual tasks' mempolicy. If the task is allocating in
+ * parallel, it might temporarily see an empty intersection, which results in
+ * a seqlock check and retry before OOM or allocation failure.
  */
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
 					nodemask_t *newmems)
 {
-	bool need_loop;
-
 	task_lock(tsk);
-	/*
-	 * Determine if a loop is necessary if another thread is doing
-	 * read_mems_allowed_begin().  If at least one node remains unchanged and
-	 * tsk does not have a mempolicy, then an empty nodemask will not be
-	 * possible when mems_allowed is larger than a word.
-	 */
-	need_loop = task_has_mempolicy(tsk) ||
-			!nodes_intersects(*newmems, tsk->mems_allowed);
 
-	if (need_loop) {
-		local_irq_disable();
-		write_seqcount_begin(&tsk->mems_allowed_seq);
-	}
+	local_irq_disable();
+	write_seqcount_begin(&tsk->mems_allowed_seq);
 
 	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
 	mpol_rebind_task(tsk, newmems);
 	tsk->mems_allowed = *newmems;
 
-	if (need_loop) {
-		write_seqcount_end(&tsk->mems_allowed_seq);
-		local_irq_enable();
-	}
+	write_seqcount_end(&tsk->mems_allowed_seq);
+	local_irq_enable();
 
 	task_unlock(tsk);
 }
-- 
cgit v1.3-14-g43fede


From ed52be7bfd45533b194b429f43361493d24599a7 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 6 Jul 2017 15:40:49 -0700
Subject: mm: memcontrol: use generic mod_memcg_page_state for kmem pages

The kmem-specific functions do the same thing.  Switch and drop.

Link: http://lkml.kernel.org/r/20170530181724.27197-5-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 17 -----------------
 kernel/fork.c              |  8 ++++----
 mm/slab.h                  | 16 ++++++++--------
 3 files changed, 12 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fa506ae61d66..5a72d8377942 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -929,19 +929,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
 	return memcg ? memcg->kmemcg_id : -1;
 }
 
-/**
- * memcg_kmem_update_page_stat - update kmem page state statistics
- * @page: the page
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- */
-static inline void memcg_kmem_update_page_stat(struct page *page,
-				enum memcg_stat_item idx, int val)
-{
-	if (memcg_kmem_enabled() && page->mem_cgroup)
-		this_cpu_add(page->mem_cgroup->stat->count[idx], val);
-}
-
 #else
 #define for_each_memcg_cache_index(_idx)	\
 	for (; NULL; )
@@ -964,10 +951,6 @@ static inline void memcg_put_cache_ids(void)
 {
 }
 
-static inline void memcg_kmem_update_page_stat(struct page *page,
-				enum memcg_stat_item idx, int val)
-{
-}
 #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index e53770d2bf95..aa01b810c0bd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -326,8 +326,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
 		}
 
 		/* All stack pages belong to the same memcg. */
-		memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
-					    account * (THREAD_SIZE / 1024));
+		mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+				     account * (THREAD_SIZE / 1024));
 	} else {
 		/*
 		 * All stack pages are in the same zone and belong to the
@@ -338,8 +338,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
 		mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
 				    THREAD_SIZE / 1024 * account);
 
-		memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
-					    account * (THREAD_SIZE / 1024));
+		mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB,
+				     account * (THREAD_SIZE / 1024));
 	}
 }
 
diff --git a/mm/slab.h b/mm/slab.h
index 69f0579cb5aa..7b84e3839dfe 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -285,10 +285,10 @@ static __always_inline int memcg_charge_slab(struct page *page,
 	if (ret)
 		return ret;
 
-	memcg_kmem_update_page_stat(page,
-			(s->flags & SLAB_RECLAIM_ACCOUNT) ?
-			NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-			1 << order);
+	mod_memcg_page_state(page,
+			     (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+			     NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+			     1 << order);
 	return 0;
 }
 
@@ -298,10 +298,10 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
 	if (!memcg_kmem_enabled())
 		return;
 
-	memcg_kmem_update_page_stat(page,
-			(s->flags & SLAB_RECLAIM_ACCOUNT) ?
-			NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-			-(1 << order));
+	mod_memcg_page_state(page,
+			     (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+			     NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+			     -(1 << order));
 	memcg_kmem_uncharge(page, order);
 }
 
-- 
cgit v1.3-14-g43fede


From f610c9d68b1a47f539b7764f4b5ce07d32fb9ae1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 7 Jul 2017 08:57:57 +0200
Subject: genirq/debugfs: Remove redundant NULL pointer check

debugfs_remove() can be called with a NULL pointer.

Fixes: 087cdfb662ae5 ("genirq/debugfs: Add proper debugfs interface")
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdomain.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 14fe862aa2e3..ed47688b8e79 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1667,8 +1667,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d)
 
 static void debugfs_remove_domain_dir(struct irq_domain *d)
 {
-	if (d->debugfs_file)
-		debugfs_remove(d->debugfs_file);
+	debugfs_remove(d->debugfs_file);
 }
 
 void __init irq_domain_debugfs_init(struct dentry *root)
-- 
cgit v1.3-14-g43fede


From c5c601c4295f89368f4a304cb3ae4aebdf80db22 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 7 Jul 2017 09:39:59 +0100
Subject: irqdomain: Allow ACPI device nodes to be used as irqdomain
 identifiers

A number of irqchip implementations are (ab)using the irqdomain allocator
by passing a fwnode that is neither a FWNODE_OF or a FWNODE_IRQCHIP.

This is pretty bad, but it also feels pretty crap to force these drivers to
allocate their own irqchip_fwid when they already have a proper fwnode.

Instead, let's teach the irqdomain allocator about ACPI device nodes, and
add some lovely name generation code... Tested on an arm64 D05 system.

Reported-and-tested-by: John Garry <john.garry@huawei.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Agustin Vega-Frias <agustinv@codeaurora.org>
Cc: Ma Jun <majun258@huawei.com>
Cc: Hanjun Guo <hanjun.guo@linaro.org>
Link: http://lkml.kernel.org/r/20170707083959.10349-1-marc.zyngier@arm.com
---
 kernel/irq/irqdomain.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index ed47688b8e79..f1f251479aa6 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,5 +1,6 @@
 #define pr_fmt(fmt)  "irq: " fmt
 
+#include <linux/acpi.h>
 #include <linux/debugfs.h>
 #include <linux/hardirq.h>
 #include <linux/interrupt.h>
@@ -155,6 +156,21 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
 			domain->name = fwid->name;
 			break;
 		}
+#ifdef CONFIG_ACPI
+	} else if (is_acpi_device_node(fwnode)) {
+		struct acpi_buffer buf = {
+			.length = ACPI_ALLOCATE_BUFFER,
+		};
+		acpi_handle handle;
+
+		handle = acpi_device_handle(to_acpi_device_node(fwnode));
+		if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) {
+			domain->name = buf.pointer;
+			domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+		}
+
+		domain->fwnode = fwnode;
+#endif
 	} else if (of_node) {
 		char *name;
 
-- 
cgit v1.3-14-g43fede


From eaf260ac04d9b4cf9f458d5c97555bfff2da526e Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 6 Jul 2017 16:00:21 -0700
Subject: tracing: Treat recording comm for idle task as a success

Currently we stop recording comm for non-idle tasks when switching from/to idle
task since we treat that as a record failure. Fix that by treat recording of
comm for idle task as a success.

Link: http://lkml.kernel.org/r/20170706230023.17942-1-joelaf@google.com

Cc: kernel-team@android.com
Cc: Ingo Molnar <mingo@redhat.com>
Reported-by: Michael Sartain <mikesart@gmail.com>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f079a8ca1117..6722d86f2af5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1916,7 +1916,11 @@ static int trace_save_cmdline(struct task_struct *tsk)
 {
 	unsigned pid, idx;
 
-	if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
+	/* treat recording of idle task as a success */
+	if (!tsk->pid)
+		return 1;
+
+	if (unlikely(tsk->pid > PID_MAX_DEFAULT))
 		return 0;
 
 	/*
-- 
cgit v1.3-14-g43fede


From bd45d34d25720a820021c8ea45de5cd607eace64 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 6 Jul 2017 16:00:22 -0700
Subject: tracing: Treat recording tgid for idle task as a success

Currently we stop recording tgid for non-idle tasks when switching from/to idle
task since we treat that as a record failure. Fix that by treat recording of
tgid for idle task as a success.

Link: http://lkml.kernel.org/r/20170706230023.17942-2-joelaf@google.com

Cc: kernel-team@android.com
Cc: Ingo Molnar <mingo@redhat.com>
Reported-by: Michael Sartain <mikesart@gmail.com>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6722d86f2af5..aee11e3a394f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2006,7 +2006,11 @@ int trace_find_tgid(int pid)
 
 static int trace_save_tgid(struct task_struct *tsk)
 {
-	if (unlikely(!tgid_map || !tsk->pid || tsk->pid > PID_MAX_DEFAULT))
+	/* treat recording of idle task as a success */
+	if (!tsk->pid)
+		return 1;
+
+	if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT))
 		return 0;
 
 	tgid_map[tsk->pid] = tsk->tgid;
-- 
cgit v1.3-14-g43fede


From 29b1a8ad7df4528b862a79e3d5fb0936f4d199c7 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 6 Jul 2017 16:00:23 -0700
Subject: tracing: Attempt to record other information even if some fail

In recent patches where we record comm and tgid at the same time, we skip
continuing to record if any fail. Fix that by trying to record as many things
as we can even if some couldn't be recorded. If any information isn't recorded,
then we don't set trace_taskinfo_save as before.

Link: http://lkml.kernel.org/r/20170706230023.17942-3-joelaf@google.com

Cc: kernel-team@android.com
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aee11e3a394f..92af8fd1429b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2037,11 +2037,20 @@ static bool tracing_record_taskinfo_skip(int flags)
  */
 void tracing_record_taskinfo(struct task_struct *task, int flags)
 {
+	bool done;
+
 	if (tracing_record_taskinfo_skip(flags))
 		return;
-	if ((flags & TRACE_RECORD_CMDLINE) && !trace_save_cmdline(task))
-		return;
-	if ((flags & TRACE_RECORD_TGID) && !trace_save_tgid(task))
+
+	/*
+	 * Record as much task information as possible. If some fail, continue
+	 * to try to record the others.
+	 */
+	done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
+	done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);
+
+	/* If recording any information failed, retry again soon. */
+	if (!done)
 		return;
 
 	__this_cpu_write(trace_taskinfo_save, false);
@@ -2058,15 +2067,22 @@ void tracing_record_taskinfo(struct task_struct *task, int flags)
 void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
 					  struct task_struct *next, int flags)
 {
+	bool done;
+
 	if (tracing_record_taskinfo_skip(flags))
 		return;
 
-	if ((flags & TRACE_RECORD_CMDLINE) &&
-	    (!trace_save_cmdline(prev) || !trace_save_cmdline(next)))
-		return;
+	/*
+	 * Record as much task information as possible. If some fail, continue
+	 * to try to record the others.
+	 */
+	done  = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
+	done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
+	done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
+	done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);
 
-	if ((flags & TRACE_RECORD_TGID) &&
-	    (!trace_save_tgid(prev) || !trace_save_tgid(next)))
+	/* If recording any information failed, retry again soon. */
+	if (!done)
 		return;
 
 	__this_cpu_write(trace_taskinfo_save, false);
-- 
cgit v1.3-14-g43fede


From 5671360f29c68d9079914438f6a0109ef62f82a8 Mon Sep 17 00:00:00 2001
From: Stafford Horne <shorne@gmail.com>
Date: Sat, 8 Jul 2017 04:56:58 +0900
Subject: locking/qspinlock: Explicitly include asm/prefetch.h

In architectures that use qspinlock, like x86, prefetch is loaded
indirectly via the asm/qspinlock.h include.  On other architectures, like
OpenRISC, which may want to use asm-generic/qspinlock.h the built will
fail without the asm/prefetch.h include.

Fix this by including directly.

Signed-off-by: Stafford Horne <shorne@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170707195658.23840-1-shorne@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b2caec7315af..fd24153e8a48 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -28,6 +28,7 @@
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/mutex.h>
+#include <linux/prefetch.h>
 #include <asm/byteorder.h>
 #include <asm/qspinlock.h>
 
-- 
cgit v1.3-14-g43fede


From 659b957f20c78fd470083c80af5e79eedfb39e5b Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Fri, 7 Jul 2017 22:37:24 +0530
Subject: kprobes: Rename [arch_]function_offset_within_entry() to
 [arch_]kprobe_on_func_entry()

Rename function_offset_within_entry() to scope it to kprobe namespace by
using kprobe_ prefix, and to also simplify it.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Suggested-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/3aa6c7e2e4fb6e00f3c24fa306496a66edb558ea.1499443367.git.naveen.n.rao@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kernel/kprobes.c | 2 +-
 include/linux/kprobes.h       | 4 ++--
 kernel/kprobes.c              | 8 ++++----
 kernel/trace/trace_kprobe.c   | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 01addfb0ed0a..586508e949f0 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -221,7 +221,7 @@ static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs
 	kcb->kprobe_saved_msr = regs->msr;
 }
 
-bool arch_function_offset_within_entry(unsigned long offset)
+bool arch_kprobe_on_func_entry(unsigned long offset)
 {
 #ifdef PPC64_ELF_ABI_v2
 #ifdef CONFIG_KPROBES_ON_FTRACE
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 541df0b5b815..bd2684700b74 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -267,8 +267,8 @@ extern int arch_init_kprobes(void);
 extern void show_registers(struct pt_regs *regs);
 extern void kprobes_inc_nmissed_count(struct kprobe *p);
 extern bool arch_within_kprobe_blacklist(unsigned long addr);
-extern bool arch_function_offset_within_entry(unsigned long offset);
-extern bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
+extern bool arch_kprobe_on_func_entry(unsigned long offset);
+extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
 
 extern bool within_kprobe_blacklist(unsigned long addr);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6756d750b31b..a519219169fd 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1888,12 +1888,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(pre_handler_kretprobe);
 
-bool __weak arch_function_offset_within_entry(unsigned long offset)
+bool __weak arch_kprobe_on_func_entry(unsigned long offset)
 {
 	return !offset;
 }
 
-bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
 {
 	kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
 
@@ -1901,7 +1901,7 @@ bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsign
 		return false;
 
 	if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
-						!arch_function_offset_within_entry(offset))
+						!arch_kprobe_on_func_entry(offset))
 		return false;
 
 	return true;
@@ -1914,7 +1914,7 @@ int register_kretprobe(struct kretprobe *rp)
 	int i;
 	void *addr;
 
-	if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
+	if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
 		return -EINVAL;
 
 	if (kretprobe_blacklist_size) {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b53c8d369163..2c5221819be5 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -720,7 +720,7 @@ static int create_trace_kprobe(int argc, char **argv)
 			return ret;
 		}
 		if (offset && is_return &&
-		    !function_offset_within_entry(NULL, symbol, offset)) {
+		    !kprobe_on_func_entry(NULL, symbol, offset)) {
 			pr_info("Given offset is not valid for return probe.\n");
 			return -EINVAL;
 		}
-- 
cgit v1.3-14-g43fede


From 0f73ff80b751b39ff539a550e65c5bd131ff0316 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Fri, 7 Jul 2017 22:37:25 +0530
Subject: kprobes: Simplify register_jprobes()

Re-factor jprobe registration functions as the current version is
getting too unwieldy. Move the actual jprobe registration to
register_jprobe() and re-organize code accordingly.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/089cae4bfe73767f765291ee0e6fb0c3d240e5f1.1499443367.git.naveen.n.rao@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/kprobes.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a519219169fd..db3cd3e60bdd 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1771,24 +1771,13 @@ unsigned long __weak arch_deref_entry_point(void *entry)
 
 int register_jprobes(struct jprobe **jps, int num)
 {
-	struct jprobe *jp;
 	int ret = 0, i;
 
 	if (num <= 0)
 		return -EINVAL;
+
 	for (i = 0; i < num; i++) {
-		unsigned long addr, offset;
-		jp = jps[i];
-		addr = arch_deref_entry_point(jp->entry);
-
-		/* Verify probepoint is a function entry point */
-		if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
-		    offset == 0) {
-			jp->kp.pre_handler = setjmp_pre_handler;
-			jp->kp.break_handler = longjmp_break_handler;
-			ret = register_kprobe(&jp->kp);
-		} else
-			ret = -EINVAL;
+		ret = register_jprobe(jps[i]);
 
 		if (ret < 0) {
 			if (i > 0)
@@ -1796,13 +1785,26 @@ int register_jprobes(struct jprobe **jps, int num)
 			break;
 		}
 	}
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_jprobes);
 
 int register_jprobe(struct jprobe *jp)
 {
-	return register_jprobes(&jp, 1);
+	unsigned long addr, offset;
+	struct kprobe *kp = &jp->kp;
+
+	/* Verify probepoint is a function entry point */
+	addr = arch_deref_entry_point(jp->entry);
+
+	if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0) {
+		kp->pre_handler = setjmp_pre_handler;
+		kp->break_handler = longjmp_break_handler;
+		return register_kprobe(kp);
+	}
+
+	return -EINVAL;
 }
 EXPORT_SYMBOL_GPL(register_jprobe);
 
-- 
cgit v1.3-14-g43fede


From dbf580623d5fee785218d1a47a2bcdf36d85c0e9 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Fri, 7 Jul 2017 22:37:26 +0530
Subject: kprobes: Ensure that jprobe probepoints are at function entry

Similar to commit 90ec5e89e393c ("kretprobes: Ensure probe location is
at function entry"), ensure that the jprobe probepoint is at function
entry.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a4525af6c5a42df385efa31251246cf7cca73598.1499443367.git.naveen.n.rao@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/kprobes.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index db3cd3e60bdd..a1606a4224e1 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1795,10 +1795,14 @@ int register_jprobe(struct jprobe *jp)
 	unsigned long addr, offset;
 	struct kprobe *kp = &jp->kp;
 
-	/* Verify probepoint is a function entry point */
+	/*
+	 * Verify probepoint as well as the jprobe handler are
+	 * valid function entry points.
+	 */
 	addr = arch_deref_entry_point(jp->entry);
 
-	if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0) {
+	if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0 &&
+	    kprobe_on_func_entry(kp->addr, kp->symbol_name, kp->offset)) {
 		kp->pre_handler = setjmp_pre_handler;
 		kp->break_handler = longjmp_break_handler;
 		return register_kprobe(kp);
-- 
cgit v1.3-14-g43fede


From 634a81609561f05266e1f625b6f2567c2e0b0419 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 8 Jul 2017 11:26:39 -0400
Subject: fix waitid(2) breakage

We lose the distinction between "found a PID" and "nothing, but that's not
an error" a bit too early in waitid().  Easily fixed, fortunately...

Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Fixes: 67d7ddded322 ("waitid(2): leave copyout of siginfo to syscall itself")
Tested-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exit.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2bbc23273e2f..608c9775a37b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1590,9 +1590,6 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
 	wo.wo_rusage	= ru;
 	ret = do_wait(&wo);
 
-	if (ret > 0)
-		ret = 0;
-
 	put_pid(pid);
 	return ret;
 }
@@ -1603,6 +1600,11 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	struct rusage r;
 	struct waitid_info info = {.status = 0};
 	long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
+	int signo = 0;
+	if (err > 0) {
+		signo = SIGCHLD;
+		err = 0;
+	}
 
 	if (!err) {
 		if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
@@ -1612,7 +1614,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 		return err;
 
 	user_access_begin();
-	unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault);
+	unsafe_put_user(signo, &infop->si_signo, Efault);
 	unsafe_put_user(0, &infop->si_errno, Efault);
 	unsafe_put_user((short)info.cause, &infop->si_code, Efault);
 	unsafe_put_user(info.pid, &infop->si_pid, Efault);
@@ -1714,6 +1716,11 @@ COMPAT_SYSCALL_DEFINE5(waitid,
 	struct rusage ru;
 	struct waitid_info info = {.status = 0};
 	long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
+	int signo = 0;
+	if (err > 0) {
+		signo = SIGCHLD;
+		err = 0;
+	}
 
 	if (!err && uru) {
 		/* kernel_waitid() overwrites everything in ru */
@@ -1729,7 +1736,7 @@ COMPAT_SYSCALL_DEFINE5(waitid,
 		return err;
 
 	user_access_begin();
-	unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault);
+	unsafe_put_user(signo, &infop->si_signo, Efault);
 	unsafe_put_user(0, &infop->si_errno, Efault);
 	unsafe_put_user((short)info.cause, &infop->si_code, Efault);
 	unsafe_put_user(info.pid, &infop->si_pid, Efault);
-- 
cgit v1.3-14-g43fede


From fca18a47cf3eb8425ec19c2dfc374f3d04f5219f Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Sat, 8 Jul 2017 00:27:30 +0530
Subject: trace/kprobes: Sanitize derived event names

When we derive event names, convert some expected symbols (such as ':'
used to specify module:name and '.' present in some symbols) into
underscores so that the event name is not rejected.

Before this patch:
    # echo 'p kobject_example:foo_store' > kprobe_events
    trace_kprobe: Failed to allocate trace_probe.(-22)
    -sh: write error: Invalid argument

After this patch:
    # echo 'p kobject_example:foo_store' > kprobe_events
    # cat kprobe_events
    p:kprobes/p_kobject_example_foo_store_0 kobject_example:foo_store

Link: http://lkml.kernel.org/r/66c189e09e71361aba91dd4a5bd146a1b62a7a51.1499453040.git.naveen.n.rao@linux.vnet.ibm.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c129fca6ec99..44fd819aa33d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -598,6 +598,14 @@ static struct notifier_block trace_kprobe_module_nb = {
 	.priority = 1	/* Invoked after kprobe module callback */
 };
 
+/* Convert certain expected symbols into '_' when generating event names */
+static inline void sanitize_event_name(char *name)
+{
+	while (*name++ != '\0')
+		if (*name == ':' || *name == '.')
+			*name = '_';
+}
+
 static int create_trace_kprobe(int argc, char **argv)
 {
 	/*
@@ -740,6 +748,7 @@ static int create_trace_kprobe(int argc, char **argv)
 		else
 			snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
 				 is_return ? 'r' : 'p', addr);
+		sanitize_event_name(buf);
 		event = buf;
 	}
 	tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
-- 
cgit v1.3-14-g43fede


From 1860033237d4be09c5d7382585f0c7229367a534 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 10 Jul 2017 15:48:02 -0700
Subject: mm: make PR_SET_THP_DISABLE immediately active

PR_SET_THP_DISABLE has a rather subtle semantic.  It doesn't affect any
existing mapping because it only updated mm->def_flags which is a
template for new mappings.

The mappings created after prctl(PR_SET_THP_DISABLE) have VM_NOHUGEPAGE
flag set.  This can be quite surprising for all those applications which
do not do prctl(); fork() & exec() and want to control their own THP
behavior.

Another usecase when the immediate semantic of the prctl might be useful
is a combination of pre- and post-copy migration of containers with
CRIU.  In this case CRIU populates a part of a memory region with data
that was saved during the pre-copy stage.  Afterwards, the region is
registered with userfaultfd and CRIU expects to get page faults for the
parts of the region that were not yet populated.  However, khugepaged
collapses the pages and the expected page faults do not occur.

In more general case, the prctl(PR_SET_THP_DISABLE) could be used as a
temporary mechanism for enabling/disabling THP process wide.

Implementation wise, a new MMF_DISABLE_THP flag is added.  This flag is
tested when decision whether to use huge pages is taken either during
page fault of at the time of THP collapse.

It should be noted, that the new implementation makes PR_SET_THP_DISABLE
master override to any per-VMA setting, which was not the case
previously.

Fixes: a0715cc22601 ("mm, thp: add VM_INIT_DEF_MASK and PRCTL_THP_DISABLE")
Link: http://lkml.kernel.org/r/1496415802-30944-1-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h        | 1 +
 include/linux/khugepaged.h     | 3 ++-
 include/linux/sched/coredump.h | 5 ++++-
 kernel/sys.c                   | 6 +++---
 mm/khugepaged.c                | 3 ++-
 mm/shmem.c                     | 8 +++++---
 6 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index d3b3e8fcc717..40d7b7dd2653 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -92,6 +92,7 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
 	   (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) &&			\
 	   ((__vma)->vm_flags & VM_HUGEPAGE))) &&			\
 	 !((__vma)->vm_flags & VM_NOHUGEPAGE) &&			\
+	 !test_bit(MMF_DISABLE_THP, &(__vma)->vm_mm->flags) &&		\
 	 !is_vma_temporary_stack(__vma))
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 5d9a400af509..f0d7335336cd 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -48,7 +48,8 @@ static inline int khugepaged_enter(struct vm_area_struct *vma,
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
 		if ((khugepaged_always() ||
 		     (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) &&
-		    !(vm_flags & VM_NOHUGEPAGE))
+		    !(vm_flags & VM_NOHUGEPAGE) &&
+		    !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
 			if (__khugepaged_enter(vma->vm_mm))
 				return -ENOMEM;
 	return 0;
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 69eedcef8f03..98ae0d05aa32 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -68,7 +68,10 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_OOM_SKIP		21	/* mm is of no interest for the OOM killer */
 #define MMF_UNSTABLE		22	/* mm is unstable for copy_from_user */
 #define MMF_HUGE_ZERO_PAGE	23      /* mm has ever used the global huge zero page */
+#define MMF_DISABLE_THP		24	/* disable THP for all VMAs */
+#define MMF_DISABLE_THP_MASK	(1 << MMF_DISABLE_THP)
 
-#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
+#define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
+				 MMF_DISABLE_THP_MASK)
 
 #endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 47d901586b4e..73fc0af147d0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2360,7 +2360,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_GET_THP_DISABLE:
 		if (arg2 || arg3 || arg4 || arg5)
 			return -EINVAL;
-		error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
+		error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags);
 		break;
 	case PR_SET_THP_DISABLE:
 		if (arg3 || arg4 || arg5)
@@ -2368,9 +2368,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		if (down_write_killable(&me->mm->mmap_sem))
 			return -EINTR;
 		if (arg2)
-			me->mm->def_flags |= VM_NOHUGEPAGE;
+			set_bit(MMF_DISABLE_THP, &me->mm->flags);
 		else
-			me->mm->def_flags &= ~VM_NOHUGEPAGE;
+			clear_bit(MMF_DISABLE_THP, &me->mm->flags);
 		up_write(&me->mm->mmap_sem);
 		break;
 	case PR_MPX_ENABLE_MANAGEMENT:
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index df4ebdb2b10a..c01f177a1120 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -816,7 +816,8 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
 static bool hugepage_vma_check(struct vm_area_struct *vma)
 {
 	if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
-	    (vma->vm_flags & VM_NOHUGEPAGE))
+	    (vma->vm_flags & VM_NOHUGEPAGE) ||
+	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
 		return false;
 	if (shmem_file(vma->vm_file)) {
 		if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
diff --git a/mm/shmem.c b/mm/shmem.c
index 9418f5a9bc46..b0aa6075d164 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1977,10 +1977,12 @@ static int shmem_fault(struct vm_fault *vmf)
 	}
 
 	sgp = SGP_CACHE;
-	if (vma->vm_flags & VM_HUGEPAGE)
-		sgp = SGP_HUGE;
-	else if (vma->vm_flags & VM_NOHUGEPAGE)
+
+	if ((vma->vm_flags & VM_NOHUGEPAGE) ||
+	    test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
 		sgp = SGP_NOHUGE;
+	else if (vma->vm_flags & VM_HUGEPAGE)
+		sgp = SGP_HUGE;
 
 	error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
 				  gfp, vma, vmf, &ret);
-- 
cgit v1.3-14-g43fede


From 9dcdcea11491f6eee65bd1b352293ca01e4b7997 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Mon, 10 Jul 2017 15:51:14 -0700
Subject: kernel/ksysfs.c: constify attribute_group structures.

attribute_groups are not supposed to change at runtime.  All functions
working with attribute_groups provided by <linux/sysfs.h> work with
const attribute_group.  So mark the non-const structs as const.

File size before:
   text	   data	    bss	    dec	    hex	filename
   1120	    544	     16	   1680	    690	kernel/ksysfs.o

File size After adding 'const':
   text	   data	    bss	    dec	    hex	filename
   1160	    480	     16	   1656	    678	kernel/ksysfs.o

Link: http://lkml.kernel.org/r/aa224b3cc923fdbb3edd0c41b2c639c85408c9e8.1498737347.git.arvind.yadav.cs@gmail.com
Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Dave Young <dyoung@redhat.com>
Cc: Hari Bathini <hbathini@linux.vnet.ibm.com>
Cc: Petr Tesarik <ptesarik@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ksysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 23cd70651238..df1a9aa602a0 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -234,7 +234,7 @@ static struct attribute * kernel_attrs[] = {
 	NULL
 };
 
-static struct attribute_group kernel_attr_group = {
+static const struct attribute_group kernel_attr_group = {
 	.attrs = kernel_attrs,
 };
 
-- 
cgit v1.3-14-g43fede


From b7b2562f7252878e18de60c24f320052076f9de8 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Mon, 10 Jul 2017 15:51:17 -0700
Subject: kernel/groups.c: use sort library function

setgroups is not exactly a hot path, so we might as well use the library
function instead of open-coding the sorting.  Saves ~150 bytes.

Link: http://lkml.kernel.org/r/1497301378-22739-1-git-send-email-linux@rasmusvillemoes.dk
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/groups.c | 35 +++++++++++------------------------
 1 file changed, 11 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/groups.c b/kernel/groups.c
index d09727692a2a..434f6665f187 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -5,6 +5,7 @@
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/security.h>
+#include <linux/sort.h>
 #include <linux/syscalls.h>
 #include <linux/user_namespace.h>
 #include <linux/vmalloc.h>
@@ -76,32 +77,18 @@ static int groups_from_user(struct group_info *group_info,
 	return 0;
 }
 
-/* a simple Shell sort */
+static int gid_cmp(const void *_a, const void *_b)
+{
+	kgid_t a = *(kgid_t *)_a;
+	kgid_t b = *(kgid_t *)_b;
+
+	return gid_gt(a, b) - gid_lt(a, b);
+}
+
 static void groups_sort(struct group_info *group_info)
 {
-	int base, max, stride;
-	int gidsetsize = group_info->ngroups;
-
-	for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
-		; /* nothing */
-	stride /= 3;
-
-	while (stride) {
-		max = gidsetsize - stride;
-		for (base = 0; base < max; base++) {
-			int left = base;
-			int right = left + stride;
-			kgid_t tmp = group_info->gid[right];
-
-			while (left >= 0 && gid_gt(group_info->gid[left], tmp)) {
-				group_info->gid[right] = group_info->gid[left];
-				right = left;
-				left -= stride;
-			}
-			group_info->gid[right] = tmp;
-		}
-		stride /= 3;
-	}
+	sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
+	     gid_cmp, NULL);
 }
 
 /* a simple bsearch */
-- 
cgit v1.3-14-g43fede


From 63b23e2cbc8e80de3e40184ecb2c3bfb705776fa Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 10 Jul 2017 15:51:20 -0700
Subject: kernel/kallsyms.c: replace all_var with
 IS_ENABLED(CONFIG_KALLSYMS_ALL)

'all_var' looks like a variable, but is actually a macro.  Use
IS_ENABLED(CONFIG_KALLSYMS_ALL) for clarification.

Link: http://lkml.kernel.org/r/1497577591-3434-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kallsyms.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6a3b249a2ae1..127e7cfafa55 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -28,12 +28,6 @@
 
 #include <asm/sections.h>
 
-#ifdef CONFIG_KALLSYMS_ALL
-#define all_var 1
-#else
-#define all_var 0
-#endif
-
 /*
  * These will be re-linked against their real values
  * during the second link stage.
@@ -82,7 +76,7 @@ static inline int is_kernel(unsigned long addr)
 
 static int is_ksym_addr(unsigned long addr)
 {
-	if (all_var)
+	if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
 		return is_kernel(addr);
 
 	return is_kernel_text(addr) || is_kernel_inittext(addr);
@@ -280,7 +274,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
 	if (!symbol_end) {
 		if (is_kernel_inittext(addr))
 			symbol_end = (unsigned long)_einittext;
-		else if (all_var)
+		else if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
 			symbol_end = (unsigned long)_end;
 		else
 			symbol_end = (unsigned long)_etext;
-- 
cgit v1.3-14-g43fede


From a94c33dd1f677d16c4f1a162b4b3e9eba1b07c24 Mon Sep 17 00:00:00 2001
From: Thomas Meyer <thomas@m3y3r.de>
Date: Mon, 10 Jul 2017 15:51:58 -0700
Subject: lib/extable.c: use bsearch() library function in search_extable()

[thomas@m3y3r.de: v3: fix arch specific implementations]
  Link: http://lkml.kernel.org/r/1497890858.12931.7.camel@m3y3r.de
Signed-off-by: Thomas Meyer <thomas@m3y3r.de>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/kernel/module.c |  3 ++-
 arch/mips/kernel/traps.c  |  3 ++-
 arch/sh/mm/extable_64.c   | 34 ++++++++++++++++++----------------
 arch/sparc/mm/extable.c   | 28 ++++++++++++++--------------
 include/linux/extable.h   |  5 +++--
 kernel/extable.c          |  3 ++-
 kernel/module.c           |  2 +-
 lib/extable.c             | 41 +++++++++++++++++++++--------------------
 8 files changed, 63 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 94627a3a6a0d..50c020c47e54 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -317,7 +317,8 @@ const struct exception_table_entry *search_module_dbetables(unsigned long addr)
 
 	spin_lock_irqsave(&dbe_lock, flags);
 	list_for_each_entry(dbe, &dbe_list, dbe_list) {
-		e = search_extable(dbe->dbe_start, dbe->dbe_end - 1, addr);
+		e = search_extable(dbe->dbe_start,
+				   dbe->dbe_end - dbe->dbe_start, addr);
 		if (e)
 			break;
 	}
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 38dfa27730ff..b68b4d0726d3 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -429,7 +429,8 @@ static const struct exception_table_entry *search_dbe_tables(unsigned long addr)
 {
 	const struct exception_table_entry *e;
 
-	e = search_extable(__start___dbe_table, __stop___dbe_table - 1, addr);
+	e = search_extable(__start___dbe_table,
+			   __stop___dbe_table - __start___dbe_table, addr);
 	if (!e)
 		e = search_module_dbetables(addr);
 	return e;
diff --git a/arch/sh/mm/extable_64.c b/arch/sh/mm/extable_64.c
index b90cdfad2c78..7a3b4d33d2e7 100644
--- a/arch/sh/mm/extable_64.c
+++ b/arch/sh/mm/extable_64.c
@@ -10,6 +10,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  */
+#include <linux/bsearch.h>
 #include <linux/rwsem.h>
 #include <linux/extable.h>
 #include <linux/uaccess.h>
@@ -40,10 +41,23 @@ static const struct exception_table_entry *check_exception_ranges(unsigned long
 	return NULL;
 }
 
+static int cmp_ex_search(const void *key, const void *elt)
+{
+	const struct exception_table_entry *_elt = elt;
+	unsigned long _key = *(unsigned long *)key;
+
+	/* avoid overflow */
+	if (_key > _elt->insn)
+		return 1;
+	if (_key < _elt->insn)
+		return -1;
+	return 0;
+}
+
 /* Simple binary search */
 const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
-		 const struct exception_table_entry *last,
+search_extable(const struct exception_table_entry *base,
+		 const size_t num,
 		 unsigned long value)
 {
 	const struct exception_table_entry *mid;
@@ -52,20 +66,8 @@ search_extable(const struct exception_table_entry *first,
 	if (mid)
 		return mid;
 
-        while (first <= last) {
-		long diff;
-
-		mid = (last - first) / 2 + first;
-		diff = mid->insn - value;
-                if (diff == 0)
-                        return mid;
-                else if (diff < 0)
-                        first = mid+1;
-                else
-                        last = mid-1;
-        }
-
-        return NULL;
+	return bsearch(&value, base, num,
+		       sizeof(struct exception_table_entry), cmp_ex_search);
 }
 
 int fixup_exception(struct pt_regs *regs)
diff --git a/arch/sparc/mm/extable.c b/arch/sparc/mm/extable.c
index db214e9931d9..2422511dc8c5 100644
--- a/arch/sparc/mm/extable.c
+++ b/arch/sparc/mm/extable.c
@@ -13,11 +13,11 @@ void sort_extable(struct exception_table_entry *start,
 
 /* Caller knows they are in a range if ret->fixup == 0 */
 const struct exception_table_entry *
-search_extable(const struct exception_table_entry *start,
-	       const struct exception_table_entry *last,
+search_extable(const struct exception_table_entry *base,
+	       const size_t num,
 	       unsigned long value)
 {
-	const struct exception_table_entry *walk;
+	int i;
 
 	/* Single insn entries are encoded as:
 	 *	word 1:	insn address
@@ -37,30 +37,30 @@ search_extable(const struct exception_table_entry *start,
 	 */
 
 	/* 1. Try to find an exact match. */
-	for (walk = start; walk <= last; walk++) {
-		if (walk->fixup == 0) {
+	for (i = 0; i < num; i++) {
+		if (base[i].fixup == 0) {
 			/* A range entry, skip both parts. */
-			walk++;
+			i++;
 			continue;
 		}
 
 		/* A deleted entry; see trim_init_extable */
-		if (walk->fixup == -1)
+		if (base[i].fixup == -1)
 			continue;
 
-		if (walk->insn == value)
-			return walk;
+		if (base[i].insn == value)
+			return &base[i];
 	}
 
 	/* 2. Try to find a range match. */
-	for (walk = start; walk <= (last - 1); walk++) {
-		if (walk->fixup)
+	for (i = 0; i < (num - 1); i++) {
+		if (base[i].fixup)
 			continue;
 
-		if (walk[0].insn <= value && walk[1].insn > value)
-			return walk;
+		if (base[i].insn <= value && base[i + 1].insn > value)
+			return &base[i];
 
-		walk++;
+		i++;
 	}
 
         return NULL;
diff --git a/include/linux/extable.h b/include/linux/extable.h
index 7effea4b257d..28addad0dda7 100644
--- a/include/linux/extable.h
+++ b/include/linux/extable.h
@@ -2,13 +2,14 @@
 #define _LINUX_EXTABLE_H
 
 #include <linux/stddef.h>	/* for NULL */
+#include <linux/types.h>
 
 struct module;
 struct exception_table_entry;
 
 const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
-	       const struct exception_table_entry *last,
+search_extable(const struct exception_table_entry *base,
+	       const size_t num,
 	       unsigned long value);
 void sort_extable(struct exception_table_entry *start,
 		  struct exception_table_entry *finish);
diff --git a/kernel/extable.c b/kernel/extable.c
index 223df4a328a4..38c2412401a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 {
 	const struct exception_table_entry *e;
 
-	e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
+	e = search_extable(__start___ex_table,
+			   __stop___ex_table - __start___ex_table, addr);
 	if (!e)
 		e = search_module_extables(addr);
 	return e;
diff --git a/kernel/module.c b/kernel/module.c
index b3dbdde82e80..b0f92a365140 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4196,7 +4196,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
 		goto out;
 
 	e = search_extable(mod->extable,
-			   mod->extable + mod->num_exentries - 1,
+			   mod->num_exentries,
 			   addr);
 out:
 	preempt_enable();
diff --git a/lib/extable.c b/lib/extable.c
index 62968daa66a9..f54996fdd0b8 100644
--- a/lib/extable.c
+++ b/lib/extable.c
@@ -9,6 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/bsearch.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sort.h>
@@ -51,7 +52,7 @@ static void swap_ex(void *a, void *b, int size)
  * This is used both for the kernel exception table and for
  * the exception tables of modules that get loaded.
  */
-static int cmp_ex(const void *a, const void *b)
+static int cmp_ex_sort(const void *a, const void *b)
 {
 	const struct exception_table_entry *x = a, *y = b;
 
@@ -67,7 +68,7 @@ void sort_extable(struct exception_table_entry *start,
 		  struct exception_table_entry *finish)
 {
 	sort(start, finish - start, sizeof(struct exception_table_entry),
-	     cmp_ex, swap_ex);
+	     cmp_ex_sort, swap_ex);
 }
 
 #ifdef CONFIG_MODULES
@@ -93,6 +94,20 @@ void trim_init_extable(struct module *m)
 #endif /* !ARCH_HAS_SORT_EXTABLE */
 
 #ifndef ARCH_HAS_SEARCH_EXTABLE
+
+static int cmp_ex_search(const void *key, const void *elt)
+{
+	const struct exception_table_entry *_elt = elt;
+	unsigned long _key = *(unsigned long *)key;
+
+	/* avoid overflow */
+	if (_key > ex_to_insn(_elt))
+		return 1;
+	if (_key < ex_to_insn(_elt))
+		return -1;
+	return 0;
+}
+
 /*
  * Search one exception table for an entry corresponding to the
  * given instruction address, and return the address of the entry,
@@ -101,25 +116,11 @@ void trim_init_extable(struct module *m)
  * already sorted.
  */
 const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
-	       const struct exception_table_entry *last,
+search_extable(const struct exception_table_entry *base,
+	       const size_t num,
 	       unsigned long value)
 {
-	while (first <= last) {
-		const struct exception_table_entry *mid;
-
-		mid = ((last - first) >> 1) + first;
-		/*
-		 * careful, the distance between value and insn
-		 * can be larger than MAX_LONG:
-		 */
-		if (ex_to_insn(mid) < value)
-			first = mid + 1;
-		else if (ex_to_insn(mid) > value)
-			last = mid - 1;
-		else
-			return mid;
-	}
-	return NULL;
+	return bsearch(&value, base, num,
+		       sizeof(struct exception_table_entry), cmp_ex_search);
 }
 #endif
-- 
cgit v1.3-14-g43fede


From 4ea77014af0d6205b05503d1c7aac6eace11d473 Mon Sep 17 00:00:00 2001
From: zhongjiang <zhongjiang@huawei.com>
Date: Mon, 10 Jul 2017 15:52:57 -0700
Subject: kernel/signal.c: avoid undefined behaviour in kill_something_info

When running kill(72057458746458112, 0) in userspace I hit the following
issue.

  UBSAN: Undefined behaviour in kernel/signal.c:1462:11
  negation of -2147483648 cannot be represented in type 'int':
  CPU: 226 PID: 9849 Comm: test Tainted: G    B          ---- -------   3.10.0-327.53.58.70.x86_64_ubsan+ #116
  Hardware name: Huawei Technologies Co., Ltd. RH8100 V3/BC61PBIA, BIOS BLHSV028 11/11/2014
  Call Trace:
    dump_stack+0x19/0x1b
    ubsan_epilogue+0xd/0x50
    __ubsan_handle_negate_overflow+0x109/0x14e
    SYSC_kill+0x43e/0x4d0
    SyS_kill+0xe/0x10
    system_call_fastpath+0x16/0x1b

Add code to avoid the UBSAN detection.

[akpm@linux-foundation.org: tweak comment]
Link: http://lkml.kernel.org/r/1496670008-59084-1-git-send-email-zhongjiang@huawei.com
Signed-off-by: zhongjiang <zhongjiang@huawei.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 48a59eefd8ad..caed9133ae52 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1402,6 +1402,10 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
 		return ret;
 	}
 
+	/* -INT_MIN is undefined.  Exclude this case to avoid a UBSAN warning */
+	if (pid == INT_MIN)
+		return -ESRCH;
+
 	read_lock(&tasklist_lock);
 	if (pid != -1) {
 		ret = __kill_pgrp_info(sig, info,
-- 
cgit v1.3-14-g43fede


From dd83c161fbcc5d8be637ab159c0de015cbff5ba4 Mon Sep 17 00:00:00 2001
From: zhongjiang <zhongjiang@huawei.com>
Date: Mon, 10 Jul 2017 15:53:01 -0700
Subject: kernel/exit.c: avoid undefined behaviour when calling wait4()

wait4(-2147483648, 0x20, 0, 0xdd0000) triggers:
UBSAN: Undefined behaviour in kernel/exit.c:1651:9

The related calltrace is as follows:

  negation of -2147483648 cannot be represented in type 'int':
  CPU: 9 PID: 16482 Comm: zj Tainted: G    B          ---- -------   3.10.0-327.53.58.71.x86_64+ #66
  Hardware name: Huawei Technologies Co., Ltd. Tecal RH2285          /BC11BTSA              , BIOS CTSAV036 04/27/2011
  Call Trace:
    dump_stack+0x19/0x1b
    ubsan_epilogue+0xd/0x50
    __ubsan_handle_negate_overflow+0x109/0x14e
    SyS_wait4+0x1cb/0x1e0
    system_call_fastpath+0x16/0x1b

Exclude the overflow to avoid the UBSAN warning.

Link: http://lkml.kernel.org/r/1497264618-20212-1-git-send-email-zhongjiang@huawei.com
Signed-off-by: zhongjiang <zhongjiang@huawei.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 608c9775a37b..c5548faa9f37 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1639,6 +1639,10 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
 			__WNOTHREAD|__WCLONE|__WALL))
 		return -EINVAL;
 
+	/* -INT_MIN is not defined */
+	if (upid == INT_MIN)
+		return -ESRCH;
+
 	if (upid == -1)
 		type = PIDTYPE_MAX;
 	else if (upid < 0) {
-- 
cgit v1.3-14-g43fede


From 6a8a75f3235724c5941a33e287b2f98966ad14c5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 11 Jul 2017 10:56:54 +0200
Subject: Revert "perf/core: Drop kernel samples even though :u is specified"

This reverts commit cc1582c231ea041fbc68861dfaf957eaf902b829.

This commit introduced a regression that broke rr-project, which uses sampling
events to receive a signal on overflow (but does not care about the contents
of the sample). These signals are critical to the correct operation of rr.

There's been some back and forth about how to fix it - but to not keep
applications in limbo queue up a revert.

Reported-by: Kyle Huey <me@kylehuey.com>
Acked-by: Kyle Huey <me@kylehuey.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170628105600.GC5981@leverpostej
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4d2c32f98482..9747e422ab20 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7308,21 +7308,6 @@ int perf_event_account_interrupt(struct perf_event *event)
 	return __perf_event_account_interrupt(event, 1);
 }
 
-static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
-{
-	/*
-	 * Due to interrupt latency (AKA "skid"), we may enter the
-	 * kernel before taking an overflow, even if the PMU is only
-	 * counting user events.
-	 * To avoid leaking information to userspace, we must always
-	 * reject kernel samples when exclude_kernel is set.
-	 */
-	if (event->attr.exclude_kernel && !user_mode(regs))
-		return false;
-
-	return true;
-}
-
 /*
  * Generic event overflow handling, sampling.
  */
@@ -7343,12 +7328,6 @@ static int __perf_event_overflow(struct perf_event *event,
 
 	ret = __perf_event_account_interrupt(event, throttle);
 
-	/*
-	 * For security, drop the skid kernel samples if necessary.
-	 */
-	if (!sample_is_allowed(event, regs))
-		return ret;
-
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
 	 * events
-- 
cgit v1.3-14-g43fede


From dea1d0f5f1284e3defee4b8484d9fc230686cd42 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 11 Jul 2017 22:06:24 +0200
Subject: smp/hotplug: Replace BUG_ON and react useful

The move of the unpark functions to the control thread moved the BUG_ON()
there as well. While it made some sense in the idle thread of the upcoming
CPU, it's bogus to crash the control thread on the already online CPU,
especially as the function has a return value and the callsite is prepared
to handle an error return.

Replace it with a WARN_ON_ONCE() and return a proper error code.

Fixes: 9cd4f1a4e7a8 ("smp/hotplug: Move unparking of percpu threads to the control CPU")
Rightfully-ranted-at-by: Linux Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index ab860453841d..eee033134262 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -279,7 +279,8 @@ static int bringup_wait_for_ap(unsigned int cpu)
 
 	/* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
 	wait_for_completion(&st->done);
-	BUG_ON(!cpu_online(cpu));
+	if (WARN_ON_ONCE((!cpu_online(cpu))))
+		return -ECANCELED;
 
 	/* Unpark the stopper thread and the hotplug thread of the target cpu */
 	stop_machine_unpark(cpu);
-- 
cgit v1.3-14-g43fede


From b11fb73743fc406204e0749ead18560aeda8b136 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 11 Jul 2017 15:43:24 -0400
Subject: tracing: Fixup trace file header alignment

The addition of TGID to the tracing header added a check to see if TGID
shoudl be displayed or not, and updated the header accordingly.
Unfortunately, it broke the default header.

Also add constant strings to use for spacing. This does remove the
visibility of the header a bit, but cuts it down from the extended lines
much greater than 80 characters.

Before this change:

 # tracer: function
 #
 #                            _-----=> irqs-off
 #                           / _----=> need-resched
 #                          | / _---=> hardirq/softirq
 #                          || / _--=> preempt-depth
 #                          ||| /     delay
 #           TASK-PID   CPU#||||    TIMESTAMP  FUNCTION
 #              | |       | ||||       |         |
        swapper/0-1     [000] ....     0.277830: migration_init <-do_one_initcall
        swapper/0-1     [002] d...    13.861967: Unknown type 1201
        swapper/0-1     [002] d..1    13.861970: Unknown type 1202

After this change:

 # tracer: function
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
        swapper/0-1     [000] ....     0.278245: migration_init <-do_one_initcall
        swapper/0-1     [003] d...    13.861189: Unknown type 1201
        swapper/0-1     [003] d..1    13.861192: Unknown type 1202

Cc: Joel Fernandes <joelaf@google.com>
Fixes: 441dae8f2f29 ("tracing: Add support for display of tgid in trace output")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 92af8fd1429b..dabd810a10cd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3358,14 +3358,23 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
 				       unsigned int flags)
 {
 	bool tgid = flags & TRACE_ITER_RECORD_TGID;
-
-	seq_printf(m, "#                          %s  _-----=> irqs-off\n",	    tgid ? "          " : "");
-	seq_printf(m, "#                          %s / _----=> need-resched\n",	    tgid ? "          " : "");
-	seq_printf(m, "#                          %s| / _---=> hardirq/softirq\n",  tgid ? "          " : "");
-	seq_printf(m, "#                          %s|| / _--=> preempt-depth\n",    tgid ? "          " : "");
-	seq_printf(m, "#                          %s||| /     delay\n",		    tgid ? "          " : "");
-	seq_printf(m, "#           TASK-PID   CPU#%s||||    TIMESTAMP  FUNCTION\n", tgid ? "   TGID   " : "");
-	seq_printf(m, "#              | |       | %s||||       |         |\n",	    tgid ? "     |    " : "");
+	const char tgid_space[] = "          ";
+	const char space[] = "  ";
+
+	seq_printf(m, "#                          %s  _-----=> irqs-off\n",
+		   tgid ? tgid_space : space);
+	seq_printf(m, "#                          %s / _----=> need-resched\n",
+		   tgid ? tgid_space : space);
+	seq_printf(m, "#                          %s| / _---=> hardirq/softirq\n",
+		   tgid ? tgid_space : space);
+	seq_printf(m, "#                          %s|| / _--=> preempt-depth\n",
+		   tgid ? tgid_space : space);
+	seq_printf(m, "#                          %s||| /     delay\n",
+		   tgid ? tgid_space : space);
+	seq_printf(m, "#           TASK-PID   CPU#%s||||    TIMESTAMP  FUNCTION\n",
+		   tgid ? "   TGID   " : space);
+	seq_printf(m, "#              | |       | %s||||       |         |\n",
+		   tgid ? "     |    " : space);
 }
 
 void
-- 
cgit v1.3-14-g43fede


From bbd1d27d863d5c0acee65ecd0c2e34035e1df5ea Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 11 Jul 2017 19:21:04 -0400
Subject: tracing: Do note expose stack_trace_filter without DYNAMIC_FTRACE

The "stack_trace_filter" file only makes sense if DYNAMIC_FTRACE is
configured in. If it is not, then the user can not filter any functions.

Not only that, the open function causes warnings when DYNAMIC_FTRACE is not
set.

Link: http://lkml.kernel.org/r/20170710110521.600806-1-arnd@arndb.de

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b4a751e8f9d6..a4df67cbc711 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -406,6 +406,8 @@ static const struct file_operations stack_trace_fops = {
 	.release	= seq_release,
 };
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+
 static int
 stack_trace_filter_open(struct inode *inode, struct file *file)
 {
@@ -423,6 +425,8 @@ static const struct file_operations stack_trace_filter_fops = {
 	.release = ftrace_regex_release,
 };
 
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
 int
 stack_trace_sysctl(struct ctl_table *table, int write,
 		   void __user *buffer, size_t *lenp,
@@ -477,8 +481,10 @@ static __init int stack_trace_init(void)
 	trace_create_file("stack_trace", 0444, d_tracer,
 			NULL, &stack_trace_fops);
 
+#ifdef CONFIG_DYNAMIC_FTRACE
 	trace_create_file("stack_trace_filter", 0444, d_tracer,
 			  &trace_ops, &stack_trace_filter_fops);
+#endif
 
 	if (stack_trace_filter_buf[0])
 		ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
-- 
cgit v1.3-14-g43fede


From 69449bbd65687e8e5fb968a5a0c46089f6af6001 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 10 Jul 2017 10:44:03 +0200
Subject: ftrace: Hide cached module code for !CONFIG_MODULES

When modules are disabled, we get a harmless build warning:

kernel/trace/ftrace.c:4051:13: error: 'process_cached_mods' defined but not used [-Werror=unused-function]

This adds the same #ifdef around the new code that exists around
its caller.

Link: http://lkml.kernel.org/r/20170710084413.1820568-1-arnd@arndb.de

Fixes: d7fbf8df7ca0 ("ftrace: Implement cached modules tracing on module load")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2953d558bbee..4706f0ed193e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3978,6 +3978,7 @@ static int
 ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
 		 int reset, int enable);
 
+#ifdef CONFIG_MODULES
 static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
 			     char *mod, bool enable)
 {
@@ -4068,6 +4069,7 @@ static void process_cached_mods(const char *mod_name)
 
 	kfree(mod);
 }
+#endif
 
 /*
  * We register the module command as a template to show others how
-- 
cgit v1.3-14-g43fede


From 19d39a3810e7032f311ef83effdac40339b9d022 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 11 Jul 2017 23:41:52 +0200
Subject: genirq: Keep chip buslock across irq_request/release_resources()

Moving the irq_request/release_resources() callbacks out of the spinlocked,
irq disabled and bus locked region, unearthed an interesting abuse of the
irq_bus_lock/irq_bus_sync_unlock() callbacks.

The OMAP GPIO driver does merily power management inside of them. The
irq_request_resources() callback of this GPIO irqchip calls a function
which reads a GPIO register. That read aborts now because the clock of the
GPIO block is not magically enabled via the irq_bus_lock() callback.

Move the callbacks under the bus lock again to prevent this. In the
free_irq() path this requires to drop the bus_lock before calling
synchronize_irq() and reaquiring it before calling the
irq_release_resources() callback.

The bus lock can't be held because:

   1) The data which has been changed between bus_lock/un_lock is cached in
      the irq chip driver private data and needs to go out to the irq chip
      via the slow bus (usually SPI or I2C) before calling
      synchronize_irq().

      That's the reason why this bus_lock/unlock magic exists in the first
      place, as you cannot do SPI/I2C transactions while holding desc->lock
      with interrupts disabled.

   2) synchronize_irq() will actually deadlock, if there is a handler on
      flight. These chips use threaded handlers for obvious reasons, as
      they allow to do SPI/I2C communication. When the threaded handler
      returns then bus_lock needs to be taken in irq_finalize_oneshot() as
      we need to talk to the actual irq chip once more. After that the
      threaded handler is marked done, which makes synchronize_irq() return.

      So if we hold bus_lock accross the synchronize_irq() call, the
      handler cannot mark itself done because it blocks on the bus
      lock. That in turn makes synchronize_irq() wait forever on the
      threaded handler to complete....

Add the missing unlock of desc->request_mutex in the error path of
__free_irq() and add a bunch of comments to explain the locking and
protection rules.

Fixes: 46e48e257360 ("genirq: Move irq resource handling out of spinlocked region")
Reported-and-tested-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Reported-and-tested-by: Tony Lindgren <tony@atomide.com>
Reported-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Not-longer-ranted-at-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Grygorii Strashko <grygorii.strashko@ti.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
---
 kernel/irq/manage.c | 63 ++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5624b2dd6b58..1d1a5b945ab4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1090,6 +1090,16 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
 /*
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
+ *
+ * Locking rules:
+ *
+ * desc->request_mutex	Provides serialization against a concurrent free_irq()
+ *   chip_bus_lock	Provides serialization for slow bus operations
+ *     desc->lock	Provides serialization against hard interrupts
+ *
+ * chip_bus_lock and desc->lock are sufficient for all other management and
+ * interrupt related functions. desc->request_mutex solely serializes
+ * request/free_irq().
  */
 static int
 __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
@@ -1167,20 +1177,35 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
 		new->flags &= ~IRQF_ONESHOT;
 
+	/*
+	 * Protects against a concurrent __free_irq() call which might wait
+	 * for synchronize_irq() to complete without holding the optional
+	 * chip bus lock and desc->lock.
+	 */
 	mutex_lock(&desc->request_mutex);
+
+	/*
+	 * Acquire bus lock as the irq_request_resources() callback below
+	 * might rely on the serialization or the magic power management
+	 * functions which are abusing the irq_bus_lock() callback,
+	 */
+	chip_bus_lock(desc);
+
+	/* First installed action requests resources. */
 	if (!desc->action) {
 		ret = irq_request_resources(desc);
 		if (ret) {
 			pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
 			       new->name, irq, desc->irq_data.chip->name);
-			goto out_mutex;
+			goto out_bus_unlock;
 		}
 	}
 
-	chip_bus_lock(desc);
-
 	/*
 	 * The following block of code has to be executed atomically
+	 * protected against a concurrent interrupt and any of the other
+	 * management calls which are not serialized via
+	 * desc->request_mutex or the optional bus lock.
 	 */
 	raw_spin_lock_irqsave(&desc->lock, flags);
 	old_ptr = &desc->action;
@@ -1286,10 +1311,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 			ret = __irq_set_trigger(desc,
 						new->flags & IRQF_TRIGGER_MASK);
 
-			if (ret) {
-				irq_release_resources(desc);
+			if (ret)
 				goto out_unlock;
-			}
 		}
 
 		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
@@ -1385,12 +1408,10 @@ mismatch:
 out_unlock:
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
-	chip_bus_sync_unlock(desc);
-
 	if (!desc->action)
 		irq_release_resources(desc);
-
-out_mutex:
+out_bus_unlock:
+	chip_bus_sync_unlock(desc);
 	mutex_unlock(&desc->request_mutex);
 
 out_thread:
@@ -1472,6 +1493,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 			WARN(1, "Trying to free already-free IRQ %d\n", irq);
 			raw_spin_unlock_irqrestore(&desc->lock, flags);
 			chip_bus_sync_unlock(desc);
+			mutex_unlock(&desc->request_mutex);
 			return NULL;
 		}
 
@@ -1498,6 +1520,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 #endif
 
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
+	/*
+	 * Drop bus_lock here so the changes which were done in the chip
+	 * callbacks above are synced out to the irq chips which hang
+	 * behind a slow bus (I2C, SPI) before calling synchronize_irq().
+	 *
+	 * Aside of that the bus_lock can also be taken from the threaded
+	 * handler in irq_finalize_oneshot() which results in a deadlock
+	 * because synchronize_irq() would wait forever for the thread to
+	 * complete, which is blocked on the bus lock.
+	 *
+	 * The still held desc->request_mutex() protects against a
+	 * concurrent request_irq() of this irq so the release of resources
+	 * and timing data is properly serialized.
+	 */
 	chip_bus_sync_unlock(desc);
 
 	unregister_handler_proc(irq, action);
@@ -1530,8 +1566,15 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 		}
 	}
 
+	/* Last action releases resources */
 	if (!desc->action) {
+		/*
+		 * Reaquire bus lock as irq_release_resources() might
+		 * require it to deallocate resources over the slow bus.
+		 */
+		chip_bus_lock(desc);
 		irq_release_resources(desc);
+		chip_bus_sync_unlock(desc);
 		irq_remove_timings(desc);
 	}
 
-- 
cgit v1.3-14-g43fede


From ab2f7cf141aa6734c4ca7525132d8cc236efee77 Mon Sep 17 00:00:00 2001
From: Vikram Mulukutla <markivx@codeaurora.org>
Date: Thu, 6 Jul 2017 10:53:20 -0700
Subject: cpufreq: schedutil: Fix sugov_start() versus sugov_update_shared()
 race

With a shared policy in place, when one of the CPUs in the policy is
hotplugged out and then brought back online, sugov_stop() and
sugov_start() are called in order.

sugov_stop() removes utilization hooks for each CPU in the policy and
does nothing else in the for_each_cpu() loop. sugov_start() on the
other hand iterates through the CPUs in the policy and re-initializes
the per-cpu structure _and_ adds the utilization hook.  This implies
that the scheduler is allowed to invoke a CPU's utilization update
hook when the rest of the per-cpu structures have yet to be
re-inited.

Apart from some strange values in tracepoints this doesn't cause a
problem, but if we do end up accessing a pointer from the per-cpu
sugov_cpu structure somewhere in the sugov_update_shared() path,
we will likely see crashes since the memset for another CPU in the
policy is free to race with sugov_update_shared from the CPU that is
ready to go.  So let's fix this now to first init all per-cpu
structures, and then add the per-cpu utilization update hooks all at
once.

Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 076a2e31951c..29a397067ffa 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -610,6 +610,11 @@ static int sugov_start(struct cpufreq_policy *policy)
 		sg_cpu->sg_policy = sg_policy;
 		sg_cpu->flags = SCHED_CPUFREQ_RT;
 		sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+	}
+
+	for_each_cpu(cpu, policy->cpus) {
+		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
 		cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
 					     policy_is_shared(policy) ?
 							sugov_update_shared :
-- 
cgit v1.3-14-g43fede


From 44925dfff05fd1a897992d278b15a6b6b55e79a7 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 12 Jul 2017 10:33:40 +0300
Subject: ftrace: Remove an unneeded NULL check

"func" can't be NULL and it doesn't make sense to check because we've
already derefenced it.

Link: http://lkml.kernel.org/r/20170712073340.4enzeojeoupuds5a@mwanda

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4706f0ed193e..5fb5b40b3ae8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3950,7 +3950,7 @@ static int cache_mod(struct trace_array *tr,
 				continue;
 
 			/* no func matches all */
-			if (!func || strcmp(func, "*") == 0 ||
+			if (strcmp(func, "*") == 0 ||
 			    (ftrace_mod->func &&
 			     strcmp(ftrace_mod->func, func) == 0)) {
 				ret = 0;
-- 
cgit v1.3-14-g43fede


From 2e028c4fe12907f226b8221815f16c2486ad3aa7 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 12 Jul 2017 10:35:57 +0300
Subject: ftrace: Fix uninitialized variable in match_records()

My static checker complains that if "func" is NULL then "clear_filter"
is uninitialized.  This seems like it could be true, although it's
possible something subtle is happening that I haven't seen.

    kernel/trace/ftrace.c:3844 match_records()
    error: uninitialized symbol 'clear_filter'.

Link: http://lkml.kernel.org/r/20170712073556.h6tkpjcdzjaozozs@mwanda

Cc: stable@vger.kernel.org
Fixes: f0a3b154bd7 ("ftrace: Clarify code for mod command")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5fb5b40b3ae8..53f6b6401cf0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3816,7 +3816,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
 	int exclude_mod = 0;
 	int found = 0;
 	int ret;
-	int clear_filter;
+	int clear_filter = 0;
 
 	if (func) {
 		func_g.type = filter_parse_regex(func, len, &func_g.search,
-- 
cgit v1.3-14-g43fede


From 58c7ffc0747a3a9145629d4966291f0586703767 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 12 Jul 2017 04:59:45 +0100
Subject: fix a braino in compat_sys_getrlimit()

Reported-and-tested-by: Meelis Roos <mroos@linux.ee>
Fixes: commit d9e968cb9f84 "getrlimit()/setrlimit(): move compat to native"
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 73fc0af147d0..2855ee73acd0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1362,7 +1362,7 @@ COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
 
 	ret = do_prlimit(current, resource, NULL, &r);
 	if (!ret) {
-		struct rlimit r32;
+		struct compat_rlimit r32;
 		if (r.rlim_cur > COMPAT_RLIM_INFINITY)
 			r32.rlim_cur = COMPAT_RLIM_INFINITY;
 		else
-- 
cgit v1.3-14-g43fede


From 112166f88cf83dd11486cf1818672d42b540865b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 12 Jul 2017 14:33:11 -0700
Subject: kernel/fork.c: virtually mapped stacks: do not disable interrupts

The reason to disable interrupts seems to be to avoid switching to a
different processor while handling per cpu data using individual loads and
stores.  If we use per cpu RMV primitives we will not have to disable
interrupts.

Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1705171055130.5898@east.gentwo.org
Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 0f69a3e5281e..d2b9d7c31eaf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -205,19 +205,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 	void *stack;
 	int i;
 
-	local_irq_disable();
 	for (i = 0; i < NR_CACHED_STACKS; i++) {
-		struct vm_struct *s = this_cpu_read(cached_stacks[i]);
+		struct vm_struct *s;
+
+		s = this_cpu_xchg(cached_stacks[i], NULL);
 
 		if (!s)
 			continue;
-		this_cpu_write(cached_stacks[i], NULL);
 
 		tsk->stack_vm_area = s;
-		local_irq_enable();
 		return s->addr;
 	}
-	local_irq_enable();
 
 	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
 				     VMALLOC_START, VMALLOC_END,
@@ -245,19 +243,15 @@ static inline void free_thread_stack(struct task_struct *tsk)
 {
 #ifdef CONFIG_VMAP_STACK
 	if (task_stack_vm_area(tsk)) {
-		unsigned long flags;
 		int i;
 
-		local_irq_save(flags);
 		for (i = 0; i < NR_CACHED_STACKS; i++) {
-			if (this_cpu_read(cached_stacks[i]))
+			if (this_cpu_cmpxchg(cached_stacks[i],
+					NULL, tsk->stack_vm_area) != NULL)
 				continue;
 
-			this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
-			local_irq_restore(flags);
 			return;
 		}
-		local_irq_restore(flags);
 
 		vfree_atomic(tsk->stack);
 		return;
-- 
cgit v1.3-14-g43fede


From 203e9e41219b4e7357104e525e91ac609fba2c6c Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Wed, 12 Jul 2017 14:33:14 -0700
Subject: kexec: move vmcoreinfo out of the kernel's .bss section

As Eric said,
 "what we need to do is move the variable vmcoreinfo_note out of the
  kernel's .bss section. And modify the code to regenerate and keep this
  information in something like the control page.

  Definitely something like this needs a page all to itself, and ideally
  far away from any other kernel data structures. I clearly was not
  watching closely the data someone decided to keep this silly thing in
  the kernel's .bss section."

This patch allocates extra pages for these vmcoreinfo_XXX variables, one
advantage is that it enhances some safety of vmcoreinfo, because
vmcoreinfo now is kept far away from other kernel data structures.

Link: http://lkml.kernel.org/r/1493281021-20737-1-git-send-email-xlpang@redhat.com
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Tested-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Suggested-by: Eric Biederman <ebiederm@xmission.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Young <dyoung@redhat.com>
Cc: Hari Bathini <hbathini@linux.vnet.ibm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/machine_kexec.c |  5 -----
 arch/s390/kernel/machine_kexec.c |  1 +
 arch/s390/kernel/setup.c         |  6 ------
 arch/x86/kernel/crash.c          |  2 +-
 arch/x86/xen/mmu_pv.c            |  4 ++--
 include/linux/crash_core.h       |  4 ++--
 kernel/crash_core.c              | 26 ++++++++++++++++++++++----
 kernel/ksysfs.c                  |  2 +-
 8 files changed, 29 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c
index 599507bcec91..c14815dca747 100644
--- a/arch/ia64/kernel/machine_kexec.c
+++ b/arch/ia64/kernel/machine_kexec.c
@@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void)
 #endif
 }
 
-phys_addr_t paddr_vmcoreinfo_note(void)
-{
-	return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note);
-}
-
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 49a6bd45957b..3d0b14afa232 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -246,6 +246,7 @@ void arch_crash_save_vmcoreinfo(void)
 	VMCOREINFO_SYMBOL(lowcore_ptr);
 	VMCOREINFO_SYMBOL(high_memory);
 	VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
+	mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
 }
 
 void machine_shutdown(void)
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 3ae756c0db3d..3d1d808ea8a9 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -496,11 +496,6 @@ static void __init setup_memory_end(void)
 	pr_notice("The maximum memory size is %luMB\n", memory_end >> 20);
 }
 
-static void __init setup_vmcoreinfo(void)
-{
-	mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note());
-}
-
 #ifdef CONFIG_CRASH_DUMP
 
 /*
@@ -939,7 +934,6 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 	setup_resources();
-	setup_vmcoreinfo();
 	setup_lowcore();
 	smp_fill_possible_mask();
 	cpu_detect_mhz_feature();
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 22217ece26c8..44404e2307bb 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -457,7 +457,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced,
 	bufp += sizeof(Elf64_Phdr);
 	phdr->p_type = PT_NOTE;
 	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
-	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
+	phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE;
 	(ehdr->e_phnum)++;
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 1d7a7213a310..cab28cf2cffb 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2693,8 +2693,8 @@ EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
 phys_addr_t paddr_vmcoreinfo_note(void)
 {
 	if (xen_pv_domain())
-		return virt_to_machine(&vmcoreinfo_note).maddr;
+		return virt_to_machine(vmcoreinfo_note).maddr;
 	else
-		return __pa_symbol(&vmcoreinfo_note);
+		return __pa(vmcoreinfo_note);
 }
 #endif /* CONFIG_KEXEC_CORE */
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 4090a42578a8..87506a02e914 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -19,7 +19,7 @@
 				     CRASH_CORE_NOTE_NAME_BYTES +	\
 				     CRASH_CORE_NOTE_DESC_BYTES)
 
-#define VMCOREINFO_BYTES	   (4096)
+#define VMCOREINFO_BYTES	   PAGE_SIZE
 #define VMCOREINFO_NOTE_NAME	   "VMCOREINFO"
 #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
 #define VMCOREINFO_NOTE_SIZE	   ((CRASH_CORE_NOTE_HEAD_BYTES * 2) +	\
@@ -56,7 +56,7 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 #define VMCOREINFO_CONFIG(name) \
 	vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
 
-extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+extern u32 *vmcoreinfo_note;
 extern size_t vmcoreinfo_size;
 extern size_t vmcoreinfo_max_size;
 
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index fcbd568f1e95..2837d6164db8 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,10 +14,10 @@
 #include <asm/sections.h>
 
 /* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+static unsigned char *vmcoreinfo_data;
 size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
+u32 *vmcoreinfo_note;
 
 /*
  * parsing the "crashkernel" commandline
@@ -326,6 +326,9 @@ static void update_vmcoreinfo_note(void)
 
 void crash_save_vmcoreinfo(void)
 {
+	if (!vmcoreinfo_note)
+		return;
+
 	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
 	update_vmcoreinfo_note();
 }
@@ -356,11 +359,26 @@ void __weak arch_crash_save_vmcoreinfo(void)
 
 phys_addr_t __weak paddr_vmcoreinfo_note(void)
 {
-	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
+	return __pa(vmcoreinfo_note);
 }
 
 static int __init crash_save_vmcoreinfo_init(void)
 {
+	vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
+	if (!vmcoreinfo_data) {
+		pr_warn("Memory allocation for vmcoreinfo_data failed\n");
+		return -ENOMEM;
+	}
+
+	vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
+						GFP_KERNEL | __GFP_ZERO);
+	if (!vmcoreinfo_note) {
+		free_page((unsigned long)vmcoreinfo_data);
+		vmcoreinfo_data = NULL;
+		pr_warn("Memory allocation for vmcoreinfo_note failed\n");
+		return -ENOMEM;
+	}
+
 	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
 	VMCOREINFO_PAGESIZE(PAGE_SIZE);
 
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index df1a9aa602a0..46ba853656f6 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
 {
 	phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
 	return sprintf(buf, "%pa %x\n", &vmcore_base,
-		       (unsigned int)sizeof(vmcoreinfo_note));
+			(unsigned int)VMCOREINFO_NOTE_SIZE);
 }
 KERNEL_ATTR_RO(vmcoreinfo);
 
-- 
cgit v1.3-14-g43fede


From 5203f4995d9a87952a83c2ce7866adbbe8f97bb5 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Wed, 12 Jul 2017 14:33:17 -0700
Subject: powerpc/fadump: use the correct VMCOREINFO_NOTE_SIZE for phdr

vmcoreinfo_max_size stands for the vmcoreinfo_data, the correct one we
should use is vmcoreinfo_note whose total size is VMCOREINFO_NOTE_SIZE.

Like explained in commit 77019967f06b ("kdump: fix exported size of
vmcoreinfo note"), it should not affect the actual function, but we
better fix it, also this change should be safe and backward compatible.

After this, we can get rid of variable vmcoreinfo_max_size, let's use
the corresponding macros directly, fewer variables means more safety for
vmcoreinfo operation.

[xlpang@redhat.com: fix build warning]
  Link: http://lkml.kernel.org/r/1494830606-27736-1-git-send-email-xlpang@redhat.com
Link: http://lkml.kernel.org/r/1493281021-20737-2-git-send-email-xlpang@redhat.com
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Reviewed-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Reviewed-by: Dave Young <dyoung@redhat.com>
Cc: Hari Bathini <hbathini@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/fadump.c | 3 +--
 include/linux/crash_core.h   | 1 -
 kernel/crash_core.c          | 3 +--
 3 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 3079518f2245..dc0c49cfd90a 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -999,8 +999,7 @@ static int fadump_create_elfcore_headers(char *bufp)
 
 	phdr->p_paddr	= fadump_relocate(paddr_vmcoreinfo_note());
 	phdr->p_offset	= phdr->p_paddr;
-	phdr->p_memsz	= vmcoreinfo_max_size;
-	phdr->p_filesz	= vmcoreinfo_max_size;
+	phdr->p_memsz	= phdr->p_filesz = VMCOREINFO_NOTE_SIZE;
 
 	/* Increment number of program headers. */
 	(elf->e_phnum)++;
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 87506a02e914..e5df1b3cf072 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -58,7 +58,6 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 
 extern u32 *vmcoreinfo_note;
 extern size_t vmcoreinfo_size;
-extern size_t vmcoreinfo_max_size;
 
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 2837d6164db8..315adbf9cb68 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -16,7 +16,6 @@
 /* vmcoreinfo stuff */
 static unsigned char *vmcoreinfo_data;
 size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = VMCOREINFO_BYTES;
 u32 *vmcoreinfo_note;
 
 /*
@@ -343,7 +342,7 @@ void vmcoreinfo_append_str(const char *fmt, ...)
 	r = vscnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
 
-	r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+	r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
 
 	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
 
-- 
cgit v1.3-14-g43fede


From 1229384f5b856d83698c38f9dedfd836e26711cb Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Wed, 12 Jul 2017 14:33:21 -0700
Subject: kdump: protect vmcoreinfo data under the crash memory

Currently vmcoreinfo data is updated at boot time subsys_initcall(), it
has the risk of being modified by some wrong code during system is
running.

As a result, vmcore dumped may contain the wrong vmcoreinfo.  Later on,
when using "crash", "makedumpfile", etc utility to parse this vmcore, we
probably will get "Segmentation fault" or other unexpected errors.

E.g.  1) wrong code overwrites vmcoreinfo_data; 2) further crashes the
system; 3) trigger kdump, then we obviously will fail to recognize the
crash context correctly due to the corrupted vmcoreinfo.

Now except for vmcoreinfo, all the crash data is well
protected(including the cpu note which is fully updated in the crash
path, thus its correctness is guaranteed).  Given that vmcoreinfo data
is a large chunk prepared for kdump, we better protect it as well.

To solve this, we relocate and copy vmcoreinfo_data to the crash memory
when kdump is loading via kexec syscalls.  Because the whole crash
memory will be protected by existing arch_kexec_protect_crashkres()
mechanism, we naturally protect vmcoreinfo_data from write(even read)
access under kernel direct mapping after kdump is loaded.

Since kdump is usually loaded at the very early stage after boot, we can
trust the correctness of the vmcoreinfo data copied.

On the other hand, we still need to operate the vmcoreinfo safe copy
when crash happens to generate vmcoreinfo_note again, we rely on vmap()
to map out a new kernel virtual address and update to use this new one
instead in the following crash_save_vmcoreinfo().

BTW, we do not touch vmcoreinfo_note, because it will be fully updated
using the protected vmcoreinfo_data after crash which is surely correct
just like the cpu crash note.

Link: http://lkml.kernel.org/r/1493281021-20737-3-git-send-email-xlpang@redhat.com
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Tested-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Young <dyoung@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Hari Bathini <hbathini@linux.vnet.ibm.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/crash_core.h |  2 +-
 include/linux/kexec.h      |  2 ++
 kernel/crash_core.c        | 17 ++++++++++++++++-
 kernel/kexec.c             |  8 ++++++++
 kernel/kexec_core.c        | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/kexec_file.c        |  8 ++++++++
 6 files changed, 74 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index e5df1b3cf072..2df2118fbe13 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -28,6 +28,7 @@
 
 typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4];
 
+void crash_update_vmcoreinfo_safecopy(void *ptr);
 void crash_save_vmcoreinfo(void);
 void arch_crash_save_vmcoreinfo(void);
 __printf(1, 2)
@@ -57,7 +58,6 @@ phys_addr_t paddr_vmcoreinfo_note(void);
 	vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
 
 extern u32 *vmcoreinfo_note;
-extern size_t vmcoreinfo_size;
 
 Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
 			  void *data, size_t data_len);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 65888418fb69..dd056fab9e35 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -172,6 +172,7 @@ struct kimage {
 	unsigned long start;
 	struct page *control_code_page;
 	struct page *swap_page;
+	void *vmcoreinfo_data_copy; /* locates in the crash memory */
 
 	unsigned long nr_segments;
 	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
@@ -241,6 +242,7 @@ extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
 int kexec_crash_loaded(void);
 void crash_save_cpu(struct pt_regs *regs, int cpu);
+extern int kimage_crash_copy_vmcoreinfo(struct kimage *image);
 
 extern struct kimage *kexec_image;
 extern struct kimage *kexec_crash_image;
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 315adbf9cb68..6db80fc0810b 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -15,9 +15,12 @@
 
 /* vmcoreinfo stuff */
 static unsigned char *vmcoreinfo_data;
-size_t vmcoreinfo_size;
+static size_t vmcoreinfo_size;
 u32 *vmcoreinfo_note;
 
+/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
+static unsigned char *vmcoreinfo_data_safecopy;
+
 /*
  * parsing the "crashkernel" commandline
  *
@@ -323,11 +326,23 @@ static void update_vmcoreinfo_note(void)
 	final_note(buf);
 }
 
+void crash_update_vmcoreinfo_safecopy(void *ptr)
+{
+	if (ptr)
+		memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
+
+	vmcoreinfo_data_safecopy = ptr;
+}
+
 void crash_save_vmcoreinfo(void)
 {
 	if (!vmcoreinfo_note)
 		return;
 
+	/* Use the safe copy to generate vmcoreinfo note if have */
+	if (vmcoreinfo_data_safecopy)
+		vmcoreinfo_data = vmcoreinfo_data_safecopy;
+
 	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
 	update_vmcoreinfo_note();
 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 980936a90ee6..e62ec4dc6620 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -144,6 +144,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
 	if (ret)
 		goto out;
 
+	/*
+	 * Some architecture(like S390) may touch the crash memory before
+	 * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
+	 */
+	ret = kimage_crash_copy_vmcoreinfo(image);
+	if (ret)
+		goto out;
+
 	for (i = 0; i < nr_segments; i++) {
 		ret = kimage_load_segment(image, &image->segment[i]);
 		if (ret)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 154ffb489b93..1ae7c41c33c1 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -482,6 +482,40 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
 	return pages;
 }
 
+int kimage_crash_copy_vmcoreinfo(struct kimage *image)
+{
+	struct page *vmcoreinfo_page;
+	void *safecopy;
+
+	if (image->type != KEXEC_TYPE_CRASH)
+		return 0;
+
+	/*
+	 * For kdump, allocate one vmcoreinfo safe copy from the
+	 * crash memory. as we have arch_kexec_protect_crashkres()
+	 * after kexec syscall, we naturally protect it from write
+	 * (even read) access under kernel direct mapping. But on
+	 * the other hand, we still need to operate it when crash
+	 * happens to generate vmcoreinfo note, hereby we rely on
+	 * vmap for this purpose.
+	 */
+	vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
+	if (!vmcoreinfo_page) {
+		pr_warn("Could not allocate vmcoreinfo buffer\n");
+		return -ENOMEM;
+	}
+	safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
+	if (!safecopy) {
+		pr_warn("Could not vmap vmcoreinfo buffer\n");
+		return -ENOMEM;
+	}
+
+	image->vmcoreinfo_data_copy = safecopy;
+	crash_update_vmcoreinfo_safecopy(safecopy);
+
+	return 0;
+}
+
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
 	if (*image->entry != 0)
@@ -569,6 +603,11 @@ void kimage_free(struct kimage *image)
 	if (!image)
 		return;
 
+	if (image->vmcoreinfo_data_copy) {
+		crash_update_vmcoreinfo_safecopy(NULL);
+		vunmap(image->vmcoreinfo_data_copy);
+	}
+
 	kimage_free_extra_pages(image);
 	for_each_kimage_entry(image, ptr, entry) {
 		if (entry & IND_INDIRECTION) {
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 766e7e4d3ad9..c8f7f77e9fa9 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -298,6 +298,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 	if (ret)
 		goto out;
 
+	/*
+	 * Some architecture(like S390) may touch the crash memory before
+	 * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
+	 */
+	ret = kimage_crash_copy_vmcoreinfo(image);
+	if (ret)
+		goto out;
+
 	ret = kexec_calculate_store_digests(image);
 	if (ret)
 		goto out;
-- 
cgit v1.3-14-g43fede


From a19ac3374995382a994653ff372b98ea7cbad548 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Wed, 12 Jul 2017 14:33:30 -0700
Subject: sysctl: kdoc'ify sysctl_writes_strict

Document the different sysctl_writes_strict modes in code.

Link: http://lkml.kernel.org/r/20170519033554.18592-3-mcgrof@kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4dfba1a76cc3..02725178694a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -174,11 +174,32 @@ extern int no_unaligned_warning;
 
 #ifdef CONFIG_PROC_SYSCTL
 
-#define SYSCTL_WRITES_LEGACY	-1
-#define SYSCTL_WRITES_WARN	 0
-#define SYSCTL_WRITES_STRICT	 1
+/**
+ * enum sysctl_writes_mode - supported sysctl write modes
+ *
+ * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value
+ * 	to be written, and multiple writes on the same sysctl file descriptor
+ * 	will rewrite the sysctl value, regardless of file position. No warning
+ * 	is issued when the initial position is not 0.
+ * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is
+ * 	not 0.
+ * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at
+ * 	file position 0 and the value must be fully contained in the buffer
+ * 	sent to the write syscall. If dealing with strings respect the file
+ * 	position, but restrict this to the max length of the buffer, anything
+ * 	passed the max lenght will be ignored. Multiple writes will append
+ * 	to the buffer.
+ *
+ * These write modes control how current file position affects the behavior of
+ * updating sysctl values through the proc interface on each write.
+ */
+enum sysctl_writes_mode {
+	SYSCTL_WRITES_LEGACY		= -1,
+	SYSCTL_WRITES_WARN		= 0,
+	SYSCTL_WRITES_STRICT		= 1,
+};
 
-static int sysctl_writes_strict = SYSCTL_WRITES_STRICT;
+static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
 
 static int proc_do_cad_pid(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
-- 
cgit v1.3-14-g43fede


From d383d48470819e86fe30eb72f0e9494e1ee0e2af Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Wed, 12 Jul 2017 14:33:33 -0700
Subject: sysctl: fold sysctl_writes_strict checks into helper

The mode sysctl_writes_strict positional checks keep being copy and pasted
as we add new proc handlers.  Just add a helper to avoid code duplication.

Link: http://lkml.kernel.org/r/20170519033554.18592-4-mcgrof@kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Suggested-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 56 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 02725178694a..6f3bb1f099fa 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1970,6 +1970,32 @@ static void warn_sysctl_write(struct ctl_table *table)
 		current->comm, table->procname);
 }
 
+/**
+ * proc_first_pos_non_zero_ignore - check if firs position is allowed
+ * @ppos: file position
+ * @table: the sysctl table
+ *
+ * Returns true if the first position is non-zero and the sysctl_writes_strict
+ * mode indicates this is not allowed for numeric input types. String proc
+ * hadlers can ignore the return value.
+ */
+static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
+					   struct ctl_table *table)
+{
+	if (!*ppos)
+		return false;
+
+	switch (sysctl_writes_strict) {
+	case SYSCTL_WRITES_STRICT:
+		return true;
+	case SYSCTL_WRITES_WARN:
+		warn_sysctl_write(table);
+		return false;
+	default:
+		return false;
+	}
+}
+
 /**
  * proc_dostring - read a string sysctl
  * @table: the sysctl table
@@ -1990,8 +2016,8 @@ static void warn_sysctl_write(struct ctl_table *table)
 int proc_dostring(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN)
-		warn_sysctl_write(table);
+	if (write)
+		proc_first_pos_non_zero_ignore(ppos, table);
 
 	return _proc_do_string((char *)(table->data), table->maxlen, write,
 			       (char __user *)buffer, lenp, ppos);
@@ -2193,17 +2219,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 		conv = do_proc_dointvec_conv;
 
 	if (write) {
-		if (*ppos) {
-			switch (sysctl_writes_strict) {
-			case SYSCTL_WRITES_STRICT:
-				goto out;
-			case SYSCTL_WRITES_WARN:
-				warn_sysctl_write(table);
-				break;
-			default:
-				break;
-			}
-		}
+		if (proc_first_pos_non_zero_ignore(ppos, table))
+			goto out;
 
 		if (left > PAGE_SIZE - 1)
 			left = PAGE_SIZE - 1;
@@ -2468,17 +2485,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 	left = *lenp;
 
 	if (write) {
-		if (*ppos) {
-			switch (sysctl_writes_strict) {
-			case SYSCTL_WRITES_STRICT:
-				goto out;
-			case SYSCTL_WRITES_WARN:
-				warn_sysctl_write(table);
-				break;
-			default:
-				break;
-			}
-		}
+		if (proc_first_pos_non_zero_ignore(ppos, table))
+			goto out;
 
 		if (left > PAGE_SIZE - 1)
 			left = PAGE_SIZE - 1;
-- 
cgit v1.3-14-g43fede


From 4f2fec00afa60aa8e5d1b7f2a8e0526900f55623 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Wed, 12 Jul 2017 14:33:36 -0700
Subject: sysctl: simplify unsigned int support

Commit e7d316a02f68 ("sysctl: handle error writing UINT_MAX to u32
fields") added proc_douintvec() to start help adding support for
unsigned int, this however was only half the work needed.  Two fixes
have come in since then for the following issues:

  o Printing the values shows a negative value, this happens since
    do_proc_dointvec() and this uses proc_put_long()

This was fixed by commit 5380e5644afbba9 ("sysctl: don't print negative
flag for proc_douintvec").

  o We can easily wrap around the int values: UINT_MAX is 4294967295, if
    we echo in 4294967295 + 1 we end up with 0, using 4294967295 + 2 we
    end up with 1.
  o We echo negative values in and they are accepted

This was fixed by commit 425fffd886ba ("sysctl: report EINVAL if value
is larger than UINT_MAX for proc_douintvec").

It still also failed to be added to sysctl_check_table()...  instead of
adding it with the current implementation just provide a proper and
simplified unsigned int support without any array unsigned int support
with no negative support at all.

Historically sysctl proc helpers have supported arrays, due to the
complexity this adds though we've taken a step back to evaluate array
users to determine if its worth upkeeping for unsigned int.  An
evaluation using Coccinelle has been done to perform a grammatical
search to ask ourselves:

  o How many sysctl proc_dointvec() (int) users exist which likely
    should be moved over to proc_douintvec() (unsigned int) ?
	Answer: about 8
	- Of these how many are array users ?
		Answer: Probably only 1
  o How many sysctl array users exist ?
	Answer: about 12

This last question gives us an idea just how popular arrays: they are not.
Array support should probably just be kept for strings.

The identified uint ports are:

  drivers/infiniband/core/ucma.c - max_backlog
  drivers/infiniband/core/iwcm.c - default_backlog
  net/core/sysctl_net_core.c - rps_sock_flow_sysctl()
  net/netfilter/nf_conntrack_timestamp.c - nf_conntrack_timestamp -- bool
  net/netfilter/nf_conntrack_acct.c nf_conntrack_acct -- bool
  net/netfilter/nf_conntrack_ecache.c - nf_conntrack_events -- bool
  net/netfilter/nf_conntrack_helper.c - nf_conntrack_helper -- bool
  net/phonet/sysctl.c proc_local_port_range()

The only possible array users is proc_local_port_range() but it does not
seem worth it to add array support just for this given the range support
works just as well.  Unsigned int support should be desirable more for
when you *need* more than INT_MAX or using int min/max support then does
not suffice for your ranges.

If you forget and by mistake happen to register an unsigned int proc
entry with an array, the driver will fail and you will get something as
follows:

sysctl table check failed: debug/test_sysctl//uint_0002 array now allowed
CPU: 2 PID: 1342 Comm: modprobe Tainted: G        W   E <etc>
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS <etc>
Call Trace:
 dump_stack+0x63/0x81
 __register_sysctl_table+0x350/0x650
 ? kmem_cache_alloc_trace+0x107/0x240
 __register_sysctl_paths+0x1b3/0x1e0
 ? 0xffffffffc005f000
 register_sysctl_table+0x1f/0x30
 test_sysctl_init+0x10/0x1000 [test_sysctl]
 do_one_initcall+0x52/0x1a0
 ? kmem_cache_alloc_trace+0x107/0x240
 do_init_module+0x5f/0x200
 load_module+0x1867/0x1bd0
 ? __symbol_put+0x60/0x60
 SYSC_finit_module+0xdf/0x110
 SyS_finit_module+0xe/0x10
 entry_SYSCALL_64_fastpath+0x1e/0xad
RIP: 0033:0x7f042b22d119
<etc>

Fixes: e7d316a02f68 ("sysctl: handle error writing UINT_MAX to u32 fields")
Link: http://lkml.kernel.org/r/20170519033554.18592-5-mcgrof@kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Suggested-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Cc: Liping Zhang <zlpnobody@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Heinrich Schuchardt <xypron.glpk@gmx.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_sysctl.c |  14 +++++
 kernel/sysctl.c       | 153 +++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 160 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 32c9c5630507..ee6feba8b6c0 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1061,6 +1061,18 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
 	return -EINVAL;
 }
 
+static int sysctl_check_table_array(const char *path, struct ctl_table *table)
+{
+	int err = 0;
+
+	if (table->proc_handler == proc_douintvec) {
+		if (table->maxlen != sizeof(unsigned int))
+			err |= sysctl_err(path, table, "array now allowed");
+	}
+
+	return err;
+}
+
 static int sysctl_check_table(const char *path, struct ctl_table *table)
 {
 	int err = 0;
@@ -1081,6 +1093,8 @@ static int sysctl_check_table(const char *path, struct ctl_table *table)
 				err |= sysctl_err(path, table, "No data");
 			if (!table->maxlen)
 				err |= sysctl_err(path, table, "No maxlen");
+			else
+				err |= sysctl_check_table_array(path, table);
 		}
 		if (!table->proc_handler)
 			err |= sysctl_err(path, table, "No proc_handler");
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6f3bb1f099fa..d12078fc215f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2175,19 +2175,18 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
 	return 0;
 }
 
-static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
-				 int *valp,
-				 int write, void *data)
+static int do_proc_douintvec_conv(unsigned long *lvalp,
+				  unsigned int *valp,
+				  int write, void *data)
 {
 	if (write) {
-		if (*negp)
+		if (*lvalp > UINT_MAX)
 			return -EINVAL;
 		if (*lvalp > UINT_MAX)
 			return -EINVAL;
 		*valp = *lvalp;
 	} else {
 		unsigned int val = *valp;
-		*negp = false;
 		*lvalp = (unsigned long)val;
 	}
 	return 0;
@@ -2287,6 +2286,146 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
 			buffer, lenp, ppos, conv, data);
 }
 
+static int do_proc_douintvec_w(unsigned int *tbl_data,
+			       struct ctl_table *table,
+			       void __user *buffer,
+			       size_t *lenp, loff_t *ppos,
+			       int (*conv)(unsigned long *lvalp,
+					   unsigned int *valp,
+					   int write, void *data),
+			       void *data)
+{
+	unsigned long lval;
+	int err = 0;
+	size_t left;
+	bool neg;
+	char *kbuf = NULL, *p;
+
+	left = *lenp;
+
+	if (proc_first_pos_non_zero_ignore(ppos, table))
+		goto bail_early;
+
+	if (left > PAGE_SIZE - 1)
+		left = PAGE_SIZE - 1;
+
+	p = kbuf = memdup_user_nul(buffer, left);
+	if (IS_ERR(kbuf))
+		return -EINVAL;
+
+	left -= proc_skip_spaces(&p);
+	if (!left) {
+		err = -EINVAL;
+		goto out_free;
+	}
+
+	err = proc_get_long(&p, &left, &lval, &neg,
+			     proc_wspace_sep,
+			     sizeof(proc_wspace_sep), NULL);
+	if (err || neg) {
+		err = -EINVAL;
+		goto out_free;
+	}
+
+	if (conv(&lval, tbl_data, 1, data)) {
+		err = -EINVAL;
+		goto out_free;
+	}
+
+	if (!err && left)
+		left -= proc_skip_spaces(&p);
+
+out_free:
+	kfree(kbuf);
+	if (err)
+		return -EINVAL;
+
+	return 0;
+
+	/* This is in keeping with old __do_proc_dointvec() */
+bail_early:
+	*ppos += *lenp;
+	return err;
+}
+
+static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer,
+			       size_t *lenp, loff_t *ppos,
+			       int (*conv)(unsigned long *lvalp,
+					   unsigned int *valp,
+					   int write, void *data),
+			       void *data)
+{
+	unsigned long lval;
+	int err = 0;
+	size_t left;
+
+	left = *lenp;
+
+	if (conv(&lval, tbl_data, 0, data)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = proc_put_long(&buffer, &left, lval, false);
+	if (err || !left)
+		goto out;
+
+	err = proc_put_char(&buffer, &left, '\n');
+
+out:
+	*lenp -= left;
+	*ppos += *lenp;
+
+	return err;
+}
+
+static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
+			       int write, void __user *buffer,
+			       size_t *lenp, loff_t *ppos,
+			       int (*conv)(unsigned long *lvalp,
+					   unsigned int *valp,
+					   int write, void *data),
+			       void *data)
+{
+	unsigned int *i, vleft;
+
+	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+
+	i = (unsigned int *) tbl_data;
+	vleft = table->maxlen / sizeof(*i);
+
+	/*
+	 * Arrays are not supported, keep this simple. *Do not* add
+	 * support for them.
+	 */
+	if (vleft != 1) {
+		*lenp = 0;
+		return -EINVAL;
+	}
+
+	if (!conv)
+		conv = do_proc_douintvec_conv;
+
+	if (write)
+		return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
+					   conv, data);
+	return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
+}
+
+static int do_proc_douintvec(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp, loff_t *ppos,
+			     int (*conv)(unsigned long *lvalp,
+					 unsigned int *valp,
+					 int write, void *data),
+			     void *data)
+{
+	return __do_proc_douintvec(table->data, table, write,
+				   buffer, lenp, ppos, conv, data);
+}
+
 /**
  * proc_dointvec - read a vector of integers
  * @table: the sysctl table
@@ -2322,8 +2461,8 @@ int proc_dointvec(struct ctl_table *table, int write,
 int proc_douintvec(struct ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	return do_proc_dointvec(table, write, buffer, lenp, ppos,
-				do_proc_douintvec_conv, NULL);
+	return do_proc_douintvec(table, write, buffer, lenp, ppos,
+				 do_proc_douintvec_conv, NULL);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 61d9b56a89208d8cccd0b4cfec7e6959717e16e3 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Wed, 12 Jul 2017 14:33:40 -0700
Subject: sysctl: add unsigned int range support

To keep parity with regular int interfaces provide the an unsigned int
proc_douintvec_minmax() which allows you to specify a range of allowed
valid numbers.

Adding proc_douintvec_minmax_sysadmin() is easy but we can wait for an
actual user for that.

Link: http://lkml.kernel.org/r/20170519033554.18592-6-mcgrof@kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Cc: Heinrich Schuchardt <xypron.glpk@gmx.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/proc_sysctl.c  |  4 ++-
 include/linux/sysctl.h |  3 +++
 kernel/sysctl.c        | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index ee6feba8b6c0..8f9d564d0969 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1065,7 +1065,8 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
 {
 	int err = 0;
 
-	if (table->proc_handler == proc_douintvec) {
+	if ((table->proc_handler == proc_douintvec) ||
+	    (table->proc_handler == proc_douintvec_minmax)) {
 		if (table->maxlen != sizeof(unsigned int))
 			err |= sysctl_err(path, table, "array now allowed");
 	}
@@ -1083,6 +1084,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table)
 		if ((table->proc_handler == proc_dostring) ||
 		    (table->proc_handler == proc_dointvec) ||
 		    (table->proc_handler == proc_douintvec) ||
+		    (table->proc_handler == proc_douintvec_minmax) ||
 		    (table->proc_handler == proc_dointvec_minmax) ||
 		    (table->proc_handler == proc_dointvec_jiffies) ||
 		    (table->proc_handler == proc_dointvec_userhz_jiffies) ||
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 80d07816def0..225001d437ae 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -47,6 +47,9 @@ extern int proc_douintvec(struct ctl_table *, int,
 			 void __user *, size_t *, loff_t *);
 extern int proc_dointvec_minmax(struct ctl_table *, int,
 				void __user *, size_t *, loff_t *);
+extern int proc_douintvec_minmax(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *ppos);
 extern int proc_dointvec_jiffies(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
 extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d12078fc215f..df9f2a367882 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2567,6 +2567,65 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
 				do_proc_dointvec_minmax_conv, &param);
 }
 
+struct do_proc_douintvec_minmax_conv_param {
+	unsigned int *min;
+	unsigned int *max;
+};
+
+static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
+					 unsigned int *valp,
+					 int write, void *data)
+{
+	struct do_proc_douintvec_minmax_conv_param *param = data;
+
+	if (write) {
+		unsigned int val = *lvalp;
+
+		if ((param->min && *param->min > val) ||
+		    (param->max && *param->max < val))
+			return -ERANGE;
+
+		if (*lvalp > UINT_MAX)
+			return -EINVAL;
+		*valp = val;
+	} else {
+		unsigned int val = *valp;
+		*lvalp = (unsigned long) val;
+	}
+
+	return 0;
+}
+
+/**
+ * proc_douintvec_minmax - read a vector of unsigned ints with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string. Negative
+ * strings are not allowed.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max). There is a final sanity
+ * check for UINT_MAX to avoid having to support wrap around uses from
+ * userspace.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec_minmax(struct ctl_table *table, int write,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct do_proc_douintvec_minmax_conv_param param = {
+		.min = (unsigned int *) table->extra1,
+		.max = (unsigned int *) table->extra2,
+	};
+	return do_proc_douintvec(table, write, buffer, lenp, ppos,
+				 do_proc_douintvec_minmax_conv, &param);
+}
+
 static void validate_coredump_safety(void)
 {
 #ifdef CONFIG_COREDUMP
@@ -3066,6 +3125,12 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
 	return -ENOSYS;
 }
 
+int proc_douintvec_minmax(struct ctl_table *table, int write,
+			  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
 int proc_dointvec_jiffies(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -3108,6 +3173,7 @@ EXPORT_SYMBOL(proc_dointvec);
 EXPORT_SYMBOL(proc_douintvec);
 EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
+EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
 EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
 EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 EXPORT_SYMBOL(proc_dostring);
-- 
cgit v1.3-14-g43fede


From 9380fa60b10ebd6ee7c3fcdb2cf162f4d7cf9fc5 Mon Sep 17 00:00:00 2001
From: Mateusz Jurczyk <mjurczyk@google.com>
Date: Wed, 12 Jul 2017 14:34:01 -0700
Subject: kernel/sysctl_binary.c: check name array length in
 deprecated_sysctl_warning()

Prevent use of uninitialized memory (originating from the stack frame of
do_sysctl()) by verifying that the name array is filled with sufficient
input data before comparing its specific entries with integer constants.

Through timing measurement or analyzing the kernel debug logs, a
user-mode program could potentially infer the results of comparisons
against the uninitialized memory, and acquire some (very limited)
information about the state of the kernel stack.  The change also
eliminates possible future warnings by tools such as KMSAN and other
code checkers / instrumentations.

Link: http://lkml.kernel.org/r/20170524122139.21333-1-mjurczyk@google.com
Signed-off-by: Mateusz Jurczyk <mjurczyk@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Matthew Whitehead <tedheadster@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl_binary.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 939a158eab11..02e1859f2ca8 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1346,7 +1346,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
 	 * CTL_KERN/KERN_VERSION is used by older glibc and cannot
 	 * ever go away.
 	 */
-	if (name[0] == CTL_KERN && name[1] == KERN_VERSION)
+	if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION)
 		return;
 
 	if (printk_ratelimit()) {
-- 
cgit v1.3-14-g43fede


From 0791e3644e5ef21646fe565b9061788d05ec71d4 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 12 Jul 2017 14:34:28 -0700
Subject: kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files

With current epoll architecture target files are addressed with
file_struct and file descriptor number, where the last is not unique.
Moreover files can be transferred from another process via unix socket,
added into queue and closed then so we won't find this descriptor in the
task fdinfo list.

Thus to checkpoint and restore such processes CRIU needs to find out
where exactly the target file is present to add it into epoll queue.
For this sake one can use kcmp call where some particular target file
from the queue is compared with arbitrary file passed as an argument.

Because epoll target files can have same file descriptor number but
different file_struct a caller should explicitly specify the offset
within.

To test if some particular file is matching entry inside epoll one have
to

 - fill kcmp_epoll_slot structure with epoll file descriptor,
   target file number and target file offset (in case if only
   one target is present then it should be 0)

 - call kcmp as kcmp(pid1, pid2, KCMP_EPOLL_TFD, fd, &kcmp_epoll_slot)
    - the kernel fetch file pointer matching file descriptor @fd of pid1
    - lookups for file struct in epoll queue of pid2 and returns traditional
      0,1,2 result for sorting purpose

Link: http://lkml.kernel.org/r/20170424154423.511592110@gmail.com
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Andrey Vagin <avagin@openvz.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c            | 42 ++++++++++++++++++++++++++++++++++
 include/linux/eventpoll.h |  3 +++
 include/uapi/linux/kcmp.h | 10 +++++++++
 kernel/kcmp.c             | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 112 insertions(+)

(limited to 'kernel')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 322904c3ebdf..e7e9901c3790 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1077,6 +1077,48 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 	return epir;
 }
 
+static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
+{
+	struct rb_node *rbp;
+	struct epitem *epi;
+
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		if (epi->ffd.fd == tfd) {
+			if (toff == 0)
+				return epi;
+			else
+				toff--;
+		}
+		cond_resched();
+	}
+
+	return NULL;
+}
+
+struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
+				     unsigned long toff)
+{
+	struct file *file_raw;
+	struct eventpoll *ep;
+	struct epitem *epi;
+
+	if (!is_file_epoll(file))
+		return ERR_PTR(-EINVAL);
+
+	ep = file->private_data;
+
+	mutex_lock(&ep->mtx);
+	epi = ep_find_tfd(ep, tfd, toff);
+	if (epi)
+		file_raw = epi->ffd.file;
+	else
+		file_raw = ERR_PTR(-ENOENT);
+	mutex_unlock(&ep->mtx);
+
+	return file_raw;
+}
+
 /*
  * This is the callback that is passed to the wait queue wakeup
  * mechanism. It is called by the stored file descriptors when they
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 6daf6d4971f6..d8625d214ea7 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -14,6 +14,7 @@
 #define _LINUX_EVENTPOLL_H
 
 #include <uapi/linux/eventpoll.h>
+#include <uapi/linux/kcmp.h>
 
 
 /* Forward declarations to avoid compiler errors */
@@ -22,6 +23,8 @@ struct file;
 
 #ifdef CONFIG_EPOLL
 
+struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
+
 /* Used to initialize the epoll bits inside the "struct file" */
 static inline void eventpoll_init_file(struct file *file)
 {
diff --git a/include/uapi/linux/kcmp.h b/include/uapi/linux/kcmp.h
index 84df14b37360..481e103da78e 100644
--- a/include/uapi/linux/kcmp.h
+++ b/include/uapi/linux/kcmp.h
@@ -1,6 +1,8 @@
 #ifndef _UAPI_LINUX_KCMP_H
 #define _UAPI_LINUX_KCMP_H
 
+#include <linux/types.h>
+
 /* Comparison type */
 enum kcmp_type {
 	KCMP_FILE,
@@ -10,8 +12,16 @@ enum kcmp_type {
 	KCMP_SIGHAND,
 	KCMP_IO,
 	KCMP_SYSVSEM,
+	KCMP_EPOLL_TFD,
 
 	KCMP_TYPES,
 };
 
+/* Slot for KCMP_EPOLL_TFD */
+struct kcmp_epoll_slot {
+	__u32 efd;		/* epoll file descriptor */
+	__u32 tfd;		/* target file number */
+	__u32 toff;		/* target offset within same numbered sequence */
+};
+
 #endif /* _UAPI_LINUX_KCMP_H */
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 3a47fa998fe0..ea34ed8bb952 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -11,6 +11,10 @@
 #include <linux/bug.h>
 #include <linux/err.h>
 #include <linux/kcmp.h>
+#include <linux/capability.h>
+#include <linux/list.h>
+#include <linux/eventpoll.h>
+#include <linux/file.h>
 
 #include <asm/unistd.h>
 
@@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2)
 	return err;
 }
 
+#ifdef CONFIG_EPOLL
+static int kcmp_epoll_target(struct task_struct *task1,
+			     struct task_struct *task2,
+			     unsigned long idx1,
+			     struct kcmp_epoll_slot __user *uslot)
+{
+	struct file *filp, *filp_epoll, *filp_tgt;
+	struct kcmp_epoll_slot slot;
+	struct files_struct *files;
+
+	if (copy_from_user(&slot, uslot, sizeof(slot)))
+		return -EFAULT;
+
+	filp = get_file_raw_ptr(task1, idx1);
+	if (!filp)
+		return -EBADF;
+
+	files = get_files_struct(task2);
+	if (!files)
+		return -EBADF;
+
+	spin_lock(&files->file_lock);
+	filp_epoll = fcheck_files(files, slot.efd);
+	if (filp_epoll)
+		get_file(filp_epoll);
+	else
+		filp_tgt = ERR_PTR(-EBADF);
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+	if (filp_epoll) {
+		filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
+		fput(filp_epoll);
+	} else
+
+	if (IS_ERR(filp_tgt))
+		return PTR_ERR(filp_tgt);
+
+	return kcmp_ptr(filp, filp_tgt, KCMP_FILE);
+}
+#else
+static int kcmp_epoll_target(struct task_struct *task1,
+			     struct task_struct *task2,
+			     unsigned long idx1,
+			     struct kcmp_epoll_slot __user *uslot)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 		unsigned long, idx1, unsigned long, idx2)
 {
@@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 		ret = -EOPNOTSUPP;
 #endif
 		break;
+	case KCMP_EPOLL_TFD:
+		ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit v1.3-14-g43fede


From e41d58185f1444368873d4d7422f7664a68be61d Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Wed, 12 Jul 2017 14:34:35 -0700
Subject: fault-inject: support systematic fault injection

Add /proc/self/task/<current-tid>/fail-nth file that allows failing
0-th, 1-st, 2-nd and so on calls systematically.
Excerpt from the added documentation:

 "Write to this file of integer N makes N-th call in the current task
  fail (N is 0-based). Read from this file returns a single char 'Y' or
  'N' that says if the fault setup with a previous write to this file
  was injected or not, and disables the fault if it wasn't yet injected.
  Note that this file enables all types of faults (slab, futex, etc).
  This setting takes precedence over all other generic settings like
  probability, interval, times, etc. But per-capability settings (e.g.
  fail_futex/ignore-private) take precedence over it. This feature is
  intended for systematic testing of faults in a single system call. See
  an example below"

Why add a new setting:
1. Existing settings are global rather than per-task.
   So parallel testing is not possible.
2. attr->interval is close but it depends on attr->count
   which is non reset to 0, so interval does not work as expected.
3. Trying to model this with existing settings requires manipulations
   of all of probability, interval, times, space, task-filter and
   unexposed count and per-task make-it-fail files.
4. Existing settings are per-failure-type, and the set of failure
   types is potentially expanding.
5. make-it-fail can't be changed by unprivileged user and aggressive
   stress testing better be done from an unprivileged user.
   Similarly, this would require opening the debugfs files to the
   unprivileged user, as he would need to reopen at least times file
   (not possible to pre-open before dropping privs).

The proposed interface solves all of the above (see the example).

We want to integrate this into syzkaller fuzzer.  A prototype has found
10 bugs in kernel in first day of usage:

  https://groups.google.com/forum/#!searchin/syzkaller/%22FAULT_INJECTION%22%7Csort:relevance

I've made the current interface work with all types of our sandboxes.
For setuid the secret sauce was prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) to
make /proc entries non-root owned.  So I am fine with the current
version of the code.

[akpm@linux-foundation.org: fix build]
Link: http://lkml.kernel.org/r/20170328130128.101773-1-dvyukov@google.com
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/fault-injection/fault-injection.txt | 78 +++++++++++++++++++++++
 fs/proc/base.c                                    | 52 +++++++++++++++
 include/linux/sched.h                             |  1 +
 kernel/fork.c                                     |  4 ++
 lib/fault-inject.c                                |  7 ++
 5 files changed, 142 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt
index 415484f3d59a..192d8cbcc5f9 100644
--- a/Documentation/fault-injection/fault-injection.txt
+++ b/Documentation/fault-injection/fault-injection.txt
@@ -134,6 +134,22 @@ use the boot option:
 	fail_futex=
 	mmc_core.fail_request=<interval>,<probability>,<space>,<times>
 
+o proc entries
+
+- /proc/self/task/<current-tid>/fail-nth:
+
+	Write to this file of integer N makes N-th call in the current task fail
+	(N is 0-based). Read from this file returns a single char 'Y' or 'N'
+	that says if the fault setup with a previous write to this file was
+	injected or not, and disables the fault if it wasn't yet injected.
+	Note that this file enables all types of faults (slab, futex, etc).
+	This setting takes precedence over all other generic debugfs settings
+	like probability, interval, times, etc. But per-capability settings
+	(e.g. fail_futex/ignore-private) take precedence over it.
+
+	This feature is intended for systematic testing of faults in a single
+	system call. See an example below.
+
 How to add new fault injection capability
 -----------------------------------------
 
@@ -278,3 +294,65 @@ allocation failure.
 	# env FAILCMD_TYPE=fail_page_alloc \
 		./tools/testing/fault-injection/failcmd.sh --times=100 \
                 -- make -C tools/testing/selftests/ run_tests
+
+Systematic faults using fail-nth
+---------------------------------
+
+The following code systematically faults 0-th, 1-st, 2-nd and so on
+capabilities in the socketpair() system call.
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/syscall.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+
+int main()
+{
+	int i, err, res, fail_nth, fds[2];
+	char buf[128];
+
+	system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait");
+	sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid));
+	fail_nth = open(buf, O_RDWR);
+	for (i = 0;; i++) {
+		sprintf(buf, "%d", i);
+		write(fail_nth, buf, strlen(buf));
+		res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds);
+		err = errno;
+		read(fail_nth, buf, 1);
+		if (res == 0) {
+			close(fds[0]);
+			close(fds[1]);
+		}
+		printf("%d-th fault %c: res=%d/%d\n", i, buf[0], res, err);
+		if (buf[0] != 'Y')
+			break;
+	}
+	return 0;
+}
+
+An example output:
+
+0-th fault Y: res=-1/23
+1-th fault Y: res=-1/23
+2-th fault Y: res=-1/23
+3-th fault Y: res=-1/12
+4-th fault Y: res=-1/12
+5-th fault Y: res=-1/23
+6-th fault Y: res=-1/23
+7-th fault Y: res=-1/23
+8-th fault Y: res=-1/12
+9-th fault Y: res=-1/12
+10-th fault Y: res=-1/12
+11-th fault Y: res=-1/12
+12-th fault Y: res=-1/12
+13-th fault Y: res=-1/12
+14-th fault Y: res=-1/12
+15-th fault Y: res=-1/12
+16-th fault N: res=0/12
diff --git a/fs/proc/base.c b/fs/proc/base.c
index f1e1927ccd48..88b773f318cd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1355,6 +1355,53 @@ static const struct file_operations proc_fault_inject_operations = {
 	.write		= proc_fault_inject_write,
 	.llseek		= generic_file_llseek,
 };
+
+static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	int err, n;
+
+	task = get_proc_task(file_inode(file));
+	if (!task)
+		return -ESRCH;
+	put_task_struct(task);
+	if (task != current)
+		return -EPERM;
+	err = kstrtoint_from_user(buf, count, 10, &n);
+	if (err)
+		return err;
+	if (n < 0 || n == INT_MAX)
+		return -EINVAL;
+	current->fail_nth = n + 1;
+	return count;
+}
+
+static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct task_struct *task;
+	int err;
+
+	task = get_proc_task(file_inode(file));
+	if (!task)
+		return -ESRCH;
+	put_task_struct(task);
+	if (task != current)
+		return -EPERM;
+	if (count < 1)
+		return -EINVAL;
+	err = put_user((char)(current->fail_nth ? 'N' : 'Y'), buf);
+	if (err)
+		return err;
+	current->fail_nth = 0;
+	return 1;
+}
+
+static const struct file_operations proc_fail_nth_operations = {
+	.read		= proc_fail_nth_read,
+	.write		= proc_fail_nth_write,
+};
 #endif
 
 
@@ -3311,6 +3358,11 @@ static const struct pid_entry tid_base_stuff[] = {
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
+	/*
+	 * Operations on the file check that the task is current,
+	 * so we create it with 0666 to support testing under unprivileged user.
+	 */
+	REG("fail-nth", 0666, proc_fail_nth_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
 	ONE("io",	S_IRUSR, proc_tid_io_accounting),
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 20814b7d7d70..3822d749fc9e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -974,6 +974,7 @@ struct task_struct {
 
 #ifdef CONFIG_FAULT_INJECTION
 	int				make_it_fail;
+	int fail_nth;
 #endif
 	/*
 	 * When (nr_dirtied >= nr_dirtied_pause), it's time to call
diff --git a/kernel/fork.c b/kernel/fork.c
index d2b9d7c31eaf..ade237a96308 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -573,6 +573,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 
 	kcov_task_init(tsk);
 
+#ifdef CONFIG_FAULT_INJECTION
+	tsk->fail_nth = 0;
+#endif
+
 	return tsk;
 
 free_stack:
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index 4ff157159a0d..09ac73c177fd 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -107,6 +107,12 @@ static inline bool fail_stacktrace(struct fault_attr *attr)
 
 bool should_fail(struct fault_attr *attr, ssize_t size)
 {
+	if (in_task() && current->fail_nth) {
+		if (--current->fail_nth == 0)
+			goto fail;
+		return false;
+	}
+
 	/* No need to check any other properties if the probability is 0 */
 	if (attr->probability == 0)
 		return false;
@@ -134,6 +140,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size)
 	if (!fail_stacktrace(attr))
 		return false;
 
+fail:
 	fail_dump(attr);
 
 	if (atomic_read(&attr->times) != -1)
-- 
cgit v1.3-14-g43fede


From f2e0cff85ed111a3cf24d894c3fa11697dfae628 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 12 Jul 2017 14:35:43 -0700
Subject: kernel/watchdog: introduce arch_touch_nmi_watchdog()

For architectures that define HAVE_NMI_WATCHDOG, instead of having them
provide the complete touch_nmi_watchdog() function, just have them
provide arch_touch_nmi_watchdog().

This gives the generic code more flexibility in implementing this
function, and arch implementations don't miss out on touching the
softlockup watchdog or other generic details.

Link: http://lkml.kernel.org/r/20170616065715.18390-3-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Reviewed-by: Babu Moger <babu.moger@oracle.com>
Tested-by: Babu Moger <babu.moger@oracle.com>	[sparc]
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/blackfin/include/asm/nmi.h            |  2 ++
 arch/blackfin/kernel/nmi.c                 |  2 +-
 arch/mn10300/include/asm/nmi.h             |  2 ++
 arch/mn10300/kernel/mn10300-watchdog-low.S |  8 ++++----
 arch/mn10300/kernel/mn10300-watchdog.c     |  2 +-
 arch/sparc/include/asm/nmi.h               |  1 +
 arch/sparc/kernel/nmi.c                    |  6 ++----
 include/linux/nmi.h                        | 27 ++++++++++++++++-----------
 kernel/watchdog_hld.c                      |  5 ++---
 9 files changed, 31 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/arch/blackfin/include/asm/nmi.h b/arch/blackfin/include/asm/nmi.h
index b9caac4fcfd8..107d23705f46 100644
--- a/arch/blackfin/include/asm/nmi.h
+++ b/arch/blackfin/include/asm/nmi.h
@@ -9,4 +9,6 @@
 
 #include <linux/nmi.h>
 
+extern void arch_touch_nmi_watchdog(void);
+
 #endif
diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c
index 633c37083e87..1e714329fe8a 100644
--- a/arch/blackfin/kernel/nmi.c
+++ b/arch/blackfin/kernel/nmi.c
@@ -190,7 +190,7 @@ static int __init init_nmi_wdt(void)
 }
 device_initcall(init_nmi_wdt);
 
-void touch_nmi_watchdog(void)
+void arch_touch_nmi_watchdog(void)
 {
 	atomic_set(&nmi_touched[smp_processor_id()], 1);
 }
diff --git a/arch/mn10300/include/asm/nmi.h b/arch/mn10300/include/asm/nmi.h
index f3671cbbc117..b05627597b1b 100644
--- a/arch/mn10300/include/asm/nmi.h
+++ b/arch/mn10300/include/asm/nmi.h
@@ -11,4 +11,6 @@
 #ifndef _ASM_NMI_H
 #define _ASM_NMI_H
 
+extern void arch_touch_nmi_watchdog(void);
+
 #endif /* _ASM_NMI_H */
diff --git a/arch/mn10300/kernel/mn10300-watchdog-low.S b/arch/mn10300/kernel/mn10300-watchdog-low.S
index f2f5c9cfaabd..34f8773de7d0 100644
--- a/arch/mn10300/kernel/mn10300-watchdog-low.S
+++ b/arch/mn10300/kernel/mn10300-watchdog-low.S
@@ -50,9 +50,9 @@ watchdog_handler:
 #   we can't inline it)
 #
 ###############################################################################
-	.globl	touch_nmi_watchdog
-	.type	touch_nmi_watchdog,@function
-touch_nmi_watchdog:
+	.globl	arch_touch_nmi_watchdog
+	.type	arch_touch_nmi_watchdog,@function
+arch_touch_nmi_watchdog:
 	clr	d0
 	clr	d1
 	mov	watchdog_alert_counter, a0
@@ -63,4 +63,4 @@ touch_nmi_watchdog:
 	lne
 	ret	[],0
 
-	.size	touch_nmi_watchdog,.-touch_nmi_watchdog
+	.size	arch_touch_nmi_watchdog,.-arch_touch_nmi_watchdog
diff --git a/arch/mn10300/kernel/mn10300-watchdog.c b/arch/mn10300/kernel/mn10300-watchdog.c
index a2d8e6938d67..0d5641beadf5 100644
--- a/arch/mn10300/kernel/mn10300-watchdog.c
+++ b/arch/mn10300/kernel/mn10300-watchdog.c
@@ -31,7 +31,7 @@ static unsigned int watchdog;
 static unsigned int watchdog_hz = 1;
 unsigned int watchdog_alert_counter[NR_CPUS];
 
-EXPORT_SYMBOL(touch_nmi_watchdog);
+EXPORT_SYMBOL(arch_touch_nmi_watchdog);
 
 /*
  * the best way to detect whether a CPU has a 'hard lockup' problem
diff --git a/arch/sparc/include/asm/nmi.h b/arch/sparc/include/asm/nmi.h
index 26ad2b2607c6..284eac3ffaf2 100644
--- a/arch/sparc/include/asm/nmi.h
+++ b/arch/sparc/include/asm/nmi.h
@@ -7,6 +7,7 @@ void nmi_adjust_hz(unsigned int new_hz);
 
 extern atomic_t nmi_active;
 
+void arch_touch_nmi_watchdog(void);
 void start_nmi_watchdog(void *unused);
 void stop_nmi_watchdog(void *unused);
 
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index 95e73c63c99d..048ad783ea3f 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -51,7 +51,7 @@ static DEFINE_PER_CPU(unsigned int, last_irq_sum);
 static DEFINE_PER_CPU(long, alert_counter);
 static DEFINE_PER_CPU(int, nmi_touch);
 
-void touch_nmi_watchdog(void)
+void arch_touch_nmi_watchdog(void)
 {
 	if (atomic_read(&nmi_active)) {
 		int cpu;
@@ -61,10 +61,8 @@ void touch_nmi_watchdog(void)
 				per_cpu(nmi_touch, cpu) = 1;
 		}
 	}
-
-	touch_softlockup_watchdog();
 }
-EXPORT_SYMBOL(touch_nmi_watchdog);
+EXPORT_SYMBOL(arch_touch_nmi_watchdog);
 
 static void die_nmi(const char *str, struct pt_regs *regs, int do_panic)
 {
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 5e2e57536d98..bd387ef8bccd 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -6,6 +6,9 @@
 
 #include <linux/sched.h>
 #include <asm/irq.h>
+#if defined(CONFIG_HAVE_NMI_WATCHDOG)
+#include <asm/nmi.h>
+#endif
 
 #ifdef CONFIG_LOCKUP_DETECTOR
 extern void touch_softlockup_watchdog_sched(void);
@@ -58,6 +61,18 @@ static inline void reset_hung_task_detector(void)
 #define NMI_WATCHDOG_ENABLED      (1 << NMI_WATCHDOG_ENABLED_BIT)
 #define SOFT_WATCHDOG_ENABLED     (1 << SOFT_WATCHDOG_ENABLED_BIT)
 
+#if defined(CONFIG_HARDLOCKUP_DETECTOR)
+extern void hardlockup_detector_disable(void);
+#else
+static inline void hardlockup_detector_disable(void) {}
+#endif
+
+#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
+extern void arch_touch_nmi_watchdog(void);
+#else
+static inline void arch_touch_nmi_watchdog(void) {}
+#endif
+
 /**
  * touch_nmi_watchdog - restart NMI watchdog timeout.
  * 
@@ -65,21 +80,11 @@ static inline void reset_hung_task_detector(void)
  * may be used to reset the timeout - for code which intentionally
  * disables interrupts for a long time. This call is stateless.
  */
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
-#include <asm/nmi.h>
-extern void touch_nmi_watchdog(void);
-#else
 static inline void touch_nmi_watchdog(void)
 {
+	arch_touch_nmi_watchdog();
 	touch_softlockup_watchdog();
 }
-#endif
-
-#if defined(CONFIG_HARDLOCKUP_DETECTOR)
-extern void hardlockup_detector_disable(void);
-#else
-static inline void hardlockup_detector_disable(void) {}
-#endif
 
 /*
  * Create trigger_all_cpu_backtrace() out of the arch-provided
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 54a427d1f344..90d688df6ce1 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -56,7 +56,7 @@ static int __init hardlockup_panic_setup(char *str)
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
 
-void touch_nmi_watchdog(void)
+void arch_touch_nmi_watchdog(void)
 {
 	/*
 	 * Using __raw here because some code paths have
@@ -66,9 +66,8 @@ void touch_nmi_watchdog(void)
 	 * going off.
 	 */
 	raw_cpu_write(watchdog_nmi_touch, true);
-	touch_softlockup_watchdog();
 }
-EXPORT_SYMBOL(touch_nmi_watchdog);
+EXPORT_SYMBOL(arch_touch_nmi_watchdog);
 
 static struct perf_event_attr wd_hw_attr = {
 	.type		= PERF_TYPE_HARDWARE,
-- 
cgit v1.3-14-g43fede


From 05a4a95279311c3a4633b4277a5d21cfd616c6c7 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 12 Jul 2017 14:35:46 -0700
Subject: kernel/watchdog: split up config options

Split SOFTLOCKUP_DETECTOR from LOCKUP_DETECTOR, and split
HARDLOCKUP_DETECTOR_PERF from HARDLOCKUP_DETECTOR.

LOCKUP_DETECTOR implies the general boot, sysctl, and programming
interfaces for the lockup detectors.

An architecture that wants to use a hard lockup detector must define
HAVE_HARDLOCKUP_DETECTOR_PERF or HAVE_HARDLOCKUP_DETECTOR_ARCH.

Alternatively an arch can define HAVE_NMI_WATCHDOG, which provides the
minimum arch_touch_nmi_watchdog, and it otherwise does its own thing and
does not implement the LOCKUP_DETECTOR interfaces.

sparc is unusual in that it has started to implement some of the
interfaces, but not fully yet.  It should probably be converted to a full
HAVE_HARDLOCKUP_DETECTOR_ARCH.

[npiggin@gmail.com: fix]
  Link: http://lkml.kernel.org/r/20170617223522.66c0ad88@roar.ozlabs.ibm.com
Link: http://lkml.kernel.org/r/20170616065715.18390-4-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Reviewed-by: Babu Moger <babu.moger@oracle.com>
Tested-by: Babu Moger <babu.moger@oracle.com>	[sparc]
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                   |  25 ++++-
 arch/powerpc/Kconfig           |   1 +
 arch/powerpc/kernel/setup_64.c |   2 +-
 arch/x86/Kconfig               |   1 +
 arch/x86/kernel/apic/hw_nmi.c  |   2 +-
 include/linux/nmi.h            |  29 +++--
 kernel/Makefile                |   2 +-
 kernel/sysctl.c                |  31 +++---
 kernel/watchdog.c              | 243 +++++++++++++++++++++++++++--------------
 kernel/watchdog_hld.c          |  32 ------
 lib/Kconfig.debug              |  45 +++++---
 11 files changed, 251 insertions(+), 162 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index cae0958a2298..fb9bd7d36b05 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -198,9 +198,6 @@ config HAVE_KPROBES_ON_FTRACE
 config HAVE_NMI
 	bool
 
-config HAVE_NMI_WATCHDOG
-	depends on HAVE_NMI
-	bool
 #
 # An arch should select this if it provides all these things:
 #
@@ -288,6 +285,28 @@ config HAVE_PERF_EVENTS_NMI
 	  subsystem.  Also has support for calculating CPU cycle events
 	  to determine how many clock cycles in a given period.
 
+config HAVE_HARDLOCKUP_DETECTOR_PERF
+	bool
+	depends on HAVE_PERF_EVENTS_NMI
+	help
+	  The arch chooses to use the generic perf-NMI-based hardlockup
+	  detector. Must define HAVE_PERF_EVENTS_NMI.
+
+config HAVE_NMI_WATCHDOG
+	depends on HAVE_NMI
+	bool
+	help
+	  The arch provides a low level NMI watchdog. It provides
+	  asm/nmi.h, and defines its own arch_touch_nmi_watchdog().
+
+config HAVE_HARDLOCKUP_DETECTOR_ARCH
+	bool
+	select HAVE_NMI_WATCHDOG
+	help
+	  The arch chooses to provide its own hardlockup detector, which is
+	  a superset of the HAVE_NMI_WATCHDOG. It also conforms to config
+	  interfaces and parameters provided by hardlockup detector subsystem.
+
 config HAVE_PERF_REGS
 	bool
 	help
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7177a3f4f418..63ed758e1d20 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -197,6 +197,7 @@ config PPC
 	select HAVE_OPTPROBES			if PPC64
 	select HAVE_PERF_EVENTS
 	select HAVE_PERF_EVENTS_NMI		if PPC64
+	select HAVE_HARDLOCKUP_DETECTOR_PERF	if HAVE_PERF_EVENTS_NMI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_RCU_TABLE_FREE		if SMP
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 4640f6d64f8b..074a075a9cdb 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -752,7 +752,7 @@ struct ppc_pci_io ppc_pci_io;
 EXPORT_SYMBOL(ppc_pci_io);
 #endif
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
 u64 hw_nmi_get_sample_period(int watchdog_thresh)
 {
 	return ppc_proc_freq * watchdog_thresh;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 94a18681353d..3d2b8ce54e00 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -162,6 +162,7 @@ config X86
 	select HAVE_PCSPKR_PLATFORM
 	select HAVE_PERF_EVENTS
 	select HAVE_PERF_EVENTS_NMI
+	select HAVE_HARDLOCKUP_DETECTOR_PERF	if HAVE_PERF_EVENTS_NMI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index c73c9fb281e1..d6f387780849 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -19,7 +19,7 @@
 #include <linux/init.h>
 #include <linux/delay.h>
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
 u64 hw_nmi_get_sample_period(int watchdog_thresh)
 {
 	return (u64)(cpu_khz) * 1000 * watchdog_thresh;
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index bd387ef8bccd..8aa01fd859fb 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -11,13 +11,21 @@
 #endif
 
 #ifdef CONFIG_LOCKUP_DETECTOR
+void lockup_detector_init(void);
+#else
+static inline void lockup_detector_init(void)
+{
+}
+#endif
+
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
 extern void touch_softlockup_watchdog_sched(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern unsigned int  softlockup_panic;
-extern unsigned int  hardlockup_panic;
-void lockup_detector_init(void);
+extern int soft_watchdog_enabled;
+extern atomic_t watchdog_park_in_progress;
 #else
 static inline void touch_softlockup_watchdog_sched(void)
 {
@@ -31,9 +39,6 @@ static inline void touch_softlockup_watchdog_sync(void)
 static inline void touch_all_softlockup_watchdogs(void)
 {
 }
-static inline void lockup_detector_init(void)
-{
-}
 #endif
 
 #ifdef CONFIG_DETECT_HUNG_TASK
@@ -63,15 +68,18 @@ static inline void reset_hung_task_detector(void)
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
 extern void hardlockup_detector_disable(void);
+extern unsigned int hardlockup_panic;
 #else
 static inline void hardlockup_detector_disable(void) {}
 #endif
 
-#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
+#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
 extern void arch_touch_nmi_watchdog(void);
 #else
+#if !defined(CONFIG_HAVE_NMI_WATCHDOG)
 static inline void arch_touch_nmi_watchdog(void) {}
 #endif
+#endif
 
 /**
  * touch_nmi_watchdog - restart NMI watchdog timeout.
@@ -141,15 +149,18 @@ static inline bool trigger_single_cpu_backtrace(int cpu)
 }
 #endif
 
-#ifdef CONFIG_LOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
+#endif
+
+#ifdef CONFIG_LOCKUP_DETECTOR
 extern int nmi_watchdog_enabled;
-extern int soft_watchdog_enabled;
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
 extern unsigned long watchdog_enabled;
+extern struct cpumask watchdog_cpumask;
 extern unsigned long *watchdog_cpumask_bits;
-extern atomic_t watchdog_park_in_progress;
+extern int __read_mostly watchdog_suspended;
 #ifdef CONFIG_SMP
 extern int sysctl_softlockup_all_cpu_backtrace;
 extern int sysctl_hardlockup_all_cpu_backtrace;
diff --git a/kernel/Makefile b/kernel/Makefile
index 72aa080f91f0..4cb8e8b23c6e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -82,7 +82,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
-obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index df9f2a367882..6648fbbb8157 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -900,6 +900,14 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &zero,
 #endif
 	},
+	{
+		.procname	= "watchdog_cpumask",
+		.data		= &watchdog_cpumask_bits,
+		.maxlen		= NR_CPUS,
+		.mode		= 0644,
+		.proc_handler	= proc_watchdog_cpumask,
+	},
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
 	{
 		.procname       = "soft_watchdog",
 		.data           = &soft_watchdog_enabled,
@@ -909,13 +917,6 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-	{
-		.procname	= "watchdog_cpumask",
-		.data		= &watchdog_cpumask_bits,
-		.maxlen		= NR_CPUS,
-		.mode		= 0644,
-		.proc_handler	= proc_watchdog_cpumask,
-	},
 	{
 		.procname	= "softlockup_panic",
 		.data		= &softlockup_panic,
@@ -925,27 +926,29 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_SMP
 	{
-		.procname	= "hardlockup_panic",
-		.data		= &hardlockup_panic,
+		.procname	= "softlockup_all_cpu_backtrace",
+		.data		= &sysctl_softlockup_all_cpu_backtrace,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+#endif /* CONFIG_SMP */
 #endif
-#ifdef CONFIG_SMP
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
 	{
-		.procname	= "softlockup_all_cpu_backtrace",
-		.data		= &sysctl_softlockup_all_cpu_backtrace,
+		.procname	= "hardlockup_panic",
+		.data		= &hardlockup_panic,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+#ifdef CONFIG_SMP
 	{
 		.procname	= "hardlockup_all_cpu_backtrace",
 		.data		= &sysctl_hardlockup_all_cpu_backtrace,
@@ -957,6 +960,8 @@ static struct ctl_table kern_table[] = {
 	},
 #endif /* CONFIG_SMP */
 #endif
+#endif
+
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
 		.procname       = "unknown_nmi_panic",
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 03e0b69bb5bf..1fba9c3d66dc 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,15 +29,58 @@
 #include <linux/kvm_para.h>
 #include <linux/kthread.h>
 
+/* Watchdog configuration */
 static DEFINE_MUTEX(watchdog_proc_mutex);
 
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+int __read_mostly nmi_watchdog_enabled;
+
+#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
+						NMI_WATCHDOG_ENABLED;
 #else
 unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
 #endif
-int __read_mostly nmi_watchdog_enabled;
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+unsigned int __read_mostly hardlockup_panic =
+			CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+/*
+ * We may not want to enable hard lockup detection by default in all cases,
+ * for example when running the kernel as a guest on a hypervisor. In these
+ * cases this function can be called to disable hard lockup detection. This
+ * function should only be executed once by the boot processor before the
+ * kernel command line parameters are parsed, because otherwise it is not
+ * possible to override this in hardlockup_panic_setup().
+ */
+void hardlockup_detector_disable(void)
+{
+	watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+}
+
+static int __init hardlockup_panic_setup(char *str)
+{
+	if (!strncmp(str, "panic", 5))
+		hardlockup_panic = 1;
+	else if (!strncmp(str, "nopanic", 7))
+		hardlockup_panic = 0;
+	else if (!strncmp(str, "0", 1))
+		watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+	else if (!strncmp(str, "1", 1))
+		watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+	return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+
+#endif
+
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
 int __read_mostly soft_watchdog_enabled;
+#endif
+
 int __read_mostly watchdog_user_enabled;
 int __read_mostly watchdog_thresh = 10;
 
@@ -45,15 +88,9 @@ int __read_mostly watchdog_thresh = 10;
 int __read_mostly sysctl_softlockup_all_cpu_backtrace;
 int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
 #endif
-static struct cpumask watchdog_cpumask __read_mostly;
+struct cpumask watchdog_cpumask __read_mostly;
 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 
-/* Helper for online, unparked cpus. */
-#define for_each_watchdog_cpu(cpu) \
-	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
-
-atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
-
 /*
  * The 'watchdog_running' variable is set to 1 when the watchdog threads
  * are registered/started and is set to 0 when the watchdog threads are
@@ -72,7 +109,27 @@ static int __read_mostly watchdog_running;
  * of 'watchdog_running' cannot change while the watchdog is deactivated
  * temporarily (see related code in 'proc' handlers).
  */
-static int __read_mostly watchdog_suspended;
+int __read_mostly watchdog_suspended;
+
+/*
+ * These functions can be overridden if an architecture implements its
+ * own hardlockup detector.
+ */
+int __weak watchdog_nmi_enable(unsigned int cpu)
+{
+	return 0;
+}
+void __weak watchdog_nmi_disable(unsigned int cpu)
+{
+}
+
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
+
+/* Helper for online, unparked cpus. */
+#define for_each_watchdog_cpu(cpu) \
+	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+
+atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
 
 static u64 __read_mostly sample_period;
 
@@ -120,6 +177,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
 	return 1;
 }
 __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
 static int __init hardlockup_all_cpu_backtrace_setup(char *str)
 {
 	sysctl_hardlockup_all_cpu_backtrace =
@@ -128,6 +186,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str)
 }
 __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
 #endif
+#endif
 
 /*
  * Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -213,18 +272,6 @@ void touch_softlockup_watchdog_sync(void)
 	__this_cpu_write(watchdog_touch_ts, 0);
 }
 
-/* watchdog detector functions */
-bool is_hardlockup(void)
-{
-	unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
-
-	if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
-		return true;
-
-	__this_cpu_write(hrtimer_interrupts_saved, hrint);
-	return false;
-}
-
 static int is_softlockup(unsigned long touch_ts)
 {
 	unsigned long now = get_timestamp();
@@ -237,21 +284,21 @@ static int is_softlockup(unsigned long touch_ts)
 	return 0;
 }
 
-static void watchdog_interrupt_count(void)
+/* watchdog detector functions */
+bool is_hardlockup(void)
 {
-	__this_cpu_inc(hrtimer_interrupts);
-}
+	unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
 
-/*
- * These two functions are mostly architecture specific
- * defining them as weak here.
- */
-int __weak watchdog_nmi_enable(unsigned int cpu)
-{
-	return 0;
+	if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
+		return true;
+
+	__this_cpu_write(hrtimer_interrupts_saved, hrint);
+	return false;
 }
-void __weak watchdog_nmi_disable(unsigned int cpu)
+
+static void watchdog_interrupt_count(void)
 {
+	__this_cpu_inc(hrtimer_interrupts);
 }
 
 static int watchdog_enable_all_cpus(void);
@@ -502,57 +549,6 @@ static void watchdog_unpark_threads(void)
 		kthread_unpark(per_cpu(softlockup_watchdog, cpu));
 }
 
-/*
- * Suspend the hard and soft lockup detector by parking the watchdog threads.
- */
-int lockup_detector_suspend(void)
-{
-	int ret = 0;
-
-	get_online_cpus();
-	mutex_lock(&watchdog_proc_mutex);
-	/*
-	 * Multiple suspend requests can be active in parallel (counted by
-	 * the 'watchdog_suspended' variable). If the watchdog threads are
-	 * running, the first caller takes care that they will be parked.
-	 * The state of 'watchdog_running' cannot change while a suspend
-	 * request is active (see related code in 'proc' handlers).
-	 */
-	if (watchdog_running && !watchdog_suspended)
-		ret = watchdog_park_threads();
-
-	if (ret == 0)
-		watchdog_suspended++;
-	else {
-		watchdog_disable_all_cpus();
-		pr_err("Failed to suspend lockup detectors, disabled\n");
-		watchdog_enabled = 0;
-	}
-
-	mutex_unlock(&watchdog_proc_mutex);
-
-	return ret;
-}
-
-/*
- * Resume the hard and soft lockup detector by unparking the watchdog threads.
- */
-void lockup_detector_resume(void)
-{
-	mutex_lock(&watchdog_proc_mutex);
-
-	watchdog_suspended--;
-	/*
-	 * The watchdog threads are unparked if they were previously running
-	 * and if there is no more active suspend request.
-	 */
-	if (watchdog_running && !watchdog_suspended)
-		watchdog_unpark_threads();
-
-	mutex_unlock(&watchdog_proc_mutex);
-	put_online_cpus();
-}
-
 static int update_watchdog_all_cpus(void)
 {
 	int ret;
@@ -604,6 +600,81 @@ static void watchdog_disable_all_cpus(void)
 	}
 }
 
+#else /* SOFTLOCKUP */
+static int watchdog_park_threads(void)
+{
+	return 0;
+}
+
+static void watchdog_unpark_threads(void)
+{
+}
+
+static int watchdog_enable_all_cpus(void)
+{
+	return 0;
+}
+
+static void watchdog_disable_all_cpus(void)
+{
+}
+
+static void set_sample_period(void)
+{
+}
+#endif /* SOFTLOCKUP */
+
+/*
+ * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ */
+int lockup_detector_suspend(void)
+{
+	int ret = 0;
+
+	get_online_cpus();
+	mutex_lock(&watchdog_proc_mutex);
+	/*
+	 * Multiple suspend requests can be active in parallel (counted by
+	 * the 'watchdog_suspended' variable). If the watchdog threads are
+	 * running, the first caller takes care that they will be parked.
+	 * The state of 'watchdog_running' cannot change while a suspend
+	 * request is active (see related code in 'proc' handlers).
+	 */
+	if (watchdog_running && !watchdog_suspended)
+		ret = watchdog_park_threads();
+
+	if (ret == 0)
+		watchdog_suspended++;
+	else {
+		watchdog_disable_all_cpus();
+		pr_err("Failed to suspend lockup detectors, disabled\n");
+		watchdog_enabled = 0;
+	}
+
+	mutex_unlock(&watchdog_proc_mutex);
+
+	return ret;
+}
+
+/*
+ * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ */
+void lockup_detector_resume(void)
+{
+	mutex_lock(&watchdog_proc_mutex);
+
+	watchdog_suspended--;
+	/*
+	 * The watchdog threads are unparked if they were previously running
+	 * and if there is no more active suspend request.
+	 */
+	if (watchdog_running && !watchdog_suspended)
+		watchdog_unpark_threads();
+
+	mutex_unlock(&watchdog_proc_mutex);
+	put_online_cpus();
+}
+
 #ifdef CONFIG_SYSCTL
 
 /*
@@ -810,9 +881,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 			 * a temporary cpumask, so we are likely not in a
 			 * position to do much else to make things better.
 			 */
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
 			if (smpboot_update_cpumask_percpu_thread(
 				    &watchdog_threads, &watchdog_cpumask) != 0)
 				pr_err("cpumask update failed\n");
+#endif
 		}
 	}
 out:
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 90d688df6ce1..295a0d84934c 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -22,39 +22,7 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 
-/* boot commands */
-/*
- * Should we panic when a soft-lockup or hard-lockup occurs:
- */
-unsigned int __read_mostly hardlockup_panic =
-			CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
 static unsigned long hardlockup_allcpu_dumped;
-/*
- * We may not want to enable hard lockup detection by default in all cases,
- * for example when running the kernel as a guest on a hypervisor. In these
- * cases this function can be called to disable hard lockup detection. This
- * function should only be executed once by the boot processor before the
- * kernel command line parameters are parsed, because otherwise it is not
- * possible to override this in hardlockup_panic_setup().
- */
-void hardlockup_detector_disable(void)
-{
-	watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-}
-
-static int __init hardlockup_panic_setup(char *str)
-{
-	if (!strncmp(str, "panic", 5))
-		hardlockup_panic = 1;
-	else if (!strncmp(str, "nopanic", 7))
-		hardlockup_panic = 0;
-	else if (!strncmp(str, "0", 1))
-		watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-	else if (!strncmp(str, "1", 1))
-		watchdog_enabled |= NMI_WATCHDOG_ENABLED;
-	return 1;
-}
-__setup("nmi_watchdog=", hardlockup_panic_setup);
 
 void arch_touch_nmi_watchdog(void)
 {
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f28f4252e54a..b0d01c6d4e03 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -778,34 +778,45 @@ config DEBUG_SHIRQ
 menu "Debug Lockups and Hangs"
 
 config LOCKUP_DETECTOR
-	bool "Detect Hard and Soft Lockups"
+	bool
+
+config SOFTLOCKUP_DETECTOR
+	bool "Detect Soft Lockups"
 	depends on DEBUG_KERNEL && !S390
+	select LOCKUP_DETECTOR
 	help
 	  Say Y here to enable the kernel to act as a watchdog to detect
-	  hard and soft lockups.
+	  soft lockups.
 
 	  Softlockups are bugs that cause the kernel to loop in kernel
 	  mode for more than 20 seconds, without giving other tasks a
 	  chance to run.  The current stack trace is displayed upon
 	  detection and the system will stay locked up.
 
+config HARDLOCKUP_DETECTOR_PERF
+	bool
+	select SOFTLOCKUP_DETECTOR
+
+#
+# arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard
+# lockup detector rather than the perf based detector.
+#
+config HARDLOCKUP_DETECTOR
+	bool "Detect Hard Lockups"
+	depends on DEBUG_KERNEL && !S390
+	depends on HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_ARCH
+	select LOCKUP_DETECTOR
+	select HARDLOCKUP_DETECTOR_PERF if HAVE_HARDLOCKUP_DETECTOR_PERF
+	select HARDLOCKUP_DETECTOR_ARCH if HAVE_HARDLOCKUP_DETECTOR_ARCH
+	help
+	  Say Y here to enable the kernel to act as a watchdog to detect
+	  hard lockups.
+
 	  Hardlockups are bugs that cause the CPU to loop in kernel mode
 	  for more than 10 seconds, without letting other interrupts have a
 	  chance to run.  The current stack trace is displayed upon detection
 	  and the system will stay locked up.
 
-	  The overhead should be minimal.  A periodic hrtimer runs to
-	  generate interrupts and kick the watchdog task every 4 seconds.
-	  An NMI is generated every 10 seconds or so to check for hardlockups.
-
-	  The frequency of hrtimer and NMI events and the soft and hard lockup
-	  thresholds can be controlled through the sysctl watchdog_thresh.
-
-config HARDLOCKUP_DETECTOR
-	def_bool y
-	depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG
-	depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI
-
 config BOOTPARAM_HARDLOCKUP_PANIC
 	bool "Panic (Reboot) On Hard Lockups"
 	depends on HARDLOCKUP_DETECTOR
@@ -826,7 +837,7 @@ config BOOTPARAM_HARDLOCKUP_PANIC_VALUE
 
 config BOOTPARAM_SOFTLOCKUP_PANIC
 	bool "Panic (Reboot) On Soft Lockups"
-	depends on LOCKUP_DETECTOR
+	depends on SOFTLOCKUP_DETECTOR
 	help
 	  Say Y here to enable the kernel to panic on "soft lockups",
 	  which are bugs that cause the kernel to loop in kernel
@@ -843,7 +854,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC
 
 config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE
 	int
-	depends on LOCKUP_DETECTOR
+	depends on SOFTLOCKUP_DETECTOR
 	range 0 1
 	default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC
 	default 1 if BOOTPARAM_SOFTLOCKUP_PANIC
@@ -851,7 +862,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE
 config DETECT_HUNG_TASK
 	bool "Detect Hung Tasks"
 	depends on DEBUG_KERNEL
-	default LOCKUP_DETECTOR
+	default SOFTLOCKUP_DETECTOR
 	help
 	  Say Y here to enable the kernel to detect "hung tasks",
 	  which are bugs that cause the task to be stuck in
-- 
cgit v1.3-14-g43fede


From a10a842ff81a7e3810817b3b04e4c432b6191e21 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 12 Jul 2017 14:35:49 -0700
Subject: kernel/watchdog: provide watchdog_nmi_reconfigure() for arch
 watchdogs

After reconfiguring watchdog sysctls etc., architecture specific
watchdogs may not get all their parameters updated.

watchdog_nmi_reconfigure() can be implemented to pull the new values in
and set the arch NMI watchdog.

[npiggin@gmail.com: add code comments]
  Link: http://lkml.kernel.org/r/20170617125933.774d3858@roar.ozlabs.ibm.com
[arnd@arndb.de: hide unused function]
  Link: http://lkml.kernel.org/r/20170620204854.966601-1-arnd@arndb.de
Link: http://lkml.kernel.org/r/20170616065715.18390-5-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Tested-by: Babu Moger <babu.moger@oracle.com>	[sparc]
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 48 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 1fba9c3d66dc..cabe3e9fb620 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -114,6 +114,10 @@ int __read_mostly watchdog_suspended;
 /*
  * These functions can be overridden if an architecture implements its
  * own hardlockup detector.
+ *
+ * watchdog_nmi_enable/disable can be implemented to start and stop when
+ * softlockup watchdog threads start and stop. The arch must select the
+ * SOFTLOCKUP_DETECTOR Kconfig.
  */
 int __weak watchdog_nmi_enable(unsigned int cpu)
 {
@@ -123,6 +127,22 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
 {
 }
 
+/*
+ * watchdog_nmi_reconfigure can be implemented to be notified after any
+ * watchdog configuration change. The arch hardlockup watchdog should
+ * respond to the following variables:
+ * - nmi_watchdog_enabled
+ * - watchdog_thresh
+ * - watchdog_cpumask
+ * - sysctl_hardlockup_all_cpu_backtrace
+ * - hardlockup_panic
+ * - watchdog_suspended
+ */
+void __weak watchdog_nmi_reconfigure(void)
+{
+}
+
+
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
 
 /* Helper for online, unparked cpus. */
@@ -600,6 +620,14 @@ static void watchdog_disable_all_cpus(void)
 	}
 }
 
+#ifdef CONFIG_SYSCTL
+static int watchdog_update_cpus(void)
+{
+	return smpboot_update_cpumask_percpu_thread(
+		    &watchdog_threads, &watchdog_cpumask);
+}
+#endif
+
 #else /* SOFTLOCKUP */
 static int watchdog_park_threads(void)
 {
@@ -619,6 +647,13 @@ static void watchdog_disable_all_cpus(void)
 {
 }
 
+#ifdef CONFIG_SYSCTL
+static int watchdog_update_cpus(void)
+{
+	return 0;
+}
+#endif
+
 static void set_sample_period(void)
 {
 }
@@ -651,6 +686,8 @@ int lockup_detector_suspend(void)
 		watchdog_enabled = 0;
 	}
 
+	watchdog_nmi_reconfigure();
+
 	mutex_unlock(&watchdog_proc_mutex);
 
 	return ret;
@@ -671,6 +708,8 @@ void lockup_detector_resume(void)
 	if (watchdog_running && !watchdog_suspended)
 		watchdog_unpark_threads();
 
+	watchdog_nmi_reconfigure();
+
 	mutex_unlock(&watchdog_proc_mutex);
 	put_online_cpus();
 }
@@ -696,6 +735,8 @@ static int proc_watchdog_update(void)
 	else
 		watchdog_disable_all_cpus();
 
+	watchdog_nmi_reconfigure();
+
 	return err;
 
 }
@@ -881,12 +922,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 			 * a temporary cpumask, so we are likely not in a
 			 * position to do much else to make things better.
 			 */
-#ifdef CONFIG_SOFTLOCKUP_DETECTOR
-			if (smpboot_update_cpumask_percpu_thread(
-				    &watchdog_threads, &watchdog_cpumask) != 0)
+			if (watchdog_update_cpus() != 0)
 				pr_err("cpumask update failed\n");
-#endif
 		}
+
+		watchdog_nmi_reconfigure();
 	}
 out:
 	mutex_unlock(&watchdog_proc_mutex);
-- 
cgit v1.3-14-g43fede


From e2ae8ab4b571e2e4094a28acb60649bc2732c67f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 12 Jul 2017 14:35:58 -0700
Subject: kexec_file: adjust declaration of kexec_purgatory

Defining kexec_purgatory as a zero-length char array upsets compile time
size checking.  Since this is built on a per-arch basis, define it as an
unsized char array (like is done for other similar things, e.g.  linker
sections).  This silences the warning generated by the future
CONFIG_FORTIFY_SOURCE, which did not like the memcmp() of a "0 byte"
array.  This drops the __weak and uses an extern instead, since both
users define kexec_purgatory.

Link: http://lkml.kernel.org/r/1497903987-21002-4-git-send-email-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Micay <danielmicay@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c     | 7 -------
 kernel/kexec_internal.h | 2 ++
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index c8f7f77e9fa9..9f48f4412297 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -26,13 +26,6 @@
 #include <linux/vmalloc.h>
 #include "kexec_internal.h"
 
-/*
- * Declare these symbols weak so that if architecture provides a purgatory,
- * these will be overridden.
- */
-char __weak kexec_purgatory[0];
-size_t __weak kexec_purgatory_size = 0;
-
 static int kexec_calculate_store_digests(struct kimage *image);
 
 /* Architectures can provide this probe function */
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 799a8a452187..50dfcb039a41 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -17,6 +17,8 @@ extern struct mutex kexec_mutex;
 #ifdef CONFIG_KEXEC_FILE
 #include <linux/purgatory.h>
 void kimage_file_post_load_cleanup(struct kimage *image);
+extern char kexec_purgatory[];
+extern size_t kexec_purgatory_size;
 #else /* CONFIG_KEXEC_FILE */
 static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
 #endif /* CONFIG_KEXEC_FILE */
-- 
cgit v1.3-14-g43fede


From 7cd815bce828220deffd1654265f0ef891567774 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 12 Jul 2017 14:36:20 -0700
Subject: fork,random: use get_random_canary() to set tsk->stack_canary

Use the ascii-armor canary to prevent unterminated C string overflows
from being able to successfully overwrite the canary, even if they
somehow obtain the canary value.

Inspired by execshield ascii-armor and Daniel Micay's linux-hardened
tree.

Link: http://lkml.kernel.org/r/20170524155751.424-3-riel@redhat.com
Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Daniel Micay <danielmicay@gmail.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index ade237a96308..17921b0390b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -554,7 +554,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	set_task_stack_end_magic(tsk);
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-	tsk->stack_canary = get_random_long();
+	tsk->stack_canary = get_random_canary();
 #endif
 
 	/*
-- 
cgit v1.3-14-g43fede


From 69f0d429c413fe96db2c187475cebcc6e3a8c7f5 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@linaro.org>
Date: Thu, 13 Jul 2017 14:18:24 +0800
Subject: locking/rtmutex: Remove unnecessary priority adjustment

We don't need to adjust priority before adding a new pi_waiter, the
priority only needs to be updated after pi_waiter change or task
priority change.

Steven Rostedt pointed out:

  "Interesting, I did some git mining and this was added with the original
   entry of the rtmutex.c (23f78d4a03c5). Looking at even that version, I
   don't see the purpose of adjusting the task prio here. It is done
   before anything changes in the task."

Signed-off-by: Alex Shi <alex.shi@linaro.org>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1499926704-28841-1-git-send-email-alex.shi@linaro.org
[ Enhance the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rtmutex.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 78069895032a..649dc9d3951a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -963,7 +963,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		return -EDEADLK;
 
 	raw_spin_lock(&task->pi_lock);
-	rt_mutex_adjust_prio(task);
 	waiter->task = task;
 	waiter->lock = lock;
 	waiter->prio = task->prio;
-- 
cgit v1.3-14-g43fede


From 0e4097c3354e2f5a5ad8affd9dc7f7f7d00bb6b9 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Sun, 9 Jul 2017 00:40:28 -0700
Subject: sched/cputime: Don't use smp_processor_id() in preemptible context

Recent kernels trigger this warning:

 BUG: using smp_processor_id() in preemptible [00000000] code: 99-trinity/181
 caller is debug_smp_processor_id+0x17/0x19
 CPU: 0 PID: 181 Comm: 99-trinity Not tainted 4.12.0-01059-g2a42eb9 #1
 Call Trace:
  dump_stack+0x82/0xb8
  check_preemption_disabled()
  debug_smp_processor_id()
  vtime_delta()
  task_cputime()
  thread_group_cputime()
  thread_group_cputime_adjusted()
  wait_consider_task()
  do_wait()
  SYSC_wait4()
  do_syscall_64()
  entry_SYSCALL64_slow_path()

As Frederic pointed out:

| Although those sched_clock_cpu() things seem to only matter when the
| sched_clock() is unstable. And that stability is a condition for nohz_full
| to work anyway. So probably sched_clock() alone would be enough.

This patch fixes it by replacing sched_clock_cpu() with sched_clock() to
avoid calling smp_processor_id() in a preemptible context.

Reported-by: Xiaolong Ye <xiaolong.ye@intel.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1499586028-7402-1-git-send-email-wanpeng.li@hotmail.com
[ Prettified the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 6e3ea4ac1bda..14d2dbf97c53 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -683,7 +683,7 @@ static u64 vtime_delta(struct vtime *vtime)
 {
 	unsigned long long clock;
 
-	clock = sched_clock_cpu(smp_processor_id());
+	clock = sched_clock();
 	if (clock < vtime->starttime)
 		return 0;
 
@@ -814,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
 
 	write_seqcount_begin(&vtime->seqcount);
 	vtime->state = VTIME_SYS;
-	vtime->starttime = sched_clock_cpu(smp_processor_id());
+	vtime->starttime = sched_clock();
 	write_seqcount_end(&vtime->seqcount);
 }
 
@@ -826,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
 	local_irq_save(flags);
 	write_seqcount_begin(&vtime->seqcount);
 	vtime->state = VTIME_SYS;
-	vtime->starttime = sched_clock_cpu(cpu);
+	vtime->starttime = sched_clock();
 	write_seqcount_end(&vtime->seqcount);
 	local_irq_restore(flags);
 }
-- 
cgit v1.3-14-g43fede


From 193be41e33168a3a06eb9d356d9e39c69de161d2 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Wed, 12 Jul 2017 19:24:29 -0700
Subject: sched/deadline: Fix confusing comments about selection of top
 pi-waiter

This comment in the code is incomplete, and I believe it begs a definition of
dl_boosted to make sense of the condition that follows. Rewrite the comment and
also rearrange the condition that follows to reflect the first condition "we
have a top pi-waiter which is a SCHED_DEADLINE task" in that order. Also fix a
typo that follows.

Signed-off-by: Joel Fernandes <joelaf@google.com>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Acked-by: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170713022429.10307-1-joelaf@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a84299f44b5d..755bd3f1a1a9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1392,17 +1392,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_dl_entity *pi_se = &p->dl;
 
 	/*
-	 * Use the scheduling parameters of the top pi-waiter
-	 * task if we have one and its (absolute) deadline is
-	 * smaller than our one... OTW we keep our runtime and
-	 * deadline.
+	 * Use the scheduling parameters of the top pi-waiter task if:
+	 * - we have a top pi-waiter which is a SCHED_DEADLINE task AND
+	 * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
+	 *   smaller than our deadline OR we are a !SCHED_DEADLINE task getting
+	 *   boosted due to a SCHED_DEADLINE pi-waiter).
+	 * Otherwise we keep our runtime and deadline.
 	 */
-	if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
+	if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
 		pi_se = &pi_task->dl;
 	} else if (!dl_prio(p->normal_prio)) {
 		/*
 		 * Special case in which we have a !SCHED_DEADLINE task
-		 * that is going to be deboosted, but exceedes its
+		 * that is going to be deboosted, but exceeds its
 		 * runtime while doing so. No point in replenishing
 		 * it, as it's going to return back to its original
 		 * scheduling class after this.
-- 
cgit v1.3-14-g43fede


From 5f92a7b0fcd627fbd06ceb1cee3bbe5d08d13356 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 14 Jul 2017 14:49:46 -0700
Subject: kernel/watchdog.c: use better pr_fmt prefix

After commit 73ce0511c436 ("kernel/watchdog.c: move hardlockup
detector to separate file"), 'NMI watchdog' is inappropriate in
kernel/watchdog.c, using 'watchdog' only.

Link: http://lkml.kernel.org/r/1499928642-48983-1-git-send-email-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Babu Moger <babu.moger@oracle.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index cabe3e9fb620..06d3389bca0d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -9,7 +9,7 @@
  * to those contributors as well.
  */
 
-#define pr_fmt(fmt) "NMI watchdog: " fmt
+#define pr_fmt(fmt) "watchdog: " fmt
 
 #include <linux/mm.h>
 #include <linux/cpu.h>
-- 
cgit v1.3-14-g43fede


From 6d7964a722afc8e4f880b947f174009063028c99 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@kernel.org>
Date: Fri, 14 Jul 2017 14:50:11 -0700
Subject: kmod: throttle kmod thread limit

If we reach the limit of modprobe_limit threads running the next
request_module() call will fail.  The original reason for adding a kill
was to do away with possible issues with in old circumstances which would
create a recursive series of request_module() calls.

We can do better than just be super aggressive and reject calls once we've
reached the limit by simply making pending callers wait until the
threshold has been reduced, and then throttling them in, one by one.

This throttling enables requests over the kmod concurrent limit to be
processed once a pending request completes.  Only the first item queued up
to wait is woken up.  The assumption here is once a task is woken it will
have no other option to also kick the queue to check if there are more
pending tasks -- regardless of whether or not it was successful.

By throttling and processing only max kmod concurrent tasks we ensure we
avoid unexpected fatal request_module() calls, and we keep memory
consumption on module loading to a minimum.

With x86_64 qemu, with 4 cores, 4 GiB of RAM it takes the following run
time to run both tests:

time ./kmod.sh -t 0008
real    0m16.366s
user    0m0.883s
sys     0m8.916s

time ./kmod.sh -t 0009
real    0m50.803s
user    0m0.791s
sys     0m9.852s

Link: http://lkml.kernel.org/r/20170628223155.26472-4-mcgrof@kernel.org
Signed-off-by: Luis R. Rodriguez <mcgrof@kernel.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Michal Marek <mmarek@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kmod.c                        | 16 +++++++---------
 tools/testing/selftests/kmod/kmod.sh | 24 ++----------------------
 2 files changed, 9 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index ff68198fe83b..6d016c5d97c8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -68,6 +68,7 @@ static DECLARE_RWSEM(umhelper_sem);
  */
 #define MAX_KMOD_CONCURRENT 50
 static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
+static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
 
 /*
 	modprobe_path is set via /proc/sys.
@@ -140,7 +141,6 @@ int __request_module(bool wait, const char *fmt, ...)
 	va_list args;
 	char module_name[MODULE_NAME_LEN];
 	int ret;
-	static int kmod_loop_msg;
 
 	/*
 	 * We don't allow synchronous module loading from async.  Module
@@ -164,14 +164,11 @@ int __request_module(bool wait, const char *fmt, ...)
 		return ret;
 
 	if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) {
-		/* We may be blaming an innocent here, but unlikely */
-		if (kmod_loop_msg < 5) {
-			printk(KERN_ERR
-			       "request_module: runaway loop modprobe %s\n",
-			       module_name);
-			kmod_loop_msg++;
-		}
-		return -ENOMEM;
+		pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
+				    atomic_read(&kmod_concurrent_max),
+				    MAX_KMOD_CONCURRENT, module_name);
+		wait_event_interruptible(kmod_wq,
+					 atomic_dec_if_positive(&kmod_concurrent_max) >= 0);
 	}
 
 	trace_module_request(module_name, wait, _RET_IP_);
@@ -179,6 +176,7 @@ int __request_module(bool wait, const char *fmt, ...)
 	ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
 
 	atomic_inc(&kmod_concurrent_max);
+	wake_up(&kmod_wq);
 
 	return ret;
 }
diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh
index 10196a62ed09..8cecae9a8bca 100644
--- a/tools/testing/selftests/kmod/kmod.sh
+++ b/tools/testing/selftests/kmod/kmod.sh
@@ -59,28 +59,8 @@ ALL_TESTS="$ALL_TESTS 0004:1:1"
 ALL_TESTS="$ALL_TESTS 0005:10:1"
 ALL_TESTS="$ALL_TESTS 0006:10:1"
 ALL_TESTS="$ALL_TESTS 0007:5:1"
-
-# Disabled tests:
-#
-# 0008 x 150 -  multithreaded - push kmod_concurrent over max_modprobes for request_module()"
-# Current best-effort failure interpretation:
-# Enough module requests get loaded in place fast enough to reach over the
-# max_modprobes limit and trigger a failure -- before we're even able to
-# start processing pending requests.
-ALL_TESTS="$ALL_TESTS 0008:150:0"
-
-# 0009 x 150 - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type()"
-# Current best-effort failure interpretation:
-#
-# get_fs_type() requests modules using aliases as such the optimization in
-# place today to look for already loaded modules will not take effect and
-# we end up requesting a new module to load, this bumps the kmod_concurrent,
-# and in certain circumstances can lead to pushing the kmod_concurrent over
-# the max_modprobe limit.
-#
-# This test fails much easier than test 0008 since the alias optimizations
-# are not in place.
-ALL_TESTS="$ALL_TESTS 0009:150:0"
+ALL_TESTS="$ALL_TESTS 0008:150:1"
+ALL_TESTS="$ALL_TESTS 0009:150:1"
 
 test_modprobe()
 {
-- 
cgit v1.3-14-g43fede


From a696712c3dd54eb58d2c5a807b4aaa27782d80d6 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Mon, 17 Jul 2017 19:47:02 +0200
Subject: genirq/PM: Properly pretend disabled state when force resuming
 interrupts

Interrupts with the IRQF_FORCE_RESUME flag set have also the
IRQF_NO_SUSPEND flag set. They are not disabled in the suspend path, but
must be forcefully resumed. That's used by XEN to keep IPIs enabled beyond
the suspension of device irqs. Force resume works by pretending that the
interrupt was disabled and then calling __irq_enable().

Incrementing the disabled depth counter was enough to do that, but with the
recent changes which use state flags to avoid unnecessary hardware access,
this is not longer sufficient. If the state flags are not set, then the
hardware callbacks are not invoked and the interrupt line stays disabled in
"hardware".

Set the disabled and masked state when pretending that an interrupt got
disabled by suspend.

Fixes: bf22ff45bed6 ("genirq: Avoid unnecessary low level irq function calls")
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: xen-devel@lists.xenproject.org
Cc: boris.ostrovsky@oracle.com
Link: http://lkml.kernel.org/r/20170717174703.4603-2-jgross@suse.com
---
 kernel/irq/chip.c      | 10 ----------
 kernel/irq/internals.h | 10 ++++++++++
 kernel/irq/pm.c        |  2 ++
 3 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index d171bc57e1e0..a3cc37c0c85e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -170,21 +170,11 @@ static void irq_state_clr_disabled(struct irq_desc *desc)
 	irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
 }
 
-static void irq_state_set_disabled(struct irq_desc *desc)
-{
-	irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
-}
-
 static void irq_state_clr_masked(struct irq_desc *desc)
 {
 	irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
 }
 
-static void irq_state_set_masked(struct irq_desc *desc)
-{
-	irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
-}
-
 static void irq_state_clr_started(struct irq_desc *desc)
 {
 	irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index dbfba9933ed2..a2c48058354c 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -227,6 +227,16 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
 	return __irqd_to_state(d) & mask;
 }
 
+static inline void irq_state_set_disabled(struct irq_desc *desc)
+{
+	irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
+}
+
+static inline void irq_state_set_masked(struct irq_desc *desc)
+{
+	irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
+}
+
 #undef __irqd_to_state
 
 static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cea1de0161f1..6bd9b58429cc 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -149,6 +149,8 @@ static void resume_irq(struct irq_desc *desc)
 
 	/* Pretend that it got disabled ! */
 	desc->depth++;
+	irq_state_set_disabled(desc);
+	irq_state_set_masked(desc);
 resume:
 	desc->istate &= ~IRQS_SUSPENDED;
 	__enable_irq(desc);
-- 
cgit v1.3-14-g43fede


From 848618857d2535176037bdc085f8d012d907071f Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Wed, 12 Jul 2017 19:14:16 -0700
Subject: tracing/ring_buffer: Try harder to allocate

ftrace can fail to allocate per-CPU ring buffer on systems with a large
number of CPUs coupled while large amounts of cache happening in the
page cache. Currently the ring buffer allocation doesn't retry in the VM
implementation even if direct-reclaim made some progress but still
wasn't able to find a free page. On retrying I see that the allocations
almost always succeed. The retry doesn't happen because __GFP_NORETRY is
used in the tracer to prevent the case where we might OOM, however if we
drop __GFP_NORETRY, we risk destabilizing the system if OOM killer is
triggered. To prevent this situation, use the __GFP_RETRY_MAYFAIL flag
introduced recently [1].

Tested the following still succeeds without destabilizing a system with
1GB memory.
echo 300000 > /sys/kernel/debug/tracing/buffer_size_kb

[1] https://marc.info/?l=linux-mm&m=149820805124906&w=2

Link: http://lkml.kernel.org/r/20170713021416.8897-1-joelaf@google.com

Cc: Tim Murray <timmurray@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4ae268e687fe..529cc50d7243 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1136,12 +1136,12 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page;
 		/*
-		 * __GFP_NORETRY flag makes sure that the allocation fails
-		 * gracefully without invoking oom-killer and the system is
-		 * not destabilized.
+		 * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
+		 * gracefully without invoking oom-killer and the system is not
+		 * destabilized.
 		 */
 		bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
-				    GFP_KERNEL | __GFP_NORETRY,
+				    GFP_KERNEL | __GFP_RETRY_MAYFAIL,
 				    cpu_to_node(cpu));
 		if (!bpage)
 			goto free_pages;
@@ -1149,7 +1149,7 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
 		list_add(&bpage->list, pages);
 
 		page = alloc_pages_node(cpu_to_node(cpu),
-					GFP_KERNEL | __GFP_NORETRY, 0);
+					GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0);
 		if (!page)
 			goto free_pages;
 		bpage->page = page_address(page);
-- 
cgit v1.3-14-g43fede


From b0659ae5e30074ede1dc08f2c6d64f0c11d64e0f Mon Sep 17 00:00:00 2001
From: Shu Wang <shuwang@redhat.com>
Date: Tue, 18 Jul 2017 14:37:24 +0800
Subject: audit: fix memleak in auditd_send_unicast_skb.

Found this issue by kmemleak report, auditd_send_unicast_skb
did not free skb if rcu_dereference(auditd_conn) returns null.

unreferenced object 0xffff88082568ce00 (size 256):
comm "auditd", pid 1119, jiffies 4294708499
backtrace:
[<ffffffff8176166a>] kmemleak_alloc+0x4a/0xa0
[<ffffffff8121820c>] kmem_cache_alloc_node+0xcc/0x210
[<ffffffff8161b99d>] __alloc_skb+0x5d/0x290
[<ffffffff8113c614>] audit_make_reply+0x54/0xd0
[<ffffffff8113dfa7>] audit_receive_msg+0x967/0xd70
----------------
(gdb) list *audit_receive_msg+0x967
0xffffffff8113dff7 is in audit_receive_msg (kernel/audit.c:1133).
1132    skb = audit_make_reply(0, AUDIT_REPLACE, 0,
                                0, &pvnr, sizeof(pvnr));
---------------
[<ffffffff8113e402>] audit_receive+0x52/0xa0
[<ffffffff8166c561>] netlink_unicast+0x181/0x240
[<ffffffff8166c8e2>] netlink_sendmsg+0x2c2/0x3b0
[<ffffffff816112e8>] sock_sendmsg+0x38/0x50
[<ffffffff816117a2>] SYSC_sendto+0x102/0x190
[<ffffffff81612f4e>] SyS_sendto+0xe/0x10
[<ffffffff8176d337>] entry_SYSCALL_64_fastpath+0x1a/0xa5
[<ffffffffffffffff>] 0xffffffffffffffff

Signed-off-by: Shu Wang <shuwang@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 7cad70214b81..07def5e49cc9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -641,6 +641,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
 	ac = rcu_dereference(auditd_conn);
 	if (!ac) {
 		rcu_read_unlock();
+		kfree_skb(skb);
 		rc = -ECONNREFUSED;
 		goto err;
 	}
-- 
cgit v1.3-14-g43fede


From 3bda69c1c3993a2bddbae01397d12bfef6054011 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 18 Jul 2017 14:08:34 +0300
Subject: perf/core: Fix scheduling regression of pinned groups

Vince Weaver reported:

> I was tracking down some regressions in my perf_event_test testsuite.
> Some of the tests broke in the 4.11-rc1 timeframe.
>
> I've bisected one of them, this report is about
>	tests/overflow/simul_oneshot_group_overflow
> This test creates an event group containing two sampling events, set
> to overflow to a signal handler (which disables and then refreshes the
> event).
>
> On a good kernel you get the following:
> 	Event perf::instructions with period 1000000
> 	Event perf::instructions with period 2000000
> 		fd 3 overflows: 946 (perf::instructions/1000000)
> 		fd 4 overflows: 473 (perf::instructions/2000000)
> 	Ending counts:
> 		Count 0: 946379875
> 		Count 1: 946365218
>
> With the broken kernels you get:
> 	Event perf::instructions with period 1000000
> 	Event perf::instructions with period 2000000
> 		fd 3 overflows: 938 (perf::instructions/1000000)
> 		fd 4 overflows: 318 (perf::instructions/2000000)
> 	Ending counts:
> 		Count 0: 946373080
> 		Count 1: 653373058

The root cause of the bug is that the following commit:

  487f05e18a ("perf/core: Optimize event rescheduling on active contexts")

erronously assumed that event's 'pinned' setting determines whether the
event belongs to a pinned group or not, but in fact, it's the group
leader's pinned state that matters.

This was discovered by Vince in the test case described above, where two instruction
counters are grouped, the group leader is pinned, but the other event is not;
in the regressed case the counters were off by 33% (the difference between events'
periods), but should be the same within the error margin.

Fix the problem by looking at the group leader's pinning.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Tested-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Fixes: 487f05e18a ("perf/core: Optimize event rescheduling on active contexts")
Link: http://lkml.kernel.org/r/87lgnmvw7h.fsf@ashishki-desk.ger.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9747e422ab20..c9cdbd396770 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1452,6 +1452,13 @@ static enum event_type_t get_event_type(struct perf_event *event)
 
 	lockdep_assert_held(&ctx->lock);
 
+	/*
+	 * It's 'group type', really, because if our group leader is
+	 * pinned, so are we.
+	 */
+	if (event->group_leader != event)
+		event = event->group_leader;
+
 	event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
 	if (!ctx->task)
 		event_type |= EVENT_CPU;
-- 
cgit v1.3-14-g43fede


From db9108e054700c96322b0f0028546aa4e643cf0b Mon Sep 17 00:00:00 2001
From: Chunyu Hu <chuhu@redhat.com>
Date: Thu, 20 Jul 2017 18:36:09 +0800
Subject: tracing: Fix kmemleak in instance_rmdir

Hit the kmemleak when executing instance_rmdir, it forgot releasing
mem of tracing_cpumask. With this fix, the warn does not appear any
more.

unreferenced object 0xffff93a8dfaa7c18 (size 8):
  comm "mkdir", pid 1436, jiffies 4294763622 (age 9134.308s)
  hex dump (first 8 bytes):
    ff ff ff ff ff ff ff ff                          ........
  backtrace:
    [<ffffffff88b6567a>] kmemleak_alloc+0x4a/0xa0
    [<ffffffff8861ea41>] __kmalloc_node+0xf1/0x280
    [<ffffffff88b505d3>] alloc_cpumask_var_node+0x23/0x30
    [<ffffffff88b5060e>] alloc_cpumask_var+0xe/0x10
    [<ffffffff88571ab0>] instance_mkdir+0x90/0x240
    [<ffffffff886e5100>] tracefs_syscall_mkdir+0x40/0x70
    [<ffffffff886565c9>] vfs_mkdir+0x109/0x1b0
    [<ffffffff8865b1d0>] SyS_mkdir+0xd0/0x100
    [<ffffffff88403857>] do_syscall_64+0x67/0x150
    [<ffffffff88b710e7>] return_from_SYSCALL_64+0x0/0x6a
    [<ffffffffffffffff>] 0xffffffffffffffff

Link: http://lkml.kernel.org/r/1500546969-12594-1-git-send-email-chuhu@redhat.com

Cc: stable@vger.kernel.org
Fixes: ccfe9e42e451 ("tracing: Make tracing_cpumask available for all instances")
Signed-off-by: Chunyu Hu <chuhu@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2d0ffcc49dba..42b9355033d4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7774,6 +7774,7 @@ static int instance_rmdir(const char *name)
 	}
 	kfree(tr->topts);
 
+	free_cpumask_var(tr->tracing_cpumask);
 	kfree(tr->name);
 	kfree(tr);
 
-- 
cgit v1.3-14-g43fede


From f86f418059b94aa01f9342611a272ca60c583e89 Mon Sep 17 00:00:00 2001
From: Chunyan Zhang <zhang.chunyan@linaro.org>
Date: Wed, 7 Jun 2017 16:12:51 +0800
Subject: trace: fix the errors caused by incompatible type of RCU variables

The variables which are processed by RCU functions should be annotated
as RCU, otherwise sparse will report the errors like below:

"error: incompatible types in comparison expression (different
address spaces)"

Link: http://lkml.kernel.org/r/1496823171-7758-1-git-send-email-zhang.chunyan@linaro.org

Signed-off-by: Chunyan Zhang <zhang.chunyan@linaro.org>
[ Updated to not be 100% 80 column strict ]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h       |  6 +++---
 include/linux/trace_events.h |  2 +-
 kernel/trace/ftrace.c        | 41 +++++++++++++++++++++++++++--------------
 kernel/trace/trace.h         |  6 +++---
 4 files changed, 34 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 5857390ac35a..6383115e9d2c 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -145,8 +145,8 @@ enum {
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* The hash used to know what functions callbacks trace */
 struct ftrace_ops_hash {
-	struct ftrace_hash		*notrace_hash;
-	struct ftrace_hash		*filter_hash;
+	struct ftrace_hash __rcu	*notrace_hash;
+	struct ftrace_hash __rcu	*filter_hash;
 	struct mutex			regex_lock;
 };
 
@@ -168,7 +168,7 @@ static inline void ftrace_free_init_mem(void) { }
  */
 struct ftrace_ops {
 	ftrace_func_t			func;
-	struct ftrace_ops		*next;
+	struct ftrace_ops __rcu		*next;
 	unsigned long			flags;
 	void				*private;
 	ftrace_func_t			saved_func;
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index f73cedfa2e0b..536c80ff7ad9 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -338,7 +338,7 @@ enum {
 struct trace_event_file {
 	struct list_head		list;
 	struct trace_event_call		*event_call;
-	struct event_filter		*filter;
+	struct event_filter __rcu	*filter;
 	struct dentry			*dir;
 	struct trace_array		*tr;
 	struct trace_subsystem_dir	*system;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 53f6b6401cf0..02004ae91860 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -113,7 +113,7 @@ static int ftrace_disabled __read_mostly;
 
 static DEFINE_MUTEX(ftrace_lock);
 
-static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
+static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 static struct ftrace_ops global_ops;
 
@@ -169,8 +169,11 @@ int ftrace_nr_registered_ops(void)
 
 	mutex_lock(&ftrace_lock);
 
-	for (ops = ftrace_ops_list;
-	     ops != &ftrace_list_end; ops = ops->next)
+	for (ops = rcu_dereference_protected(ftrace_ops_list,
+					     lockdep_is_held(&ftrace_lock));
+	     ops != &ftrace_list_end;
+	     ops = rcu_dereference_protected(ops->next,
+					     lockdep_is_held(&ftrace_lock)))
 		cnt++;
 
 	mutex_unlock(&ftrace_lock);
@@ -275,10 +278,11 @@ static void update_ftrace_function(void)
 	 * If there's only one ftrace_ops registered, the ftrace_ops_list
 	 * will point to the ops we want.
 	 */
-	set_function_trace_op = ftrace_ops_list;
+	set_function_trace_op = rcu_dereference_protected(ftrace_ops_list,
+						lockdep_is_held(&ftrace_lock));
 
 	/* If there's no ftrace_ops registered, just call the stub function */
-	if (ftrace_ops_list == &ftrace_list_end) {
+	if (set_function_trace_op == &ftrace_list_end) {
 		func = ftrace_stub;
 
 	/*
@@ -286,7 +290,8 @@ static void update_ftrace_function(void)
 	 * recursion safe and not dynamic and the arch supports passing ops,
 	 * then have the mcount trampoline call the function directly.
 	 */
-	} else if (ftrace_ops_list->next == &ftrace_list_end) {
+	} else if (rcu_dereference_protected(ftrace_ops_list->next,
+			lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
 		func = ftrace_ops_get_list_func(ftrace_ops_list);
 
 	} else {
@@ -348,9 +353,11 @@ int using_ftrace_ops_list_func(void)
 	return ftrace_trace_function == ftrace_ops_list_func;
 }
 
-static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
+static void add_ftrace_ops(struct ftrace_ops __rcu **list,
+			   struct ftrace_ops *ops)
 {
-	ops->next = *list;
+	rcu_assign_pointer(ops->next, *list);
+
 	/*
 	 * We are entering ops into the list but another
 	 * CPU might be walking that list. We need to make sure
@@ -360,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
 	rcu_assign_pointer(*list, ops);
 }
 
-static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
+static int remove_ftrace_ops(struct ftrace_ops __rcu **list,
+			     struct ftrace_ops *ops)
 {
 	struct ftrace_ops **p;
 
@@ -368,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
 	 * If we are removing the last function, then simply point
 	 * to the ftrace_stub.
 	 */
-	if (*list == ops && ops->next == &ftrace_list_end) {
+	if (rcu_dereference_protected(*list,
+			lockdep_is_held(&ftrace_lock)) == ops &&
+	    rcu_dereference_protected(ops->next,
+			lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
 		*list = &ftrace_list_end;
 		return 0;
 	}
@@ -1569,8 +1580,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
 		return 0;
 #endif
 
-	hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash);
-	hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash);
+	rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash);
+	rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash);
 
 	if (hash_contains_ip(ip, &hash))
 		ret = 1;
@@ -2840,7 +2851,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 	 * If there's no more ops registered with ftrace, run a
 	 * sanity check to make sure all rec flags are cleared.
 	 */
-	if (ftrace_ops_list == &ftrace_list_end) {
+	if (rcu_dereference_protected(ftrace_ops_list,
+			lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
 		struct ftrace_page *pg;
 		struct dyn_ftrace *rec;
 
@@ -6453,7 +6465,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	if (ftrace_enabled) {
 
 		/* we are starting ftrace again */
-		if (ftrace_ops_list != &ftrace_list_end)
+		if (rcu_dereference_protected(ftrace_ops_list,
+			lockdep_is_held(&ftrace_lock)) != &ftrace_list_end)
 			update_ftrace_function();
 
 		ftrace_startup_sysctl();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6ade1c55cc3a..490ba229931d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1210,9 +1210,9 @@ struct ftrace_event_field {
 struct event_filter {
 	int			n_preds;	/* Number assigned */
 	int			a_preds;	/* allocated */
-	struct filter_pred	*preds;
-	struct filter_pred	*root;
-	char			*filter_string;
+	struct filter_pred __rcu	*preds;
+	struct filter_pred __rcu	*root;
+	char				*filter_string;
 };
 
 struct event_subsystem {
-- 
cgit v1.3-14-g43fede


From 4cabc5b186b5427b9ee5a7495172542af105f02b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 21 Jul 2017 00:00:21 +0200
Subject: bpf: fix mixed signed/unsigned derived min/max value bounds

Edward reported that there's an issue in min/max value bounds
tracking when signed and unsigned compares both provide hints
on limits when having unknown variables. E.g. a program such
as the following should have been rejected:

   0: (7a) *(u64 *)(r10 -8) = 0
   1: (bf) r2 = r10
   2: (07) r2 += -8
   3: (18) r1 = 0xffff8a94cda93400
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+7
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp
   7: (7a) *(u64 *)(r10 -16) = -8
   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = -1
  10: (2d) if r1 > r2 goto pc+3
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0
  R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
  11: (65) if r1 s> 0x1 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0,max_value=1
  R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
  12: (0f) r0 += r1
  13: (72) *(u8 *)(r0 +0) = 0
  R0=map_value_adj(ks=8,vs=8,id=0),min_value=0,max_value=1 R1=inv,min_value=0,max_value=1
  R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
  14: (b7) r0 = 0
  15: (95) exit

What happens is that in the first part ...

   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = -1
  10: (2d) if r1 > r2 goto pc+3

... r1 carries an unsigned value, and is compared as unsigned
against a register carrying an immediate. Verifier deduces in
reg_set_min_max() that since the compare is unsigned and operation
is greater than (>), that in the fall-through/false case, r1's
minimum bound must be 0 and maximum bound must be r2. Latter is
larger than the bound and thus max value is reset back to being
'invalid' aka BPF_REGISTER_MAX_RANGE. Thus, r1 state is now
'R1=inv,min_value=0'. The subsequent test ...

  11: (65) if r1 s> 0x1 goto pc+2

... is a signed compare of r1 with immediate value 1. Here,
verifier deduces in reg_set_min_max() that since the compare
is signed this time and operation is greater than (>), that
in the fall-through/false case, we can deduce that r1's maximum
bound must be 1, meaning with prior test, we result in r1 having
the following state: R1=inv,min_value=0,max_value=1. Given that
the actual value this holds is -8, the bounds are wrongly deduced.
When this is being added to r0 which holds the map_value(_adj)
type, then subsequent store access in above case will go through
check_mem_access() which invokes check_map_access_adj(), that
will then probe whether the map memory is in bounds based
on the min_value and max_value as well as access size since
the actual unknown value is min_value <= x <= max_value; commit
fce366a9dd0d ("bpf, verifier: fix alu ops against map_value{,
_adj} register types") provides some more explanation on the
semantics.

It's worth to note in this context that in the current code,
min_value and max_value tracking are used for two things, i)
dynamic map value access via check_map_access_adj() and since
commit 06c1c049721a ("bpf: allow helpers access to variable memory")
ii) also enforced at check_helper_mem_access() when passing a
memory address (pointer to packet, map value, stack) and length
pair to a helper and the length in this case is an unknown value
defining an access range through min_value/max_value in that
case. The min_value/max_value tracking is /not/ used in the
direct packet access case to track ranges. However, the issue
also affects case ii), for example, the following crafted program
based on the same principle must be rejected as well:

   0: (b7) r2 = 0
   1: (bf) r3 = r10
   2: (07) r3 += -512
   3: (7a) *(u64 *)(r10 -16) = -8
   4: (79) r4 = *(u64 *)(r10 -16)
   5: (b7) r6 = -1
   6: (2d) if r4 > r6 goto pc+5
  R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512
  R4=inv,min_value=0 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
   7: (65) if r4 s> 0x1 goto pc+4
  R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512
  R4=inv,min_value=0,max_value=1 R6=imm-1,max_value=18446744073709551615,min_align=1
  R10=fp
   8: (07) r4 += 1
   9: (b7) r5 = 0
  10: (6a) *(u16 *)(r10 -512) = 0
  11: (85) call bpf_skb_load_bytes#26
  12: (b7) r0 = 0
  13: (95) exit

Meaning, while we initialize the max_value stack slot that the
verifier thinks we access in the [1,2] range, in reality we
pass -7 as length which is interpreted as u32 in the helper.
Thus, this issue is relevant also for the case of helper ranges.
Resetting both bounds in check_reg_overflow() in case only one
of them exceeds limits is also not enough as similar test can be
created that uses values which are within range, thus also here
learned min value in r1 is incorrect when mixed with later signed
test to create a range:

   0: (7a) *(u64 *)(r10 -8) = 0
   1: (bf) r2 = r10
   2: (07) r2 += -8
   3: (18) r1 = 0xffff880ad081fa00
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+7
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp
   7: (7a) *(u64 *)(r10 -16) = -8
   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = 2
  10: (3d) if r2 >= r1 goto pc+3
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  11: (65) if r1 s> 0x4 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0
  R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  12: (0f) r0 += r1
  13: (72) *(u8 *)(r0 +0) = 0
  R0=map_value_adj(ks=8,vs=8,id=0),min_value=3,max_value=4
  R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  14: (b7) r0 = 0
  15: (95) exit

This leaves us with two options for fixing this: i) to invalidate
all prior learned information once we switch signed context, ii)
to track min/max signed and unsigned boundaries separately as
done in [0]. (Given latter introduces major changes throughout
the whole verifier, it's rather net-next material, thus this
patch follows option i), meaning we can derive bounds either
from only signed tests or only unsigned tests.) There is still the
case of adjust_reg_min_max_vals(), where we adjust bounds on ALU
operations, meaning programs like the following where boundaries
on the reg get mixed in context later on when bounds are merged
on the dst reg must get rejected, too:

   0: (7a) *(u64 *)(r10 -8) = 0
   1: (bf) r2 = r10
   2: (07) r2 += -8
   3: (18) r1 = 0xffff89b2bf87ce00
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+6
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp
   7: (7a) *(u64 *)(r10 -16) = -8
   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = 2
  10: (3d) if r2 >= r1 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  11: (b7) r7 = 1
  12: (65) if r7 s> 0x0 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,max_value=0 R10=fp
  13: (b7) r0 = 0
  14: (95) exit

  from 12 to 15: R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0
  R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,min_value=1 R10=fp
  15: (0f) r7 += r1
  16: (65) if r7 s> 0x4 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp
  17: (0f) r0 += r7
  18: (72) *(u8 *)(r0 +0) = 0
  R0=map_value_adj(ks=8,vs=8,id=0),min_value=4,max_value=4 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp
  19: (b7) r0 = 0
  20: (95) exit

Meaning, in adjust_reg_min_max_vals() we must also reset range
values on the dst when src/dst registers have mixed signed/
unsigned derived min/max value bounds with one unbounded value
as otherwise they can be added together deducing false boundaries.
Once both boundaries are established from either ALU ops or
compare operations w/o mixing signed/unsigned insns, then they
can safely be added to other regs also having both boundaries
established. Adding regs with one unbounded side to a map value
where the bounded side has been learned w/o mixing ops is
possible, but the resulting map value won't recover from that,
meaning such op is considered invalid on the time of actual
access. Invalid bounds are set on the dst reg in case i) src reg,
or ii) in case dst reg already had them. The only way to recover
would be to perform i) ALU ops but only 'add' is allowed on map
value types or ii) comparisons, but these are disallowed on
pointers in case they span a range. This is fine as only BPF_JEQ
and BPF_JNE may be performed on PTR_TO_MAP_VALUE_OR_NULL registers
which potentially turn them into PTR_TO_MAP_VALUE type depending
on the branch, so only here min/max value cannot be invalidated
for them.

In terms of state pruning, value_from_signed is considered
as well in states_equal() when dealing with adjusted map values.
With regards to breaking existing programs, there is a small
risk, but use-cases are rather quite narrow where this could
occur and mixing compares probably unlikely.

Joint work with Josef and Edward.

  [0] https://lists.iovisor.org/pipermail/iovisor-dev/2017-June/000822.html

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Reported-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |   1 +
 kernel/bpf/verifier.c        | 108 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 95 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 621076f56251..8e5d31f6faef 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -43,6 +43,7 @@ struct bpf_reg_state {
 	u32 min_align;
 	u32 aux_off;
 	u32 aux_off_align;
+	bool value_from_signed;
 };
 
 enum bpf_stack_slot_type {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6a86723c5b64..af9e84a4944e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -504,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
 {
 	regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
 	regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+	regs[regno].value_from_signed = false;
 	regs[regno].min_align = 0;
 }
 
@@ -777,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 	return -EACCES;
 }
 
-static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+static bool __is_pointer_value(bool allow_ptr_leaks,
+			       const struct bpf_reg_state *reg)
 {
-	if (env->allow_ptr_leaks)
+	if (allow_ptr_leaks)
 		return false;
 
-	switch (env->cur_state.regs[regno].type) {
+	switch (reg->type) {
 	case UNKNOWN_VALUE:
 	case CONST_IMM:
 		return false;
@@ -791,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 	}
 }
 
+static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+{
+	return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+}
+
 static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
 {
@@ -1832,10 +1839,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	dst_align = dst_reg->min_align;
 
 	/* We don't know anything about what was done to this register, mark it
-	 * as unknown.
+	 * as unknown. Also, if both derived bounds came from signed/unsigned
+	 * mixed compares and one side is unbounded, we cannot really do anything
+	 * with them as boundaries cannot be trusted. Thus, arithmetic of two
+	 * regs of such kind will get invalidated bounds on the dst side.
 	 */
-	if (min_val == BPF_REGISTER_MIN_RANGE &&
-	    max_val == BPF_REGISTER_MAX_RANGE) {
+	if ((min_val == BPF_REGISTER_MIN_RANGE &&
+	     max_val == BPF_REGISTER_MAX_RANGE) ||
+	    (BPF_SRC(insn->code) == BPF_X &&
+	     ((min_val != BPF_REGISTER_MIN_RANGE &&
+	       max_val == BPF_REGISTER_MAX_RANGE) ||
+	      (min_val == BPF_REGISTER_MIN_RANGE &&
+	       max_val != BPF_REGISTER_MAX_RANGE) ||
+	      (dst_reg->min_value != BPF_REGISTER_MIN_RANGE &&
+	       dst_reg->max_value == BPF_REGISTER_MAX_RANGE) ||
+	      (dst_reg->min_value == BPF_REGISTER_MIN_RANGE &&
+	       dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) &&
+	     regs[insn->dst_reg].value_from_signed !=
+	     regs[insn->src_reg].value_from_signed)) {
 		reset_reg_range_values(regs, insn->dst_reg);
 		return;
 	}
@@ -2023,6 +2044,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			regs[insn->dst_reg].max_value = insn->imm;
 			regs[insn->dst_reg].min_value = insn->imm;
 			regs[insn->dst_reg].min_align = calc_align(insn->imm);
+			regs[insn->dst_reg].value_from_signed = false;
 		}
 
 	} else if (opcode > BPF_END) {
@@ -2198,40 +2220,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			    struct bpf_reg_state *false_reg, u64 val,
 			    u8 opcode)
 {
+	bool value_from_signed = true;
+	bool is_range = true;
+
 	switch (opcode) {
 	case BPF_JEQ:
 		/* If this is false then we know nothing Jon Snow, but if it is
 		 * true then we know for sure.
 		 */
 		true_reg->max_value = true_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JNE:
 		/* If this is true we know nothing Jon Snow, but if it is false
 		 * we know the value for sure;
 		 */
 		false_reg->max_value = false_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JGT:
-		/* Unsigned comparison, the minimum value is 0. */
-		false_reg->min_value = 0;
+		value_from_signed = false;
 		/* fallthrough */
 	case BPF_JSGT:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGT) {
+			/* Unsigned comparison, the minimum value is 0. */
+			false_reg->min_value = 0;
+		}
 		/* If this is false then we know the maximum val is val,
 		 * otherwise we know the min val is val+1.
 		 */
 		false_reg->max_value = val;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->min_value = val + 1;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	case BPF_JGE:
-		/* Unsigned comparison, the minimum value is 0. */
-		false_reg->min_value = 0;
+		value_from_signed = false;
 		/* fallthrough */
 	case BPF_JSGE:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGE) {
+			/* Unsigned comparison, the minimum value is 0. */
+			false_reg->min_value = 0;
+		}
 		/* If this is false then we know the maximum value is val - 1,
 		 * otherwise we know the mimimum value is val.
 		 */
 		false_reg->max_value = val - 1;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->min_value = val;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	default:
 		break;
@@ -2239,6 +2284,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 
 	check_reg_overflow(false_reg);
 	check_reg_overflow(true_reg);
+	if (is_range) {
+		if (__is_pointer_value(false, false_reg))
+			reset_reg_range_values(false_reg, 0);
+		if (__is_pointer_value(false, true_reg))
+			reset_reg_range_values(true_reg, 0);
+	}
 }
 
 /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
@@ -2248,41 +2299,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 				struct bpf_reg_state *false_reg, u64 val,
 				u8 opcode)
 {
+	bool value_from_signed = true;
+	bool is_range = true;
+
 	switch (opcode) {
 	case BPF_JEQ:
 		/* If this is false then we know nothing Jon Snow, but if it is
 		 * true then we know for sure.
 		 */
 		true_reg->max_value = true_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JNE:
 		/* If this is true we know nothing Jon Snow, but if it is false
 		 * we know the value for sure;
 		 */
 		false_reg->max_value = false_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JGT:
-		/* Unsigned comparison, the minimum value is 0. */
-		true_reg->min_value = 0;
+		value_from_signed = false;
 		/* fallthrough */
 	case BPF_JSGT:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGT) {
+			/* Unsigned comparison, the minimum value is 0. */
+			true_reg->min_value = 0;
+		}
 		/*
 		 * If this is false, then the val is <= the register, if it is
 		 * true the register <= to the val.
 		 */
 		false_reg->min_value = val;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->max_value = val - 1;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	case BPF_JGE:
-		/* Unsigned comparison, the minimum value is 0. */
-		true_reg->min_value = 0;
+		value_from_signed = false;
 		/* fallthrough */
 	case BPF_JSGE:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGE) {
+			/* Unsigned comparison, the minimum value is 0. */
+			true_reg->min_value = 0;
+		}
 		/* If this is false then constant < register, if it is true then
 		 * the register < constant.
 		 */
 		false_reg->min_value = val + 1;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->max_value = val;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	default:
 		break;
@@ -2290,6 +2364,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 
 	check_reg_overflow(false_reg);
 	check_reg_overflow(true_reg);
+	if (is_range) {
+		if (__is_pointer_value(false, false_reg))
+			reset_reg_range_values(false_reg, 0);
+		if (__is_pointer_value(false, true_reg))
+			reset_reg_range_values(true_reg, 0);
+	}
 }
 
 static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
-- 
cgit v1.3-14-g43fede


From 2aeb1883547626d82c597cce2c99f0b9c62e2425 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 20 Jul 2017 16:14:55 +0200
Subject: perf/core: Fix locking for children siblings group read

We're missing ctx lock when iterating children siblings
within the perf_read path for group reading. Following
race and crash can happen:

User space doing read syscall on event group leader:

T1:
  perf_read
    lock event->ctx->mutex
    perf_read_group
      lock leader->child_mutex
      __perf_read_group_add(child)
        list_for_each_entry(sub, &leader->sibling_list, group_entry)

---->   sub might be invalid at this point, because it could
        get removed via perf_event_exit_task_context in T2

Child exiting and cleaning up its events:

T2:
  perf_event_exit_task_context
    lock ctx->mutex
    list_for_each_entry_safe(child_event, next, &child_ctx->event_list,...
      perf_event_exit_event(child)
        lock ctx->lock
        perf_group_detach(child)
        unlock ctx->lock

---->   child is removed from sibling_list without any sync
        with T1 path above

        ...
        free_event(child)

Before the child is removed from the leader's child_list,
(and thus is omitted from perf_read_group processing), we
need to ensure that perf_read_group touches child's
siblings under its ctx->lock.

Peter further notes:

| One additional note; this bug got exposed by commit:
|
|   ba5213ae6b88 ("perf/core: Correct event creation with PERF_FORMAT_GROUP")
|
| which made it possible to actually trigger this code-path.

Tested-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: ba5213ae6b88 ("perf/core: Correct event creation with PERF_FORMAT_GROUP")
Link: http://lkml.kernel.org/r/20170720141455.2106-1-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index c9cdbd396770..c17c0881fd36 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4372,7 +4372,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value);
 static int __perf_read_group_add(struct perf_event *leader,
 					u64 read_format, u64 *values)
 {
+	struct perf_event_context *ctx = leader->ctx;
 	struct perf_event *sub;
+	unsigned long flags;
 	int n = 1; /* skip @nr */
 	int ret;
 
@@ -4402,12 +4404,15 @@ static int __perf_read_group_add(struct perf_event *leader,
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(leader);
 
+	raw_spin_lock_irqsave(&ctx->lock, flags);
+
 	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 		values[n++] += perf_event_count(sub);
 		if (read_format & PERF_FORMAT_ID)
 			values[n++] = primary_event_id(sub);
 	}
 
+	raw_spin_unlock_irqrestore(&ctx->lock, flags);
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede