From 625ed2bf049d5a352c1bcca962d6e133454eaaff Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 26 Apr 2017 08:27:56 +0200 Subject: sched/cfs: Make util/load_avg more stable In the current implementation of load/util_avg, we assume that the ongoing time segment has fully elapsed, and util/load_sum is divided by LOAD_AVG_MAX, even if part of the time segment still remains to run. As a consequence, this remaining part is considered as idle time and generates unexpected variations of util_avg of a busy CPU in the range [1002..1024[ whereas util_avg should stay at 1023. In order to keep the metric stable, we should not consider the ongoing time segment when computing load/util_avg but only the segments that have already fully elapsed. But to not consider the current time segment adds unwanted latency in the load/util_avg responsivness especially when the time is scaled instead of the contribution. Instead of waiting for the current time segment to have fully elapsed before accounting it in load/util_avg, we can already account the elapsed part but change the range used to compute load/util_avg accordingly. At the very beginning of a new time segment, the past segments have been decayed and the max value is LOAD_AVG_MAX*y. At the very end of the current time segment, the max value becomes: LOAD_AVG_MAX*y + 1024(us) (== LOAD_AVG_MAX) In fact, the max value is: LOAD_AVG_MAX*y + sa->period_contrib at any time in the time segment. Taking advantage of the fact that: LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024 the range becomes [0..LOAD_AVG_MAX-1024+sa->period_contrib]. As the elapsed part is already accounted in load/util_sum, we update the max value according to the current position in the time segment instead of removing its contribution. Suggested-by: Peter Zijlstra Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: pjt@google.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1493188076-2767-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d71109321841..4f1825d60937 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2916,12 +2916,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, /* * Step 2: update *_avg. */ - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); if (cfs_rq) { cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; + sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib); return 1; } -- cgit v1.3-14-g43fede From cf15ca8deda86b27b66e27848b4b0fe58098fc0b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:11:53 +0200 Subject: sched/clock: Initialize all per-CPU state before switching (back) to unstable In preparation for not keeping the sched_clock_tick() active for stable TSC, we need to explicitly initialize all per-CPU state before switching back to unstable. Note: this patch looses the __gtod_offset calculation; it will be restored in the next one. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 60 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 00a45c45beca..dc650851935f 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -124,6 +124,12 @@ int sched_clock_stable(void) return static_branch_likely(&__sched_clock_stable); } +static void __scd_stamp(struct sched_clock_data *scd) +{ + scd->tick_gtod = ktime_get_ns(); + scd->tick_raw = sched_clock(); +} + static void __set_sched_clock_stable(void) { struct sched_clock_data *scd = this_scd(); @@ -141,8 +147,37 @@ static void __set_sched_clock_stable(void) tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } +/* + * If we ever get here, we're screwed, because we found out -- typically after + * the fact -- that TSC wasn't good. This means all our clocksources (including + * ktime) could have reported wrong values. + * + * What we do here is an attempt to fix up and continue sort of where we left + * off in a coherent manner. + * + * The only way to fully avoid random clock jumps is to boot with: + * "tsc=unstable". + */ static void __sched_clock_work(struct work_struct *work) { + struct sched_clock_data *scd; + int cpu; + + /* take a current timestamp and set 'now' */ + preempt_disable(); + scd = this_scd(); + __scd_stamp(scd); + scd->clock = scd->tick_gtod + __gtod_offset; + preempt_enable(); + + /* clone to all CPUs */ + for_each_possible_cpu(cpu) + per_cpu(sched_clock_data, cpu) = *scd; + + printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", + scd->tick_gtod, __gtod_offset, + scd->tick_raw, __sched_clock_offset); + static_branch_disable(&__sched_clock_stable); } @@ -150,27 +185,11 @@ static DECLARE_WORK(sched_clock_work, __sched_clock_work); static void __clear_sched_clock_stable(void) { - struct sched_clock_data *scd = this_scd(); - - /* - * Attempt to make the stable->unstable transition continuous. - * - * Trouble is, this is typically called from the TSC watchdog - * timer, which is late per definition. This means the tick - * values can already be screwy. - * - * Still do what we can. - */ - __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod); - - printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", - scd->tick_gtod, __gtod_offset, - scd->tick_raw, __sched_clock_offset); + if (!sched_clock_stable()) + return; tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); - - if (sched_clock_stable()) - schedule_work(&sched_clock_work); + schedule_work(&sched_clock_work); } void clear_sched_clock_stable(void) @@ -357,8 +376,7 @@ void sched_clock_tick(void) * XXX arguably we can skip this if we expose tsc_clocksource_reliable */ scd = this_scd(); - scd->tick_raw = sched_clock(); - scd->tick_gtod = ktime_get_ns(); + __scd_stamp(scd); if (!sched_clock_stable() && likely(sched_clock_running)) sched_clock_local(scd); -- cgit v1.3-14-g43fede From b421b22b00b0011f6a2ce3561176c4e79e640c49 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:14:13 +0200 Subject: x86/tsc, sched/clock, clocksource: Use clocksource watchdog to provide stable sync points Currently we keep sched_clock_tick() active for stable TSC in order to keep the per-CPU state semi up-to-date. The (obvious) problem is that by the time we detect TSC is borked, our per-CPU state is also borked. So hook into the clocksource watchdog and call a method after we've found it to still be stable. There's the obvious race where the TSC goes wonky between finding it stable and us running the callback, but closing that is too much work and not really worth it, since we're already detecting TSC wobbles after the fact, so we cannot, per definition, fully avoid funny clock values. And since the watchdog runs less often than the tick, this is also an optimization. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 10 ++++++++++ include/linux/clocksource.h | 1 + include/linux/sched/clock.h | 2 +- kernel/sched/clock.c | 36 +++++++++++++++++++++++++++--------- kernel/time/clocksource.c | 3 +++ 5 files changed, 42 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 66015195bd18..c1b16b328abe 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1033,6 +1033,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs) pr_info("Marking TSC unstable due to clocksource watchdog\n"); } +static void tsc_cs_tick_stable(struct clocksource *cs) +{ + if (tsc_unstable) + return; + + if (using_native_sched_clock()) + sched_clock_tick_stable(); +} + /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ @@ -1046,6 +1055,7 @@ static struct clocksource clocksource_tsc = { .archdata = { .vclock_mode = VCLOCK_TSC }, .resume = tsc_resume, .mark_unstable = tsc_cs_mark_unstable, + .tick_stable = tsc_cs_tick_stable, }; void mark_tsc_unstable(char *reason) diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index f2b10d9ebd04..81490456c242 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -96,6 +96,7 @@ struct clocksource { void (*suspend)(struct clocksource *cs); void (*resume)(struct clocksource *cs); void (*mark_unstable)(struct clocksource *cs); + void (*tick_stable)(struct clocksource *cs); /* private: */ #ifdef CONFIG_CLOCKSOURCE_WATCHDOG diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 34fe92ce1ebd..978cbb0af5f3 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -63,8 +63,8 @@ extern void clear_sched_clock_stable(void); */ extern u64 __sched_clock_offset; - extern void sched_clock_tick(void); +extern void sched_clock_tick_stable(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index dc650851935f..f861637f7fdc 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -366,20 +366,38 @@ void sched_clock_tick(void) { struct sched_clock_data *scd; + if (sched_clock_stable()) + return; + + if (unlikely(!sched_clock_running)) + return; + WARN_ON_ONCE(!irqs_disabled()); - /* - * Update these values even if sched_clock_stable(), because it can - * become unstable at any point in time at which point we need some - * values to fall back on. - * - * XXX arguably we can skip this if we expose tsc_clocksource_reliable - */ scd = this_scd(); __scd_stamp(scd); + sched_clock_local(scd); +} + +void sched_clock_tick_stable(void) +{ + u64 gtod, clock; - if (!sched_clock_stable() && likely(sched_clock_running)) - sched_clock_local(scd); + if (!sched_clock_stable()) + return; + + /* + * Called under watchdog_lock. + * + * The watchdog just found this TSC to (still) be stable, so now is a + * good moment to update our __gtod_offset. Because once we find the + * TSC to be unstable, any computation will be computing crap. + */ + local_irq_disable(); + gtod = ktime_get_ns(); + clock = sched_clock(); + __gtod_offset = (clock + __sched_clock_offset) - gtod; + local_irq_enable(); } /* diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 93621ae718d3..03918a19cf2d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigned long data) continue; } + if (cs == curr_clocksource && cs->tick_stable) + cs->tick_stable(cs); + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { -- cgit v1.3-14-g43fede From ac1e843f0900bea92fcb47f6205e1f9ffb0d469c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:26:23 +0200 Subject: sched/clock: Remove unused argument to sched_clock_idle_wakeup_event() The argument to sched_clock_idle_wakeup_event() has not been used in a long time. Remove it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 2 +- include/linux/sched/clock.h | 4 ++-- kernel/sched/clock.c | 4 ++-- kernel/time/tick-sched.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c1b16b328abe..a3b544264360 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -182,7 +182,7 @@ static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long ts c2n->data[1] = data; done: - sched_clock_idle_wakeup_event(0); + sched_clock_idle_wakeup_event(); local_irq_restore(flags); } diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 978cbb0af5f3..9c36f0722966 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -39,7 +39,7 @@ static inline void sched_clock_idle_sleep_event(void) { } -static inline void sched_clock_idle_wakeup_event(u64 delta_ns) +static inline void sched_clock_idle_wakeup_event(void) { } @@ -66,7 +66,7 @@ extern u64 __sched_clock_offset; extern void sched_clock_tick(void); extern void sched_clock_tick_stable(void); extern void sched_clock_idle_sleep_event(void); -extern void sched_clock_idle_wakeup_event(u64 delta_ns); +extern void sched_clock_idle_wakeup_event(void); /* * As outlined in clock.c, provides a fast, high resolution, nanosecond diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index f861637f7fdc..750a92c9db7e 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -410,9 +410,9 @@ void sched_clock_idle_sleep_event(void) EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* - * We just idled delta nanoseconds (called with irqs disabled): + * We just idled; resync with ktime. (called with irqs disabled): */ -void sched_clock_idle_wakeup_event(u64 delta_ns) +void sched_clock_idle_wakeup_event(void) { if (timekeeping_suspended) return; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 64c97fc130c4..9c2dc64e31d8 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -554,7 +554,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) update_ts_time_stats(smp_processor_id(), ts, now, NULL); ts->idle_active = 0; - sched_clock_idle_wakeup_event(0); + sched_clock_idle_wakeup_event(); } static ktime_t tick_nohz_start_idle(struct tick_sched *ts) -- cgit v1.3-14-g43fede From 3067a33d5fec856bb297d58e7f03411d060ccdee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:42:03 +0200 Subject: sched/clock: Remove watchdog touching Commit: 2bacec8c318c ("sched: touch softlockup watchdog after idling") introduced the touch_softlockup_watchdog_sched() call without justification and I feel sched_clock management is not the right place, it should only be concerned with producing semi coherent time. If this causes watchdog thingies, we can find a better place. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 750a92c9db7e..c30c05f05d6f 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -418,7 +418,6 @@ void sched_clock_idle_wakeup_event(void) return; sched_clock_tick(); - touch_softlockup_watchdog_sched(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -- cgit v1.3-14-g43fede From f9fccdb9efef60dbcf84d493514b475c41aa866f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:43:59 +0200 Subject: cpuidle: Fix idle time tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ville reported that on his Core2, which has TSC stop in idle, we would always report very short idle durations. He tracked this down to commit: e93e59ce5b85 ("cpuidle: Replace ktime_get() with local_clock()") which replaces ktime_get() with local_clock(). Add a sched_clock_idle_wakeup_event() call, which will re-sync the clock with ktime_get_ns() when TSC is unstable and no-op otherwise. Reported-by: Ville Syrjälä Tested-by: Ville Syrjälä Signed-off-by: Peter Zijlstra (Intel) Cc: Daniel Lezcano Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: e93e59ce5b85 ("cpuidle: Replace ktime_get() with local_clock()") Signed-off-by: Ingo Molnar --- drivers/cpuidle/cpuidle.c | 1 + kernel/sched/clock.c | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 2706be7ed334..60bb64f4329d 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -220,6 +220,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, entered_state = target_state->enter(dev, drv, index); start_critical_timings(); + sched_clock_idle_wakeup_event(); time_end = ns_to_ktime(local_clock()); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c30c05f05d6f..d4c2f89fac92 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -410,14 +410,21 @@ void sched_clock_idle_sleep_event(void) EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* - * We just idled; resync with ktime. (called with irqs disabled): + * We just idled; resync with ktime. */ void sched_clock_idle_wakeup_event(void) { - if (timekeeping_suspended) + unsigned long flags; + + if (sched_clock_stable()) + return; + + if (unlikely(timekeeping_suspended)) return; + local_irq_save(flags); sched_clock_tick(); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -- cgit v1.3-14-g43fede From 2e44b7ddf8ab01cf98106c68388f87af15fbde73 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:46:57 +0200 Subject: sched/clock: Use late_initcall() instead of sched_init_smp() Core2 marks its TSC unstable in ACPI Processor Idle, which is probed after sched_init_smp(). Luckily it appears both acpi_processor and intel_idle (which has a similar check) are mandatory built-in. This means we can delay switching to stable until after these drivers have ran (if they were modules, this would be impossible). Delay the stable switch to late_initcall() to allow these drivers to mark TSC unstable and avoid difficult stable->unstable transitions. Reported-by: Lofstedt, Marta Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched/clock.h | 5 ----- kernel/sched/clock.c | 10 +++++++++- kernel/sched/core.c | 2 -- 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 9c36f0722966..a55600ffdf4b 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -23,10 +23,6 @@ extern u64 sched_clock_cpu(int cpu); extern void sched_clock_init(void); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline void sched_clock_init_late(void) -{ -} - static inline void sched_clock_tick(void) { } @@ -53,7 +49,6 @@ static inline u64 local_clock(void) return sched_clock(); } #else -extern void sched_clock_init_late(void); extern int sched_clock_stable(void); extern void clear_sched_clock_stable(void); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index d4c2f89fac92..a2f847c6ada8 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -64,6 +64,7 @@ #include #include #include +#include /* * Scheduler clock - returns current time in nanosec units. @@ -202,7 +203,11 @@ void clear_sched_clock_stable(void) __clear_sched_clock_stable(); } -void sched_clock_init_late(void) +/* + * We run this as late_initcall() such that it runs after all built-in drivers, + * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. + */ +static int __init sched_clock_init_late(void) { sched_clock_running = 2; /* @@ -216,7 +221,10 @@ void sched_clock_init_late(void) if (__sched_clock_stable_early) __set_sched_clock_stable(); + + return 0; } +late_initcall(sched_clock_init_late); /* * min, max except they take wrapping into account diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 803c3bc274c4..5794f4acad15 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5984,7 +5984,6 @@ void __init sched_init_smp(void) init_sched_dl_class(); sched_init_smt(); - sched_clock_init_late(); sched_smp_initialized = true; } @@ -6000,7 +5999,6 @@ early_initcall(migration_init); void __init sched_init_smp(void) { sched_init_granularity(); - sched_clock_init_late(); } #endif /* CONFIG_SMP */ -- cgit v1.3-14-g43fede From 7708d5f04de4dd5d2110df3244372b1e3f61bc7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:52:52 +0200 Subject: sched/clock: Print a warning recommending 'tsc=unstable' With our switch to stable delayed until late_initcall(), the most likely cause of hitting mark_tsc_unstable() is the watchdog. The watchdog typically only triggers when creative BIOS'es fiddle with the TSC to hide SMI latency. Since the watchdog can only detect TSC fiddling after the fact all TSC clocks (including userspace GTOD) can already have reported funny values. The only way to fully avoid this, is manually marking the TSC unstable at boot. Suggest people do this on their broken systems. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index a2f847c6ada8..1a0d389d2f2b 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -175,6 +175,7 @@ static void __sched_clock_work(struct work_struct *work) for_each_possible_cpu(cpu) per_cpu(sched_clock_data, cpu) = *scd; + printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n"); printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", scd->tick_gtod, __gtod_offset, scd->tick_raw, __sched_clock_offset); -- cgit v1.3-14-g43fede From 8c0334697dc37eb3d6d7632304d3a3662248daac Mon Sep 17 00:00:00 2001 From: Lauro Ramos Venancio Date: Thu, 13 Apr 2017 10:56:07 -0300 Subject: sched/topology: Refactor function build_overlap_sched_groups() Create functions build_group_from_child_sched_domain() and init_overlap_sched_group(). No functional change. Signed-off-by: Lauro Ramos Venancio Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1492091769-19879-2-git-send-email-lvenanci@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 62 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1b0b4fb12837..d786d45c44d9 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -513,6 +513,47 @@ int group_balance_cpu(struct sched_group *sg) return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); } +static struct sched_group * +build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) +{ + struct sched_group *sg; + struct cpumask *sg_span; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(cpu)); + + if (!sg) + return NULL; + + sg_span = sched_group_cpus(sg); + if (sd->child) + cpumask_copy(sg_span, sched_domain_span(sd->child)); + else + cpumask_copy(sg_span, sched_domain_span(sd)); + + return sg; +} + +static void init_overlap_sched_group(struct sched_domain *sd, + struct sched_group *sg, int cpu) +{ + struct sd_data *sdd = sd->private; + struct cpumask *sg_span; + + sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); + if (atomic_inc_return(&sg->sgc->ref) == 1) + build_group_mask(sd, sg); + + /* + * Initialize sgc->capacity such that even if we mess up the + * domains and no possible iteration will get us here, we won't + * die on a /0 trap. + */ + sg_span = sched_group_cpus(sg); + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; +} + static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { @@ -537,31 +578,14 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(cpu)); - + sg = build_group_from_child_sched_domain(sibling, cpu); if (!sg) goto fail; sg_span = sched_group_cpus(sg); - if (sibling->child) - cpumask_copy(sg_span, sched_domain_span(sibling->child)); - else - cpumask_set_cpu(i, sg_span); - cpumask_or(covered, covered, sg_span); - sg->sgc = *per_cpu_ptr(sdd->sgc, i); - if (atomic_inc_return(&sg->sgc->ref) == 1) - build_group_mask(sd, sg); - - /* - * Initialize sgc->capacity such that even if we mess up the - * domains and no possible iteration will get us here, we won't - * die on a /0 trap. - */ - sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + init_overlap_sched_group(sd, sg, i); /* * Make sure the first group of this domain contains the -- cgit v1.3-14-g43fede From c743f0a5c50f2fcbc628526279cfa24f3dabe182 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Apr 2017 14:20:05 +0200 Subject: sched/fair, cpumask: Export for_each_cpu_wrap() More users for for_each_cpu_wrap() have appeared. Promote the construct to generic cpumask interface. The implementation is slightly modified to reduce arguments. Signed-off-by: Peter Zijlstra (Intel) Cc: Lauro Ramos Venancio Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: lwang@redhat.com Link: http://lkml.kernel.org/r/20170414122005.o35me2h5nowqkxbv@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/cpumask.h | 17 +++++++++++++++++ kernel/sched/fair.c | 45 ++++----------------------------------------- lib/cpumask.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 2404ad238c0b..a21b1fb9a968 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -236,6 +236,23 @@ unsigned int cpumask_local_spread(unsigned int i, int node); (cpu) = cpumask_next_zero((cpu), (mask)), \ (cpu) < nr_cpu_ids;) +extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap); + +/** + * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location + * @cpu: the (optionally unsigned) integer iterator + * @mask: the cpumask poiter + * @start: the start location + * + * The implementation does not assume any bit in @mask is set (including @start). + * + * After the loop, cpu is >= nr_cpu_ids. + */ +#define for_each_cpu_wrap(cpu, mask, start) \ + for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \ + (cpu) < nr_cpumask_bits; \ + (cpu) = cpumask_next_wrap((cpu), (mask), (start), true)) + /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4f1825d60937..f80c825e2b43 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5640,43 +5640,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } -/* - * Implement a for_each_cpu() variant that starts the scan at a given cpu - * (@start), and wraps around. - * - * This is used to scan for idle CPUs; such that not all CPUs looking for an - * idle CPU find the same CPU. The down-side is that tasks tend to cycle - * through the LLC domain. - * - * Especially tbench is found sensitive to this. - */ - -static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) -{ - int next; - -again: - next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); - - if (*wrapped) { - if (next >= start) - return nr_cpumask_bits; - } else { - if (next >= nr_cpumask_bits) { - *wrapped = 1; - n = -1; - goto again; - } - } - - return next; -} - -#define for_each_cpu_wrap(cpu, mask, start, wrap) \ - for ((wrap) = 0, (cpu) = (start)-1; \ - (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ - (cpu) < nr_cpumask_bits; ) - #ifdef CONFIG_SCHED_SMT static inline void set_idle_cores(int cpu, int val) @@ -5736,7 +5699,7 @@ unlock: static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu, wrap; + int core, cpu; if (!static_branch_likely(&sched_smt_present)) return -1; @@ -5746,7 +5709,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); - for_each_cpu_wrap(core, cpus, target, wrap) { + for_each_cpu_wrap(core, cpus, target) { bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { @@ -5812,7 +5775,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t u64 avg_cost, avg_idle = this_rq()->avg_idle; u64 time, cost; s64 delta; - int cpu, wrap; + int cpu; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -5829,7 +5792,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t time = local_clock(); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { + for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; if (idle_cpu(cpu)) diff --git a/lib/cpumask.c b/lib/cpumask.c index 81dedaab36cc..4731a0895760 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -43,6 +43,38 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) } EXPORT_SYMBOL(cpumask_any_but); +/** + * cpumask_next_wrap - helper to implement for_each_cpu_wrap + * @n: the cpu prior to the place to search + * @mask: the cpumask pointer + * @start: the start point of the iteration + * @wrap: assume @n crossing @start terminates the iteration + * + * Returns >= nr_cpu_ids on completion + * + * Note: the @wrap argument is required for the start condition when + * we cannot assume @start is set in @mask. + */ +int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) +{ + int next; + +again: + next = cpumask_next(n, mask); + + if (wrap && n < start && next >= start) { + return nr_cpumask_bits; + + } else if (next >= nr_cpumask_bits) { + wrap = true; + n = -1; + goto again; + } + + return next; +} +EXPORT_SYMBOL(cpumask_next_wrap); + /* These are not inline because of header tangles. */ #ifdef CONFIG_CPUMASK_OFFSTACK /** -- cgit v1.3-14-g43fede From 0372dd2736e02672ac6e189c31f7d8c02ad543cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Apr 2017 17:24:02 +0200 Subject: sched/topology: Fix building of overlapping sched-groups When building the overlapping groups, we very obviously should start with the previous domain of _this_ @cpu, not CPU-0. This can be readily demonstrated with a topology like: node 0 1 2 3 0: 10 20 30 20 1: 20 10 20 30 2: 30 20 10 20 3: 20 30 20 10 Where (for example) CPU1 ends up generating the following nonsensical groups: [] CPU1 attaching sched-domain: [] domain 0: span 0-2 level NUMA [] groups: 1 2 0 [] domain 1: span 0-3 level NUMA [] groups: 1-3 (cpu_capacity = 3072) 0-1,3 (cpu_capacity = 3072) Where the fact that domain 1 doesn't include a group with span 0-2 is the obvious fail. With patch this looks like: [] CPU1 attaching sched-domain: [] domain 0: span 0-2 level NUMA [] groups: 1 0 2 [] domain 1: span 0-3 level NUMA [] groups: 0-2 (cpu_capacity = 3072) 0,2-3 (cpu_capacity = 3072) Debugged-by: Lauro Ramos Venancio Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Fixes: e3589f6c81e4 ("sched: Allow for overlapping sched_domain spans") Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index d786d45c44d9..921dedde2ee1 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -566,7 +566,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_clear(covered); - for_each_cpu(i, span) { + for_each_cpu_wrap(i, span, cpu) { struct cpumask *sg_span; if (cpumask_test_cpu(i, covered)) -- cgit v1.3-14-g43fede From 91eaed0d61319f58a9f8e43d41a8cbb069b4f73d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Apr 2017 17:32:07 +0200 Subject: sched/topology: Simplify build_overlap_sched_groups() Now that the first group will always be the previous domain of this @cpu this can be simplified. In fact, writing the code now removed should've been a big clue I was doing it wrong :/ Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 921dedde2ee1..6b10e0a956c7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -557,7 +557,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { - struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + struct sched_group *first = NULL, *last = NULL, *sg; const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered = sched_domains_tmpmask; struct sd_data *sdd = sd->private; @@ -587,15 +587,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) init_overlap_sched_group(sd, sg, i); - /* - * Make sure the first group of this domain contains the - * canonical balance CPU. Otherwise the sched_domain iteration - * breaks. See update_sg_lb_stats(). - */ - if ((!groups && cpumask_test_cpu(cpu, sg_span)) || - group_balance_cpu(sg) == cpu) - groups = sg; - if (!first) first = sg; if (last) @@ -603,7 +594,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) last = sg; last->next = first; } - sd->groups = groups; + sd->groups = first; return 0; -- cgit v1.3-14-g43fede From b0151c25548cacc50771a7930475727c6c8ee869 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Apr 2017 17:29:16 +0200 Subject: sched/debug: Print the scheduler topology group mask In order to determine the balance_cpu (for should_we_balance()) we need the sched_group_mask() for overlapping domains. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6b10e0a956c7..3d50ee38b8fb 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -82,12 +82,22 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT " %*pbl", cpumask_pr_args(sched_group_cpus(group))); + + if ((sd->flags & SD_OVERLAP) && !cpumask_full(sched_group_mask(group))) { + printk(KERN_CONT " (mask: %*pbl)", + cpumask_pr_args(sched_group_mask(group))); + } + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %lu)", + printk(KERN_CONT " (cpu_capacity: %lu)", group->sgc->capacity); } group = group->next; + + if (group != sd->groups) + printk(KERN_CONT ","); + } while (group != sd->groups); printk(KERN_CONT "\n"); -- cgit v1.3-14-g43fede From a420b0630362c2c451060e6187e36d72df827134 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Apr 2017 18:20:48 +0200 Subject: sched/topology: Verify the first group matches the child domain We want sched_groups to be sibling child domains (or individual CPUs when there are no child domains). Furthermore, since the first group of a domain should include the CPU of that domain, the first group of each domain should match the child domain. Verify this is indeed so. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 3d50ee38b8fb..81c82031ed95 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -93,6 +93,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, group->sgc->capacity); } + if (group == sd->groups && sd->child && + !cpumask_equal(sched_domain_span(sd->child), + sched_group_cpus(group))) { + printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n"); + } + group = group->next; if (group != sd->groups) -- cgit v1.3-14-g43fede From f32d782e31bf079f600dcec126ed117b0577e85c Mon Sep 17 00:00:00 2001 From: Lauro Ramos Venancio Date: Thu, 20 Apr 2017 16:51:40 -0300 Subject: sched/topology: Optimize build_group_mask() The group mask is always used in intersection with the group CPUs. So, when building the group mask, we don't have to care about CPUs that are not part of the group. Signed-off-by: Lauro Ramos Venancio Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: lwang@redhat.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1492717903-5195-2-git-send-email-lvenanci@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 81c82031ed95..5a4d9aeda258 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -506,12 +506,12 @@ enum s_alloc { */ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) { - const struct cpumask *span = sched_domain_span(sd); + const struct cpumask *sg_span = sched_group_cpus(sg); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; - for_each_cpu(i, span) { + for_each_cpu(i, sg_span) { sibling = *per_cpu_ptr(sdd->sd, i); if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; -- cgit v1.3-14-g43fede From c20e1ea4b61c3d99a354d912f2d74822fd2a001d Mon Sep 17 00:00:00 2001 From: Lauro Ramos Venancio Date: Thu, 20 Apr 2017 16:51:42 -0300 Subject: sched/topology: Move comment about asymmetric node setups Signed-off-by: Lauro Ramos Venancio Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: lwang@redhat.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1492717903-5195-4-git-send-email-lvenanci@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5a4d9aeda258..c10f44a1ab2d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -495,14 +495,6 @@ enum s_alloc { /* * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. - * - * Asymmetric node setups can result in situations where the domain tree is of - * unequal depth, make sure to skip domains that already cover the entire - * range. - * - * In that case build_sched_domains() will have terminated the iteration early - * and our sibling sd spans will be empty. Domains should always include the - * CPU they're built on, so check that. */ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) { @@ -590,7 +582,16 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) sibling = *per_cpu_ptr(sdd->sd, i); - /* See the comment near build_group_mask(). */ + /* + * Asymmetric node setups can result in situations where the + * domain tree is of unequal depth, make sure to skip domains + * that already cover the entire range. + * + * In that case build_sched_domains() will have terminated the + * iteration early and our sibling sd spans will be empty. + * Domains should always include the CPU they're built on, so + * check that. + */ if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; -- cgit v1.3-14-g43fede From af85596c74de2fd9abb87501ae280038ac28a3f4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 26 Apr 2017 17:36:41 +0200 Subject: sched/topology: Remove FORCE_SD_OVERLAP Its an obsolete debug mechanism and future code wants to rely on properties this undermines. Namely, it would be good to assume that SD_OVERLAP domains have children, but if we build the entire hierarchy with SD_OVERLAP this is obviously false. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/features.h | 1 - kernel/sched/topology.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 11192e0cb122..dc4d1483b038 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -76,7 +76,6 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) SCHED_FEAT(RT_PUSH_IPI, true) #endif -SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c10f44a1ab2d..21efacf547d4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1439,7 +1439,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = build_sched_domain(tl, cpu_map, attr, sd, i); if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + if (tl->flags & SDTL_OVERLAP) sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; -- cgit v1.3-14-g43fede From 73bb059f9b8a00c5e1bf2f7ca83138c05d05e600 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Apr 2017 14:00:49 +0200 Subject: sched/topology: Fix overlapping sched_group_mask The point of sched_group_mask is to select those CPUs from sched_group_cpus that can actually arrive at this balance domain. The current code gets it wrong, as can be readily demonstrated with a topology like: node 0 1 2 3 0: 10 20 30 20 1: 20 10 20 30 2: 30 20 10 20 3: 20 30 20 10 Where (for example) domain 1 on CPU1 ends up with a mask that includes CPU0: [] CPU1 attaching sched-domain: [] domain 0: span 0-2 level NUMA [] groups: 1 (mask: 1), 2, 0 [] domain 1: span 0-3 level NUMA [] groups: 0-2 (mask: 0-2) (cpu_capacity: 3072), 0,2-3 (cpu_capacity: 3072) This causes sched_balance_cpu() to compute the wrong CPU and consequently should_we_balance() will terminate early resulting in missed load-balance opportunities. The fixed topology looks like: [] CPU1 attaching sched-domain: [] domain 0: span 0-2 level NUMA [] groups: 1 (mask: 1), 2, 0 [] domain 1: span 0-3 level NUMA [] groups: 0-2 (mask: 1) (cpu_capacity: 3072), 0,2-3 (cpu_capacity: 3072) (note: this relies on OVERLAP domains to always have children, this is true because the regular topology domains are still here -- this is before degenerate trimming) Debugged-by: Lauro Ramos Venancio Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Fixes: e3589f6c81e4 ("sched: Allow for overlapping sched_domain spans") Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 21efacf547d4..09563c1d1d5b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -495,6 +495,9 @@ enum s_alloc { /* * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. + * + * Only CPUs that can arrive at this group should be considered to continue + * balancing. */ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) { @@ -505,11 +508,24 @@ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) for_each_cpu(i, sg_span) { sibling = *per_cpu_ptr(sdd->sd, i); - if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + + /* + * Can happen in the asymmetric case, where these siblings are + * unused. The mask will not be empty because those CPUs that + * do have the top domain _should_ span the domain. + */ + if (!sibling->child) + continue; + + /* If we would not end up here, we can't continue from here */ + if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) continue; cpumask_set_cpu(i, sched_group_mask(sg)); } + + /* We must not have empty masks here */ + WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg))); } /* -- cgit v1.3-14-g43fede From 8d5dc5126bb2bbcebf0b1e061cca2fc02c935620 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Apr 2017 15:29:40 +0200 Subject: sched/topology: Small cleanup Move the allocation of topology specific cpumasks into the topology code. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +--- kernel/sched/sched.h | 4 +--- kernel/sched/topology.c | 7 +++++-- 3 files changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5794f4acad15..dde5d1e860f0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5958,7 +5958,6 @@ void __init sched_init_smp(void) cpumask_var_t non_isolated_cpus; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); - alloc_cpumask_var(&fallback_doms, GFP_KERNEL); sched_init_numa(); @@ -5968,7 +5967,7 @@ void __init sched_init_smp(void) * happen. */ mutex_lock(&sched_domains_mutex); - init_sched_domains(cpu_active_mask); + sched_init_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); @@ -6197,7 +6196,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; #ifdef CONFIG_SMP - zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6dda2aab731e..6e1eae717a24 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -606,11 +606,9 @@ struct root_domain { extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; -extern cpumask_var_t fallback_doms; -extern cpumask_var_t sched_domains_tmpmask; extern void init_defrootdomain(void); -extern int init_sched_domains(const struct cpumask *cpu_map); +extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); #endif /* CONFIG_SMP */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 09563c1d1d5b..a4b868c76f3c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1526,7 +1526,7 @@ static struct sched_domain_attr *dattr_cur; * cpumask) fails, then fallback to a single sched domain, * as determined by the single cpumask fallback_doms. */ -cpumask_var_t fallback_doms; +static cpumask_var_t fallback_doms; /* * arch_update_cpu_topology lets virtualized architectures update the @@ -1568,10 +1568,13 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * For now this just excludes isolated CPUs, but could be used to * exclude other special cases in the future. */ -int init_sched_domains(const struct cpumask *cpu_map) +int sched_init_domains(const struct cpumask *cpu_map) { int err; + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); + zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); + arch_update_cpu_topology(); ndoms_cur = 1; doms_cur = alloc_sched_domains(ndoms_cur); -- cgit v1.3-14-g43fede From 005f874dd2843116e2ea079e3679f4f318f12fee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 26 Apr 2017 17:35:35 +0200 Subject: sched/topology: Add sched_group_capacity debugging Add sgc::id to easier spot domain construction issues. Take the opportunity to slightly rework the group printing, because adding more "(id: %d)" strings makes the entire thing very hard to read. Also the individual groups are very hard to separate, so add explicit visual grouping, which allows replacing all the "(%s: %d)" format things with shorter "%s=%d" variants. Then fix up some inconsistencies in surrounding prints for domains. The end result looks like: [] CPU0 attaching sched-domain(s): [] domain-0: span=0,4 level=DIE [] groups: 0:{ span=0 }, 4:{ span=4 } [] domain-1: span=0-1,3-5,7 level=NUMA [] groups: 0:{ span=0,4 mask=0,4 cap=2048 }, 1:{ span=1,5 mask=1,5 cap=2048 }, 3:{ span=3,7 mask=3,7 cap=2048 } [] domain-2: span=0-7 level=NUMA [] groups: 0:{ span=0-1,3-5,7 mask=0,4 cap=6144 }, 2:{ span=1-3,5-7 mask=2,6 cap=6144 } Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 4 ++++ kernel/sched/topology.c | 25 +++++++++++++++---------- 2 files changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6e1eae717a24..4312b2adfb02 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1023,6 +1023,10 @@ struct sched_group_capacity { unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ +#ifdef CONFIG_SCHED_DEBUG + int id; +#endif + unsigned long cpumask[0]; /* iteration mask */ }; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index a4b868c76f3c..12af4b157928 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -35,7 +35,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_clear(groupmask); - printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); @@ -45,7 +45,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %*pbl level %s\n", + printk(KERN_CONT "span=%*pbl level=%s\n", cpumask_pr_args(sched_domain_span(sd)), sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { @@ -80,18 +80,17 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - printk(KERN_CONT " %*pbl", - cpumask_pr_args(sched_group_cpus(group))); + printk(KERN_CONT " %d:{ span=%*pbl", + group->sgc->id, + cpumask_pr_args(sched_group_cpus(group))); if ((sd->flags & SD_OVERLAP) && !cpumask_full(sched_group_mask(group))) { - printk(KERN_CONT " (mask: %*pbl)", + printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(sched_group_mask(group))); } - if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity: %lu)", - group->sgc->capacity); - } + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) + printk(KERN_CONT " cap=%lu", group->sgc->capacity); if (group == sd->groups && sd->child && !cpumask_equal(sched_domain_span(sd->child), @@ -99,6 +98,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n"); } + printk(KERN_CONT " }"); + group = group->next; if (group != sd->groups) @@ -129,7 +130,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) return; } - printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu); for (;;) { if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) @@ -1356,6 +1357,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sgc) return -ENOMEM; +#ifdef CONFIG_SCHED_DEBUG + sgc->id = j; +#endif + *per_cpu_ptr(sdd->sgc, j) = sgc; } } -- cgit v1.3-14-g43fede From 1676330ecfa840113a37b25a49afda068380d19c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Apr 2017 14:31:11 +0200 Subject: sched/topology: Fix overlapping sched_group_capacity When building the overlapping groups we need to attach a consistent sched_group_capacity structure. That is, all 'identical' sched_group's should have the _same_ sched_group_capacity. This can (once again) be demonstrated with a topology like: node 0 1 2 3 0: 10 20 30 20 1: 20 10 20 30 2: 30 20 10 20 3: 20 30 20 10 But we need at least 2 CPUs per node for this to show up, after all, if there is only one CPU per node, our CPU @i is per definition a unique CPU that reaches this domain (aka balance-cpu). Given the above NUMA topo and 2 CPUs per node: [] CPU0 attaching sched-domain(s): [] domain-0: span=0,4 level=DIE [] groups: 0:{ span=0 }, 4:{ span=4 } [] domain-1: span=0-1,3-5,7 level=NUMA [] groups: 0:{ span=0,4 mask=0,4 cap=2048 }, 1:{ span=1,5 mask=1,5 cap=2048 }, 3:{ span=3,7 mask=3,7 cap=2048 } [] domain-2: span=0-7 level=NUMA [] groups: 0:{ span=0-1,3-5,7 mask=0,4 cap=6144 }, 2:{ span=1-3,5-7 mask=2,6 cap=6144 } [] CPU1 attaching sched-domain(s): [] domain-0: span=1,5 level=DIE [] groups: 1:{ span=1 }, 5:{ span=5 } [] domain-1: span=0-2,4-6 level=NUMA [] groups: 1:{ span=1,5 mask=1,5 cap=2048 }, 2:{ span=2,6 mask=2,6 cap=2048 }, 4:{ span=0,4 mask=0,4 cap=2048 } [] domain-2: span=0-7 level=NUMA [] groups: 1:{ span=0-2,4-6 mask=1,5 cap=6144 }, 3:{ span=0,2-4,6-7 mask=3,7 cap=6144 } Observe how CPU0-domain1-group0 and CPU1-domain1-group4 are the 'same' but have a different id (0 vs 4). To fix this, use the group balance CPU to select the SGC. This means we have to compute the full mask for each CPU and require a second temporary mask to store the group mask in (it otherwise lives in the SGC). The fixed topology looks like: [] CPU0 attaching sched-domain(s): [] domain-0: span=0,4 level=DIE [] groups: 0:{ span=0 }, 4:{ span=4 } [] domain-1: span=0-1,3-5,7 level=NUMA [] groups: 0:{ span=0,4 mask=0,4 cap=2048 }, 1:{ span=1,5 mask=1,5 cap=2048 }, 3:{ span=3,7 mask=3,7 cap=2048 } [] domain-2: span=0-7 level=NUMA [] groups: 0:{ span=0-1,3-5,7 mask=0,4 cap=6144 }, 2:{ span=1-3,5-7 mask=2,6 cap=6144 } [] CPU1 attaching sched-domain(s): [] domain-0: span=1,5 level=DIE [] groups: 1:{ span=1 }, 5:{ span=5 } [] domain-1: span=0-2,4-6 level=NUMA [] groups: 1:{ span=1,5 mask=1,5 cap=2048 }, 2:{ span=2,6 mask=2,6 cap=2048 }, 0:{ span=0,4 mask=0,4 cap=2048 } [] domain-2: span=0-7 level=NUMA [] groups: 1:{ span=0-2,4-6 mask=1,5 cap=6144 }, 3:{ span=0,2-4,6-7 mask=3,7 cap=6144 } Debugged-by: Lauro Ramos Venancio Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: e3589f6c81e4 ("sched: Allow for overlapping sched_domain spans") Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 12af4b157928..4f6fa7553d92 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -10,6 +10,7 @@ DEFINE_MUTEX(sched_domains_mutex); /* Protected by sched_domains_mutex: */ cpumask_var_t sched_domains_tmpmask; +cpumask_var_t sched_domains_tmpmask2; #ifdef CONFIG_SCHED_DEBUG @@ -500,13 +501,16 @@ enum s_alloc { * Only CPUs that can arrive at this group should be considered to continue * balancing. */ -static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +static void +build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) { const struct cpumask *sg_span = sched_group_cpus(sg); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; + cpumask_clear(mask); + for_each_cpu(i, sg_span) { sibling = *per_cpu_ptr(sdd->sd, i); @@ -522,11 +526,11 @@ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) continue; - cpumask_set_cpu(i, sched_group_mask(sg)); + cpumask_set_cpu(i, mask); } /* We must not have empty masks here */ - WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg))); + WARN_ON_ONCE(cpumask_empty(mask)); } /* @@ -560,14 +564,19 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) } static void init_overlap_sched_group(struct sched_domain *sd, - struct sched_group *sg, int cpu) + struct sched_group *sg) { + struct cpumask *mask = sched_domains_tmpmask2; struct sd_data *sdd = sd->private; struct cpumask *sg_span; + int cpu; + + build_group_mask(sd, sg, mask); + cpu = cpumask_first_and(sched_group_cpus(sg), mask); sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); if (atomic_inc_return(&sg->sgc->ref) == 1) - build_group_mask(sd, sg); + cpumask_copy(sched_group_mask(sg), mask); /* * Initialize sgc->capacity such that even if we mess up the @@ -619,7 +628,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) sg_span = sched_group_cpus(sg); cpumask_or(covered, covered, sg_span); - init_overlap_sched_group(sd, sg, i); + init_overlap_sched_group(sd, sg); if (!first) first = sg; @@ -1578,6 +1587,7 @@ int sched_init_domains(const struct cpumask *cpu_map) int err; zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); + zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL); zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); arch_update_cpu_topology(); -- cgit v1.3-14-g43fede From 35a566e6e8a18c3bc16229abeac146a707b8f216 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 28 Apr 2017 10:54:26 +0200 Subject: sched/topology: Add a few comments Try and describe what this code is about.. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 193 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4f6fa7553d92..b2790830e184 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -494,12 +494,128 @@ enum s_alloc { sa_none, }; +/* + * Return the canonical balance CPU for this group, this is the first CPU + * of this group that's also in the iteration mask. + * + * The iteration mask are all those CPUs that could actually end up at this + * group. See build_group_mask(). + * + * Also see should_we_balance(). + */ +int group_balance_cpu(struct sched_group *sg) +{ + return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); +} + + +/* + * NUMA topology (first read the regular topology blurb below) + * + * Given a node-distance table, for example: + * + * node 0 1 2 3 + * 0: 10 20 30 20 + * 1: 20 10 20 30 + * 2: 30 20 10 20 + * 3: 20 30 20 10 + * + * which represents a 4 node ring topology like: + * + * 0 ----- 1 + * | | + * | | + * | | + * 3 ----- 2 + * + * We want to construct domains and groups to represent this. The way we go + * about doing this is to build the domains on 'hops'. For each NUMA level we + * construct the mask of all nodes reachable in @level hops. + * + * For the above NUMA topology that gives 3 levels: + * + * NUMA-2 0-3 0-3 0-3 0-3 + * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2} + * + * NUMA-1 0-1,3 0-2 1-3 0,2-3 + * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3} + * + * NUMA-0 0 1 2 3 + * + * + * As can be seen; things don't nicely line up as with the regular topology. + * When we iterate a domain in child domain chunks some nodes can be + * represented multiple times -- hence the "overlap" naming for this part of + * the topology. + * + * In order to minimize this overlap, we only build enough groups to cover the + * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3. + * + * Because: + * + * - the first group of each domain is its child domain; this + * gets us the first 0-1,3 + * - the only uncovered node is 2, who's child domain is 1-3. + * + * However, because of the overlap, computing a unique CPU for each group is + * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both + * groups include the CPUs of Node-0, while those CPUs would not in fact ever + * end up at those groups (they would end up in group: 0-1,3). + * + * To correct this we have to introduce the group iteration mask. This mask + * will contain those CPUs in the group that can reach this group given the + * (child) domain tree. + * + * With this we can once again compute balance_cpu and sched_group_capacity + * relations. + * + * XXX include words on how balance_cpu is unique and therefore can be + * used for sched_group_capacity links. + * + * + * Another 'interesting' topology is: + * + * node 0 1 2 3 + * 0: 10 20 20 30 + * 1: 20 10 20 20 + * 2: 20 20 10 20 + * 3: 30 20 20 10 + * + * Which looks a little like: + * + * 0 ----- 1 + * | / | + * | / | + * | / | + * 2 ----- 3 + * + * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3 + * are not. + * + * This leads to a few particularly weird cases where the sched_domain's are + * not of the same number for each cpu. Consider: + * + * NUMA-2 0-3 0-3 + * groups: {0-2},{1-3} {1-3},{0-2} + * + * NUMA-1 0-2 0-3 0-3 1-3 + * + * NUMA-0 0 1 2 3 + * + */ + + /* * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. * * Only CPUs that can arrive at this group should be considered to continue * balancing. + * + * We do this during the group creation pass, therefore the group information + * isn't complete yet, however since each group represents a (child) domain we + * can fully construct this using the sched_domain bits (which are already + * complete). */ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) @@ -534,14 +650,10 @@ build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask } /* - * Return the canonical balance CPU for this group, this is the first CPU - * of this group that's also in the iteration mask. + * XXX: This creates per-node group entries; since the load-balancer will + * immediately access remote memory to construct this group's load-balance + * statistics having the groups node local is of dubious benefit. */ -int group_balance_cpu(struct sched_group *sg) -{ - return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); -} - static struct sched_group * build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) { @@ -577,6 +689,8 @@ static void init_overlap_sched_group(struct sched_domain *sd, sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); if (atomic_inc_return(&sg->sgc->ref) == 1) cpumask_copy(sched_group_mask(sg), mask); + else + WARN_ON_ONCE(!cpumask_equal(sched_group_mask(sg), mask)); /* * Initialize sgc->capacity such that even if we mess up the @@ -647,6 +761,78 @@ fail: return -ENOMEM; } + +/* + * Package topology (also see the load-balance blurb in fair.c) + * + * The scheduler builds a tree structure to represent a number of important + * topology features. By default (default_topology[]) these include: + * + * - Simultaneous multithreading (SMT) + * - Multi-Core Cache (MC) + * - Package (DIE) + * + * Where the last one more or less denotes everything up to a NUMA node. + * + * The tree consists of 3 primary data structures: + * + * sched_domain -> sched_group -> sched_group_capacity + * ^ ^ ^ ^ + * `-' `-' + * + * The sched_domains are per-cpu and have a two way link (parent & child) and + * denote the ever growing mask of CPUs belonging to that level of topology. + * + * Each sched_domain has a circular (double) linked list of sched_group's, each + * denoting the domains of the level below (or individual CPUs in case of the + * first domain level). The sched_group linked by a sched_domain includes the + * CPU of that sched_domain [*]. + * + * Take for instance a 2 threaded, 2 core, 2 cache cluster part: + * + * CPU 0 1 2 3 4 5 6 7 + * + * DIE [ ] + * MC [ ] [ ] + * SMT [ ] [ ] [ ] [ ] + * + * - or - + * + * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 + * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 + * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 + * + * CPU 0 1 2 3 4 5 6 7 + * + * One way to think about it is: sched_domain moves you up and down among these + * topology levels, while sched_group moves you sideways through it, at child + * domain granularity. + * + * sched_group_capacity ensures each unique sched_group has shared storage. + * + * There are two related construction problems, both require a CPU that + * uniquely identify each group (for a given domain): + * + * - The first is the balance_cpu (see should_we_balance() and the + * load-balance blub in fair.c); for each group we only want 1 CPU to + * continue balancing at a higher domain. + * + * - The second is the sched_group_capacity; we want all identical groups + * to share a single sched_group_capacity. + * + * Since these topologies are exclusive by construction. That is, its + * impossible for an SMT thread to belong to multiple cores, and cores to + * be part of multiple caches. There is a very clear and unique location + * for each CPU in the hierarchy. + * + * Therefore computing a unique CPU for each group is trivial (the iteration + * mask is redundant and set all 1s; all CPUs in a group will end up at _that_ + * group), we can simply pick the first CPU in each group. + * + * + * [*] in other words, the first group of each domain is its child domain. + */ + static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); -- cgit v1.3-14-g43fede From 0c0e776a9b0f21ac41d4c6982c57493928524dba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 May 2017 14:18:06 +0200 Subject: sched/topology: Rewrite get_group() We want to attain: sg_cpus() & sg_mask() == sg_mask() for this to be so we must initialize sg_mask() to sg_cpus() for the !overlap case (its currently cpumask_setall()). Since the code makes my head hurt bad, rewrite it into a simpler form, inspired by the now fixed overlap code. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/topology.c | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b2790830e184..dea1950b42a5 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -833,23 +833,34 @@ fail: * [*] in other words, the first group of each domain is its child domain. */ -static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) +static struct sched_group *get_group(int cpu, struct sd_data *sdd) { struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); struct sched_domain *child = sd->child; + struct sched_group *sg; if (child) cpu = cpumask_first(sched_domain_span(child)); - if (sg) { - *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); + sg = *per_cpu_ptr(sdd->sg, cpu); + sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); + + /* For claim_allocations: */ + atomic_inc(&sg->ref); + atomic_inc(&sg->sgc->ref); - /* For claim_allocations: */ - atomic_set(&(*sg)->sgc->ref, 1); + if (child) { + cpumask_copy(sched_group_cpus(sg), sched_domain_span(child)); + cpumask_copy(sched_group_mask(sg), sched_group_cpus(sg)); + } else { + cpumask_set_cpu(cpu, sched_group_cpus(sg)); + cpumask_set_cpu(cpu, sched_group_mask(sg)); } - return cpu; + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_cpus(sg)); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + + return sg; } /* @@ -868,34 +879,20 @@ build_sched_groups(struct sched_domain *sd, int cpu) struct cpumask *covered; int i; - get_group(cpu, sdd, &sd->groups); - atomic_inc(&sd->groups->ref); - - if (cpu != cpumask_first(span)) - return 0; - lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; cpumask_clear(covered); - for_each_cpu(i, span) { + for_each_cpu_wrap(i, span, cpu) { struct sched_group *sg; - int group, j; if (cpumask_test_cpu(i, covered)) continue; - group = get_group(i, sdd, &sg); - cpumask_setall(sched_group_mask(sg)); + sg = get_group(i, sdd); - for_each_cpu(j, span) { - if (get_group(j, sdd, NULL) != group) - continue; - - cpumask_set_cpu(j, covered); - cpumask_set_cpu(j, sched_group_cpus(sg)); - } + cpumask_or(covered, covered, sched_group_cpus(sg)); if (!first) first = sg; @@ -904,6 +901,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) last = sg; } last->next = first; + sd->groups = first; return 0; } -- cgit v1.3-14-g43fede From af218122b103900fa33d408aea0c2468791e698c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 May 2017 08:51:05 +0200 Subject: sched/topology: Simplify sched_group_mask() usage While writing the comments, it occurred to me that: sg_cpus & sg_mask == sg_mask at least conceptually; the !overlap case sets the all 1s mask. If we correct that we can simplify things and directly use sg_mask. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 +++---- kernel/sched/topology.c | 5 +++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f80c825e2b43..1eb32d4513ea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7996,7 +7996,7 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - struct cpumask *sg_cpus, *sg_mask; + struct cpumask *sg_mask; int cpu, balance_cpu = -1; /* @@ -8006,11 +8006,10 @@ static int should_we_balance(struct lb_env *env) if (env->idle == CPU_NEWLY_IDLE) return 1; - sg_cpus = sched_group_cpus(sg); sg_mask = sched_group_mask(sg); /* Try to find first idle cpu */ - for_each_cpu_and(cpu, sg_cpus, env->cpus) { - if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) + for_each_cpu_and(cpu, sg_mask, env->cpus) { + if (!idle_cpu(cpu)) continue; balance_cpu = cpu; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dea1950b42a5..bf53a99eb511 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -85,7 +85,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, group->sgc->id, cpumask_pr_args(sched_group_cpus(group))); - if ((sd->flags & SD_OVERLAP) && !cpumask_full(sched_group_mask(group))) { + if ((sd->flags & SD_OVERLAP) && + !cpumask_equal(sched_group_mask(group), sched_group_cpus(group))) { printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(sched_group_mask(group))); } @@ -505,7 +506,7 @@ enum s_alloc { */ int group_balance_cpu(struct sched_group *sg) { - return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); + return cpumask_first(sched_group_mask(sg)); } -- cgit v1.3-14-g43fede From e5c14b1fb89213ff718261e6fb1bb29c5ffbbe99 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 May 2017 10:47:02 +0200 Subject: sched/topology: Rename sched_group_mask() Since sched_group_mask() is now an independent cpumask (it no longer masks sched_group_cpus()), rename the thing. Suggested-by: Lauro Ramos Venancio Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +--- kernel/sched/sched.h | 7 +++---- kernel/sched/topology.c | 33 +++++++++++++++------------------ 3 files changed, 19 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1eb32d4513ea..a7d84c8a7881 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7996,7 +7996,6 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - struct cpumask *sg_mask; int cpu, balance_cpu = -1; /* @@ -8006,9 +8005,8 @@ static int should_we_balance(struct lb_env *env) if (env->idle == CPU_NEWLY_IDLE) return 1; - sg_mask = sched_group_mask(sg); /* Try to find first idle cpu */ - for_each_cpu_and(cpu, sg_mask, env->cpus) { + for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { if (!idle_cpu(cpu)) continue; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4312b2adfb02..f7c70575ae34 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1027,7 +1027,7 @@ struct sched_group_capacity { int id; #endif - unsigned long cpumask[0]; /* iteration mask */ + unsigned long cpumask[0]; /* balance mask */ }; struct sched_group { @@ -1054,10 +1054,9 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) } /* - * cpumask masking which cpus in the group are allowed to iterate up the domain - * tree. + * See build_balance_mask(). */ -static inline struct cpumask *sched_group_mask(struct sched_group *sg) +static inline struct cpumask *group_balance_mask(struct sched_group *sg) { return to_cpumask(sg->sgc->cpumask); } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index bf53a99eb511..070191f02035 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -86,9 +86,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_pr_args(sched_group_cpus(group))); if ((sd->flags & SD_OVERLAP) && - !cpumask_equal(sched_group_mask(group), sched_group_cpus(group))) { + !cpumask_equal(group_balance_mask(group), sched_group_cpus(group))) { printk(KERN_CONT " mask=%*pbl", - cpumask_pr_args(sched_group_mask(group))); + cpumask_pr_args(group_balance_mask(group))); } if (group->sgc->capacity != SCHED_CAPACITY_SCALE) @@ -497,16 +497,16 @@ enum s_alloc { /* * Return the canonical balance CPU for this group, this is the first CPU - * of this group that's also in the iteration mask. + * of this group that's also in the balance mask. * - * The iteration mask are all those CPUs that could actually end up at this - * group. See build_group_mask(). + * The balance mask are all those CPUs that could actually end up at this + * group. See build_balance_mask(). * * Also see should_we_balance(). */ int group_balance_cpu(struct sched_group *sg) { - return cpumask_first(sched_group_mask(sg)); + return cpumask_first(group_balance_mask(sg)); } @@ -563,7 +563,7 @@ int group_balance_cpu(struct sched_group *sg) * groups include the CPUs of Node-0, while those CPUs would not in fact ever * end up at those groups (they would end up in group: 0-1,3). * - * To correct this we have to introduce the group iteration mask. This mask + * To correct this we have to introduce the group balance mask. This mask * will contain those CPUs in the group that can reach this group given the * (child) domain tree. * @@ -607,11 +607,8 @@ int group_balance_cpu(struct sched_group *sg) /* - * Build an iteration mask that can exclude certain CPUs from the upwards - * domain traversal. - * - * Only CPUs that can arrive at this group should be considered to continue - * balancing. + * Build the balance mask; it contains only those CPUs that can arrive at this + * group and should be considered to continue balancing. * * We do this during the group creation pass, therefore the group information * isn't complete yet, however since each group represents a (child) domain we @@ -619,7 +616,7 @@ int group_balance_cpu(struct sched_group *sg) * complete). */ static void -build_group_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) +build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) { const struct cpumask *sg_span = sched_group_cpus(sg); struct sd_data *sdd = sd->private; @@ -684,14 +681,14 @@ static void init_overlap_sched_group(struct sched_domain *sd, struct cpumask *sg_span; int cpu; - build_group_mask(sd, sg, mask); + build_balance_mask(sd, sg, mask); cpu = cpumask_first_and(sched_group_cpus(sg), mask); sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); if (atomic_inc_return(&sg->sgc->ref) == 1) - cpumask_copy(sched_group_mask(sg), mask); + cpumask_copy(group_balance_mask(sg), mask); else - WARN_ON_ONCE(!cpumask_equal(sched_group_mask(sg), mask)); + WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask)); /* * Initialize sgc->capacity such that even if we mess up the @@ -852,10 +849,10 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) if (child) { cpumask_copy(sched_group_cpus(sg), sched_domain_span(child)); - cpumask_copy(sched_group_mask(sg), sched_group_cpus(sg)); + cpumask_copy(group_balance_mask(sg), sched_group_cpus(sg)); } else { cpumask_set_cpu(cpu, sched_group_cpus(sg)); - cpumask_set_cpu(cpu, sched_group_mask(sg)); + cpumask_set_cpu(cpu, group_balance_mask(sg)); } sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_cpus(sg)); -- cgit v1.3-14-g43fede From ae4df9d6c935105857d9d166b615e3f17531ce6b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 May 2017 11:03:12 +0200 Subject: sched/topology: Rename sched_group_cpus() There's a discrepancy in naming between the sched_domain and sched_group cpumask accessor. Since we're doing changes, fix it. $ git grep sched_group_cpus | wc -l 28 $ git grep sched_domain_span | wc -l 38 Suggests changing sched_group_cpus() into sched_group_span(): for i in `git grep -l sched_group_cpus` do sed -ie 's/sched_group_cpus/sched_group_span/g' $i done Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++++++++++---------- kernel/sched/sched.h | 4 ++-- kernel/sched/topology.c | 38 +++++++++++++++++++------------------- 3 files changed, 31 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a7d84c8a7881..eede181b4530 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5484,12 +5484,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int i; /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), + if (!cpumask_intersects(sched_group_span(group), &p->cpus_allowed)) continue; local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); + sched_group_span(group)); /* * Tally up the load of all CPUs in the group and find @@ -5499,7 +5499,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, runnable_load = 0; max_spare_cap = 0; - for_each_cpu(i, sched_group_cpus(group)) { + for_each_cpu(i, sched_group_span(group)) { /* Bias balancing toward cpus of our domain */ if (local_group) load = source_load(i, load_idx); @@ -5602,10 +5602,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* Check if we have any choice: */ if (group->group_weight == 1) - return cpumask_first(sched_group_cpus(group)); + return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { if (idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -7192,7 +7192,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * span the current group. */ - for_each_cpu(cpu, sched_group_cpus(sdg)) { + for_each_cpu(cpu, sched_group_span(sdg)) { struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); @@ -7371,7 +7371,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, memset(sgs, 0, sizeof(*sgs)); - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); /* Bias balancing toward cpus of our domain */ @@ -7535,7 +7535,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sg_lb_stats *sgs = &tmp_sgs; int local_group; - local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg)); if (local_group) { sds->local = sg; sgs = local; @@ -7890,7 +7890,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, unsigned long busiest_load = 0, busiest_capacity = 1; int i; - for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { + for_each_cpu_and(i, sched_group_span(group), env->cpus) { unsigned long capacity, wl; enum fbq_type rt; @@ -8043,7 +8043,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, - .dst_grpmask = sched_group_cpus(sd->groups), + .dst_grpmask = sched_group_span(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f7c70575ae34..f8cf1d87f065 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1048,7 +1048,7 @@ struct sched_group { unsigned long cpumask[0]; }; -static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +static inline struct cpumask *sched_group_span(struct sched_group *sg) { return to_cpumask(sg->cpumask); } @@ -1067,7 +1067,7 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) */ static inline unsigned int group_first_cpu(struct sched_group *group) { - return cpumask_first(sched_group_cpus(group)); + return cpumask_first(sched_group_span(group)); } extern int group_balance_cpu(struct sched_group *sg); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 070191f02035..79895aec281e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -53,7 +53,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_ERR "ERROR: domain->span does not contain " "CPU%d\n", cpu); } - if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { + if (!cpumask_test_cpu(cpu, sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not contain" " CPU%d\n", cpu); } @@ -66,27 +66,27 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!cpumask_weight(sched_group_cpus(group))) { + if (!cpumask_weight(sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); break; } if (!(sd->flags & SD_OVERLAP) && - cpumask_intersects(groupmask, sched_group_cpus(group))) { + cpumask_intersects(groupmask, sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpumask_or(groupmask, groupmask, sched_group_cpus(group)); + cpumask_or(groupmask, groupmask, sched_group_span(group)); printk(KERN_CONT " %d:{ span=%*pbl", group->sgc->id, - cpumask_pr_args(sched_group_cpus(group))); + cpumask_pr_args(sched_group_span(group))); if ((sd->flags & SD_OVERLAP) && - !cpumask_equal(group_balance_mask(group), sched_group_cpus(group))) { + !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(group_balance_mask(group))); } @@ -96,7 +96,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (group == sd->groups && sd->child && !cpumask_equal(sched_domain_span(sd->child), - sched_group_cpus(group))) { + sched_group_span(group))) { printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n"); } @@ -618,7 +618,7 @@ int group_balance_cpu(struct sched_group *sg) static void build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) { - const struct cpumask *sg_span = sched_group_cpus(sg); + const struct cpumask *sg_span = sched_group_span(sg); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; @@ -664,7 +664,7 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) if (!sg) return NULL; - sg_span = sched_group_cpus(sg); + sg_span = sched_group_span(sg); if (sd->child) cpumask_copy(sg_span, sched_domain_span(sd->child)); else @@ -682,7 +682,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, int cpu; build_balance_mask(sd, sg, mask); - cpu = cpumask_first_and(sched_group_cpus(sg), mask); + cpu = cpumask_first_and(sched_group_span(sg), mask); sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); if (atomic_inc_return(&sg->sgc->ref) == 1) @@ -695,7 +695,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, * domains and no possible iteration will get us here, we won't * die on a /0 trap. */ - sg_span = sched_group_cpus(sg); + sg_span = sched_group_span(sg); sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; } @@ -737,7 +737,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (!sg) goto fail; - sg_span = sched_group_cpus(sg); + sg_span = sched_group_span(sg); cpumask_or(covered, covered, sg_span); init_overlap_sched_group(sd, sg); @@ -848,14 +848,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) atomic_inc(&sg->sgc->ref); if (child) { - cpumask_copy(sched_group_cpus(sg), sched_domain_span(child)); - cpumask_copy(group_balance_mask(sg), sched_group_cpus(sg)); + cpumask_copy(sched_group_span(sg), sched_domain_span(child)); + cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); } else { - cpumask_set_cpu(cpu, sched_group_cpus(sg)); + cpumask_set_cpu(cpu, sched_group_span(sg)); cpumask_set_cpu(cpu, group_balance_mask(sg)); } - sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_cpus(sg)); + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; return sg; @@ -890,7 +890,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) sg = get_group(i, sdd); - cpumask_or(covered, covered, sched_group_cpus(sg)); + cpumask_or(covered, covered, sched_group_span(sg)); if (!first) first = sg; @@ -923,12 +923,12 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) do { int cpu, max_cpu = -1; - sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + sg->group_weight = cpumask_weight(sched_group_span(sg)); if (!(sd->flags & SD_ASYM_PACKING)) goto next; - for_each_cpu(cpu, sched_group_cpus(sg)) { + for_each_cpu(cpu, sched_group_span(sg)) { if (max_cpu < 0) max_cpu = cpu; else if (sched_asym_prefer(cpu, max_cpu)) -- cgit v1.3-14-g43fede From 502ce005ab95d5d9481768649dbab808845b24d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 May 2017 15:31:22 +0200 Subject: sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs In order to allow leaf_cfs_rq_list to remove entries switch the bandwidth hotplug code over to the task_groups list. Suggested-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Mason Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Turner Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170504133122.a6qjlj3hlblbjxux@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eede181b4530..c8fa777de76c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4642,24 +4642,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } +/* + * Both these cpu hotplug callbacks race against unregister_fair_sched_group() + * + * The race is harmless, since modifying bandwidth settings of unhooked group + * bits doesn't do much. + */ + +/* cpu online calback */ static void __maybe_unused update_runtime_enabled(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; - for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + lockdep_assert_held(&rq->lock); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; raw_spin_lock(&cfs_b->lock); cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; raw_spin_unlock(&cfs_b->lock); } + rcu_read_unlock(); } +/* cpu offline callback */ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct task_group *tg; + + lockdep_assert_held(&rq->lock); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; - for_each_leaf_cfs_rq(rq, cfs_rq) { if (!cfs_rq->runtime_enabled) continue; @@ -4677,6 +4696,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } + rcu_read_unlock(); } #else /* CONFIG_CFS_BANDWIDTH */ -- cgit v1.3-14-g43fede From a9e7f6544b9cebdae54d29f87a7ba2a83c0471b5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 25 Apr 2017 17:43:50 -0700 Subject: sched/fair: Fix O(nr_cgroups) in load balance path Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all live cfs_rqs which have ever been active on the CPU; unfortunately, this makes update_blocked_averages() O(# total cgroups) which isn't scalable at all. This shows up as a small CPU consumption and scheduling latency increase in the load balancing path in systems with CPU controller enabled across most cgroups. In an edge case where temporary cgroups were leaking, this caused the kernel to consume good several tens of percents of CPU cycles running update_blocked_averages(), each run taking multiple millisecs. This patch fixes the issue by taking empty and fully decayed cfs_rqs off the rq->leaf_cfs_rq_list. Signed-off-by: Tejun Heo [ Added cfs_rq_is_decayed() ] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Chris Mason Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Turner Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170426004350.GB3222@wtj.duckdns.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c8fa777de76c..219fe58e3023 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) } /* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ + leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ static inline struct cfs_rq * @@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) static inline struct sched_entity *parent_entity(struct sched_entity *se) { @@ -6953,10 +6954,28 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_FAIR_GROUP_SCHED + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (cfs_rq->avg.load_sum) + return false; + + if (cfs_rq->avg.util_sum) + return false; + + if (cfs_rq->runnable_load_sum) + return false; + + return true; +} + static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; struct rq_flags rf; rq_lock_irqsave(rq, &rf); @@ -6966,7 +6985,7 @@ static void update_blocked_averages(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) { + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; /* throttled entities do not contribute to load */ @@ -6980,6 +6999,13 @@ static void update_blocked_averages(int cpu) se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) update_load_avg(se, 0); + + /* + * There can be a lot of idle CPU cgroups. Don't let fully + * decayed cfs_rqs linger on the list. + */ + if (cfs_rq_is_decayed(cfs_rq)) + list_del_leaf_cfs_rq(cfs_rq); } rq_unlock_irqrestore(rq, &rf); } @@ -9503,10 +9529,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } -- cgit v1.3-14-g43fede From 7b4ff1adb57ad96d8f12a05d8c661a3d8c4d2be1 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 11 May 2017 10:17:45 -0300 Subject: mutex, futex: adjust kernel-doc markups to generate ReST There are a few issues on some kernel-doc markups that was causing troubles with kernel-doc output on ReST format: ./kernel/futex.c:492: WARNING: Inline emphasis start-string without end-string. ./kernel/futex.c:1264: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:1721: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:2338: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:2426: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:2899: WARNING: Block quote ends without a blank line; unexpected unindent. ./kernel/futex.c:2972: WARNING: Block quote ends without a blank line; unexpected unindent. Fix them. No functional changes. Acked-by: Darren Hart (VMware) Signed-off-by: Mauro Carvalho Chehab --- include/linux/mutex.h | 6 +++--- kernel/futex.c | 40 ++++++++++++++++++++-------------------- kernel/locking/mutex.c | 6 +++--- 3 files changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 1127fe31645d..ffcba1f337da 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -214,9 +214,9 @@ enum mutex_trylock_recursive_enum { * raisins, and once those are gone this will be removed. * * Returns: - * MUTEX_TRYLOCK_FAILED - trylock failed, - * MUTEX_TRYLOCK_SUCCESS - lock acquired, - * MUTEX_TRYLOCK_RECURSIVE - we already owned the lock. + * - MUTEX_TRYLOCK_FAILED - trylock failed, + * - MUTEX_TRYLOCK_SUCCESS - lock acquired, + * - MUTEX_TRYLOCK_RECURSIVE - we already owned the lock. */ static inline /* __deprecated */ __must_check enum mutex_trylock_recursive_enum mutex_trylock_recursive(struct mutex *lock) diff --git a/kernel/futex.c b/kernel/futex.c index 357348a6cf6b..b8ae87d227da 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -488,7 +488,7 @@ static void drop_futex_key_refs(union futex_key *key) * * Return: a negative error code or 0 * - * The key words are stored in *key on success. + * The key words are stored in @key on success. * * For shared mappings, it's (page->index, file_inode(vma->vm_file), * offset_within_page). For private mappings, it's (uaddr, current->mm). @@ -1259,9 +1259,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Return: - * 0 - ready to wait; - * 1 - acquired the lock; - * <0 - error + * - 0 - ready to wait; + * - 1 - acquired the lock; + * - <0 - error * * The hb->lock and futex_key refs shall be held by the caller. */ @@ -1717,9 +1717,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * hb1 and hb2 must be held by the caller. * * Return: - * 0 - failed to acquire the lock atomically; - * >0 - acquired the lock, return value is vpid of the top_waiter - * <0 - error + * - 0 - failed to acquire the lock atomically; + * - >0 - acquired the lock, return value is vpid of the top_waiter + * - <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, @@ -1785,8 +1785,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * uaddr2 atomically on behalf of the top waiter. * * Return: - * >=0 - on success, the number of tasks requeued or woken; - * <0 - on error + * - >=0 - on success, the number of tasks requeued or woken; + * - <0 - on error */ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_requeue, @@ -2142,8 +2142,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) * be paired with exactly one earlier call to queue_me(). * * Return: - * 1 - if the futex_q was still queued (and we removed unqueued it); - * 0 - if the futex_q was already removed by the waking thread + * - 1 - if the futex_q was still queued (and we removed unqueued it); + * - 0 - if the futex_q was already removed by the waking thread */ static int unqueue_me(struct futex_q *q) { @@ -2333,9 +2333,9 @@ static long futex_wait_restart(struct restart_block *restart); * acquire the lock. Must be called with the hb lock held. * * Return: - * 1 - success, lock taken; - * 0 - success, lock not taken; - * <0 - on error (-EFAULT) + * - 1 - success, lock taken; + * - 0 - success, lock not taken; + * - <0 - on error (-EFAULT) */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { @@ -2422,8 +2422,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * with no q.key reference on failure. * * Return: - * 0 - uaddr contains val and hb has been locked; - * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked + * - 0 - uaddr contains val and hb has been locked; + * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb) @@ -2895,8 +2895,8 @@ pi_faulted: * called with the hb lock held. * * Return: - * 0 = no early wakeup detected; - * <0 = -ETIMEDOUT or -ERESTARTNOINTR + * - 0 = no early wakeup detected; + * - <0 = -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, @@ -2968,8 +2968,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * If 4 or 7, we cleanup and return with -ETIMEDOUT. * * Return: - * 0 - On success; - * <0 - On error + * - 0 - On success; + * - <0 - On error */ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 198527a62149..858a07590e39 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -227,9 +227,9 @@ static void __sched __mutex_lock_slowpath(struct mutex *lock); * (or statically defined) before it can be locked. memset()-ing * the mutex to 0 is not allowed. * - * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging - * checks that will enforce the restrictions and will also do - * deadlock debugging. ) + * (The CONFIG_DEBUG_MUTEXES .config option turns on debugging + * checks that will enforce the restrictions and will also do + * deadlock debugging) * * This function is similar to (but not equivalent to) down(). */ -- cgit v1.3-14-g43fede From c0c6e0850514c16814c37f64a9a1bcc6c52a19ab Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sun, 14 May 2017 12:03:39 -0300 Subject: irq: update genericirq book location This book got converted from DocBook. Update its references to point to the current location. Signed-off-by: Mauro Carvalho Chehab --- kernel/irq/chip.c | 2 +- kernel/irq/handle.c | 2 +- kernel/irq/irqdesc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 686be4b73018..4188a0a7691f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -7,7 +7,7 @@ * This file contains the core interrupt handling code, for irq-chip * based architectures. * - * Detailed information is available in Documentation/DocBook/genericirq + * Detailed information is available in Documentation/core-api/genericirq.rst */ #include diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d3f24905852c..bbf9a7174283 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -6,7 +6,7 @@ * * This file contains the core interrupt handling code. * - * Detailed information is available in Documentation/DocBook/genericirq + * Detailed information is available in Documentation/core-api/genericirq.rst * */ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 00bb0aeea1d0..22e443133987 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -4,7 +4,7 @@ * * This file contains the interrupt descriptor management code * - * Detailed information is available in Documentation/DocBook/genericirq + * Detailed information is available in Documentation/core-api/genericirq.rst * */ #include -- cgit v1.3-14-g43fede From ce6cf9a15d62fd7ee92f4f9bb754883bacf85a3e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 May 2017 16:36:19 +0200 Subject: nohz: Add hrtimer sanity check Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 64c97fc130c4..d212bb62bc08 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -771,8 +771,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, tick = expires; /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && (expires == dev->next_event)) - goto out; + if (ts->tick_stopped) { + if (hrtimer_active(&ts->sched_timer)) + WARN_ON_ONCE(hrtimer_get_expires(&ts->sched_timer) < dev->next_event); + + if (expires == dev->next_event) + goto out; + } /* * nohz_stop_sched_tick can be called several times before -- cgit v1.3-14-g43fede From 411fe24e6b7c283c3a1911450cdba6dd3aaea56e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 21 Apr 2017 16:00:54 +0200 Subject: nohz: Fix collision between tick and other hrtimers, again This restores commit: 24b91e360ef5: ("nohz: Fix collision between tick and other hrtimers") ... which got reverted by commit: 558e8e27e73f: ('Revert "nohz: Fix collision between tick and other hrtimers"') ... due to a regression where CPUs spuriously stopped ticking. The bug happened when a tick fired too early past its expected expiration: on IRQ exit the tick was scheduled again to the same deadline but skipped reprogramming because ts->next_tick still kept in cache the deadline. This has been fixed now with resetting ts->next_tick from the tick itself. Extra care has also been taken to prevent from obsolete values throughout CPU hotplug operations. When the tick is stopped and an interrupt occurs afterward, we check on that interrupt exit if the next tick needs to be rescheduled. If it doesn't need any update, we don't want to do anything. In order to check if the tick needs an update, we compare it against the clockevent device deadline. Now that's a problem because the clockevent device is at a lower level than the tick itself if it is implemented on top of hrtimer. Every hrtimer share this clockevent device. So comparing the next tick deadline against the clockevent device deadline is wrong because the device may be programmed for another hrtimer whose deadline collides with the tick. As a result we may end up not reprogramming the tick accidentally. In a worst case scenario under full dynticks mode, the tick stops firing as it is supposed to every 1hz, leaving /proc/stat stalled: Task in a full dynticks CPU ---------------------------- * hrtimer A is queued 2 seconds ahead * the tick is stopped, scheduled 1 second ahead * tick fires 1 second later * on tick exit, nohz schedules the tick 1 second ahead but sees the clockevent device is already programmed to that deadline, fooled by hrtimer A, the tick isn't rescheduled. * hrtimer A is cancelled before its deadline * tick never fires again until an interrupt happens... In order to fix this, store the next tick deadline to the tick_sched local structure and reuse that value later to check whether we need to reprogram the clock after an interrupt. On the other hand, ts->sleep_length still wants to know about the next clock event and not just the tick, so we want to improve the related comment to avoid confusion. Reported-and-tested-by: Tim Wright Reported-and-tested-by: Pavel Machek Reported-by: James Hartsock Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1492783255-5051-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 37 +++++++++++++++++++++++++++++++------ kernel/time/tick-sched.h | 2 ++ 2 files changed, 33 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d212bb62bc08..764d2905e6a5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -150,6 +150,12 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; + /* + * In case the current tick fired too early past its expected + * expiration, make sure we don't bypass the next clock reprogramming + * to the same deadline. + */ + ts->next_tick = 0; } #endif update_process_times(user_mode(regs)); @@ -660,6 +666,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + + /* + * Reset to make sure next tick stop doesn't get fooled by past + * cached clock deadline. + */ + ts->next_tick = 0; } static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, @@ -771,12 +783,15 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, tick = expires; /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped) { - if (hrtimer_active(&ts->sched_timer)) - WARN_ON_ONCE(hrtimer_get_expires(&ts->sched_timer) < dev->next_event); - - if (expires == dev->next_event) + if (ts->tick_stopped && (expires == ts->next_tick)) { + /* Sanity check: make sure clockevent is actually programmed */ + if (likely(dev->next_event <= ts->next_tick)) goto out; + + WARN_ON_ONCE(1); + printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", + basemono, ts->next_tick, dev->next_event, + hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); } /* @@ -796,6 +811,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, trace_tick_stop(1, TICK_DEP_MASK_NONE); } + ts->next_tick = tick; + /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. @@ -811,7 +828,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, else tick_program_event(tick, 1); out: - /* Update the estimated sleep length */ + /* + * Update the estimated sleep length until the next timer + * (not only the tick). + */ ts->sleep_length = ktime_sub(dev->next_event, now); return tick; } @@ -869,6 +889,11 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; + /* + * Make sure the CPU doesn't get fooled by obsolete tick + * deadline if it comes back online later. + */ + ts->next_tick = 0; return false; } diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index bf38226e5c17..075444e3d48e 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -27,6 +27,7 @@ enum tick_nohz_mode { * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. + * @next_tick: Next tick to be fired when in dynticks mode. * @tick_stopped: Indicator that the idle tick has been stopped * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_calls: Total number of idle calls @@ -44,6 +45,7 @@ struct tick_sched { unsigned long check_clocks; enum tick_nohz_mode nohz_mode; ktime_t last_tick; + ktime_t next_tick; int inidle; int tick_stopped; unsigned long idle_jiffies; -- cgit v1.3-14-g43fede From af777cd1b83e95138e7285fde87c795ef0ae7c4d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 13 May 2017 04:51:40 -0700 Subject: doc: ReSTify credentials.txt This updates the credentials API documentation to ReST markup and moves it under the security subsection of kernel API documentation. Cc: David Howells Signed-off-by: Kees Cook Signed-off-by: Jonathan Corbet --- Documentation/security/00-INDEX | 2 - Documentation/security/credentials.rst | 554 +++++++++++++++++++++++++++++++ Documentation/security/credentials.txt | 581 --------------------------------- Documentation/security/index.rst | 1 + include/linux/cred.h | 2 +- kernel/cred.c | 2 +- 6 files changed, 557 insertions(+), 585 deletions(-) create mode 100644 Documentation/security/credentials.rst delete mode 100644 Documentation/security/credentials.txt (limited to 'kernel') diff --git a/Documentation/security/00-INDEX b/Documentation/security/00-INDEX index 414235c1fcfc..c4df62a9ae5b 100644 --- a/Documentation/security/00-INDEX +++ b/Documentation/security/00-INDEX @@ -10,8 +10,6 @@ Yama.txt - documentation on the Yama Linux Security Module. apparmor.txt - documentation on the AppArmor security extension. -credentials.txt - - documentation about credentials in Linux. keys-ecryptfs.txt - description of the encryption keys for the ecryptfs filesystem. keys-request-key.txt diff --git a/Documentation/security/credentials.rst b/Documentation/security/credentials.rst new file mode 100644 index 000000000000..038a7e19eff9 --- /dev/null +++ b/Documentation/security/credentials.rst @@ -0,0 +1,554 @@ +==================== +Credentials in Linux +==================== + +By: David Howells + +.. contents:: :local: + +Overview +======== + +There are several parts to the security check performed by Linux when one +object acts upon another: + + 1. Objects. + + Objects are things in the system that may be acted upon directly by + userspace programs. Linux has a variety of actionable objects, including: + + - Tasks + - Files/inodes + - Sockets + - Message queues + - Shared memory segments + - Semaphores + - Keys + + As a part of the description of all these objects there is a set of + credentials. What's in the set depends on the type of object. + + 2. Object ownership. + + Amongst the credentials of most objects, there will be a subset that + indicates the ownership of that object. This is used for resource + accounting and limitation (disk quotas and task rlimits for example). + + In a standard UNIX filesystem, for instance, this will be defined by the + UID marked on the inode. + + 3. The objective context. + + Also amongst the credentials of those objects, there will be a subset that + indicates the 'objective context' of that object. This may or may not be + the same set as in (2) - in standard UNIX files, for instance, this is the + defined by the UID and the GID marked on the inode. + + The objective context is used as part of the security calculation that is + carried out when an object is acted upon. + + 4. Subjects. + + A subject is an object that is acting upon another object. + + Most of the objects in the system are inactive: they don't act on other + objects within the system. Processes/tasks are the obvious exception: + they do stuff; they access and manipulate things. + + Objects other than tasks may under some circumstances also be subjects. + For instance an open file may send SIGIO to a task using the UID and EUID + given to it by a task that called ``fcntl(F_SETOWN)`` upon it. In this case, + the file struct will have a subjective context too. + + 5. The subjective context. + + A subject has an additional interpretation of its credentials. A subset + of its credentials forms the 'subjective context'. The subjective context + is used as part of the security calculation that is carried out when a + subject acts. + + A Linux task, for example, has the FSUID, FSGID and the supplementary + group list for when it is acting upon a file - which are quite separate + from the real UID and GID that normally form the objective context of the + task. + + 6. Actions. + + Linux has a number of actions available that a subject may perform upon an + object. The set of actions available depends on the nature of the subject + and the object. + + Actions include reading, writing, creating and deleting files; forking or + signalling and tracing tasks. + + 7. Rules, access control lists and security calculations. + + When a subject acts upon an object, a security calculation is made. This + involves taking the subjective context, the objective context and the + action, and searching one or more sets of rules to see whether the subject + is granted or denied permission to act in the desired manner on the + object, given those contexts. + + There are two main sources of rules: + + a. Discretionary access control (DAC): + + Sometimes the object will include sets of rules as part of its + description. This is an 'Access Control List' or 'ACL'. A Linux + file may supply more than one ACL. + + A traditional UNIX file, for example, includes a permissions mask that + is an abbreviated ACL with three fixed classes of subject ('user', + 'group' and 'other'), each of which may be granted certain privileges + ('read', 'write' and 'execute' - whatever those map to for the object + in question). UNIX file permissions do not allow the arbitrary + specification of subjects, however, and so are of limited use. + + A Linux file might also sport a POSIX ACL. This is a list of rules + that grants various permissions to arbitrary subjects. + + b. Mandatory access control (MAC): + + The system as a whole may have one or more sets of rules that get + applied to all subjects and objects, regardless of their source. + SELinux and Smack are examples of this. + + In the case of SELinux and Smack, each object is given a label as part + of its credentials. When an action is requested, they take the + subject label, the object label and the action and look for a rule + that says that this action is either granted or denied. + + +Types of Credentials +==================== + +The Linux kernel supports the following types of credentials: + + 1. Traditional UNIX credentials. + + - Real User ID + - Real Group ID + + The UID and GID are carried by most, if not all, Linux objects, even if in + some cases it has to be invented (FAT or CIFS files for example, which are + derived from Windows). These (mostly) define the objective context of + that object, with tasks being slightly different in some cases. + + - Effective, Saved and FS User ID + - Effective, Saved and FS Group ID + - Supplementary groups + + These are additional credentials used by tasks only. Usually, an + EUID/EGID/GROUPS will be used as the subjective context, and real UID/GID + will be used as the objective. For tasks, it should be noted that this is + not always true. + + 2. Capabilities. + + - Set of permitted capabilities + - Set of inheritable capabilities + - Set of effective capabilities + - Capability bounding set + + These are only carried by tasks. They indicate superior capabilities + granted piecemeal to a task that an ordinary task wouldn't otherwise have. + These are manipulated implicitly by changes to the traditional UNIX + credentials, but can also be manipulated directly by the ``capset()`` + system call. + + The permitted capabilities are those caps that the process might grant + itself to its effective or permitted sets through ``capset()``. This + inheritable set might also be so constrained. + + The effective capabilities are the ones that a task is actually allowed to + make use of itself. + + The inheritable capabilities are the ones that may get passed across + ``execve()``. + + The bounding set limits the capabilities that may be inherited across + ``execve()``, especially when a binary is executed that will execute as + UID 0. + + 3. Secure management flags (securebits). + + These are only carried by tasks. These govern the way the above + credentials are manipulated and inherited over certain operations such as + execve(). They aren't used directly as objective or subjective + credentials. + + 4. Keys and keyrings. + + These are only carried by tasks. They carry and cache security tokens + that don't fit into the other standard UNIX credentials. They are for + making such things as network filesystem keys available to the file + accesses performed by processes, without the necessity of ordinary + programs having to know about security details involved. + + Keyrings are a special type of key. They carry sets of other keys and can + be searched for the desired key. Each process may subscribe to a number + of keyrings: + + Per-thread keying + Per-process keyring + Per-session keyring + + When a process accesses a key, if not already present, it will normally be + cached on one of these keyrings for future accesses to find. + + For more information on using keys, see Documentation/security/keys.txt. + + 5. LSM + + The Linux Security Module allows extra controls to be placed over the + operations that a task may do. Currently Linux supports several LSM + options. + + Some work by labelling the objects in a system and then applying sets of + rules (policies) that say what operations a task with one label may do to + an object with another label. + + 6. AF_KEY + + This is a socket-based approach to credential management for networking + stacks [RFC 2367]. It isn't discussed by this document as it doesn't + interact directly with task and file credentials; rather it keeps system + level credentials. + + +When a file is opened, part of the opening task's subjective context is +recorded in the file struct created. This allows operations using that file +struct to use those credentials instead of the subjective context of the task +that issued the operation. An example of this would be a file opened on a +network filesystem where the credentials of the opened file should be presented +to the server, regardless of who is actually doing a read or a write upon it. + + +File Markings +============= + +Files on disk or obtained over the network may have annotations that form the +objective security context of that file. Depending on the type of filesystem, +this may include one or more of the following: + + * UNIX UID, GID, mode; + * Windows user ID; + * Access control list; + * LSM security label; + * UNIX exec privilege escalation bits (SUID/SGID); + * File capabilities exec privilege escalation bits. + +These are compared to the task's subjective security context, and certain +operations allowed or disallowed as a result. In the case of execve(), the +privilege escalation bits come into play, and may allow the resulting process +extra privileges, based on the annotations on the executable file. + + +Task Credentials +================ + +In Linux, all of a task's credentials are held in (uid, gid) or through +(groups, keys, LSM security) a refcounted structure of type 'struct cred'. +Each task points to its credentials by a pointer called 'cred' in its +task_struct. + +Once a set of credentials has been prepared and committed, it may not be +changed, barring the following exceptions: + + 1. its reference count may be changed; + + 2. the reference count on the group_info struct it points to may be changed; + + 3. the reference count on the security data it points to may be changed; + + 4. the reference count on any keyrings it points to may be changed; + + 5. any keyrings it points to may be revoked, expired or have their security + attributes changed; and + + 6. the contents of any keyrings to which it points may be changed (the whole + point of keyrings being a shared set of credentials, modifiable by anyone + with appropriate access). + +To alter anything in the cred struct, the copy-and-replace principle must be +adhered to. First take a copy, then alter the copy and then use RCU to change +the task pointer to make it point to the new copy. There are wrappers to aid +with this (see below). + +A task may only alter its _own_ credentials; it is no longer permitted for a +task to alter another's credentials. This means the ``capset()`` system call +is no longer permitted to take any PID other than the one of the current +process. Also ``keyctl_instantiate()`` and ``keyctl_negate()`` functions no +longer permit attachment to process-specific keyrings in the requesting +process as the instantiating process may need to create them. + + +Immutable Credentials +--------------------- + +Once a set of credentials has been made public (by calling ``commit_creds()`` +for example), it must be considered immutable, barring two exceptions: + + 1. The reference count may be altered. + + 2. Whilst the keyring subscriptions of a set of credentials may not be + changed, the keyrings subscribed to may have their contents altered. + +To catch accidental credential alteration at compile time, struct task_struct +has _const_ pointers to its credential sets, as does struct file. Furthermore, +certain functions such as ``get_cred()`` and ``put_cred()`` operate on const +pointers, thus rendering casts unnecessary, but require to temporarily ditch +the const qualification to be able to alter the reference count. + + +Accessing Task Credentials +-------------------------- + +A task being able to alter only its own credentials permits the current process +to read or replace its own credentials without the need for any form of locking +-- which simplifies things greatly. It can just call:: + + const struct cred *current_cred() + +to get a pointer to its credentials structure, and it doesn't have to release +it afterwards. + +There are convenience wrappers for retrieving specific aspects of a task's +credentials (the value is simply returned in each case):: + + uid_t current_uid(void) Current's real UID + gid_t current_gid(void) Current's real GID + uid_t current_euid(void) Current's effective UID + gid_t current_egid(void) Current's effective GID + uid_t current_fsuid(void) Current's file access UID + gid_t current_fsgid(void) Current's file access GID + kernel_cap_t current_cap(void) Current's effective capabilities + void *current_security(void) Current's LSM security pointer + struct user_struct *current_user(void) Current's user account + +There are also convenience wrappers for retrieving specific associated pairs of +a task's credentials:: + + void current_uid_gid(uid_t *, gid_t *); + void current_euid_egid(uid_t *, gid_t *); + void current_fsuid_fsgid(uid_t *, gid_t *); + +which return these pairs of values through their arguments after retrieving +them from the current task's credentials. + + +In addition, there is a function for obtaining a reference on the current +process's current set of credentials:: + + const struct cred *get_current_cred(void); + +and functions for getting references to one of the credentials that don't +actually live in struct cred:: + + struct user_struct *get_current_user(void); + struct group_info *get_current_groups(void); + +which get references to the current process's user accounting structure and +supplementary groups list respectively. + +Once a reference has been obtained, it must be released with ``put_cred()``, +``free_uid()`` or ``put_group_info()`` as appropriate. + + +Accessing Another Task's Credentials +------------------------------------ + +Whilst a task may access its own credentials without the need for locking, the +same is not true of a task wanting to access another task's credentials. It +must use the RCU read lock and ``rcu_dereference()``. + +The ``rcu_dereference()`` is wrapped by:: + + const struct cred *__task_cred(struct task_struct *task); + +This should be used inside the RCU read lock, as in the following example:: + + void foo(struct task_struct *t, struct foo_data *f) + { + const struct cred *tcred; + ... + rcu_read_lock(); + tcred = __task_cred(t); + f->uid = tcred->uid; + f->gid = tcred->gid; + f->groups = get_group_info(tcred->groups); + rcu_read_unlock(); + ... + } + +Should it be necessary to hold another task's credentials for a long period of +time, and possibly to sleep whilst doing so, then the caller should get a +reference on them using:: + + const struct cred *get_task_cred(struct task_struct *task); + +This does all the RCU magic inside of it. The caller must call put_cred() on +the credentials so obtained when they're finished with. + +.. note:: + The result of ``__task_cred()`` should not be passed directly to + ``get_cred()`` as this may race with ``commit_cred()``. + +There are a couple of convenience functions to access bits of another task's +credentials, hiding the RCU magic from the caller:: + + uid_t task_uid(task) Task's real UID + uid_t task_euid(task) Task's effective UID + +If the caller is holding the RCU read lock at the time anyway, then:: + + __task_cred(task)->uid + __task_cred(task)->euid + +should be used instead. Similarly, if multiple aspects of a task's credentials +need to be accessed, RCU read lock should be used, ``__task_cred()`` called, +the result stored in a temporary pointer and then the credential aspects called +from that before dropping the lock. This prevents the potentially expensive +RCU magic from being invoked multiple times. + +Should some other single aspect of another task's credentials need to be +accessed, then this can be used:: + + task_cred_xxx(task, member) + +where 'member' is a non-pointer member of the cred struct. For instance:: + + uid_t task_cred_xxx(task, suid); + +will retrieve 'struct cred::suid' from the task, doing the appropriate RCU +magic. This may not be used for pointer members as what they point to may +disappear the moment the RCU read lock is dropped. + + +Altering Credentials +-------------------- + +As previously mentioned, a task may only alter its own credentials, and may not +alter those of another task. This means that it doesn't need to use any +locking to alter its own credentials. + +To alter the current process's credentials, a function should first prepare a +new set of credentials by calling:: + + struct cred *prepare_creds(void); + +this locks current->cred_replace_mutex and then allocates and constructs a +duplicate of the current process's credentials, returning with the mutex still +held if successful. It returns NULL if not successful (out of memory). + +The mutex prevents ``ptrace()`` from altering the ptrace state of a process +whilst security checks on credentials construction and changing is taking place +as the ptrace state may alter the outcome, particularly in the case of +``execve()``. + +The new credentials set should be altered appropriately, and any security +checks and hooks done. Both the current and the proposed sets of credentials +are available for this purpose as current_cred() will return the current set +still at this point. + + +When the credential set is ready, it should be committed to the current process +by calling:: + + int commit_creds(struct cred *new); + +This will alter various aspects of the credentials and the process, giving the +LSM a chance to do likewise, then it will use ``rcu_assign_pointer()`` to +actually commit the new credentials to ``current->cred``, it will release +``current->cred_replace_mutex`` to allow ``ptrace()`` to take place, and it +will notify the scheduler and others of the changes. + +This function is guaranteed to return 0, so that it can be tail-called at the +end of such functions as ``sys_setresuid()``. + +Note that this function consumes the caller's reference to the new credentials. +The caller should _not_ call ``put_cred()`` on the new credentials afterwards. + +Furthermore, once this function has been called on a new set of credentials, +those credentials may _not_ be changed further. + + +Should the security checks fail or some other error occur after +``prepare_creds()`` has been called, then the following function should be +invoked:: + + void abort_creds(struct cred *new); + +This releases the lock on ``current->cred_replace_mutex`` that +``prepare_creds()`` got and then releases the new credentials. + + +A typical credentials alteration function would look something like this:: + + int alter_suid(uid_t suid) + { + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + new->suid = suid; + ret = security_alter_suid(new); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); + } + + +Managing Credentials +-------------------- + +There are some functions to help manage credentials: + + - ``void put_cred(const struct cred *cred);`` + + This releases a reference to the given set of credentials. If the + reference count reaches zero, the credentials will be scheduled for + destruction by the RCU system. + + - ``const struct cred *get_cred(const struct cred *cred);`` + + This gets a reference on a live set of credentials, returning a pointer to + that set of credentials. + + - ``struct cred *get_new_cred(struct cred *cred);`` + + This gets a reference on a set of credentials that is under construction + and is thus still mutable, returning a pointer to that set of credentials. + + +Open File Credentials +===================== + +When a new file is opened, a reference is obtained on the opening task's +credentials and this is attached to the file struct as ``f_cred`` in place of +``f_uid`` and ``f_gid``. Code that used to access ``file->f_uid`` and +``file->f_gid`` should now access ``file->f_cred->fsuid`` and +``file->f_cred->fsgid``. + +It is safe to access ``f_cred`` without the use of RCU or locking because the +pointer will not change over the lifetime of the file struct, and nor will the +contents of the cred struct pointed to, barring the exceptions listed above +(see the Task Credentials section). + + +Overriding the VFS's Use of Credentials +======================================= + +Under some circumstances it is desirable to override the credentials used by +the VFS, and that can be done by calling into such as ``vfs_mkdir()`` with a +different set of credentials. This is done in the following places: + + * ``sys_faccessat()``. + * ``do_coredump()``. + * nfs4recover.c. diff --git a/Documentation/security/credentials.txt b/Documentation/security/credentials.txt deleted file mode 100644 index 86257052e31a..000000000000 --- a/Documentation/security/credentials.txt +++ /dev/null @@ -1,581 +0,0 @@ - ==================== - CREDENTIALS IN LINUX - ==================== - -By: David Howells - -Contents: - - (*) Overview. - - (*) Types of credentials. - - (*) File markings. - - (*) Task credentials. - - - Immutable credentials. - - Accessing task credentials. - - Accessing another task's credentials. - - Altering credentials. - - Managing credentials. - - (*) Open file credentials. - - (*) Overriding the VFS's use of credentials. - - -======== -OVERVIEW -======== - -There are several parts to the security check performed by Linux when one -object acts upon another: - - (1) Objects. - - Objects are things in the system that may be acted upon directly by - userspace programs. Linux has a variety of actionable objects, including: - - - Tasks - - Files/inodes - - Sockets - - Message queues - - Shared memory segments - - Semaphores - - Keys - - As a part of the description of all these objects there is a set of - credentials. What's in the set depends on the type of object. - - (2) Object ownership. - - Amongst the credentials of most objects, there will be a subset that - indicates the ownership of that object. This is used for resource - accounting and limitation (disk quotas and task rlimits for example). - - In a standard UNIX filesystem, for instance, this will be defined by the - UID marked on the inode. - - (3) The objective context. - - Also amongst the credentials of those objects, there will be a subset that - indicates the 'objective context' of that object. This may or may not be - the same set as in (2) - in standard UNIX files, for instance, this is the - defined by the UID and the GID marked on the inode. - - The objective context is used as part of the security calculation that is - carried out when an object is acted upon. - - (4) Subjects. - - A subject is an object that is acting upon another object. - - Most of the objects in the system are inactive: they don't act on other - objects within the system. Processes/tasks are the obvious exception: - they do stuff; they access and manipulate things. - - Objects other than tasks may under some circumstances also be subjects. - For instance an open file may send SIGIO to a task using the UID and EUID - given to it by a task that called fcntl(F_SETOWN) upon it. In this case, - the file struct will have a subjective context too. - - (5) The subjective context. - - A subject has an additional interpretation of its credentials. A subset - of its credentials forms the 'subjective context'. The subjective context - is used as part of the security calculation that is carried out when a - subject acts. - - A Linux task, for example, has the FSUID, FSGID and the supplementary - group list for when it is acting upon a file - which are quite separate - from the real UID and GID that normally form the objective context of the - task. - - (6) Actions. - - Linux has a number of actions available that a subject may perform upon an - object. The set of actions available depends on the nature of the subject - and the object. - - Actions include reading, writing, creating and deleting files; forking or - signalling and tracing tasks. - - (7) Rules, access control lists and security calculations. - - When a subject acts upon an object, a security calculation is made. This - involves taking the subjective context, the objective context and the - action, and searching one or more sets of rules to see whether the subject - is granted or denied permission to act in the desired manner on the - object, given those contexts. - - There are two main sources of rules: - - (a) Discretionary access control (DAC): - - Sometimes the object will include sets of rules as part of its - description. This is an 'Access Control List' or 'ACL'. A Linux - file may supply more than one ACL. - - A traditional UNIX file, for example, includes a permissions mask that - is an abbreviated ACL with three fixed classes of subject ('user', - 'group' and 'other'), each of which may be granted certain privileges - ('read', 'write' and 'execute' - whatever those map to for the object - in question). UNIX file permissions do not allow the arbitrary - specification of subjects, however, and so are of limited use. - - A Linux file might also sport a POSIX ACL. This is a list of rules - that grants various permissions to arbitrary subjects. - - (b) Mandatory access control (MAC): - - The system as a whole may have one or more sets of rules that get - applied to all subjects and objects, regardless of their source. - SELinux and Smack are examples of this. - - In the case of SELinux and Smack, each object is given a label as part - of its credentials. When an action is requested, they take the - subject label, the object label and the action and look for a rule - that says that this action is either granted or denied. - - -==================== -TYPES OF CREDENTIALS -==================== - -The Linux kernel supports the following types of credentials: - - (1) Traditional UNIX credentials. - - Real User ID - Real Group ID - - The UID and GID are carried by most, if not all, Linux objects, even if in - some cases it has to be invented (FAT or CIFS files for example, which are - derived from Windows). These (mostly) define the objective context of - that object, with tasks being slightly different in some cases. - - Effective, Saved and FS User ID - Effective, Saved and FS Group ID - Supplementary groups - - These are additional credentials used by tasks only. Usually, an - EUID/EGID/GROUPS will be used as the subjective context, and real UID/GID - will be used as the objective. For tasks, it should be noted that this is - not always true. - - (2) Capabilities. - - Set of permitted capabilities - Set of inheritable capabilities - Set of effective capabilities - Capability bounding set - - These are only carried by tasks. They indicate superior capabilities - granted piecemeal to a task that an ordinary task wouldn't otherwise have. - These are manipulated implicitly by changes to the traditional UNIX - credentials, but can also be manipulated directly by the capset() system - call. - - The permitted capabilities are those caps that the process might grant - itself to its effective or permitted sets through capset(). This - inheritable set might also be so constrained. - - The effective capabilities are the ones that a task is actually allowed to - make use of itself. - - The inheritable capabilities are the ones that may get passed across - execve(). - - The bounding set limits the capabilities that may be inherited across - execve(), especially when a binary is executed that will execute as UID 0. - - (3) Secure management flags (securebits). - - These are only carried by tasks. These govern the way the above - credentials are manipulated and inherited over certain operations such as - execve(). They aren't used directly as objective or subjective - credentials. - - (4) Keys and keyrings. - - These are only carried by tasks. They carry and cache security tokens - that don't fit into the other standard UNIX credentials. They are for - making such things as network filesystem keys available to the file - accesses performed by processes, without the necessity of ordinary - programs having to know about security details involved. - - Keyrings are a special type of key. They carry sets of other keys and can - be searched for the desired key. Each process may subscribe to a number - of keyrings: - - Per-thread keying - Per-process keyring - Per-session keyring - - When a process accesses a key, if not already present, it will normally be - cached on one of these keyrings for future accesses to find. - - For more information on using keys, see Documentation/security/keys.txt. - - (5) LSM - - The Linux Security Module allows extra controls to be placed over the - operations that a task may do. Currently Linux supports several LSM - options. - - Some work by labelling the objects in a system and then applying sets of - rules (policies) that say what operations a task with one label may do to - an object with another label. - - (6) AF_KEY - - This is a socket-based approach to credential management for networking - stacks [RFC 2367]. It isn't discussed by this document as it doesn't - interact directly with task and file credentials; rather it keeps system - level credentials. - - -When a file is opened, part of the opening task's subjective context is -recorded in the file struct created. This allows operations using that file -struct to use those credentials instead of the subjective context of the task -that issued the operation. An example of this would be a file opened on a -network filesystem where the credentials of the opened file should be presented -to the server, regardless of who is actually doing a read or a write upon it. - - -============= -FILE MARKINGS -============= - -Files on disk or obtained over the network may have annotations that form the -objective security context of that file. Depending on the type of filesystem, -this may include one or more of the following: - - (*) UNIX UID, GID, mode; - - (*) Windows user ID; - - (*) Access control list; - - (*) LSM security label; - - (*) UNIX exec privilege escalation bits (SUID/SGID); - - (*) File capabilities exec privilege escalation bits. - -These are compared to the task's subjective security context, and certain -operations allowed or disallowed as a result. In the case of execve(), the -privilege escalation bits come into play, and may allow the resulting process -extra privileges, based on the annotations on the executable file. - - -================ -TASK CREDENTIALS -================ - -In Linux, all of a task's credentials are held in (uid, gid) or through -(groups, keys, LSM security) a refcounted structure of type 'struct cred'. -Each task points to its credentials by a pointer called 'cred' in its -task_struct. - -Once a set of credentials has been prepared and committed, it may not be -changed, barring the following exceptions: - - (1) its reference count may be changed; - - (2) the reference count on the group_info struct it points to may be changed; - - (3) the reference count on the security data it points to may be changed; - - (4) the reference count on any keyrings it points to may be changed; - - (5) any keyrings it points to may be revoked, expired or have their security - attributes changed; and - - (6) the contents of any keyrings to which it points may be changed (the whole - point of keyrings being a shared set of credentials, modifiable by anyone - with appropriate access). - -To alter anything in the cred struct, the copy-and-replace principle must be -adhered to. First take a copy, then alter the copy and then use RCU to change -the task pointer to make it point to the new copy. There are wrappers to aid -with this (see below). - -A task may only alter its _own_ credentials; it is no longer permitted for a -task to alter another's credentials. This means the capset() system call is no -longer permitted to take any PID other than the one of the current process. -Also keyctl_instantiate() and keyctl_negate() functions no longer permit -attachment to process-specific keyrings in the requesting process as the -instantiating process may need to create them. - - -IMMUTABLE CREDENTIALS ---------------------- - -Once a set of credentials has been made public (by calling commit_creds() for -example), it must be considered immutable, barring two exceptions: - - (1) The reference count may be altered. - - (2) Whilst the keyring subscriptions of a set of credentials may not be - changed, the keyrings subscribed to may have their contents altered. - -To catch accidental credential alteration at compile time, struct task_struct -has _const_ pointers to its credential sets, as does struct file. Furthermore, -certain functions such as get_cred() and put_cred() operate on const pointers, -thus rendering casts unnecessary, but require to temporarily ditch the const -qualification to be able to alter the reference count. - - -ACCESSING TASK CREDENTIALS --------------------------- - -A task being able to alter only its own credentials permits the current process -to read or replace its own credentials without the need for any form of locking -- which simplifies things greatly. It can just call: - - const struct cred *current_cred() - -to get a pointer to its credentials structure, and it doesn't have to release -it afterwards. - -There are convenience wrappers for retrieving specific aspects of a task's -credentials (the value is simply returned in each case): - - uid_t current_uid(void) Current's real UID - gid_t current_gid(void) Current's real GID - uid_t current_euid(void) Current's effective UID - gid_t current_egid(void) Current's effective GID - uid_t current_fsuid(void) Current's file access UID - gid_t current_fsgid(void) Current's file access GID - kernel_cap_t current_cap(void) Current's effective capabilities - void *current_security(void) Current's LSM security pointer - struct user_struct *current_user(void) Current's user account - -There are also convenience wrappers for retrieving specific associated pairs of -a task's credentials: - - void current_uid_gid(uid_t *, gid_t *); - void current_euid_egid(uid_t *, gid_t *); - void current_fsuid_fsgid(uid_t *, gid_t *); - -which return these pairs of values through their arguments after retrieving -them from the current task's credentials. - - -In addition, there is a function for obtaining a reference on the current -process's current set of credentials: - - const struct cred *get_current_cred(void); - -and functions for getting references to one of the credentials that don't -actually live in struct cred: - - struct user_struct *get_current_user(void); - struct group_info *get_current_groups(void); - -which get references to the current process's user accounting structure and -supplementary groups list respectively. - -Once a reference has been obtained, it must be released with put_cred(), -free_uid() or put_group_info() as appropriate. - - -ACCESSING ANOTHER TASK'S CREDENTIALS ------------------------------------- - -Whilst a task may access its own credentials without the need for locking, the -same is not true of a task wanting to access another task's credentials. It -must use the RCU read lock and rcu_dereference(). - -The rcu_dereference() is wrapped by: - - const struct cred *__task_cred(struct task_struct *task); - -This should be used inside the RCU read lock, as in the following example: - - void foo(struct task_struct *t, struct foo_data *f) - { - const struct cred *tcred; - ... - rcu_read_lock(); - tcred = __task_cred(t); - f->uid = tcred->uid; - f->gid = tcred->gid; - f->groups = get_group_info(tcred->groups); - rcu_read_unlock(); - ... - } - -Should it be necessary to hold another task's credentials for a long period of -time, and possibly to sleep whilst doing so, then the caller should get a -reference on them using: - - const struct cred *get_task_cred(struct task_struct *task); - -This does all the RCU magic inside of it. The caller must call put_cred() on -the credentials so obtained when they're finished with. - - [*] Note: The result of __task_cred() should not be passed directly to - get_cred() as this may race with commit_cred(). - -There are a couple of convenience functions to access bits of another task's -credentials, hiding the RCU magic from the caller: - - uid_t task_uid(task) Task's real UID - uid_t task_euid(task) Task's effective UID - -If the caller is holding the RCU read lock at the time anyway, then: - - __task_cred(task)->uid - __task_cred(task)->euid - -should be used instead. Similarly, if multiple aspects of a task's credentials -need to be accessed, RCU read lock should be used, __task_cred() called, the -result stored in a temporary pointer and then the credential aspects called -from that before dropping the lock. This prevents the potentially expensive -RCU magic from being invoked multiple times. - -Should some other single aspect of another task's credentials need to be -accessed, then this can be used: - - task_cred_xxx(task, member) - -where 'member' is a non-pointer member of the cred struct. For instance: - - uid_t task_cred_xxx(task, suid); - -will retrieve 'struct cred::suid' from the task, doing the appropriate RCU -magic. This may not be used for pointer members as what they point to may -disappear the moment the RCU read lock is dropped. - - -ALTERING CREDENTIALS --------------------- - -As previously mentioned, a task may only alter its own credentials, and may not -alter those of another task. This means that it doesn't need to use any -locking to alter its own credentials. - -To alter the current process's credentials, a function should first prepare a -new set of credentials by calling: - - struct cred *prepare_creds(void); - -this locks current->cred_replace_mutex and then allocates and constructs a -duplicate of the current process's credentials, returning with the mutex still -held if successful. It returns NULL if not successful (out of memory). - -The mutex prevents ptrace() from altering the ptrace state of a process whilst -security checks on credentials construction and changing is taking place as -the ptrace state may alter the outcome, particularly in the case of execve(). - -The new credentials set should be altered appropriately, and any security -checks and hooks done. Both the current and the proposed sets of credentials -are available for this purpose as current_cred() will return the current set -still at this point. - - -When the credential set is ready, it should be committed to the current process -by calling: - - int commit_creds(struct cred *new); - -This will alter various aspects of the credentials and the process, giving the -LSM a chance to do likewise, then it will use rcu_assign_pointer() to actually -commit the new credentials to current->cred, it will release -current->cred_replace_mutex to allow ptrace() to take place, and it will notify -the scheduler and others of the changes. - -This function is guaranteed to return 0, so that it can be tail-called at the -end of such functions as sys_setresuid(). - -Note that this function consumes the caller's reference to the new credentials. -The caller should _not_ call put_cred() on the new credentials afterwards. - -Furthermore, once this function has been called on a new set of credentials, -those credentials may _not_ be changed further. - - -Should the security checks fail or some other error occur after prepare_creds() -has been called, then the following function should be invoked: - - void abort_creds(struct cred *new); - -This releases the lock on current->cred_replace_mutex that prepare_creds() got -and then releases the new credentials. - - -A typical credentials alteration function would look something like this: - - int alter_suid(uid_t suid) - { - struct cred *new; - int ret; - - new = prepare_creds(); - if (!new) - return -ENOMEM; - - new->suid = suid; - ret = security_alter_suid(new); - if (ret < 0) { - abort_creds(new); - return ret; - } - - return commit_creds(new); - } - - -MANAGING CREDENTIALS --------------------- - -There are some functions to help manage credentials: - - (*) void put_cred(const struct cred *cred); - - This releases a reference to the given set of credentials. If the - reference count reaches zero, the credentials will be scheduled for - destruction by the RCU system. - - (*) const struct cred *get_cred(const struct cred *cred); - - This gets a reference on a live set of credentials, returning a pointer to - that set of credentials. - - (*) struct cred *get_new_cred(struct cred *cred); - - This gets a reference on a set of credentials that is under construction - and is thus still mutable, returning a pointer to that set of credentials. - - -===================== -OPEN FILE CREDENTIALS -===================== - -When a new file is opened, a reference is obtained on the opening task's -credentials and this is attached to the file struct as 'f_cred' in place of -'f_uid' and 'f_gid'. Code that used to access file->f_uid and file->f_gid -should now access file->f_cred->fsuid and file->f_cred->fsgid. - -It is safe to access f_cred without the use of RCU or locking because the -pointer will not change over the lifetime of the file struct, and nor will the -contents of the cred struct pointed to, barring the exceptions listed above -(see the Task Credentials section). - - -======================================= -OVERRIDING THE VFS'S USE OF CREDENTIALS -======================================= - -Under some circumstances it is desirable to override the credentials used by -the VFS, and that can be done by calling into such as vfs_mkdir() with a -different set of credentials. This is done in the following places: - - (*) sys_faccessat(). - - (*) do_coredump(). - - (*) nfs4recover.c. diff --git a/Documentation/security/index.rst b/Documentation/security/index.rst index 07335659ce8d..415be8e0b013 100644 --- a/Documentation/security/index.rst +++ b/Documentation/security/index.rst @@ -5,5 +5,6 @@ Security Documentation .. toctree:: :maxdepth: 1 + credentials IMA-templates tpm/index diff --git a/include/linux/cred.h b/include/linux/cred.h index b03e7d049a64..c728d515e5e2 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -1,4 +1,4 @@ -/* Credentials management - see Documentation/security/credentials.txt +/* Credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) diff --git a/kernel/cred.c b/kernel/cred.c index 2bc66075740f..ecf03657e71c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -1,4 +1,4 @@ -/* Task credentials management - see Documentation/security/credentials.txt +/* Task credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) -- cgit v1.3-14-g43fede From 719f6a7040f1bdaf96fcc709d272548facb88e90 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 20 Apr 2017 10:52:31 +0200 Subject: printk: Use the main logbuf in NMI when logbuf_lock is available The commit 42a0bb3f71383b457a7d ("printk/nmi: generic solution for safe printk in NMI") caused that printk stores messages into a temporary buffer in NMI context. The buffer is per-CPU and therefore the size is rather limited. It works quite well for NMI backtraces. But there are longer logs that might get printed in NMI context, for example, lockdep warnings, ftrace_dump_on_oops. The temporary buffer is used to avoid deadlocks caused by logbuf_lock. Also it is needed to avoid races with the other temporary buffer that is used when PRINTK_SAFE_CONTEXT is entered. But the main buffer can be used in NMI if the lock is available and we did not interrupt PRINTK_SAFE_CONTEXT. The lock is checked using raw_spin_is_locked(). It might cause false negatives when the lock is taken on another CPU and this CPU is in the safe context from other reasons. Note that the safe context is used also to get console semaphore or when calling console drivers. For this reason, we do the check in printk_nmi_enter(). It makes the handling consistent for the entire NMI handler and avoids reshuffling of the messages. The patch also defines special printk context that allows to use printk_deferred() in NMI. Note that we could not flush the messages to the consoles because console drivers might use many other internal locks. The newly created vprintk_deferred() disables the preemption only around the irq work handling. It is needed there to keep the consistency between the two per-CPU variables. But there is no reason to disable preemption around vprintk_emit(). Finally, the patch puts back explicit serialization of the NMI backtraces from different CPUs. It was removed by the commit a9edc88093287183ac934b ("x86/nmi: Perform a safe NMI stack trace on all CPUs"). It was not needed because the flushing of the temporary per-CPU buffers was serialized. Link: http://lkml.kernel.org/r/1493912763-24873-1-git-send-email-pmladek@suse.com Cc: Steven Rostedt Cc: Andrew Morton Cc: Peter Zijlstra Cc: Russell King Cc: Daniel Thompson Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Chris Metcalf Cc: x86@kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Suggested-by: Sergey Senozhatsky Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/internal.h | 6 ++++-- kernel/printk/printk.c | 19 ++++++++++++++----- kernel/printk/printk_safe.c | 26 ++++++++++++++++++++++++-- lib/nmi_backtrace.c | 3 +++ 4 files changed, 45 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 1db044f808b7..2a7d04049af4 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -18,12 +18,14 @@ #ifdef CONFIG_PRINTK -#define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff -#define PRINTK_NMI_CONTEXT_MASK 0x80000000 +#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff +#define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000 +#define PRINTK_NMI_CONTEXT_MASK 0x80000000 extern raw_spinlock_t logbuf_lock; __printf(1, 0) int vprintk_default(const char *fmt, va_list args); +__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); __printf(1, 0) int vprintk_func(const char *fmt, va_list args); void __printk_safe_enter(void); void __printk_safe_exit(void); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 779479ac9f57..e5278e7d1922 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2722,16 +2722,13 @@ void wake_up_klogd(void) preempt_enable(); } -int printk_deferred(const char *fmt, ...) +int vprintk_deferred(const char *fmt, va_list args) { - va_list args; int r; - preempt_disable(); - va_start(args, fmt); r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); - va_end(args); + preempt_disable(); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); preempt_enable(); @@ -2739,6 +2736,18 @@ int printk_deferred(const char *fmt, ...) return r; } +int printk_deferred(const char *fmt, ...) +{ + va_list args; + int r; + + va_start(args, fmt); + r = vprintk_deferred(fmt, args); + va_end(args); + + return r; +} + /* * printk rate limiting, lifted from the networking subsystem. * diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 033e50a7d706..03a42a539b20 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -308,12 +308,24 @@ static int vprintk_nmi(const char *fmt, va_list args) void printk_nmi_enter(void) { - this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); + /* + * The size of the extra per-CPU buffer is limited. Use it only when + * the main one is locked. If this CPU is not in the safe context, + * the lock must be taken on another CPU and we could wait for it. + */ + if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) && + raw_spin_is_locked(&logbuf_lock)) { + this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); + } else { + this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK); + } } void printk_nmi_exit(void) { - this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); + this_cpu_and(printk_context, + ~(PRINTK_NMI_CONTEXT_MASK | + PRINTK_NMI_DEFERRED_CONTEXT_MASK)); } #else @@ -351,12 +363,22 @@ void __printk_safe_exit(void) __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { + /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */ if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) return vprintk_nmi(fmt, args); + /* Use extra buffer to prevent a recursion deadlock in safe mode. */ if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) return vprintk_safe(fmt, args); + /* + * Use the main logbuf when logbuf_lock is available in NMI. + * But avoid calling console drivers that might have their own locks. + */ + if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK) + return vprintk_deferred(fmt, args); + + /* No obstacles. */ return vprintk_default(fmt, args); } diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 4e8a30d1c22f..0bc0a3535a8a 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -86,9 +86,11 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, bool nmi_cpu_backtrace(struct pt_regs *regs) { + static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; int cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { + arch_spin_lock(&lock); if (regs && cpu_in_idle(instruction_pointer(regs))) { pr_warn("NMI backtrace for cpu %d skipped: idling at pc %#lx\n", cpu, instruction_pointer(regs)); @@ -99,6 +101,7 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) else dump_stack(); } + arch_spin_unlock(&lock); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } -- cgit v1.3-14-g43fede From 7e95a225901a5d2fd140f14b4302805cecc22da7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 19:52:01 -0400 Subject: move compat wait4 and waitid next to native variants Signed-off-by: Al Viro --- kernel/compat.c | 66 ------------------------------------------------------ kernel/exit.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 933bcb31ae10..b4cdba6bbd02 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -543,72 +543,6 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) return 0; } -COMPAT_SYSCALL_DEFINE4(wait4, - compat_pid_t, pid, - compat_uint_t __user *, stat_addr, - int, options, - struct compat_rusage __user *, ru) -{ - if (!ru) { - return sys_wait4(pid, stat_addr, options, NULL); - } else { - struct rusage r; - int ret; - unsigned int status; - mm_segment_t old_fs = get_fs(); - - set_fs (KERNEL_DS); - ret = sys_wait4(pid, - (stat_addr ? - (unsigned int __user *) &status : NULL), - options, (struct rusage __user *) &r); - set_fs (old_fs); - - if (ret > 0) { - if (put_compat_rusage(&r, ru)) - return -EFAULT; - if (stat_addr && put_user(status, stat_addr)) - return -EFAULT; - } - return ret; - } -} - -COMPAT_SYSCALL_DEFINE5(waitid, - int, which, compat_pid_t, pid, - struct compat_siginfo __user *, uinfo, int, options, - struct compat_rusage __user *, uru) -{ - siginfo_t info; - struct rusage ru; - long ret; - mm_segment_t old_fs = get_fs(); - - memset(&info, 0, sizeof(info)); - - set_fs(KERNEL_DS); - ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, - uru ? (struct rusage __user *)&ru : NULL); - set_fs(old_fs); - - if ((ret < 0) || (info.si_signo == 0)) - return ret; - - if (uru) { - /* sys_waitid() overwrites everything in ru */ - if (COMPAT_USE_64BIT_TIME) - ret = copy_to_user(uru, &ru, sizeof(ru)); - else - ret = put_compat_rusage(&ru, uru); - if (ret) - return -EFAULT; - } - - BUG_ON(info.si_code & __SI_MASK); - info.si_code |= __SI_CHLD; - return copy_siginfo_to_user32(uinfo, &info); -} - static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, unsigned len, struct cpumask *new_mask) { diff --git a/kernel/exit.c b/kernel/exit.c index 516acdb0e0ec..f98782bd27b6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -1735,3 +1736,71 @@ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) } #endif + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(wait4, + compat_pid_t, pid, + compat_uint_t __user *, stat_addr, + int, options, + struct compat_rusage __user *, ru) +{ + if (!ru) { + return sys_wait4(pid, stat_addr, options, NULL); + } else { + struct rusage r; + int ret; + unsigned int status; + mm_segment_t old_fs = get_fs(); + + set_fs (KERNEL_DS); + ret = sys_wait4(pid, + (stat_addr ? + (unsigned int __user *) &status : NULL), + options, (struct rusage __user *) &r); + set_fs (old_fs); + + if (ret > 0) { + if (put_compat_rusage(&r, ru)) + return -EFAULT; + if (stat_addr && put_user(status, stat_addr)) + return -EFAULT; + } + return ret; + } +} + +COMPAT_SYSCALL_DEFINE5(waitid, + int, which, compat_pid_t, pid, + struct compat_siginfo __user *, infop, int, options, + struct compat_rusage __user *, uru) +{ + siginfo_t info; + struct rusage ru; + long ret; + mm_segment_t old_fs = get_fs(); + + memset(&info, 0, sizeof(info)); + + set_fs(KERNEL_DS); + ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, + uru ? (struct rusage __user *)&ru : NULL); + set_fs(old_fs); + + if ((ret < 0) || (info.si_signo == 0)) + return ret; + + if (uru) { + /* sys_waitid() overwrites everything in ru */ + if (COMPAT_USE_64BIT_TIME) + ret = copy_to_user(uru, &ru, sizeof(ru)); + else + ret = put_compat_rusage(&ru, uru); + if (ret) + return -EFAULT; + } + + BUG_ON(info.si_code & __SI_MASK); + info.si_code |= __SI_CHLD; + return copy_siginfo_to_user32(infop, &info); +} +#endif -- cgit v1.3-14-g43fede From ce72a16fa705f960ca2352e95a7c5f4801475e75 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 20:25:02 -0400 Subject: wait4(2)/waitid(2): separate copying rusage to userland New helpers: kernel_waitid() and kernel_wait4(). sys_waitid(), sys_wait4() and their compat variants switched to those. Copying struct rusage to userland is left to syscall itself. For compat_sys_wait4() that eliminates the use of set_fs() completely. For compat_sys_waitid() it's still needed (for siginfo handling); that will change shortly. Signed-off-by: Al Viro --- include/linux/resource.h | 2 +- kernel/exit.c | 89 ++++++++++++++++++++++++++++-------------------- kernel/sys.c | 16 ++++----- 3 files changed, 59 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/include/linux/resource.h b/include/linux/resource.h index 5bc3116e649c..277afdad6589 100644 --- a/include/linux/resource.h +++ b/include/linux/resource.h @@ -6,7 +6,7 @@ struct task_struct; -int getrusage(struct task_struct *p, int who, struct rusage __user *ru); +void getrusage(struct task_struct *p, int who, struct rusage *ru); int do_prlimit(struct task_struct *tsk, unsigned int resource, struct rlimit *new_rlim, struct rlimit *old_rlim); diff --git a/kernel/exit.c b/kernel/exit.c index f98782bd27b6..d44f12948c5f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1003,7 +1003,7 @@ struct wait_opts { struct siginfo __user *wo_info; int __user *wo_stat; - struct rusage __user *wo_rusage; + struct rusage *wo_rusage; wait_queue_t child_wait; int notask_error; @@ -1054,8 +1054,10 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, pid_t pid, uid_t uid, int why, int status) { struct siginfo __user *infop; - int retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + int retval = 0; + + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); infop = wo->wo_info; @@ -1182,8 +1184,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_unlock_irq(¤t->sighand->siglock); } - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + retval = 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; if (!retval && wo->wo_stat) @@ -1316,8 +1319,9 @@ unlock_sig: if (unlikely(wo->wo_flags & WNOWAIT)) return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + retval = 0; if (!retval && wo->wo_stat) retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); @@ -1377,8 +1381,9 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) sched_annotate_sleep(); if (!wo->wo_info) { - retval = wo->wo_rusage - ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + retval = 0; put_task_struct(p); if (!retval && wo->wo_stat) retval = put_user(0xffff, wo->wo_stat); @@ -1618,8 +1623,8 @@ end: return retval; } -SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, - infop, int, options, struct rusage __user *, ru) +static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop, + int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; @@ -1687,8 +1692,21 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, return ret; } -SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, - int, options, struct rusage __user *, ru) +SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, + infop, int, options, struct rusage __user *, ru) +{ + struct rusage r; + long err = kernel_waitid(which, upid, infop, options, ru ? &r : NULL); + + if (!err) { + if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) + return -EFAULT; + } + return err; +} + +static long kernel_wait4(pid_t upid, int __user *stat_addr, + int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; @@ -1724,6 +1742,19 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, return ret; } +SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, + int, options, struct rusage __user *, ru) +{ + struct rusage r; + long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); + + if (err > 0) { + if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) + return -EFAULT; + } + return err; +} + #ifdef __ARCH_WANT_SYS_WAITPID /* @@ -1744,29 +1775,13 @@ COMPAT_SYSCALL_DEFINE4(wait4, int, options, struct compat_rusage __user *, ru) { - if (!ru) { - return sys_wait4(pid, stat_addr, options, NULL); - } else { - struct rusage r; - int ret; - unsigned int status; - mm_segment_t old_fs = get_fs(); - - set_fs (KERNEL_DS); - ret = sys_wait4(pid, - (stat_addr ? - (unsigned int __user *) &status : NULL), - options, (struct rusage __user *) &r); - set_fs (old_fs); - - if (ret > 0) { - if (put_compat_rusage(&r, ru)) - return -EFAULT; - if (stat_addr && put_user(status, stat_addr)) - return -EFAULT; - } - return ret; + struct rusage r; + long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); + if (err > 0) { + if (ru && put_compat_rusage(&r, ru)) + return -EFAULT; } + return err; } COMPAT_SYSCALL_DEFINE5(waitid, @@ -1782,8 +1797,8 @@ COMPAT_SYSCALL_DEFINE5(waitid, memset(&info, 0, sizeof(info)); set_fs(KERNEL_DS); - ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, - uru ? (struct rusage __user *)&ru : NULL); + ret = kernel_waitid(which, pid, (siginfo_t __user *)&info, options, + uru ? &ru : NULL); set_fs(old_fs); if ((ret < 0) || (info.si_signo == 0)) diff --git a/kernel/sys.c b/kernel/sys.c index 8a94b4eabcaa..dab1a0658a92 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1552,7 +1552,7 @@ static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) r->ru_oublock += task_io_get_oublock(t); } -static void k_getrusage(struct task_struct *p, int who, struct rusage *r) +void getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; unsigned long flags; @@ -1626,20 +1626,16 @@ out: r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ } -int getrusage(struct task_struct *p, int who, struct rusage __user *ru) +SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) { struct rusage r; - k_getrusage(p, who, &r); - return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; -} - -SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) -{ if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && who != RUSAGE_THREAD) return -EINVAL; - return getrusage(current, who, ru); + + getrusage(current, who, &r); + return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } #ifdef CONFIG_COMPAT @@ -1651,7 +1647,7 @@ COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) who != RUSAGE_THREAD) return -EINVAL; - k_getrusage(current, who, &r); + getrusage(current, who, &r); return put_compat_rusage(&r, ru); } #endif -- cgit v1.3-14-g43fede From 359566faefa850504d146839d74496f0cf12d3b9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 20:39:39 -0400 Subject: kernel_wait4()/kernel_waitid(): delay copying status to userland Signed-off-by: Al Viro --- kernel/exit.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d44f12948c5f..94cdccf8e7e7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1002,7 +1002,7 @@ struct wait_opts { struct pid *wo_pid; struct siginfo __user *wo_info; - int __user *wo_stat; + int wo_stat; struct rusage *wo_rusage; wait_queue_t child_wait; @@ -1189,8 +1189,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) retval = 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; - if (!retval && wo->wo_stat) - retval = put_user(status, wo->wo_stat); + wo->wo_stat = status; infop = wo->wo_info; if (!retval && infop) @@ -1322,8 +1321,7 @@ unlock_sig: if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); retval = 0; - if (!retval && wo->wo_stat) - retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); + wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (!retval && infop) @@ -1383,12 +1381,9 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) if (!wo->wo_info) { if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); - retval = 0; put_task_struct(p); - if (!retval && wo->wo_stat) - retval = put_user(0xffff, wo->wo_stat); - if (!retval) - retval = pid; + wo->wo_stat = 0xffff; + retval = pid; } else { retval = wait_noreap_copyout(wo, p, pid, uid, CLD_CONTINUED, SIGCONT); @@ -1662,7 +1657,6 @@ static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop, wo.wo_pid = pid; wo.wo_flags = options; wo.wo_info = infop; - wo.wo_stat = NULL; wo.wo_rusage = ru; ret = do_wait(&wo); @@ -1734,10 +1728,12 @@ static long kernel_wait4(pid_t upid, int __user *stat_addr, wo.wo_pid = pid; wo.wo_flags = options | WEXITED; wo.wo_info = NULL; - wo.wo_stat = stat_addr; + wo.wo_stat = 0; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); + if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) + ret = -EFAULT; return ret; } -- cgit v1.3-14-g43fede From 67d7ddded322db99f451a7959d56ed6c70a6c4aa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 20:53:13 -0400 Subject: waitid(2): leave copyout of siginfo to syscall itself have kernel_waitid() collect the information needed for siginfo into a small structure (waitid_info) passed to it; deal with copyout in sys_waitid()/compat_sys_waitid(). Signed-off-by: Al Viro --- kernel/exit.c | 168 ++++++++++++++++++++++------------------------------------ 1 file changed, 64 insertions(+), 104 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 94cdccf8e7e7..42f26480b3cc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -996,12 +996,19 @@ SYSCALL_DEFINE1(exit_group, int, error_code) return 0; } +struct waitid_info { + pid_t pid; + uid_t uid; + int status; + int cause; +}; + struct wait_opts { enum pid_type wo_type; int wo_flags; struct pid *wo_pid; - struct siginfo __user *wo_info; + struct waitid_info *wo_info; int wo_stat; struct rusage *wo_rusage; @@ -1053,8 +1060,7 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, pid_t pid, uid_t uid, int why, int status) { - struct siginfo __user *infop; - int retval = 0; + struct waitid_info *infop; if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); @@ -1062,22 +1068,12 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, put_task_struct(p); infop = wo->wo_info; if (infop) { - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); + infop->cause = why; + infop->pid = pid; + infop->uid = uid; + infop->status = status; } - if (!retval) - retval = pid; - return retval; + return pid; } /* @@ -1088,10 +1084,10 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { - int state, retval, status; + int state, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); - struct siginfo __user *infop; + struct waitid_info *infop; if (!likely(wo->wo_flags & WEXITED)) return 0; @@ -1186,36 +1182,22 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); - retval = 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; infop = wo->wo_info; - if (!retval && infop) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval && infop) - retval = put_user(0, &infop->si_errno); - if (!retval && infop) { - int why; - + if (infop) { if ((status & 0x7f) == 0) { - why = CLD_EXITED; - status >>= 8; + infop->cause = CLD_EXITED; + infop->status = status >> 8; } else { - why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; - status &= 0x7f; + infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; + infop->status = status & 0x7f; } - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(status, &infop->si_status); + infop->pid = pid; + infop->uid = uid; } - if (!retval && infop) - retval = put_user(pid, &infop->si_pid); - if (!retval && infop) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = pid; if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); @@ -1232,7 +1214,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (state == EXIT_DEAD) release_task(p); - return retval; + return pid; } static int *task_stopped_code(struct task_struct *p, bool ptrace) @@ -1268,8 +1250,8 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace) static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) { - struct siginfo __user *infop; - int retval, exit_code, *p_code, why; + struct waitid_info *infop; + int exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; @@ -1320,28 +1302,19 @@ unlock_sig: if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); - retval = 0; wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; - if (!retval && infop) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval && infop) - retval = put_user(0, &infop->si_errno); - if (!retval && infop) - retval = put_user((short)why, &infop->si_code); - if (!retval && infop) - retval = put_user(exit_code, &infop->si_status); - if (!retval && infop) - retval = put_user(pid, &infop->si_pid); - if (!retval && infop) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = pid; + if (infop) { + infop->cause = why; + infop->status = exit_code; + infop->pid = pid; + infop->uid = uid; + } put_task_struct(p); - BUG_ON(!retval); - return retval; + BUG_ON(!pid); + return pid; } /* @@ -1618,7 +1591,7 @@ end: return retval; } -static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop, +static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { struct wait_opts wo; @@ -1660,27 +1633,8 @@ static long kernel_waitid(int which, pid_t upid, struct siginfo __user *infop, wo.wo_rusage = ru; ret = do_wait(&wo); - if (ret > 0) { + if (ret > 0) ret = 0; - } else if (infop) { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!ret) - ret = put_user(0, &infop->si_signo); - if (!ret) - ret = put_user(0, &infop->si_errno); - if (!ret) - ret = put_user(0, &infop->si_code); - if (!ret) - ret = put_user(0, &infop->si_pid); - if (!ret) - ret = put_user(0, &infop->si_uid); - if (!ret) - ret = put_user(0, &infop->si_status); - } put_pid(pid); return ret; @@ -1690,12 +1644,24 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, infop, int, options, struct rusage __user *, ru) { struct rusage r; - long err = kernel_waitid(which, upid, infop, options, ru ? &r : NULL); + struct waitid_info info = {.status = 0}; + long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); if (!err) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } + if (!infop) + return err; + + if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) || + put_user(0, &infop->si_errno) || + put_user((short)info.cause, &infop->si_code) || + put_user(info.pid, &infop->si_pid) || + put_user(info.uid, &infop->si_uid) || + put_user(info.status, &infop->si_status)) + err = -EFAULT; + return err; } @@ -1785,33 +1751,27 @@ COMPAT_SYSCALL_DEFINE5(waitid, struct compat_siginfo __user *, infop, int, options, struct compat_rusage __user *, uru) { - siginfo_t info; struct rusage ru; - long ret; - mm_segment_t old_fs = get_fs(); - - memset(&info, 0, sizeof(info)); + struct waitid_info info = {.status = 0}; + long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); - set_fs(KERNEL_DS); - ret = kernel_waitid(which, pid, (siginfo_t __user *)&info, options, - uru ? &ru : NULL); - set_fs(old_fs); - - if ((ret < 0) || (info.si_signo == 0)) - return ret; - - if (uru) { - /* sys_waitid() overwrites everything in ru */ + if (!err && uru) { + /* kernel_waitid() overwrites everything in ru */ if (COMPAT_USE_64BIT_TIME) - ret = copy_to_user(uru, &ru, sizeof(ru)); + err = copy_to_user(uru, &ru, sizeof(ru)); else - ret = put_compat_rusage(&ru, uru); - if (ret) + err = put_compat_rusage(&ru, uru); + if (err) return -EFAULT; } - BUG_ON(info.si_code & __SI_MASK); - info.si_code |= __SI_CHLD; - return copy_siginfo_to_user32(infop, &info); + if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) || + put_user(0, &infop->si_errno) || + put_user((short)info.cause, &infop->si_code) || + put_user(info.pid, &infop->si_pid) || + put_user(info.uid, &infop->si_uid) || + put_user(info.status, &infop->si_status)) + err = -EFAULT; + return err; } #endif -- cgit v1.3-14-g43fede From e61a250229fbf0f003e93676bf4d8a555a8c9eec Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 21:25:03 -0400 Subject: lift getrusage() from wait_noreap_copyout() Signed-off-by: Al Viro --- kernel/exit.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 42f26480b3cc..d4f5097da85a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1062,9 +1062,6 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, { struct waitid_info *infop; - if (wo->wo_rusage) - getrusage(p, RUSAGE_BOTH, wo->wo_rusage); - put_task_struct(p); infop = wo->wo_info; if (infop) { @@ -1099,6 +1096,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); if ((exit_code & 0x7f) == 0) { why = CLD_EXITED; @@ -1296,12 +1295,12 @@ unlock_sig: why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); sched_annotate_sleep(); + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); if (unlikely(wo->wo_flags & WNOWAIT)) return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); - if (wo->wo_rusage) - getrusage(p, RUSAGE_BOTH, wo->wo_rusage); wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; @@ -1350,10 +1349,10 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); + if (wo->wo_rusage) + getrusage(p, RUSAGE_BOTH, wo->wo_rusage); if (!wo->wo_info) { - if (wo->wo_rusage) - getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); wo->wo_stat = 0xffff; retval = pid; -- cgit v1.3-14-g43fede From bb380ec33a7d8ee048e722889627869d21a5d527 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 21:33:21 -0400 Subject: kill wait_noreap_copyout() folds into callers Signed-off-by: Al Viro --- kernel/exit.c | 65 +++++++++++++++++++++++------------------------------------ 1 file changed, 25 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d4f5097da85a..f01ebaab978a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1057,22 +1057,6 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) return 1; } -static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, - pid_t pid, uid_t uid, int why, int status) -{ - struct waitid_info *infop; - - put_task_struct(p); - infop = wo->wo_info; - if (infop) { - infop->cause = why; - infop->pid = pid; - infop->uid = uid; - infop->status = status; - } - return pid; -} - /* * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold @@ -1091,22 +1075,27 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (unlikely(wo->wo_flags & WNOWAIT)) { int exit_code = p->exit_code; - int why; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + put_task_struct(p); - if ((exit_code & 0x7f) == 0) { - why = CLD_EXITED; - status = exit_code >> 8; - } else { - why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; - status = exit_code & 0x7f; + infop = wo->wo_info; + if (infop) { + if ((exit_code & 0x7f) == 0) { + infop->cause = CLD_EXITED; + infop->status = exit_code >> 8; + } else { + infop->cause = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; + infop->status = exit_code & 0x7f; + } + infop->pid = pid; + infop->uid = uid; } - return wait_noreap_copyout(wo, p, pid, uid, why, status); + return pid; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. @@ -1297,11 +1286,10 @@ unlock_sig: sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + put_task_struct(p); - if (unlikely(wo->wo_flags & WNOWAIT)) - return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); - - wo->wo_stat = (exit_code << 8) | 0x7f; + if (likely(!(wo->wo_flags & WNOWAIT))) + wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (infop) { @@ -1310,9 +1298,6 @@ unlock_sig: infop->pid = pid; infop->uid = uid; } - put_task_struct(p); - - BUG_ON(!pid); return pid; } @@ -1324,7 +1309,7 @@ unlock_sig: */ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) { - int retval; + struct waitid_info *infop; pid_t pid; uid_t uid; @@ -1351,18 +1336,18 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); + put_task_struct(p); - if (!wo->wo_info) { - put_task_struct(p); + infop = wo->wo_info; + if (!infop) { wo->wo_stat = 0xffff; - retval = pid; } else { - retval = wait_noreap_copyout(wo, p, pid, uid, - CLD_CONTINUED, SIGCONT); - BUG_ON(retval == 0); + infop->cause = CLD_CONTINUED; + infop->pid = pid; + infop->uid = uid; + infop->status = SIGCONT; } - - return retval; + return pid; } /* -- cgit v1.3-14-g43fede From 76d9871e1122aabc086e7aade5251b1e5124cbb9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 21:38:26 -0400 Subject: wait_task_zombie: consolidate info logics Signed-off-by: Al Viro --- kernel/exit.c | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f01ebaab978a..97db9ee03f90 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1074,28 +1074,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { - int exit_code = p->exit_code; - + status = p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); - - infop = wo->wo_info; - if (infop) { - if ((exit_code & 0x7f) == 0) { - infop->cause = CLD_EXITED; - infop->status = exit_code >> 8; - } else { - infop->cause = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; - infop->status = exit_code & 0x7f; - } - infop->pid = pid; - infop->uid = uid; - } - return pid; + goto out_info; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. @@ -1174,19 +1160,6 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; - infop = wo->wo_info; - if (infop) { - if ((status & 0x7f) == 0) { - infop->cause = CLD_EXITED; - infop->status = status >> 8; - } else { - infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; - infop->status = status & 0x7f; - } - infop->pid = pid; - infop->uid = uid; - } - if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ @@ -1202,6 +1175,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (state == EXIT_DEAD) release_task(p); +out_info: + infop = wo->wo_info; + if (infop) { + if ((status & 0x7f) == 0) { + infop->cause = CLD_EXITED; + infop->status = status >> 8; + } else { + infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; + infop->status = status & 0x7f; + } + infop->pid = pid; + infop->uid = uid; + } + return pid; } -- cgit v1.3-14-g43fede From 4c48abe91be03d191d0c20cc755877da2cb35622 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 19:27:32 -0400 Subject: waitid(): switch copyout of siginfo to unsafe_put_user() Signed-off-by: Al Viro --- kernel/exit.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 97db9ee03f90..f3b8c3a87bc1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1625,15 +1625,18 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; - if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) || - put_user(0, &infop->si_errno) || - put_user((short)info.cause, &infop->si_code) || - put_user(info.pid, &infop->si_pid) || - put_user(info.uid, &infop->si_uid) || - put_user(info.status, &infop->si_status)) - err = -EFAULT; - + user_access_begin(); + unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault); + unsafe_put_user(0, &infop->si_errno, Efault); + unsafe_put_user((short)info.cause, &infop->si_code, Efault); + unsafe_put_user(info.pid, &infop->si_pid, Efault); + unsafe_put_user(info.uid, &infop->si_uid, Efault); + unsafe_put_user(info.status, &infop->si_status, Efault); + user_access_end(); return err; +Efault: + user_access_end(); + return -EFAULT; } static long kernel_wait4(pid_t upid, int __user *stat_addr, @@ -1736,13 +1739,20 @@ COMPAT_SYSCALL_DEFINE5(waitid, return -EFAULT; } - if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) || - put_user(0, &infop->si_errno) || - put_user((short)info.cause, &infop->si_code) || - put_user(info.pid, &infop->si_pid) || - put_user(info.uid, &infop->si_uid) || - put_user(info.status, &infop->si_status)) - err = -EFAULT; + if (!infop) + return err; + + user_access_begin(); + unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault); + unsafe_put_user(0, &infop->si_errno, Efault); + unsafe_put_user((short)info.cause, &infop->si_code, Efault); + unsafe_put_user(info.pid, &infop->si_pid, Efault); + unsafe_put_user(info.uid, &infop->si_uid, Efault); + unsafe_put_user(info.status, &infop->si_status, Efault); + user_access_end(); return err; +Efault: + user_access_end(); + return -EFAULT; } #endif -- cgit v1.3-14-g43fede From 92ebce5ac55dba258c608248dddf59eca3f7f514 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 May 2017 23:54:33 -0400 Subject: osf_wait4: switch to kernel_wait4() ... and sanitize copying rusage to userland Signed-off-by: Al Viro --- arch/alpha/kernel/osf_sys.c | 53 ++++++++++++--------------------------------- include/linux/sched/task.h | 2 ++ kernel/exit.c | 4 ++-- 3 files changed, 18 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index ce93124a850b..b23d6fbbb225 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1183,48 +1183,23 @@ SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru) SYSCALL_DEFINE4(osf_wait4, pid_t, pid, int __user *, ustatus, int, options, struct rusage32 __user *, ur) { - struct rusage r; - long ret, err; unsigned int status = 0; - mm_segment_t old_fs; - + struct rusage r; + long err = kernel_wait4(pid, &status, options, &r); + if (err <= 0) + return err; + if (put_user(status, ustatus)) + return -EFAULT; if (!ur) - return sys_wait4(pid, ustatus, options, NULL); - - old_fs = get_fs(); - - set_fs (KERNEL_DS); - ret = sys_wait4(pid, (unsigned int __user *) &status, options, - (struct rusage __user *) &r); - set_fs (old_fs); - - if (!access_ok(VERIFY_WRITE, ur, sizeof(*ur))) + return err; + if (put_tv32(&ur->ru_utime, &r.ru_utime)) return -EFAULT; - - err = put_user(status, ustatus); - if (ret < 0) - return err ? err : ret; - - err |= __put_user(r.ru_utime.tv_sec, &ur->ru_utime.tv_sec); - err |= __put_user(r.ru_utime.tv_usec, &ur->ru_utime.tv_usec); - err |= __put_user(r.ru_stime.tv_sec, &ur->ru_stime.tv_sec); - err |= __put_user(r.ru_stime.tv_usec, &ur->ru_stime.tv_usec); - err |= __put_user(r.ru_maxrss, &ur->ru_maxrss); - err |= __put_user(r.ru_ixrss, &ur->ru_ixrss); - err |= __put_user(r.ru_idrss, &ur->ru_idrss); - err |= __put_user(r.ru_isrss, &ur->ru_isrss); - err |= __put_user(r.ru_minflt, &ur->ru_minflt); - err |= __put_user(r.ru_majflt, &ur->ru_majflt); - err |= __put_user(r.ru_nswap, &ur->ru_nswap); - err |= __put_user(r.ru_inblock, &ur->ru_inblock); - err |= __put_user(r.ru_oublock, &ur->ru_oublock); - err |= __put_user(r.ru_msgsnd, &ur->ru_msgsnd); - err |= __put_user(r.ru_msgrcv, &ur->ru_msgrcv); - err |= __put_user(r.ru_nsignals, &ur->ru_nsignals); - err |= __put_user(r.ru_nvcsw, &ur->ru_nvcsw); - err |= __put_user(r.ru_nivcsw, &ur->ru_nivcsw); - - return err ? err : ret; + if (put_tv32(&ur->ru_stime, &r.ru_stime)) + return -EFAULT; + if (copy_to_user(&ur->ru_maxrss, &r.ru_maxrss, + sizeof(struct rusage32) - offsetof(struct rusage32, ru_maxrss))) + return -EFAULT; + return err; } /* diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index a978d7189cfd..6b830fd9d809 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -9,6 +9,7 @@ #include struct task_struct; +struct rusage; union thread_union; /* @@ -74,6 +75,7 @@ extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); +extern long kernel_wait4(pid_t, int *, int, struct rusage *); extern void free_task(struct task_struct *tsk); diff --git a/kernel/exit.c b/kernel/exit.c index f3b8c3a87bc1..462fc25eec6e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1639,8 +1639,8 @@ Efault: return -EFAULT; } -static long kernel_wait4(pid_t upid, int __user *stat_addr, - int options, struct rusage *ru) +long kernel_wait4(pid_t upid, int __user *stat_addr, int options, + struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; -- cgit v1.3-14-g43fede From 6f9a22bc5775d231ab8fbe2c2f3c88e45e3e7c28 Mon Sep 17 00:00:00 2001 From: Michael Hernandez Date: Thu, 18 May 2017 10:47:47 -0700 Subject: PCI/MSI: Ignore affinity if pre/post vector count is more than min_vecs min_vecs is the minimum amount of vectors needed to operate in MSI-X mode which may just include the vectors that don't need affinity. Disabling affinity settings causes the qla2xxx driver scsi_add_host() to fail when blk_mq is enabled as the blk_mq_pci_map_queues() expects affinity masks on each vector. Fixes: dfef358bd1be ("PCI/MSI: Don't apply affinity if there aren't enough vectors left") Signed-off-by: Michael Hernandez Signed-off-by: Himanshu Madhani Signed-off-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig Cc: stable@vger.kernel.org # v4.10+ --- drivers/pci/msi.c | 14 ++------------ include/linux/interrupt.h | 4 ++-- kernel/irq/affinity.c | 13 ++++++++++++- 3 files changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index ba44fdfda66b..9e1569107cd6 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1058,7 +1058,7 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, for (;;) { if (affd) { - nvec = irq_calc_affinity_vectors(nvec, affd); + nvec = irq_calc_affinity_vectors(minvec, nvec, affd); if (nvec < minvec) return -ENOSPC; } @@ -1097,7 +1097,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev, for (;;) { if (affd) { - nvec = irq_calc_affinity_vectors(nvec, affd); + nvec = irq_calc_affinity_vectors(minvec, nvec, affd); if (nvec < minvec) return -ENOSPC; } @@ -1165,16 +1165,6 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs, if (flags & PCI_IRQ_AFFINITY) { if (!affd) affd = &msi_default_affd; - - if (affd->pre_vectors + affd->post_vectors > min_vecs) - return -EINVAL; - - /* - * If there aren't any vectors left after applying the pre/post - * vectors don't bother with assigning affinity. - */ - if (affd->pre_vectors + affd->post_vectors == min_vecs) - affd = NULL; } else { if (WARN_ON(affd)) affd = NULL; diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a6fba4804672..0991f973f8ca 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -291,7 +291,7 @@ extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); struct cpumask *irq_create_affinity_masks(int nvec, const struct irq_affinity *affd); -int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd); +int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd); #else /* CONFIG_SMP */ @@ -331,7 +331,7 @@ irq_create_affinity_masks(int nvec, const struct irq_affinity *affd) } static inline int -irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) +irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd) { return maxvec; } diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e2d356dd7581..9b71406d2eec 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -66,6 +66,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) struct cpumask *masks; cpumask_var_t nmsk; + /* + * If there aren't any vectors left after applying the pre/post + * vectors don't bother with assigning affinity. + */ + if (!affv) + return NULL; + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; @@ -140,15 +147,19 @@ out: /** * irq_calc_affinity_vectors - Calculate the optimal number of vectors + * @minvec: The minimum number of vectors available * @maxvec: The maximum number of vectors available * @affd: Description of the affinity requirements */ -int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) +int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd) { int resv = affd->pre_vectors + affd->post_vectors; int vecs = maxvec - resv; int cpus; + if (resv > minvec) + return 0; + /* Stabilize the cpumasks */ get_online_cpus(); cpus = cpumask_weight(cpu_online_mask); -- cgit v1.3-14-g43fede From fe17a42e704a64477b15bb2cf8366fe3e5119aff Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 12 May 2017 12:55:35 +0100 Subject: irqdomain: Let irq_domain_mapping display hierarchical domains Hierarchical domains seem to be hard to grasp, and a number of aspiring kernel hackers find them utterly discombobulating. In order to ease their pain, let's make them appear in /sys/kernel/debug/irq_domain_mapping, such as the following: 96 0x81808 MSI 0x (null) RADIX MSI 96+ 0x00063 GICv2m 0xffff8003ee116980 RADIX GICv2m 96+ 0x00063 GICv2 0xffff00000916bfd8 LINEAR GICv2 [output compressed to fit in a commit log] This shows that IRQ96 is implemented by a stack of three domains, the + sign indicating the stacking. Signed-off-by: Marc Zyngier Link: http://lkml.kernel.org/r/20170512115538.10767-2-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 68 +++++++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 31805f237396..1f6cd2cacf74 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -746,13 +746,54 @@ unsigned int irq_find_mapping(struct irq_domain *domain, EXPORT_SYMBOL_GPL(irq_find_mapping); #ifdef CONFIG_IRQ_DOMAIN_DEBUG +static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc) +{ + struct irq_domain *domain; + struct irq_data *data; + + domain = desc->irq_data.domain; + data = &desc->irq_data; + + while (domain) { + unsigned int irq = data->irq; + unsigned long hwirq = data->hwirq; + struct irq_chip *chip; + bool direct; + + if (data == &desc->irq_data) + seq_printf(m, "%5d ", irq); + else + seq_printf(m, "%5d+ ", irq); + seq_printf(m, "0x%05lx ", hwirq); + + chip = irq_data_get_irq_chip(data); + seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); + + seq_printf(m, data ? "0x%p " : " %p ", + irq_data_get_irq_chip_data(data)); + + seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); + direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq); + seq_printf(m, "%6s%-8s ", + (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", + direct ? "(DIRECT)" : ""); + seq_printf(m, "%s\n", domain->name); +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + domain = domain->parent; + data = data->parent_data; +#else + domain = NULL; +#endif + } +} + static int virq_debug_show(struct seq_file *m, void *private) { unsigned long flags; struct irq_desc *desc; struct irq_domain *domain; struct radix_tree_iter iter; - void *data, **slot; + void **slot; int i; seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", @@ -782,30 +823,7 @@ static int virq_debug_show(struct seq_file *m, void *private) continue; raw_spin_lock_irqsave(&desc->lock, flags); - domain = desc->irq_data.domain; - - if (domain) { - struct irq_chip *chip; - int hwirq = desc->irq_data.hwirq; - bool direct; - - seq_printf(m, "%5d ", i); - seq_printf(m, "0x%05x ", hwirq); - - chip = irq_desc_get_chip(desc); - seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); - - data = irq_desc_get_chip_data(desc); - seq_printf(m, data ? "0x%p " : " %p ", data); - - seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); - direct = (i == hwirq) && (i < domain->revmap_direct_max_irq); - seq_printf(m, "%6s%-8s ", - (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", - direct ? "(DIRECT)" : ""); - seq_printf(m, "%s\n", desc->irq_data.domain->name); - } - + virq_debug_show_one(m, desc); raw_spin_unlock_irqrestore(&desc->lock, flags); } -- cgit v1.3-14-g43fede From 2370c00dc7232d0c4af224e7730b4de031f3b1a0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 12 May 2017 12:55:36 +0100 Subject: irqdomain: Let irq_domain_mapping display ACPI fwnode attributes If the system is using ACPI, there is no of_node to display. But ACPI can use a struct irqchip_fwid as a domain identifier, and it can be used to display the name contained in that structure. The output on such a system will look like this: pMSI 0 0 0 irqchip@00000000e1180000 MSI 37 0 0 irqchip@00000000e1180000 GICv2m 37 0 0 irqchip@00000000e1180000 GICv2 448 448 0 irqchip@ffff000008003000 Signed-off-by: Marc Zyngier Link: http://lkml.kernel.org/r/20170512115538.10767-3-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1f6cd2cacf74..70b9da72018b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -801,15 +801,26 @@ static int virq_debug_show(struct seq_file *m, void *private) mutex_lock(&irq_domain_mutex); list_for_each_entry(domain, &irq_domain_list, link) { struct device_node *of_node; + const char *name; + int count = 0; + of_node = irq_domain_get_of_node(domain); + if (of_node) + name = of_node_full_name(of_node); + else if (is_fwnode_irqchip(domain->fwnode)) + name = container_of(domain->fwnode, struct irqchip_fwid, + fwnode)->name; + else + name = ""; + radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) count++; seq_printf(m, "%c%-16s %6u %10u %10u %s\n", domain == irq_default_domain ? '*' : ' ', domain->name, domain->revmap_size + count, domain->revmap_size, domain->revmap_direct_max_irq, - of_node ? of_node_full_name(of_node) : ""); + name); } mutex_unlock(&irq_domain_mutex); -- cgit v1.3-14-g43fede From a97b852b4d4c2f8c50cab13c71566639f9a1a990 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 12 May 2017 12:55:37 +0100 Subject: genirq/msi: Populate the domain name if provided by the irqchip In order to ease debug, let's populate the domain name upfront, before any MSI gets requested. This allows the domain to appear in the irq_domain_mapping, and the user to easily find the expected data. Signed-off-by: Marc Zyngier Link: http://lkml.kernel.org/r/20170512115538.10767-4-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/msi.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index ddc2f5427f75..fe4d48ec5bc4 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -265,13 +265,19 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent) { + struct irq_domain *domain; + if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) msi_domain_update_dom_ops(info); if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) msi_domain_update_chip_ops(info); - return irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, - fwnode, &msi_domain_ops, info); + domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, + fwnode, &msi_domain_ops, info); + if (domain && info->chip && info->chip->name) + domain->name = info->chip->name; + + return domain; } int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, -- cgit v1.3-14-g43fede From 85c617abc786d7da9e95c0b4174159864dd3f85c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 22 May 2017 12:03:49 +0300 Subject: perf/core: Remove some dead code perf_init_event() can't return NULL. If it did, the error handling is incomplete and we would crash. I have removed this confusing dead code. Signed-off-by: Dan Carpenter Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/20170522090348.5g7yyld5en3yeky4@mwanda Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e75a5c9412d..0028efa0abc3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9172,7 +9172,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) static struct pmu *perf_init_event(struct perf_event *event) { - struct pmu *pmu = NULL; + struct pmu *pmu; int idx; int ret; @@ -9456,9 +9456,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } pmu = perf_init_event(event); - if (!pmu) - goto err_ns; - else if (IS_ERR(pmu)) { + if (IS_ERR(pmu)) { err = PTR_ERR(pmu); goto err_ns; } -- cgit v1.3-14-g43fede From 36cc2b9222b5106de34085c4dd8635ac67ef5cba Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 22 May 2017 12:04:18 +0300 Subject: perf/core: Fix error handling in perf_event_alloc() We don't set an error code here which means that perf_event_alloc() returns ERR_PTR(0) (in other words NULL). The callers are not expecting that and would Oops. Signed-off-by: Dan Carpenter Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 375637bc5249 ("perf/core: Introduce address range filtering") Link: http://lkml.kernel.org/r/20170522090418.hvs6icgpdo53wkn5@mwanda Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0028efa0abc3..f8c27d3ef3a1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9469,8 +9469,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, sizeof(unsigned long), GFP_KERNEL); - if (!event->addr_filters_offs) + if (!event->addr_filters_offs) { + err = -ENOMEM; goto err_per_task; + } /* force hw sync on the address filters */ event->addr_filters_gen = 1; -- cgit v1.3-14-g43fede From 3fc5b3b6a80b2e08a0fec0056208c5dff757e547 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Fri, 19 May 2017 15:53:31 +0800 Subject: smp: Avoid sending needless IPI in smp_call_function_many() Inter-Processor-Interrupt(IPI) is needed when a page is unmapped and the process' mm_cpumask() shows the process has ever run on other CPUs. page migration, page reclaim all need IPIs. The number of IPI needed to send to different CPUs is especially large for multi-threaded workload since mm_cpumask() is per process. For smp_call_function_many(), whenever a CPU queues a CSD to a target CPU, it will send an IPI to let the target CPU to handle the work. This isn't necessary - we need only send IPI when queueing a CSD to an empty call_single_queue. The reason: flush_smp_call_function_queue() that is called upon a CPU receiving an IPI will empty the queue and then handle all of the CSDs there. So if the target CPU's call_single_queue is not empty, we know that: i. An IPI for the target CPU has already been sent by 'previous queuers'; ii. flush_smp_call_function_queue() hasn't emptied that CPU's queue yet. Thus, it's safe for us to just queue our CSD there without sending an addtional IPI. And for the 'previous queuers', we can limit it to the first queuer. To demonstrate the effect of this patch, a multi-thread workload that spawns 80 threads to equally consume 100G memory is used. This is tested on a 2 node broadwell-EP which has 44cores/88threads and 32G memory. So after 32G memory is used up, page reclaiming starts to happen a lot. With this patch, IPI number dropped 88% and throughput increased about 15% for the above workload. Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Cc: Dave Hansen Cc: Huang Ying Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Link: http://lkml.kernel.org/r/20170519075331.GE2084@aaronlu.sh.intel.com Signed-off-by: Ingo Molnar --- kernel/smp.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index a817769b53c0..76d16fe3c427 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -30,6 +30,7 @@ enum { struct call_function_data { struct call_single_data __percpu *csd; cpumask_var_t cpumask; + cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); @@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu) if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return -ENOMEM; + if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, + cpu_to_node(cpu))) { + free_cpumask_var(cfd->cpumask); + return -ENOMEM; + } cfd->csd = alloc_percpu(struct call_single_data); if (!cfd->csd) { free_cpumask_var(cfd->cpumask); + free_cpumask_var(cfd->cpumask_ipi); return -ENOMEM; } @@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu) struct call_function_data *cfd = &per_cpu(cfd_data, cpu); free_cpumask_var(cfd->cpumask); + free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->csd); return 0; } @@ -434,6 +442,7 @@ void smp_call_function_many(const struct cpumask *mask, if (unlikely(!cpumask_weight(cfd->cpumask))) return; + cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); @@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask, csd->flags |= CSD_FLAG_SYNCHRONOUS; csd->func = func; csd->info = info; - llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); + if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) + cpumask_set_cpu(cpu, cfd->cpumask_ipi); } /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(cfd->cpumask); + arch_send_call_function_ipi_mask(cfd->cpumask_ipi); if (wait) { for_each_cpu(cpu, cfd->cpumask) { -- cgit v1.3-14-g43fede From 6c8557bdb28df3ae97476c5e2aed6373cd235aab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 May 2017 12:58:25 +0200 Subject: smp, cpumask: Use non-atomic cpumask_{set,clear}_cpu() The cpumasks in smp_call_function_many() are private and not subject to concurrency, atomic bitops are pointless and expensive. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/cpumask.h | 11 +++++++++++ kernel/smp.c | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index a21b1fb9a968..4bf4479a3a80 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -293,6 +293,12 @@ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } +static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) +{ + __set_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + + /** * cpumask_clear_cpu - clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) @@ -303,6 +309,11 @@ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } +static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp) +{ + __clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); +} + /** * cpumask_test_cpu - test for a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) diff --git a/kernel/smp.c b/kernel/smp.c index 76d16fe3c427..3061483cb3ad 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -436,7 +436,7 @@ void smp_call_function_many(const struct cpumask *mask, cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(this_cpu, cfd->cpumask); + __cpumask_clear_cpu(this_cpu, cfd->cpumask); /* Some callers race with other cpus changing the passed mask */ if (unlikely(!cpumask_weight(cfd->cpumask))) @@ -452,7 +452,7 @@ void smp_call_function_many(const struct cpumask *mask, csd->func = func; csd->info = info; if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) - cpumask_set_cpu(cpu, cfd->cpumask_ipi); + __cpumask_set_cpu(cpu, cfd->cpumask_ipi); } /* Send a message to all CPUs in the map */ -- cgit v1.3-14-g43fede From 73215849dfbf63421c1cafea5a1b6da9bb17831e Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 12 May 2017 09:39:44 +0900 Subject: sched/core: Use the new llist_for_each_entry_safe() primitive Now that we've added llist_for_each_entry_safe(), use it to simplify an open coded version of it in sched_ttwu_pending(). Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1494549584-11730-1-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dde5d1e860f0..4a31239956da 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1731,7 +1731,7 @@ void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); - struct task_struct *p; + struct task_struct *p, *t; struct rq_flags rf; if (!llist) @@ -1740,17 +1740,8 @@ void sched_ttwu_pending(void) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - while (llist) { - int wake_flags = 0; - - p = llist_entry(llist, struct task_struct, wake_entry); - llist = llist_next(llist); - - if (p->sched_remote_wakeup) - wake_flags = WF_MIGRATED; - - ttwu_do_activate(rq, p, wake_flags, &rf); - } + llist_for_each_entry_safe(p, t, llist, wake_entry) + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); rq_unlock_irqrestore(rq, &rf); } -- cgit v1.3-14-g43fede From de16b91effdbf5aeff8346b99bcd0991a5362db9 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 12 May 2017 10:05:43 +0900 Subject: sched/rt: Remove unnecessary condition in push_rt_task() pick_next_pushable_task(rq) has BUG_ON(rq_cpu != task_cpu(task)) when it returns a task other than NULL, which means that task_cpu(task) must be rq->cpu. So if task == next_task, then task_cpu(next_task) must be rq->cpu as well. Remove the redundant condition and make the code simpler. This way one unnecessary branch and two LOAD operations can be avoided. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Juri Lelli Reviewed-by: Daniel Bristot de Oliveira Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1494551143-22219-1-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 979b7341008a..c18b50094fab 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1819,7 +1819,7 @@ retry: * pushing. */ task = pick_next_pushable_task(rq); - if (task_cpu(next_task) == rq->cpu && task == next_task) { + if (task == next_task) { /* * The task hasn't migrated, and is still the next * eligible task, but we failed to find a run-queue -- cgit v1.3-14-g43fede From a776b968e52895a350d636e6e7fdcb3b10846fa4 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 12 May 2017 10:05:59 +0900 Subject: sched/deadline: Remove unnecessary condition in push_dl_task() pick_next_pushable_dl_task(rq) has BUG_ON(rq->cpu != task_cpu(task)) when it returns a task other than NULL, which means that task_cpu(task) must be rq->cpu. So if task == next_task, then task_cpu(next_task) must be rq->cpu as well. Remove the redundant condition and make the code simpler. This way one unnecessary branch and two LOAD operations can be avoided. Signed-off-by: Byungchul Park Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Juri Lelli Reviewed-by: Daniel Bristot de Oliveira Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1494551159-22367-1-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a2ce59015642..df6c2912bd60 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1533,7 +1533,7 @@ retry: * then possible that next_task has migrated. */ task = pick_next_pushable_dl_task(rq); - if (task_cpu(next_task) == rq->cpu && task == next_task) { + if (task == next_task) { /* * The task is still there. We don't try * again, some other cpu will pull it when ready. -- cgit v1.3-14-g43fede From 896bbb2522587e3b8eb2a0d204d43ccc1042a00d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 9 Mar 2017 10:18:42 -0500 Subject: sched/core: Allow __sched_setscheduler() in interrupts when PI is not used When priority inheritance was added back in 2.6.18 to sched_setscheduler(), it added a path to taking an rt-mutex wait_lock, which is not IRQ safe. As PI is not a common occurrence, lockdep will likely never trigger if sched_setscheduler was called from interrupt context. A BUG_ON() was added to trigger if __sched_setscheduler() was ever called from interrupt context because there was a possibility to take the wait_lock. Today the wait_lock is irq safe, but the path to taking it in sched_setscheduler() is the same as the path to taking it from normal context. The wait_lock is taken with raw_spin_lock_irq() and released with raw_spin_unlock_irq() which will indiscriminately enable interrupts, which would be bad in interrupt context. The problem is that normalize_rt_tasks, which is called by triggering the sysrq nice-all-RT-tasks was changed to call __sched_setscheduler(), and this is done from interrupt context! Now __sched_setscheduler() takes a "pi" parameter that is used to know if the priority inheritance should be called or not. As the BUG_ON() only cares about calling the PI code, it should only bug if called from interrupt context with the "pi" parameter set to true. Reported-by: Laurent Dufour Tested-by: Laurent Dufour Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: dbc7f069b93a ("sched: Use replace normalize_task() with __sched_setscheduler()") Link: http://lkml.kernel.org/r/20170308124654.10e598f2@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4a31239956da..877241e9f2b0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4188,8 +4188,8 @@ static int __sched_setscheduler(struct task_struct *p, int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; - /* May grab non-irq protected spin_locks: */ - BUG_ON(in_interrupt()); + /* The pi code expects interrupts enabled */ + BUG_ON(pi && in_interrupt()); recheck: /* Double check policy once rq lock held: */ if (policy < 0) { -- cgit v1.3-14-g43fede From c249f255aab86b9b187ba319b9d2684841ac7c8d Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Mon, 15 May 2017 14:14:13 -0500 Subject: sched/rt: Minimize rq->lock contention in do_sched_rt_period_timer() With CONFIG_RT_GROUP_SCHED=y, do_sched_rt_period_timer() sequentially takes each CPU's rq->lock. On a large, busy system, the cumulative time it takes to acquire each lock can be excessive, even triggering a watchdog timeout. If rt_rq->rt_time and rt_rq->rt_nr_running are both zero, this function does nothing while holding the lock, so don't bother taking it at all. Signed-off-by: Dave Kleikamp Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a767637b-df85-912f-ba69-c90ee00a3fb6@oracle.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c18b50094fab..581d5c7a5264 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -840,6 +840,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); + int skip; + + /* + * When span == cpu_online_mask, taking each rq->lock + * can be time-consuming. Try to avoid it when possible. + */ + raw_spin_lock(&rt_rq->rt_runtime_lock); + skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (skip) + continue; raw_spin_lock(&rq->lock); if (rt_rq->rt_time) { -- cgit v1.3-14-g43fede From 8655d5497735b288f8a9b458bd22e7d1bf95bb61 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 15 May 2017 15:13:16 +0200 Subject: sched/numa: Use down_read_trylock() for the mmap_sem A customer has reported a soft-lockup when running an intensive memory stress test, where the trace on multiple CPU's looks like this: RIP: 0010:[] [] native_queued_spin_lock_slowpath+0x10e/0x190 ... Call Trace: [] queued_spin_lock_slowpath+0x7/0xa [] change_protection_range+0x3b1/0x930 [] change_prot_numa+0x18/0x30 [] task_numa_work+0x1fe/0x310 [] task_work_run+0x72/0x90 Further investigation showed that the lock contention here is pmd_lock(). The task_numa_work() function makes sure that only one thread is let to perform the work in a single scan period (via cmpxchg), but if there's a thread with mmap_sem locked for writing for several periods, multiple threads in task_numa_work() can build up a convoy waiting for mmap_sem for read and then all get unblocked at once. This patch changes the down_read() to the trylock version, which prevents the build up. For a workload experiencing mmap_sem contention, it's probably better to postpone the NUMA balancing work anyway. This seems to have fixed the soft lockups involving pmd_lock(), which is in line with the convoy theory. Signed-off-by: Vlastimil Babka Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rik van Riel Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170515131316.21909-1-vbabka@suse.cz Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 219fe58e3023..47a0c552c77b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2470,7 +2470,8 @@ void task_numa_work(struct callback_head *work) return; - down_read(&mm->mmap_sem); + if (!down_read_trylock(&mm->mmap_sem)) + return; vma = find_vma(mm, start); if (!vma) { reset_ptenuma_scan(p); -- cgit v1.3-14-g43fede From b4def42724594cd399cfee365221f5b38639711d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 May 2017 20:42:43 +0200 Subject: async: Adjust system_state checks To enable smp_processor_id() and might_sleep() debug checks earlier, it's required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING. Adjust the system_state check in async_run_entry_fn() and async_synchronize_cookie_domain() to handle the extra states. Tested-by: Mark Rutland Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Arjan van de Ven Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170516184735.865155020@linutronix.de Signed-off-by: Ingo Molnar --- kernel/async.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index d2edd6efec56..2cbd3dd5940d 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -114,14 +114,14 @@ static void async_run_entry_fn(struct work_struct *work) ktime_t uninitialized_var(calltime), delta, rettime; /* 1) run (and print duration) */ - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("calling %lli_%pF @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); } entry->func(entry->data, entry->cookie); - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", @@ -284,14 +284,14 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain { ktime_t uninitialized_var(starttime), delta, endtime; - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { pr_debug("async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } wait_event(async_done, lowest_in_progress(domain) >= cookie); - if (initcall_debug && system_state == SYSTEM_BOOTING) { + if (initcall_debug && system_state < SYSTEM_RUNNING) { endtime = ktime_get(); delta = ktime_sub(endtime, starttime); -- cgit v1.3-14-g43fede From 0594729c24d846889408a07057b5cc9e8d931419 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 May 2017 20:42:44 +0200 Subject: extable: Adjust system_state checks To enable smp_processor_id() and might_sleep() debug checks earlier, it's required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING. Adjust the system_state check in core_kernel_text() to handle the extra states, i.e. to cover init text up to the point where the system switches to state RUNNING. Tested-by: Mark Rutland Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170516184735.949992741@linutronix.de Signed-off-by: Ingo Molnar --- kernel/extable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index 2676d7f8baf6..0fbdd8582f08 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -75,7 +75,7 @@ int core_kernel_text(unsigned long addr) addr < (unsigned long)_etext) return 1; - if (system_state == SYSTEM_BOOTING && + if (system_state < SYSTEM_RUNNING && init_kernel_text(addr)) return 1; return 0; -- cgit v1.3-14-g43fede From ff48cd26fc4889b9deb5f9333d3c61746e450b7f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 May 2017 20:42:45 +0200 Subject: printk: Adjust system_state checks To enable smp_processor_id() and might_sleep() debug checks earlier, it's required to add system states between SYSTEM_BOOTING and SYSTEM_RUNNING. Adjust the system_state check in boot_delay_msec() to handle the extra states. Tested-by: Mark Rutland Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170516184736.027534895@linutronix.de Signed-off-by: Ingo Molnar --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a1aecf44ab07..32fac391ac2a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1176,7 +1176,7 @@ static void boot_delay_msec(int level) unsigned long long k; unsigned long timeout; - if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) + if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress_message_printing(level)) { return; } -- cgit v1.3-14-g43fede From 1c3c5eab171590f86edd8d31389d61dd1efe3037 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 May 2017 20:42:48 +0200 Subject: sched/core: Enable might_sleep() and smp_processor_id() checks early might_sleep() and smp_processor_id() checks are enabled after the boot process is done. That hides bugs in the SMP bringup and driver initialization code. Enable it right when the scheduler starts working, i.e. when init task and kthreadd have been created and right before the idle task enables preemption. Tested-by: Mark Rutland Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170516184736.272225698@linutronix.de Signed-off-by: Ingo Molnar --- init/main.c | 10 ++++++++++ kernel/sched/core.c | 4 +++- lib/smp_processor_id.c | 2 +- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/init/main.c b/init/main.c index badae3bf08f1..df58a416dd1d 100644 --- a/init/main.c +++ b/init/main.c @@ -414,6 +414,16 @@ static noinline void __ref rest_init(void) rcu_read_lock(); kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); rcu_read_unlock(); + + /* + * Enable might_sleep() and smp_processor_id() checks. + * They cannot be enabled earlier because with CONFIG_PRREMPT=y + * kernel_thread() would trigger might_sleep() splats. With + * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled + * already, but it's stuck on the kthreadd_done completion. + */ + system_state = SYSTEM_SCHEDULING; + complete(&kthreadd_done); /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 877241e9f2b0..c3e50cada84d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6238,8 +6238,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset) if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && !is_idle_task(current)) || - system_state != SYSTEM_RUNNING || oops_in_progress) + system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || + oops_in_progress) return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 690d75b132fa..2fb007be0212 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -28,7 +28,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1, /* * It is valid to assume CPU-locality during early bootup: */ - if (system_state != SYSTEM_RUNNING) + if (system_state < SYSTEM_SCHEDULING) goto out; /* -- cgit v1.3-14-g43fede From 4b3e4ed6b0d958d7fb2f160bb8ebfb4f0db19382 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 20 Apr 2017 13:07:30 -0400 Subject: audit: unswing cap_* fields in PATH records The cap_* fields swing in and out of PATH records. If no capabilities are set, the cap_* fields are completely missing and when one of the cap_fi or cap_fp values is empty, that field is omitted. Original: type=PATH msg=audit(04/20/2017 12:17:11.222:193) : item=1 name=/lib64/ld-linux-x86-64.so.2 inode=787694 dev=08:03 mode=file,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:ld_so_t:s0 nametype=NORMAL type=PATH msg=audit(04/20/2017 12:17:11.222:193) : item=0 name=/home/sleep inode=1319469 dev=08:03 mode=file,suid,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:bin_t:s0 nametype=NORMAL cap_fp=sys_admin cap_fe=1 cap_fver=2 Normalize the PATH record by always printing all 4 cap_* fields. Fixed: type=PATH msg=audit(04/20/2017 13:01:31.679:201) : item=1 name=/lib64/ld-linux-x86-64.so.2 inode=787694 dev=08:03 mode=file,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:ld_so_t:s0 nametype=NORMAL cap_fp=none cap_fi=none cap_fe=0 cap_fver=0 type=PATH msg=audit(04/20/2017 13:01:31.679:201) : item=0 name=/home/sleep inode=1319469 dev=08:03 mode=file,suid,755 ouid=root ogid=root rdev=00:00 obj=system_u:object_r:bin_t:s0 nametype=NORMAL cap_fp=sys_admin cap_fi=none cap_fe=1 cap_fver=2 See: https://github.com/linux-audit/audit-kernel/issues/42 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index a7c6a50477aa..b2e877100242 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1999,22 +1999,10 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) { - kernel_cap_t *perm = &name->fcap.permitted; - kernel_cap_t *inh = &name->fcap.inheritable; - int log = 0; - - if (!cap_isclear(*perm)) { - audit_log_cap(ab, "cap_fp", perm); - log = 1; - } - if (!cap_isclear(*inh)) { - audit_log_cap(ab, "cap_fi", inh); - log = 1; - } - - if (log) - audit_log_format(ab, " cap_fe=%d cap_fver=%x", - name->fcap.fE, name->fcap_ver); + audit_log_cap(ab, "cap_fp", &name->fcap.permitted); + audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); + audit_log_format(ab, " cap_fe=%d cap_fver=%x", + name->fcap.fE, name->fcap_ver); } static inline int audit_copy_fcaps(struct audit_names *name, -- cgit v1.3-14-g43fede From 490194269665d6d4915a4a5774f002885c5a2d8f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 21 Apr 2017 15:35:26 -0700 Subject: module: Pass struct load_info into symbol checks Since we're already using values from struct load_info, just pass this pointer in directly and use what's needed as we need it. This allows us to access future fields in struct load_info too. Signed-off-by: Kees Cook Signed-off-by: Jessica Yu --- kernel/module.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 4a3665f8f837..ca4509b13400 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1278,12 +1278,13 @@ static u32 resolve_rel_crc(const s32 *crc) return *(u32 *)((void *)crc + *crc); } -static int check_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static int check_version(const struct load_info *info, const char *symname, struct module *mod, const s32 *crc) { + Elf_Shdr *sechdrs = info->sechdrs; + unsigned int versindex = info->index.vers; unsigned int i, num_versions; struct modversion_info *versions; @@ -1326,8 +1327,7 @@ bad_version: return 0; } -static inline int check_modstruct_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_modstruct_version(const struct load_info *info, struct module *mod) { const s32 *crc; @@ -1343,8 +1343,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, BUG(); } preempt_enable(); - return check_version(sechdrs, versindex, - VMLINUX_SYMBOL_STR(module_layout), mod, crc); + return check_version(info, VMLINUX_SYMBOL_STR(module_layout), + mod, crc); } /* First part is kernel version, which we ignore if module has crcs. */ @@ -1358,8 +1358,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, return strcmp(amagic, bmagic) == 0; } #else -static inline int check_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_version(const struct load_info *info, const char *symname, struct module *mod, const s32 *crc) @@ -1367,8 +1366,7 @@ static inline int check_version(Elf_Shdr *sechdrs, return 1; } -static inline int check_modstruct_version(Elf_Shdr *sechdrs, - unsigned int versindex, +static inline int check_modstruct_version(const struct load_info *info, struct module *mod) { return 1; @@ -1404,7 +1402,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, if (!sym) goto unlock; - if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) { + if (!check_version(info, name, mod, crc)) { sym = ERR_PTR(-EINVAL); goto getname; } @@ -2971,7 +2969,7 @@ static struct module *setup_load_info(struct load_info *info, int flags) info->index.pcpu = find_pcpusec(info); /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) + if (!check_modstruct_version(info, mod)) return ERR_PTR(-ENOEXEC); return mod; -- cgit v1.3-14-g43fede From 3e2e857f9c3a19d55ee0ba7b428b8be5008960bf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 21 Apr 2017 15:35:27 -0700 Subject: module: Add module name to modinfo Accessing the mod structure (e.g. for mod->name) prior to having completed check_modstruct_version() can result in writing garbage to the error logs if the layout of the mod structure loaded from disk doesn't match the running kernel's mod structure layout. This kind of mismatch will become much more likely if a kernel is built with different randomization seed for the struct layout randomization plugin. Instead, add and use a new modinfo string for logging the module name. Signed-off-by: Kees Cook Signed-off-by: Jessica Yu --- kernel/module.c | 29 ++++++++++++++++++++++------- scripts/mod/modpost.c | 1 + 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ca4509b13400..3803449ca219 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -302,6 +302,7 @@ int unregister_module_notifier(struct notifier_block *nb) EXPORT_SYMBOL(unregister_module_notifier); struct load_info { + char *name; Elf_Ehdr *hdr; unsigned long len; Elf_Shdr *sechdrs; @@ -1318,12 +1319,12 @@ static int check_version(const struct load_info *info, } /* Broken toolchain. Warn once, then let it go.. */ - pr_warn_once("%s: no symbol version for %s\n", mod->name, symname); + pr_warn_once("%s: no symbol version for %s\n", info->name, symname); return 1; bad_version: pr_warn("%s: disagrees about version of symbol %s\n", - mod->name, symname); + info->name, symname); return 0; } @@ -2913,9 +2914,15 @@ static int rewrite_section_headers(struct load_info *info, int flags) info->index.vers = 0; /* Pretend no __versions section! */ else info->index.vers = find_sec(info, "__versions"); + info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->index.info = find_sec(info, ".modinfo"); + if (!info->index.info) + info->name = "(missing .modinfo section)"; + else + info->name = get_modinfo(info, "name"); info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; - info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + return 0; } @@ -2955,14 +2962,22 @@ static struct module *setup_load_info(struct load_info *info, int flags) info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); if (!info->index.mod) { - pr_warn("No module found in object\n"); + pr_warn("%s: No module found in object\n", + info->name ?: "(missing .modinfo name field)"); return ERR_PTR(-ENOEXEC); } /* This is temporary: point mod into copy of data. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; + /* + * If we didn't load the .modinfo 'name' field, fall back to + * on-disk struct mod 'name' field. + */ + if (!info->name) + info->name = mod->name; + if (info->index.sym == 0) { - pr_warn("%s: module has no symbols (stripped?)\n", mod->name); + pr_warn("%s: module has no symbols (stripped?)\n", info->name); return ERR_PTR(-ENOEXEC); } @@ -2990,7 +3005,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return err; } else if (!same_magic(modmagic, vermagic, info->index.vers)) { pr_err("%s: version magic '%s' should be '%s'\n", - mod->name, modmagic, vermagic); + info->name, modmagic, vermagic); return -ENOEXEC; } @@ -3270,7 +3285,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) if (IS_ERR(mod)) return mod; - if (blacklisted(mod->name)) + if (blacklisted(info->name)) return ERR_PTR(-EPERM); err = check_modinfo(mod, info, flags); diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 30d752a4a6a6..48397feb08fb 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2126,6 +2126,7 @@ static void add_header(struct buffer *b, struct module *mod) buf_printf(b, "#include \n"); buf_printf(b, "\n"); buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n"); + buf_printf(b, "MODULE_INFO(name, KBUILD_MODNAME);\n"); buf_printf(b, "\n"); buf_printf(b, "__visible struct module __this_module\n"); buf_printf(b, "__attribute__((section(\".gnu.linkonce.this_module\"))) = {\n"); -- cgit v1.3-14-g43fede From 45aea321678856687927c53972321ebfab77759a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 May 2017 08:52:02 +0200 Subject: sched/clock: Fix early boot preempt assumption in __set_sched_clock_stable() The more strict early boot preemption warnings found that __set_sched_clock_stable() was incorrectly assuming we'd still be running on a single CPU: BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1 caller is debug_smp_processor_id+0x1c/0x1e CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.12.0-rc2-00108-g1c3c5ea #1 Call Trace: dump_stack+0x110/0x192 check_preemption_disabled+0x10c/0x128 ? set_debug_rodata+0x25/0x25 debug_smp_processor_id+0x1c/0x1e sched_clock_init_late+0x27/0x87 [...] Fix it by disabling IRQs. Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Steven Rostedt Cc: lkp@01.org Cc: tipbuild@zytor.com Link: http://lkml.kernel.org/r/20170524065202.v25vyu7pvba5mhpd@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/clock.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 1a0d389d2f2b..ca0f8fc945c6 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -133,12 +133,19 @@ static void __scd_stamp(struct sched_clock_data *scd) static void __set_sched_clock_stable(void) { - struct sched_clock_data *scd = this_scd(); + struct sched_clock_data *scd; + /* + * Since we're still unstable and the tick is already running, we have + * to disable IRQs in order to get a consistent scd->tick* reading. + */ + local_irq_disable(); + scd = this_scd(); /* * Attempt to make the (initial) unstable->stable transition continuous. */ __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); + local_irq_enable(); printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", scd->tick_gtod, __gtod_offset, -- cgit v1.3-14-g43fede From f36776fafbaa0094390dd4e7e3e29805e0b82730 Mon Sep 17 00:00:00 2001 From: Peter Rajnoha Date: Tue, 9 May 2017 15:22:30 +0200 Subject: kobject: support passing in variables for synthetic uevents This patch makes it possible to pass additional arguments in addition to uevent action name when writing /sys/.../uevent attribute. These additional arguments are then inserted into generated synthetic uevent as additional environment variables. Before, we were not able to pass any additional uevent environment variables for synthetic uevents. This made it hard to identify such uevents properly in userspace to make proper distinction between genuine uevents originating from kernel and synthetic uevents triggered from userspace. Also, it was not possible to pass any additional information which would make it possible to optimize and change the way the synthetic uevents are processed back in userspace based on the originating environment of the triggering action in userspace. With the extra additional variables, we are able to pass through this extra information needed and also it makes it possible to synchronize with such synthetic uevents as they can be clearly identified back in userspace. The format for writing the uevent attribute is following: ACTION [UUID [KEY=VALUE ...] There's no change in how "ACTION" is recognized - it stays the same ("add", "change", "remove"). The "ACTION" is the only argument required to generate synthetic uevent, the rest of arguments, that this patch adds support for, are optional. The "UUID" is considered as transaction identifier so it's possible to use the same UUID value for one or more synthetic uevents in which case we logically group these uevents together for any userspace listeners. The "UUID" is expected to be in "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" format where "x" is a hex digit. The value appears in uevent as "SYNTH_UUID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" environment variable. The "KEY=VALUE" pairs can contain alphanumeric characters only. It's possible to define zero or more more pairs - each pair is then delimited by a space character " ". Each pair appears in synthetic uevents as "SYNTH_ARG_KEY=VALUE" environment variable. That means the KEY name gains "SYNTH_ARG_" prefix to avoid possible collisions with existing variables. To pass the "KEY=VALUE" pairs, it's also required to pass in the "UUID" part for the synthetic uevent first. If "UUID" is not passed in, the generated synthetic uevent gains "SYNTH_UUID=0" environment variable automatically so it's possible to identify this situation in userspace when reading generated uevent and so we can still make a difference between genuine and synthetic uevents. Signed-off-by: Peter Rajnoha Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-uevent | 47 ++++++++++ drivers/base/bus.c | 10 +- drivers/base/core.c | 7 +- include/linux/kobject.h | 4 +- kernel/module.c | 5 +- lib/kobject_uevent.c | 167 ++++++++++++++++++++++++++++++--- 6 files changed, 207 insertions(+), 33 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-uevent (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-uevent b/Documentation/ABI/testing/sysfs-uevent new file mode 100644 index 000000000000..d7ac99072a16 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-uevent @@ -0,0 +1,47 @@ +What: /sys/.../uevent +Date: May 2017 +KernelVersion: 4.12 +Contact: Linux kernel mailing list +Description: + Enable passing additional variables for synthetic uevents that + are generated by writing /sys/.../uevent file. + + Recognized extended format is ACTION [UUID [KEY=VALUE ...]. + + The ACTION is compulsory - it is the name of the uevent action + ("add", "change", "remove"). There is no change compared to + previous functionality here. The rest of the extended format + is optional. + + You need to pass UUID first before any KEY=VALUE pairs. + The UUID must be in "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + format where 'x' is a hex digit. The UUID is considered to be + a transaction identifier so it's possible to use the same UUID + value for one or more synthetic uevents in which case we + logically group these uevents together for any userspace + listeners. The UUID value appears in uevent as + "SYNTH_UUID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" environment + variable. + + If UUID is not passed in, the generated synthetic uevent gains + "SYNTH_UUID=0" environment variable automatically. + + The KEY=VALUE pairs can contain alphanumeric characters only. + It's possible to define zero or more pairs - each pair is then + delimited by a space character ' '. Each pair appears in + synthetic uevent as "SYNTH_ARG_KEY=VALUE". That means the KEY + name gains "SYNTH_ARG_" prefix to avoid possible collisions + with existing variables. + + Example of valid sequence written to the uevent file: + + add fe4d7c9d-b8c6-4a70-9ef1-3d8a58d18eed A=1 B=abc + + This generates synthetic uevent including these variables: + + ACTION=add + SYNTH_ARG_A=1 + SYNTH_ARG_B=abc + SYNTH_UUID=fe4d7c9d-b8c6-4a70-9ef1-3d8a58d18eed +Users: + udev, userspace tools generating synthetic uevents diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 6470eb8088f4..f945f2f0ee06 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -648,10 +648,7 @@ static void remove_probe_files(struct bus_type *bus) static ssize_t uevent_store(struct device_driver *drv, const char *buf, size_t count) { - enum kobject_action action; - - if (kobject_action_type(buf, count, &action) == 0) - kobject_uevent(&drv->p->kobj, action); + kobject_synth_uevent(&drv->p->kobj, buf, count); return count; } static DRIVER_ATTR_WO(uevent); @@ -868,10 +865,7 @@ static void klist_devices_put(struct klist_node *n) static ssize_t bus_uevent_store(struct bus_type *bus, const char *buf, size_t count) { - enum kobject_action action; - - if (kobject_action_type(buf, count, &action) == 0) - kobject_uevent(&bus->p->subsys.kobj, action); + kobject_synth_uevent(&bus->p->subsys.kobj, buf, count); return count; } static BUS_ATTR(uevent, S_IWUSR, NULL, bus_uevent_store); diff --git a/drivers/base/core.c b/drivers/base/core.c index bbecaf9293be..6564339d7f59 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -981,12 +981,9 @@ out: static ssize_t uevent_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - enum kobject_action action; + if (kobject_synth_uevent(&dev->kobj, buf, count)) + dev_err(dev, "uevent: failed to send synthetic uevent\n"); - if (kobject_action_type(buf, count, &action) == 0) - kobject_uevent(&dev->kobj, action); - else - dev_err(dev, "uevent: unknown action-string\n"); return count; } static DEVICE_ATTR_RW(uevent); diff --git a/include/linux/kobject.h b/include/linux/kobject.h index ca85cb80e99a..eeab34b0f589 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -217,11 +217,9 @@ extern struct kobject *firmware_kobj; int kobject_uevent(struct kobject *kobj, enum kobject_action action); int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, char *envp[]); +int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count); __printf(2, 3) int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...); -int kobject_action_type(const char *buf, size_t count, - enum kobject_action *type); - #endif /* _KOBJECT_H_ */ diff --git a/kernel/module.c b/kernel/module.c index 4a3665f8f837..d7eb41d772c4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1202,10 +1202,7 @@ static ssize_t store_uevent(struct module_attribute *mattr, struct module_kobject *mk, const char *buffer, size_t count) { - enum kobject_action action; - - if (kobject_action_type(buffer, count, &action) == 0) - kobject_uevent(&mk->kobj, action); + kobject_synth_uevent(&mk->kobj, buffer, count); return count; } diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 9a2b811966eb..719c155fce20 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #include @@ -52,19 +54,13 @@ static const char *kobject_actions[] = { [KOBJ_OFFLINE] = "offline", }; -/** - * kobject_action_type - translate action string to numeric type - * - * @buf: buffer containing the action string, newline is ignored - * @count: length of buffer - * @type: pointer to the location to store the action type - * - * Returns 0 if the action string was recognized. - */ -int kobject_action_type(const char *buf, size_t count, - enum kobject_action *type) +static int kobject_action_type(const char *buf, size_t count, + enum kobject_action *type, + const char **args) { enum kobject_action action; + size_t count_first; + const char *args_start; int ret = -EINVAL; if (count && (buf[count-1] == '\n' || buf[count-1] == '\0')) @@ -73,11 +69,20 @@ int kobject_action_type(const char *buf, size_t count, if (!count) goto out; + args_start = strnchr(buf, count, ' '); + if (args_start) { + count_first = args_start - buf; + args_start = args_start + 1; + } else + count_first = count; + for (action = 0; action < ARRAY_SIZE(kobject_actions); action++) { - if (strncmp(kobject_actions[action], buf, count) != 0) + if (strncmp(kobject_actions[action], buf, count_first) != 0) continue; - if (kobject_actions[action][count] != '\0') + if (kobject_actions[action][count_first] != '\0') continue; + if (args) + *args = args_start; *type = action; ret = 0; break; @@ -86,6 +91,142 @@ out: return ret; } +static const char *action_arg_word_end(const char *buf, const char *buf_end, + char delim) +{ + const char *next = buf; + + while (next <= buf_end && *next != delim) + if (!isalnum(*next++)) + return NULL; + + if (next == buf) + return NULL; + + return next; +} + +static int kobject_action_args(const char *buf, size_t count, + struct kobj_uevent_env **ret_env) +{ + struct kobj_uevent_env *env = NULL; + const char *next, *buf_end, *key; + int key_len; + int r = -EINVAL; + + if (count && (buf[count - 1] == '\n' || buf[count - 1] == '\0')) + count--; + + if (!count) + return -EINVAL; + + env = kzalloc(sizeof(*env), GFP_KERNEL); + if (!env) + return -ENOMEM; + + /* first arg is UUID */ + if (count < UUID_STRING_LEN || !uuid_is_valid(buf) || + add_uevent_var(env, "SYNTH_UUID=%.*s", UUID_STRING_LEN, buf)) + goto out; + + /* + * the rest are custom environment variables in KEY=VALUE + * format with ' ' delimiter between each KEY=VALUE pair + */ + next = buf + UUID_STRING_LEN; + buf_end = buf + count - 1; + + while (next <= buf_end) { + if (*next != ' ') + goto out; + + /* skip the ' ', key must follow */ + key = ++next; + if (key > buf_end) + goto out; + + buf = next; + next = action_arg_word_end(buf, buf_end, '='); + if (!next || next > buf_end || *next != '=') + goto out; + key_len = next - buf; + + /* skip the '=', value must follow */ + if (++next > buf_end) + goto out; + + buf = next; + next = action_arg_word_end(buf, buf_end, ' '); + if (!next) + goto out; + + if (add_uevent_var(env, "SYNTH_ARG_%.*s=%.*s", + key_len, key, (int) (next - buf), buf)) + goto out; + } + + r = 0; +out: + if (r) + kfree(env); + else + *ret_env = env; + return r; +} + +/** + * kobject_synth_uevent - send synthetic uevent with arguments + * + * @kobj: struct kobject for which synthetic uevent is to be generated + * @buf: buffer containing action type and action args, newline is ignored + * @count: length of buffer + * + * Returns 0 if kobject_synthetic_uevent() is completed with success or the + * corresponding error when it fails. + */ +int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count) +{ + char *no_uuid_envp[] = { "SYNTH_UUID=0", NULL }; + enum kobject_action action; + const char *action_args; + struct kobj_uevent_env *env; + const char *msg = NULL, *devpath; + int r; + + r = kobject_action_type(buf, count, &action, &action_args); + if (r) { + msg = "unknown uevent action string\n"; + goto out; + } + + if (!action_args) { + r = kobject_uevent_env(kobj, action, no_uuid_envp); + goto out; + } + + r = kobject_action_args(action_args, + count - (action_args - buf), &env); + if (r == -EINVAL) { + msg = "incorrect uevent action arguments\n"; + goto out; + } + + if (r) + goto out; + + r = kobject_uevent_env(kobj, action, env->envp); + kfree(env); +out: + if (r) { + devpath = kobject_get_path(kobj, GFP_KERNEL); + printk(KERN_WARNING "synth uevent: %s: %s", + devpath ?: "unknown device", + msg ?: "failed to send uevent"); + kfree(devpath); + } + return r; +} + #ifdef CONFIG_NET static int kobj_bcast_filter(struct sock *dsk, struct sk_buff *skb, void *data) { -- cgit v1.3-14-g43fede From 9ab6055f959032258c0f83a070cd0d26ed7a8fc5 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Wed, 24 May 2017 17:55:10 -0600 Subject: kernel/locking: Fix compile error with qrwlock.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Saw these compile errors on SPARC when queued rwlock feature is enabled. CC kernel/locking/qrwlock.o kernel/locking/qrwlock.c: In function ‘queued_read_lock_slowpath’: kernel/locking/qrwlock.c:89: error: implicit declaration of function ‘arch_spin_lock’ kernel/locking/qrwlock.c:102: error: implicit declaration of function ‘arch_spin_unlock’ make[4]: *** [kernel/locking/qrwlock.o] Error 1 Include spinlock.h in qrwlock.c to fix it. Signed-off-by: Babu Moger Reviewed-by: Håkon Bugge Reviewed-by: Jane Chu Reviewed-by: Shannon Nelson Reviewed-by: Vijay Kumar Signed-off-by: David S. Miller --- kernel/locking/qrwlock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index cc3ed0ccdfa2..2655f26ec882 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -20,6 +20,7 @@ #include #include #include +#include #include /* -- cgit v1.3-14-g43fede From 8f553c498e1772cccb39a114da4a498d22992758 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:12 +0200 Subject: cpu/hotplug: Provide cpus_read|write_[un]lock() The counting 'rwsem' hackery of get|put_online_cpus() is going to be replaced by percpu rwsem. Rename the functions to make it clear that it's locking and not some refcount style interface. These new functions will be used for the preparatory patches which make the code ready for the percpu rwsem conversion. Rename all instances in the cpu hotplug code while at it. Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.080397752@linutronix.de --- include/linux/cpu.h | 34 +++++++++++++++++++--------------- kernel/cpu.c | 36 ++++++++++++++++++------------------ 2 files changed, 37 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f92081234afd..055876003914 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -99,26 +99,30 @@ static inline void cpu_maps_update_done(void) extern struct bus_type cpu_subsys; #ifdef CONFIG_HOTPLUG_CPU -/* Stop CPUs going up and down. */ - -extern void cpu_hotplug_begin(void); -extern void cpu_hotplug_done(void); -extern void get_online_cpus(void); -extern void put_online_cpus(void); +extern void cpus_write_lock(void); +extern void cpus_write_unlock(void); +extern void cpus_read_lock(void); +extern void cpus_read_unlock(void); extern void cpu_hotplug_disable(void); extern void cpu_hotplug_enable(void); void clear_tasks_mm_cpumask(int cpu); int cpu_down(unsigned int cpu); -#else /* CONFIG_HOTPLUG_CPU */ - -static inline void cpu_hotplug_begin(void) {} -static inline void cpu_hotplug_done(void) {} -#define get_online_cpus() do { } while (0) -#define put_online_cpus() do { } while (0) -#define cpu_hotplug_disable() do { } while (0) -#define cpu_hotplug_enable() do { } while (0) -#endif /* CONFIG_HOTPLUG_CPU */ +#else /* CONFIG_HOTPLUG_CPU */ + +static inline void cpus_write_lock(void) { } +static inline void cpus_write_unlock(void) { } +static inline void cpus_read_lock(void) { } +static inline void cpus_read_unlock(void) { } +static inline void cpu_hotplug_disable(void) { } +static inline void cpu_hotplug_enable(void) { } +#endif /* !CONFIG_HOTPLUG_CPU */ + +/* Wrappers which go away once all code is converted */ +static inline void cpu_hotplug_begin(void) { cpus_write_lock(); } +static inline void cpu_hotplug_done(void) { cpus_write_unlock(); } +static inline void get_online_cpus(void) { cpus_read_lock(); } +static inline void put_online_cpus(void) { cpus_read_unlock(); } #ifdef CONFIG_PM_SLEEP_SMP extern int freeze_secondary_cpus(int primary); diff --git a/kernel/cpu.c b/kernel/cpu.c index 9ae6fbe5b5cf..d3221ae5b474 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -235,7 +235,7 @@ static struct { #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) -void get_online_cpus(void) +void cpus_read_lock(void) { might_sleep(); if (cpu_hotplug.active_writer == current) @@ -245,9 +245,9 @@ void get_online_cpus(void) atomic_inc(&cpu_hotplug.refcount); mutex_unlock(&cpu_hotplug.lock); } -EXPORT_SYMBOL_GPL(get_online_cpus); +EXPORT_SYMBOL_GPL(cpus_read_lock); -void put_online_cpus(void) +void cpus_read_unlock(void) { int refcount; @@ -264,7 +264,7 @@ void put_online_cpus(void) cpuhp_lock_release(); } -EXPORT_SYMBOL_GPL(put_online_cpus); +EXPORT_SYMBOL_GPL(cpus_read_unlock); /* * This ensures that the hotplug operation can begin only when the @@ -288,7 +288,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus); * get_online_cpus() not an api which is called all that often. * */ -void cpu_hotplug_begin(void) +void cpus_write_lock(void) { DEFINE_WAIT(wait); @@ -306,7 +306,7 @@ void cpu_hotplug_begin(void) finish_wait(&cpu_hotplug.wq, &wait); } -void cpu_hotplug_done(void) +void cpus_write_unlock(void) { cpu_hotplug.active_writer = NULL; mutex_unlock(&cpu_hotplug.lock); @@ -773,7 +773,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, if (!cpu_present(cpu)) return -EINVAL; - cpu_hotplug_begin(); + cpus_write_lock(); cpuhp_tasks_frozen = tasks_frozen; @@ -811,7 +811,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, } out: - cpu_hotplug_done(); + cpus_write_unlock(); return ret; } @@ -893,7 +893,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) struct task_struct *idle; int ret = 0; - cpu_hotplug_begin(); + cpus_write_lock(); if (!cpu_present(cpu)) { ret = -EINVAL; @@ -941,7 +941,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) target = min((int)target, CPUHP_BRINGUP_CPU); ret = cpuhp_up_callbacks(cpu, st, target); out: - cpu_hotplug_done(); + cpus_write_unlock(); return ret; } @@ -1424,7 +1424,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, if (sp->multi_instance == false) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !sp->startup.multi) @@ -1453,7 +1453,7 @@ add_node: hlist_add_head(node, &sp->list); unlock: mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); @@ -1486,7 +1486,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, if (cpuhp_cb_check(state) || !name) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); ret = cpuhp_store_callbacks(state, name, startup, teardown, @@ -1522,7 +1522,7 @@ int __cpuhp_setup_state(enum cpuhp_state state, } out: mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + cpus_read_unlock(); /* * If the requested state is CPUHP_AP_ONLINE_DYN, return the * dynamically allocated state in case of success. @@ -1544,7 +1544,7 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, if (!sp->multi_instance) return -EINVAL; - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !cpuhp_get_teardown_cb(state)) @@ -1565,7 +1565,7 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, remove: hlist_del(node); mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + cpus_read_unlock(); return 0; } @@ -1587,7 +1587,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) BUG_ON(cpuhp_cb_check(state)); - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (sp->multi_instance) { @@ -1615,7 +1615,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) remove: cpuhp_store_callbacks(state, NULL, NULL, NULL, false); mutex_unlock(&cpuhp_state_mutex); - put_online_cpus(); + cpus_read_unlock(); } EXPORT_SYMBOL(__cpuhp_remove_state); -- cgit v1.3-14-g43fede From 71def423fe3da0d40ad3427a4cd5f9edc53bff67 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:14 +0200 Subject: cpu/hotplug: Provide cpuhp_setup/remove_state[_nocalls]_cpuslocked() Some call sites of cpuhp_setup/remove_state[_nocalls]() are within a cpus_read locked region. cpuhp_setup/remove_state[_nocalls]() call cpus_read_lock() as well, which is possible in the current implementation but prevents converting the hotplug locking to a percpu rwsem. Provide locked versions of the interfaces to avoid nested calls to cpus_read_lock(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.239600868@linutronix.de --- include/linux/cpuhotplug.h | 29 ++++++++++++++++++++++++++++ kernel/cpu.c | 47 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 65 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 0f2a80377520..4fac564dde70 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -153,6 +153,11 @@ int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu), bool multi_instance); +int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name, + bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance); /** * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks * @state: The state for which the calls are installed @@ -171,6 +176,15 @@ static inline int cpuhp_setup_state(enum cpuhp_state state, return __cpuhp_setup_state(state, name, true, startup, teardown, false); } +static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state_cpuslocked(state, name, true, startup, + teardown, false); +} + /** * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the * callbacks @@ -191,6 +205,15 @@ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, false); } +static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + return __cpuhp_setup_state_cpuslocked(state, name, false, startup, + teardown, false); +} + /** * cpuhp_setup_state_multi - Add callbacks for multi state * @state: The state for which the calls are installed @@ -250,6 +273,7 @@ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state, } void __cpuhp_remove_state(enum cpuhp_state state, bool invoke); +void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke); /** * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown @@ -273,6 +297,11 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) __cpuhp_remove_state(state, false); } +static inline void cpuhp_remove_state_nocalls_cpuslocked(enum cpuhp_state state) +{ + __cpuhp_remove_state_cpuslocked(state, false); +} + /** * cpuhp_remove_multi_state - Remove hotplug multi state callback * @state: The state for which the calls are removed diff --git a/kernel/cpu.c b/kernel/cpu.c index d3221ae5b474..dc27c5a28153 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1459,7 +1459,7 @@ unlock: EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); /** - * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state + * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state * @state: The state to setup * @invoke: If true, the startup function is invoked for cpus where * cpu state >= @state @@ -1468,25 +1468,27 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); * @multi_instance: State is set up for multiple instances which get * added afterwards. * + * The caller needs to hold cpus read locked while calling this function. * Returns: * On success: * Positive state number if @state is CPUHP_AP_ONLINE_DYN * 0 for all other states * On failure: proper (negative) error code */ -int __cpuhp_setup_state(enum cpuhp_state state, - const char *name, bool invoke, - int (*startup)(unsigned int cpu), - int (*teardown)(unsigned int cpu), - bool multi_instance) +int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance) { int cpu, ret = 0; bool dynstate; + lockdep_assert_cpus_held(); + if (cpuhp_cb_check(state) || !name) return -EINVAL; - cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); ret = cpuhp_store_callbacks(state, name, startup, teardown, @@ -1522,7 +1524,6 @@ int __cpuhp_setup_state(enum cpuhp_state state, } out: mutex_unlock(&cpuhp_state_mutex); - cpus_read_unlock(); /* * If the requested state is CPUHP_AP_ONLINE_DYN, return the * dynamically allocated state in case of success. @@ -1531,6 +1532,22 @@ out: return state; return ret; } +EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked); + +int __cpuhp_setup_state(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu), + bool multi_instance) +{ + int ret; + + cpus_read_lock(); + ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup, + teardown, multi_instance); + cpus_read_unlock(); + return ret; +} EXPORT_SYMBOL(__cpuhp_setup_state); int __cpuhp_state_remove_instance(enum cpuhp_state state, @@ -1572,22 +1589,23 @@ remove: EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); /** - * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state + * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state * @state: The state to remove * @invoke: If true, the teardown function is invoked for cpus where * cpu state >= @state * + * The caller needs to hold cpus read locked while calling this function. * The teardown callback is currently not allowed to fail. Think * about module removal! */ -void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke) { struct cpuhp_step *sp = cpuhp_get_step(state); int cpu; BUG_ON(cpuhp_cb_check(state)); - cpus_read_lock(); + lockdep_assert_cpus_held(); mutex_lock(&cpuhp_state_mutex); if (sp->multi_instance) { @@ -1615,6 +1633,13 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) remove: cpuhp_store_callbacks(state, NULL, NULL, NULL, false); mutex_unlock(&cpuhp_state_mutex); +} +EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked); + +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +{ + cpus_read_lock(); + __cpuhp_remove_state_cpuslocked(state, invoke); cpus_read_unlock(); } EXPORT_SYMBOL(__cpuhp_remove_state); -- cgit v1.3-14-g43fede From 9805c6733349ea3ccd22cf75b8ebaabb5290e310 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:15 +0200 Subject: cpu/hotplug: Add __cpuhp_state_add_instance_cpuslocked() Add cpuslocked() variants for the multi instance registration so this can be called from a cpus_read_lock() protected region. Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.321782217@linutronix.de --- include/linux/cpuhotplug.h | 9 +++++++++ kernel/cpu.c | 18 +++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 4fac564dde70..df3d2719a796 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -240,6 +240,8 @@ static inline int cpuhp_setup_state_multi(enum cpuhp_state state, int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, bool invoke); +int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, + struct hlist_node *node, bool invoke); /** * cpuhp_state_add_instance - Add an instance for a state and invoke startup @@ -272,6 +274,13 @@ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state, return __cpuhp_state_add_instance(state, node, false); } +static inline int +cpuhp_state_add_instance_nocalls_cpuslocked(enum cpuhp_state state, + struct hlist_node *node) +{ + return __cpuhp_state_add_instance_cpuslocked(state, node, false); +} + void __cpuhp_remove_state(enum cpuhp_state state, bool invoke); void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke); diff --git a/kernel/cpu.c b/kernel/cpu.c index dc27c5a28153..e4389ac55b65 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1413,18 +1413,20 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, } } -int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, - bool invoke) +int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, + struct hlist_node *node, + bool invoke) { struct cpuhp_step *sp; int cpu; int ret; + lockdep_assert_cpus_held(); + sp = cpuhp_get_step(state); if (sp->multi_instance == false) return -EINVAL; - cpus_read_lock(); mutex_lock(&cpuhp_state_mutex); if (!invoke || !sp->startup.multi) @@ -1453,6 +1455,16 @@ add_node: hlist_add_head(node, &sp->list); unlock: mutex_unlock(&cpuhp_state_mutex); + return ret; +} + +int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, + bool invoke) +{ + int ret; + + cpus_read_lock(); + ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke); cpus_read_unlock(); return ret; } -- cgit v1.3-14-g43fede From fe5595c074005bd94f0c7d1644175941149f6768 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:16 +0200 Subject: stop_machine: Provide stop_machine_cpuslocked() Some call sites of stop_machine() are within a get_online_cpus() protected region. stop_machine() calls get_online_cpus() as well, which is possible in the current implementation but prevents converting the hotplug locking to a percpu rwsem. Provide stop_machine_cpuslocked() to avoid nested calls to get_online_cpus(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081547.400700852@linutronix.de --- include/linux/stop_machine.h | 26 +++++++++++++++++++++++--- kernel/stop_machine.c | 11 +++++++---- 2 files changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 3cc9632dcc2a..3d60275e3ba9 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -116,15 +116,29 @@ static inline int try_stop_cpus(const struct cpumask *cpumask, * @fn() runs. * * This can be thought of as a very heavy write lock, equivalent to - * grabbing every spinlock in the kernel. */ + * grabbing every spinlock in the kernel. + * + * Protects against CPU hotplug. + */ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); +/** + * stop_machine_cpuslocked: freeze the machine on all CPUs and run this function + * @fn: the function to run + * @data: the data ptr for the @fn() + * @cpus: the cpus to run the @fn() on (NULL = any online cpu) + * + * Same as above. Must be called from with in a cpus_read_lock() protected + * region. Avoids nested calls to cpus_read_lock(). + */ +int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); + int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); #else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ -static inline int stop_machine(cpu_stop_fn_t fn, void *data, - const struct cpumask *cpus) +static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, + const struct cpumask *cpus) { unsigned long flags; int ret; @@ -134,6 +148,12 @@ static inline int stop_machine(cpu_stop_fn_t fn, void *data, return ret; } +static inline int stop_machine(cpu_stop_fn_t fn, void *data, + const struct cpumask *cpus) +{ + return stop_machine_cpuslocked(fn, data, cpus); +} + static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 1eb82661ecdb..b7591261652d 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -552,7 +552,8 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) +int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, + const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, @@ -561,6 +562,8 @@ static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cp .active_cpus = cpus, }; + lockdep_assert_cpus_held(); + if (!stop_machine_initialized) { /* * Handle the case where stop_machine() is called @@ -590,9 +593,9 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) int ret; /* No CPUs can come up or down during this. */ - get_online_cpus(); - ret = __stop_machine(fn, data, cpus); - put_online_cpus(); + cpus_read_lock(); + ret = stop_machine_cpuslocked(fn, data, cpus); + cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(stop_machine); -- cgit v1.3-14-g43fede From 9596695ee1e7eedd743c43811fe68299eb005b5c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:17 +0200 Subject: padata: Make padata_alloc() static No users outside of padata.c Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Steffen Klassert Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: linux-crypto@vger.kernel.org Link: http://lkml.kernel.org/r/20170524081547.491457256@linutronix.de --- include/linux/padata.h | 3 --- kernel/padata.c | 32 ++++++++++++++++---------------- 2 files changed, 16 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/padata.h b/include/linux/padata.h index 0f9e567d5e15..2f9c1f93b1ce 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -166,9 +166,6 @@ struct padata_instance { extern struct padata_instance *padata_alloc_possible( struct workqueue_struct *wq); -extern struct padata_instance *padata_alloc(struct workqueue_struct *wq, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask); extern void padata_free(struct padata_instance *pinst); extern int padata_do_parallel(struct padata_instance *pinst, struct padata_priv *padata, int cb_cpu); diff --git a/kernel/padata.c b/kernel/padata.c index ac8f1e524836..0c708f648853 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -933,19 +933,6 @@ static struct kobj_type padata_attr_type = { .release = padata_sysfs_release, }; -/** - * padata_alloc_possible - Allocate and initialize padata instance. - * Use the cpu_possible_mask for serial and - * parallel workers. - * - * @wq: workqueue to use for the allocated padata instance - */ -struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) -{ - return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); -} -EXPORT_SYMBOL(padata_alloc_possible); - /** * padata_alloc - allocate and initialize a padata instance and specify * cpumasks for serial and parallel workers. @@ -954,9 +941,9 @@ EXPORT_SYMBOL(padata_alloc_possible); * @pcpumask: cpumask that will be used for padata parallelization * @cbcpumask: cpumask that will be used for padata serialization */ -struct padata_instance *padata_alloc(struct workqueue_struct *wq, - const struct cpumask *pcpumask, - const struct cpumask *cbcpumask) +static struct padata_instance *padata_alloc(struct workqueue_struct *wq, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { struct padata_instance *pinst; struct parallel_data *pd = NULL; @@ -1010,6 +997,19 @@ err: return NULL; } +/** + * padata_alloc_possible - Allocate and initialize padata instance. + * Use the cpu_possible_mask for serial and + * parallel workers. + * + * @wq: workqueue to use for the allocated padata instance + */ +struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) +{ + return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); +} +EXPORT_SYMBOL(padata_alloc_possible); + /** * padata_free - free a padata instance * -- cgit v1.3-14-g43fede From c5a81c8ff816d89941fe86961b286765d6ca2f5f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:18 +0200 Subject: padata: Avoid nested calls to cpus_read_lock() in pcrypt_init_padata() pcrypt_init_padata() cpus_read_lock() padata_alloc_possible() padata_alloc() cpus_read_lock() The nested call to cpus_read_lock() works with the current implementation, but prevents the conversion to a percpu rwsem. The other caller of padata_alloc_possible() is pcrypt_init_padata() which calls from a cpus_read_lock() protected region as well. Remove the cpus_read_lock() call in padata_alloc() and document the calling convention. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Steffen Klassert Cc: Peter Zijlstra Cc: Steven Rostedt Cc: linux-crypto@vger.kernel.org Link: http://lkml.kernel.org/r/20170524081547.571278910@linutronix.de --- kernel/padata.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 0c708f648853..868f947166d7 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -940,6 +940,8 @@ static struct kobj_type padata_attr_type = { * @wq: workqueue to use for the allocated padata instance * @pcpumask: cpumask that will be used for padata parallelization * @cbcpumask: cpumask that will be used for padata serialization + * + * Must be called from a cpus_read_lock() protected region */ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, const struct cpumask *pcpumask, @@ -952,7 +954,6 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, if (!pinst) goto err; - get_online_cpus(); if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) goto err_free_inst; if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { @@ -976,14 +977,12 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq, pinst->flags = 0; - put_online_cpus(); - BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_add_instance_nocalls(hp_online, &pinst->node); + cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); #endif return pinst; @@ -992,7 +991,6 @@ err_free_masks: free_cpumask_var(pinst->cpumask.cbcpu); err_free_inst: kfree(pinst); - put_online_cpus(); err: return NULL; } @@ -1003,9 +1001,12 @@ err: * parallel workers. * * @wq: workqueue to use for the allocated padata instance + * + * Must be called from a cpus_read_lock() protected region */ struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) { + lockdep_assert_cpus_held(); return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); } EXPORT_SYMBOL(padata_alloc_possible); -- cgit v1.3-14-g43fede From 210e21331fc3a396af640cec652be769d146e49f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 24 May 2017 10:15:28 +0200 Subject: cpu/hotplug: Use stop_machine_cpuslocked() in takedown_cpu() takedown_cpu() is a cpu hotplug function invoking stop_machine(). The cpu hotplug machinery holds the hotplug lock for write. stop_machine() invokes get_online_cpus() as well. This is correct, but prevents the conversion of the hotplug locking to a percpu rwsem. Use stop_machine_cpuslocked() to avoid the nested call. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081548.423292433@linutronix.de --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index e4389ac55b65..142d889d9f69 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -701,7 +701,7 @@ static int takedown_cpu(unsigned int cpu) /* * So now all preempt/rcu users must observe !cpu_active(). */ - err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); + err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { /* CPU refused to die */ irq_unlock_sparse(); -- cgit v1.3-14-g43fede From a63fbed776c7124ce9f606234267c3c095b2680e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:34 +0200 Subject: perf/tracing/cpuhotplug: Fix locking order perf, tracing, kprobes and jump_labels have a gazillion of ways to create dependency lock chains. Some of those involve nested invocations of get_online_cpus(). The conversion of the hotplug locking to a percpu rwsem requires to avoid such nested calls. sys_perf_event_open() protects most of the syscall logic against cpu hotplug. This causes nested calls and lock inversions versus ftrace and kprobes in various interesting ways. It's impossible to move the hotplug locking to the outer end of all call chains in the involved facilities, so the hotplug protection in sys_perf_event_open() needs to be solved differently. Introduce 'pmus_mutex' which protects a perf private online cpumask. This mutex is taken when the mask is updated in the cpu hotplug callbacks and can be taken in sys_perf_event_open() to protect the swhash setup/teardown code and when the final judgement about a valid event has to be made. [ tglx: Produced changelog and fixed the swhash interaction ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Paul E. McKenney Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Masami Hiramatsu Link: http://lkml.kernel.org/r/20170524081548.930941109@linutronix.de --- include/linux/perf_event.h | 2 + kernel/events/core.c | 106 ++++++++++++++++++++++++++++++++------------- 2 files changed, 78 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 24a635887f28..7d6aa29094b2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -801,6 +801,8 @@ struct perf_cpu_context { struct list_head sched_cb_entry; int sched_cb_usage; + + int online; }; struct perf_output_handle { diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e75a5c9412d..b97cda4d1777 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -389,6 +389,7 @@ static atomic_t nr_switch_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; +static cpumask_var_t perf_online_mask; /* * perf event paranoia level: @@ -3812,14 +3813,6 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); - /* - * We could be clever and allow to attach a event to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_online(cpu)) - return ERR_PTR(-ENODEV); - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); @@ -7703,7 +7696,8 @@ static int swevent_hlist_get_cpu(int cpu) int err = 0; mutex_lock(&swhash->hlist_mutex); - if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { + if (!swevent_hlist_deref(swhash) && + cpumask_test_cpu(cpu, perf_online_mask)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); @@ -7724,7 +7718,7 @@ static int swevent_hlist_get(void) { int err, cpu, failed_cpu; - get_online_cpus(); + mutex_lock(&pmus_lock); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(cpu); if (err) { @@ -7732,8 +7726,7 @@ static int swevent_hlist_get(void) goto fail; } } - put_online_cpus(); - + mutex_unlock(&pmus_lock); return 0; fail: for_each_possible_cpu(cpu) { @@ -7741,8 +7734,7 @@ fail: break; swevent_hlist_put_cpu(cpu); } - - put_online_cpus(); + mutex_unlock(&pmus_lock); return err; } @@ -8920,7 +8912,7 @@ perf_event_mux_interval_ms_store(struct device *dev, pmu->hrtimer_interval_ms = timer; /* update all cpuctx for this PMU */ - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_context *cpuctx; cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); @@ -8929,7 +8921,7 @@ perf_event_mux_interval_ms_store(struct device *dev, cpu_function_call(cpu, (remote_function_f)perf_mux_hrtimer_restart, cpuctx); } - put_online_cpus(); + cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); return count; @@ -9059,6 +9051,7 @@ skip_type: lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.pmu = pmu; + cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); __perf_mux_hrtimer_init(cpuctx, cpu); } @@ -9882,12 +9875,10 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } - get_online_cpus(); - if (task) { err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); if (err) - goto err_cpus; + goto err_cred; /* * Reuse ptrace permission checks for now. @@ -10073,6 +10064,23 @@ SYSCALL_DEFINE5(perf_event_open, goto err_locked; } + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = + container_of(ctx, struct perf_cpu_context, ctx); + + if (!cpuctx->online) { + err = -ENODEV; + goto err_locked; + } + } + + /* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation. @@ -10162,8 +10170,6 @@ SYSCALL_DEFINE5(perf_event_open, put_task_struct(task); } - put_online_cpus(); - mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); @@ -10197,8 +10203,6 @@ err_alloc: err_cred: if (task) mutex_unlock(&task->signal->cred_guard_mutex); -err_cpus: - put_online_cpus(); err_task: if (task) put_task_struct(task); @@ -10253,6 +10257,21 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = + container_of(ctx, struct perf_cpu_context, ctx); + if (!cpuctx->online) { + err = -ENODEV; + goto err_unlock; + } + } + if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; goto err_unlock; @@ -10920,6 +10939,8 @@ static void __init perf_event_init_all_cpus(void) struct swevent_htable *swhash; int cpu; + zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); + for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); @@ -10935,7 +10956,7 @@ static void __init perf_event_init_all_cpus(void) } } -int perf_event_init_cpu(unsigned int cpu) +void perf_swevent_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -10948,7 +10969,6 @@ int perf_event_init_cpu(unsigned int cpu) rcu_assign_pointer(swhash->swevent_hlist, hlist); } mutex_unlock(&swhash->hlist_mutex); - return 0; } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE @@ -10966,19 +10986,22 @@ static void __perf_event_exit_context(void *__info) static void perf_event_exit_cpu_context(int cpu) { + struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; struct pmu *pmu; - int idx; - idx = srcu_read_lock(&pmus_srcu); - list_for_each_entry_rcu(pmu, &pmus, entry) { - ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; + mutex_lock(&pmus_lock); + list_for_each_entry(pmu, &pmus, entry) { + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + cpuctx->online = 0; mutex_unlock(&ctx->mutex); } - srcu_read_unlock(&pmus_srcu, idx); + cpumask_clear_cpu(cpu, perf_online_mask); + mutex_unlock(&pmus_lock); } #else @@ -10986,6 +11009,29 @@ static void perf_event_exit_cpu_context(int cpu) { } #endif +int perf_event_init_cpu(unsigned int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct pmu *pmu; + + perf_swevent_init_cpu(cpu); + + mutex_lock(&pmus_lock); + cpumask_set_cpu(cpu, perf_online_mask); + list_for_each_entry(pmu, &pmus, entry) { + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); + cpuctx->online = 1; + mutex_unlock(&ctx->mutex); + } + mutex_unlock(&pmus_lock); + + return 0; +} + int perf_event_exit_cpu(unsigned int cpu) { perf_event_exit_cpu_context(cpu); -- cgit v1.3-14-g43fede From f2545b2d4ce13e068897ef60ae64dffe215f4152 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:35 +0200 Subject: jump_label: Reorder hotplug lock and jump_label_lock The conversion of the hotplug locking to a percpu rwsem unearthed lock ordering issues all over the place. The jump_label code has two issues: 1) Nested get_online_cpus() invocations 2) Ordering problems vs. the cpus rwsem and the jump_label_mutex To cure these, the following lock order has been established; cpus_rwsem -> jump_label_lock -> text_mutex Even if not all architectures need protection against CPU hotplug, taking cpus_rwsem before jump_label_lock is now mandatory in code pathes which actually modify code and therefor need text_mutex protection. Move the get_online_cpus() invocations into the core jump label code and establish the proper lock order where required. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: "David S. Miller" Cc: Paul E. McKenney Cc: Chris Metcalf Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Jason Baron Cc: Ralf Baechle Link: http://lkml.kernel.org/r/20170524081549.025830817@linutronix.de --- arch/mips/kernel/jump_label.c | 2 -- arch/sparc/kernel/jump_label.c | 2 -- arch/tile/kernel/jump_label.c | 2 -- arch/x86/kernel/jump_label.c | 2 -- kernel/jump_label.c | 20 ++++++++++++++------ 5 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/mips/kernel/jump_label.c b/arch/mips/kernel/jump_label.c index 3e586daa3a32..32e3168316cd 100644 --- a/arch/mips/kernel/jump_label.c +++ b/arch/mips/kernel/jump_label.c @@ -58,7 +58,6 @@ void arch_jump_label_transform(struct jump_entry *e, insn.word = 0; /* nop */ } - get_online_cpus(); mutex_lock(&text_mutex); if (IS_ENABLED(CONFIG_CPU_MICROMIPS)) { insn_p->halfword[0] = insn.word >> 16; @@ -70,7 +69,6 @@ void arch_jump_label_transform(struct jump_entry *e, (unsigned long)insn_p + sizeof(*insn_p)); mutex_unlock(&text_mutex); - put_online_cpus(); } #endif /* HAVE_JUMP_LABEL */ diff --git a/arch/sparc/kernel/jump_label.c b/arch/sparc/kernel/jump_label.c index 07933b9e9ce0..93adde1ac166 100644 --- a/arch/sparc/kernel/jump_label.c +++ b/arch/sparc/kernel/jump_label.c @@ -41,12 +41,10 @@ void arch_jump_label_transform(struct jump_entry *entry, val = 0x01000000; } - get_online_cpus(); mutex_lock(&text_mutex); *insn = val; flushi(insn); mutex_unlock(&text_mutex); - put_online_cpus(); } #endif diff --git a/arch/tile/kernel/jump_label.c b/arch/tile/kernel/jump_label.c index 07802d586988..93931a46625b 100644 --- a/arch/tile/kernel/jump_label.c +++ b/arch/tile/kernel/jump_label.c @@ -45,14 +45,12 @@ static void __jump_label_transform(struct jump_entry *e, void arch_jump_label_transform(struct jump_entry *e, enum jump_label_type type) { - get_online_cpus(); mutex_lock(&text_mutex); __jump_label_transform(e, type); flush_icache_range(e->code, e->code + sizeof(tilegx_bundle_bits)); mutex_unlock(&text_mutex); - put_online_cpus(); } __init_or_module void arch_jump_label_transform_static(struct jump_entry *e, diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index c37bd0f39c70..ab4f491da2a9 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -105,11 +105,9 @@ static void __jump_label_transform(struct jump_entry *entry, void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { - get_online_cpus(); mutex_lock(&text_mutex); __jump_label_transform(entry, type, NULL, 0); mutex_unlock(&text_mutex); - put_online_cpus(); } static enum { diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 6c9cb208ac48..d11c506a6ac3 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef HAVE_JUMP_LABEL @@ -124,6 +125,7 @@ void static_key_slow_inc(struct static_key *key) return; } + cpus_read_lock(); jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); @@ -133,12 +135,14 @@ void static_key_slow_inc(struct static_key *key) atomic_inc(&key->enabled); } jump_label_unlock(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { + cpus_read_lock(); /* * The negative count check is valid even when a negative * key->enabled is in use by static_key_slow_inc(); a @@ -149,6 +153,7 @@ static void __static_key_slow_dec(struct static_key *key, if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); + cpus_read_unlock(); return; } @@ -159,6 +164,7 @@ static void __static_key_slow_dec(struct static_key *key, jump_label_update(key); } jump_label_unlock(); + cpus_read_unlock(); } static void jump_label_update_timeout(struct work_struct *work) @@ -334,6 +340,7 @@ void __init jump_label_init(void) if (static_key_initialized) return; + cpus_read_lock(); jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); @@ -353,6 +360,7 @@ void __init jump_label_init(void) } static_key_initialized = true; jump_label_unlock(); + cpus_read_unlock(); } #ifdef CONFIG_MODULES @@ -590,28 +598,28 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, struct module *mod = data; int ret = 0; + cpus_read_lock(); + jump_label_lock(); + switch (val) { case MODULE_STATE_COMING: - jump_label_lock(); ret = jump_label_add_module(mod); if (ret) { WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); jump_label_del_module(mod); } - jump_label_unlock(); break; case MODULE_STATE_GOING: - jump_label_lock(); jump_label_del_module(mod); - jump_label_unlock(); break; case MODULE_STATE_LIVE: - jump_label_lock(); jump_label_invalidate_module_init(mod); - jump_label_unlock(); break; } + jump_label_unlock(); + cpus_read_unlock(); + return notifier_from_errno(ret); } -- cgit v1.3-14-g43fede From 2d1e38f56622b9bb5af85be63c1052c056f5c677 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:36 +0200 Subject: kprobes: Cure hotplug lock ordering issues Converting the cpu hotplug locking to a percpu rwsem unearthed hidden lock ordering problems. There is a wide range of locks involved in this: kprobe_mutex, jump_label_mutex, ftrace_lock, text_mutex, event_mutex, module_mutex, func_hash->regex_lock and a gazillion of lock order permutations with nested get_online_cpus() calls. Some of those permutations are potential deadlocks even with the current nesting hotplug locking scheme, but they can't be discovered by lockdep. The conversion of the hotplug locking to a percpu rwsem requires to prevent nested locking, so it's required to take the hotplug rwsem early in the call chain and establish a proper lock order. After quite some analysis and going down the wrong road severa times the following lock order has been chosen: kprobe_mutex -> cpus_rwsem -> jump_label_mutex -> text_mutex For kprobes which hook on an ftrace function trace point, it's required to drop cpus_rwsem before calling into the ftrace code to avoid a deadlock on the func_hash->regex_lock. [ Steven: Ftrace interaction fixes ] Signed-off-by: Thomas Gleixner Signed-off-by: Steven Rostedt Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: Masami Hiramatsu Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sebastian Siewior Link: http://lkml.kernel.org/r/20170524081549.104864779@linutronix.de --- kernel/kprobes.c | 59 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2d2d3a568e4e..9f6056749a28 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -483,11 +483,6 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); */ static void do_optimize_kprobes(void) { - /* Optimization never be done when disarmed */ - if (kprobes_all_disarmed || !kprobes_allow_optimization || - list_empty(&optimizing_list)) - return; - /* * The optimization/unoptimization refers online_cpus via * stop_machine() and cpu-hotplug modifies online_cpus. @@ -495,14 +490,19 @@ static void do_optimize_kprobes(void) * This combination can cause a deadlock (cpu-hotplug try to lock * text_mutex but stop_machine can not be done because online_cpus * has been changed) - * To avoid this deadlock, we need to call get_online_cpus() + * To avoid this deadlock, caller must have locked cpu hotplug * for preventing cpu-hotplug outside of text_mutex locking. */ - get_online_cpus(); + lockdep_assert_cpus_held(); + + /* Optimization never be done when disarmed */ + if (kprobes_all_disarmed || !kprobes_allow_optimization || + list_empty(&optimizing_list)) + return; + mutex_lock(&text_mutex); arch_optimize_kprobes(&optimizing_list); mutex_unlock(&text_mutex); - put_online_cpus(); } /* @@ -513,12 +513,13 @@ static void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; + /* See comment in do_optimize_kprobes() */ + lockdep_assert_cpus_held(); + /* Unoptimization must be done anytime */ if (list_empty(&unoptimizing_list)) return; - /* Ditto to do_optimize_kprobes */ - get_online_cpus(); mutex_lock(&text_mutex); arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ @@ -537,7 +538,6 @@ static void do_unoptimize_kprobes(void) list_del_init(&op->list); } mutex_unlock(&text_mutex); - put_online_cpus(); } /* Reclaim all kprobes on the free_list */ @@ -562,6 +562,7 @@ static void kick_kprobe_optimizer(void) static void kprobe_optimizer(struct work_struct *work) { mutex_lock(&kprobe_mutex); + cpus_read_lock(); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); @@ -587,6 +588,7 @@ static void kprobe_optimizer(struct work_struct *work) do_free_cleaned_kprobes(); mutex_unlock(&module_mutex); + cpus_read_unlock(); mutex_unlock(&kprobe_mutex); /* Step 5: Kick optimizer again if needed */ @@ -650,9 +652,8 @@ static void optimize_kprobe(struct kprobe *p) /* Short cut to direct unoptimizing */ static void force_unoptimize_kprobe(struct optimized_kprobe *op) { - get_online_cpus(); + lockdep_assert_cpus_held(); arch_unoptimize_kprobe(op); - put_online_cpus(); if (kprobe_disabled(&op->kp)) arch_disarm_kprobe(&op->kp); } @@ -791,6 +792,7 @@ static void try_to_optimize_kprobe(struct kprobe *p) return; /* For preparing optimization, jump_label_text_reserved() is called */ + cpus_read_lock(); jump_label_lock(); mutex_lock(&text_mutex); @@ -812,6 +814,7 @@ static void try_to_optimize_kprobe(struct kprobe *p) out: mutex_unlock(&text_mutex); jump_label_unlock(); + cpus_read_unlock(); } #ifdef CONFIG_SYSCTL @@ -826,6 +829,7 @@ static void optimize_all_kprobes(void) if (kprobes_allow_optimization) goto out; + cpus_read_lock(); kprobes_allow_optimization = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; @@ -833,6 +837,7 @@ static void optimize_all_kprobes(void) if (!kprobe_disabled(p)) optimize_kprobe(p); } + cpus_read_unlock(); printk(KERN_INFO "Kprobes globally optimized\n"); out: mutex_unlock(&kprobe_mutex); @@ -851,6 +856,7 @@ static void unoptimize_all_kprobes(void) return; } + cpus_read_lock(); kprobes_allow_optimization = false; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; @@ -859,6 +865,7 @@ static void unoptimize_all_kprobes(void) unoptimize_kprobe(p, false); } } + cpus_read_unlock(); mutex_unlock(&kprobe_mutex); /* Wait for unoptimizing completion */ @@ -1010,14 +1017,11 @@ static void arm_kprobe(struct kprobe *kp) arm_kprobe_ftrace(kp); return; } - /* - * Here, since __arm_kprobe() doesn't use stop_machine(), - * this doesn't cause deadlock on text_mutex. So, we don't - * need get_online_cpus(). - */ + cpus_read_lock(); mutex_lock(&text_mutex); __arm_kprobe(kp); mutex_unlock(&text_mutex); + cpus_read_unlock(); } /* Disarm a kprobe with text_mutex */ @@ -1027,10 +1031,12 @@ static void disarm_kprobe(struct kprobe *kp, bool reopt) disarm_kprobe_ftrace(kp); return; } - /* Ditto */ + + cpus_read_lock(); mutex_lock(&text_mutex); __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); + cpus_read_unlock(); } /* @@ -1298,13 +1304,10 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) int ret = 0; struct kprobe *ap = orig_p; + cpus_read_lock(); + /* For preparing optimization, jump_label_text_reserved() is called */ jump_label_lock(); - /* - * Get online CPUs to avoid text_mutex deadlock.with stop machine, - * which is invoked by unoptimize_kprobe() in add_new_kprobe() - */ - get_online_cpus(); mutex_lock(&text_mutex); if (!kprobe_aggrprobe(orig_p)) { @@ -1352,8 +1355,8 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) out: mutex_unlock(&text_mutex); - put_online_cpus(); jump_label_unlock(); + cpus_read_unlock(); if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { ap->flags &= ~KPROBE_FLAG_DISABLED; @@ -1555,9 +1558,12 @@ int register_kprobe(struct kprobe *p) goto out; } - mutex_lock(&text_mutex); /* Avoiding text modification */ + cpus_read_lock(); + /* Prevent text modification */ + mutex_lock(&text_mutex); ret = prepare_kprobe(p); mutex_unlock(&text_mutex); + cpus_read_unlock(); if (ret) goto out; @@ -1570,7 +1576,6 @@ int register_kprobe(struct kprobe *p) /* Try to optimize kprobe */ try_to_optimize_kprobe(p); - out: mutex_unlock(&kprobe_mutex); -- cgit v1.3-14-g43fede From fc8dffd379ca5620664336eb895a426b42847558 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:40 +0200 Subject: cpu/hotplug: Convert hotplug locking to percpu rwsem There are no more (known) nested calls to get_online_cpus() and all observed lock ordering problems have been addressed. Replace the magic nested 'rwsem' hackery with a percpu-rwsem. Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081549.447014063@linutronix.de --- include/linux/cpu.h | 2 +- kernel/cpu.c | 107 +++++++--------------------------------------------- 2 files changed, 14 insertions(+), 95 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index af4d660798e5..ca73bc1563f4 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -103,7 +103,7 @@ extern void cpus_write_lock(void); extern void cpus_write_unlock(void); extern void cpus_read_lock(void); extern void cpus_read_unlock(void); -static inline void lockdep_assert_cpus_held(void) { } +extern void lockdep_assert_cpus_held(void); extern void cpu_hotplug_disable(void); extern void cpu_hotplug_enable(void); void clear_tasks_mm_cpumask(int cpu); diff --git a/kernel/cpu.c b/kernel/cpu.c index 142d889d9f69..66836216ebae 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -196,121 +197,41 @@ void cpu_maps_update_done(void) mutex_unlock(&cpu_add_remove_lock); } -/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. +/* + * If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock */ static int cpu_hotplug_disabled; #ifdef CONFIG_HOTPLUG_CPU -static struct { - struct task_struct *active_writer; - /* wait queue to wake up the active_writer */ - wait_queue_head_t wq; - /* verifies that no writer will get active while readers are active */ - struct mutex lock; - /* - * Also blocks the new readers during - * an ongoing cpu hotplug operation. - */ - atomic_t refcount; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -} cpu_hotplug = { - .active_writer = NULL, - .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), - .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), -#ifdef CONFIG_DEBUG_LOCK_ALLOC - .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), -#endif -}; - -/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ -#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) -#define cpuhp_lock_acquire_tryread() \ - lock_map_acquire_tryread(&cpu_hotplug.dep_map) -#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) -#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) - +DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock); void cpus_read_lock(void) { - might_sleep(); - if (cpu_hotplug.active_writer == current) - return; - cpuhp_lock_acquire_read(); - mutex_lock(&cpu_hotplug.lock); - atomic_inc(&cpu_hotplug.refcount); - mutex_unlock(&cpu_hotplug.lock); + percpu_down_read(&cpu_hotplug_lock); } EXPORT_SYMBOL_GPL(cpus_read_lock); void cpus_read_unlock(void) { - int refcount; - - if (cpu_hotplug.active_writer == current) - return; - - refcount = atomic_dec_return(&cpu_hotplug.refcount); - if (WARN_ON(refcount < 0)) /* try to fix things up */ - atomic_inc(&cpu_hotplug.refcount); - - if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq)) - wake_up(&cpu_hotplug.wq); - - cpuhp_lock_release(); - + percpu_up_read(&cpu_hotplug_lock); } EXPORT_SYMBOL_GPL(cpus_read_unlock); -/* - * This ensures that the hotplug operation can begin only when the - * refcount goes to zero. - * - * Note that during a cpu-hotplug operation, the new readers, if any, - * will be blocked by the cpu_hotplug.lock - * - * Since cpu_hotplug_begin() is always called after invoking - * cpu_maps_update_begin(), we can be sure that only one writer is active. - * - * Note that theoretically, there is a possibility of a livelock: - * - Refcount goes to zero, last reader wakes up the sleeping - * writer. - * - Last reader unlocks the cpu_hotplug.lock. - * - A new reader arrives at this moment, bumps up the refcount. - * - The writer acquires the cpu_hotplug.lock finds the refcount - * non zero and goes to sleep again. - * - * However, this is very difficult to achieve in practice since - * get_online_cpus() not an api which is called all that often. - * - */ void cpus_write_lock(void) { - DEFINE_WAIT(wait); - - cpu_hotplug.active_writer = current; - cpuhp_lock_acquire(); - - for (;;) { - mutex_lock(&cpu_hotplug.lock); - prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE); - if (likely(!atomic_read(&cpu_hotplug.refcount))) - break; - mutex_unlock(&cpu_hotplug.lock); - schedule(); - } - finish_wait(&cpu_hotplug.wq, &wait); + percpu_down_write(&cpu_hotplug_lock); } void cpus_write_unlock(void) { - cpu_hotplug.active_writer = NULL; - mutex_unlock(&cpu_hotplug.lock); - cpuhp_lock_release(); + percpu_up_write(&cpu_hotplug_lock); +} + +void lockdep_assert_cpus_held(void) +{ + percpu_rwsem_assert_held(&cpu_hotplug_lock); } /* @@ -344,8 +265,6 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ -/* Notifier wrappers for transitioning to state machine */ - static int bringup_wait_for_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); -- cgit v1.3-14-g43fede From 49dfe2a6779717d9c18395684ee31bdc98b22e53 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 24 May 2017 10:15:43 +0200 Subject: cpuhotplug: Link lock stacks for hotplug callbacks The CPU hotplug callbacks are not covered by lockdep versus the cpu hotplug rwsem. CPU0 CPU1 cpuhp_setup_state(STATE, startup, teardown); cpus_read_lock(); invoke_callback_on_ap(); kick_hotplug_thread(ap); wait_for_completion(); hotplug_thread_fn() lock(m); do_stuff(); unlock(m); Lockdep does not know about this dependency and will not trigger on the following code sequence: lock(m); cpus_read_lock(); Add a lockdep map and connect the initiators lock chain with the hotplug thread lock chain, so potential deadlocks can be detected. Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Acked-by: Ingo Molnar Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20170524081549.709375845@linutronix.de --- kernel/cpu.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 66836216ebae..7435ffc6163b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -66,6 +66,12 @@ struct cpuhp_cpu_state { static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); +#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) +static struct lock_class_key cpuhp_state_key; +static struct lockdep_map cpuhp_state_lock_map = + STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); +#endif + /** * cpuhp_step - Hotplug state machine step * @name: Name of the step @@ -403,6 +409,7 @@ static void cpuhp_thread_fun(unsigned int cpu) st->should_run = false; + lock_map_acquire(&cpuhp_state_lock_map); /* Single callback invocation for [un]install ? */ if (st->single) { if (st->cb_state < CPUHP_AP_ONLINE) { @@ -429,6 +436,7 @@ static void cpuhp_thread_fun(unsigned int cpu) else if (st->state > st->target) ret = cpuhp_ap_offline(cpu, st); } + lock_map_release(&cpuhp_state_lock_map); st->result = ret; complete(&st->done); } @@ -443,6 +451,9 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, if (!cpu_online(cpu)) return 0; + lock_map_acquire(&cpuhp_state_lock_map); + lock_map_release(&cpuhp_state_lock_map); + /* * If we are up and running, use the hotplug thread. For early calls * we invoke the thread function directly. @@ -486,6 +497,8 @@ static int cpuhp_kick_ap_work(unsigned int cpu) enum cpuhp_state state = st->state; trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); + lock_map_acquire(&cpuhp_state_lock_map); + lock_map_release(&cpuhp_state_lock_map); __cpuhp_kick_ap_work(st); wait_for_completion(&st->done); trace_cpuhp_exit(cpu, st->state, state, st->result); -- cgit v1.3-14-g43fede From 5a29ef22098874db79af7bf92a247a0f503bfa6e Mon Sep 17 00:00:00 2001 From: Vincent Legoll Date: Tue, 9 May 2017 10:34:09 +0200 Subject: genirq: Make early_irq_init() print out more informative The printk in early_irq_init() is cryptic and badly formatted: NR_IRQS:33024 nr_irqs:968 16 The last number is the number of preallocated interrupts, so add a prefix to it: NR_IRQS: 33024, nr_irqs: 968, preallocated irqs: 16 Cleanup the formatting for better readability as well. Signed-off-by: Vincent Legoll Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1494318849-6733-1-git-send-email-vincent.legoll@gmail.com --- kernel/irq/irqdesc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 00bb0aeea1d0..09abce2ea8f0 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -480,7 +480,8 @@ int __init early_irq_init(void) /* Let arch update nr_irqs and return the nr of preallocated irqs */ initcnt = arch_probe_nr_irqs(); - printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); + printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n", + NR_IRQS, nr_irqs, initcnt); if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) nr_irqs = IRQ_BITMAP_BITS; @@ -516,7 +517,7 @@ int __init early_irq_init(void) init_irq_default_affinity(); - printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); + printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS); desc = irq_desc; count = ARRAY_SIZE(irq_desc); -- cgit v1.3-14-g43fede From d3ba5a9a345b1243276f8a982e1bce557c2504fd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 May 2017 12:03:11 +0300 Subject: posix-timers: Make posix_clocks immutable There are no more modular users providing a posix clock. The register function is now pointless so the posix clock array can be initialized statically at compile time and the array including the various k_clock structs can be marked 'const'. Inspired by changes in the Grsecurity patch set, but done proper. [ tglx: Massaged changelog and fixed the POSIX_TIMER=n case ] Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Cc: Mike Travis Cc: Dimitri Sivanich Link: http://lkml.kernel.org/r/20170526090311.3377-3-hch@lst.de --- include/linux/posix-timers.h | 9 +- kernel/time/alarmtimer.c | 89 ++++++++++--------- kernel/time/posix-clock.c | 2 +- kernel/time/posix-cpu-timers.c | 34 +++----- kernel/time/posix-timers.c | 191 +++++++++++++++++++---------------------- 5 files changed, 151 insertions(+), 174 deletions(-) (limited to 'kernel') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 8c1e43ab14a9..b313ef2e7385 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -105,10 +105,11 @@ struct k_clock { struct itimerspec64 *cur_setting); }; -extern struct k_clock clock_posix_cpu; -extern struct k_clock clock_posix_dynamic; - -void posix_timers_register_clock(const clockid_t clock_id, struct k_clock *new_clock); +extern const struct k_clock clock_posix_cpu; +extern const struct k_clock clock_posix_dynamic; +extern const struct k_clock clock_process; +extern const struct k_clock clock_thread; +extern const struct k_clock alarm_clock; /* function to call to trigger timer event */ int posix_timer_event(struct k_itimer *timr, int si_private); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5cb5b0008d97..4f4cc3509b30 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -307,38 +307,6 @@ static int alarmtimer_resume(struct device *dev) } #endif -static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) -{ - struct alarm_base *base; - unsigned long flags; - ktime_t delta; - - switch(type) { - case ALARM_REALTIME: - base = &alarm_bases[ALARM_REALTIME]; - type = ALARM_REALTIME_FREEZER; - break; - case ALARM_BOOTTIME: - base = &alarm_bases[ALARM_BOOTTIME]; - type = ALARM_BOOTTIME_FREEZER; - break; - default: - WARN_ONCE(1, "Invalid alarm type: %d\n", type); - return; - } - - delta = ktime_sub(absexp, base->gettime()); - - spin_lock_irqsave(&freezer_delta_lock, flags); - if (!freezer_delta || (delta < freezer_delta)) { - freezer_delta = delta; - freezer_expires = absexp; - freezer_alarmtype = type; - } - spin_unlock_irqrestore(&freezer_delta_lock, flags); -} - - /** * alarm_init - Initialize an alarm structure * @alarm: ptr to alarm to be initialized @@ -488,6 +456,38 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) } EXPORT_SYMBOL_GPL(alarm_forward_now); +#ifdef CONFIG_POSIX_TIMERS + +static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) +{ + struct alarm_base *base; + unsigned long flags; + ktime_t delta; + + switch(type) { + case ALARM_REALTIME: + base = &alarm_bases[ALARM_REALTIME]; + type = ALARM_REALTIME_FREEZER; + break; + case ALARM_BOOTTIME: + base = &alarm_bases[ALARM_BOOTTIME]; + type = ALARM_BOOTTIME_FREEZER; + break; + default: + WARN_ONCE(1, "Invalid alarm type: %d\n", type); + return; + } + + delta = ktime_sub(absexp, base->gettime()); + + spin_lock_irqsave(&freezer_delta_lock, flags); + if (!freezer_delta || (delta < freezer_delta)) { + freezer_delta = delta; + freezer_expires = absexp; + freezer_alarmtype = type; + } + spin_unlock_irqrestore(&freezer_delta_lock, flags); +} /** * clock2alarm - helper that converts from clockid to alarmtypes @@ -846,6 +846,17 @@ out: return ret; } +const struct k_clock alarm_clock = { + .clock_getres = alarm_clock_getres, + .clock_get = alarm_clock_get, + .timer_create = alarm_timer_create, + .timer_set = alarm_timer_set, + .timer_del = alarm_timer_del, + .timer_get = alarm_timer_get, + .nsleep = alarm_timer_nsleep, +}; +#endif /* CONFIG_POSIX_TIMERS */ + /* Suspend hook structures */ static const struct dev_pm_ops alarmtimer_pm_ops = { @@ -871,23 +882,9 @@ static int __init alarmtimer_init(void) struct platform_device *pdev; int error = 0; int i; - struct k_clock alarm_clock = { - .clock_getres = alarm_clock_getres, - .clock_get = alarm_clock_get, - .timer_create = alarm_timer_create, - .timer_set = alarm_timer_set, - .timer_del = alarm_timer_del, - .timer_get = alarm_timer_get, - .nsleep = alarm_timer_nsleep, - }; alarmtimer_rtc_timer_init(); - if (IS_ENABLED(CONFIG_POSIX_TIMERS)) { - posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); - posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); - } - /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 31d588d37a17..7e453005e078 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -434,7 +434,7 @@ static int pc_timer_settime(struct k_itimer *kit, int flags, return err; } -struct k_clock clock_posix_dynamic = { +const struct k_clock clock_posix_dynamic = { .clock_getres = pc_clock_getres, .clock_set = pc_clock_settime, .clock_get = pc_clock_gettime, diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1370f067fb51..1a522b39f19d 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1413,7 +1413,7 @@ static int thread_cpu_timer_create(struct k_itimer *timer) return posix_cpu_timer_create(timer); } -struct k_clock clock_posix_cpu = { +const struct k_clock clock_posix_cpu = { .clock_getres = posix_cpu_clock_getres, .clock_set = posix_cpu_clock_set, .clock_get = posix_cpu_clock_get, @@ -1425,24 +1425,16 @@ struct k_clock clock_posix_cpu = { .timer_get = posix_cpu_timer_get, }; -static __init int init_posix_cpu_timers(void) -{ - struct k_clock process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, - .nsleep_restart = process_cpu_nsleep_restart, - }; - struct k_clock thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .timer_create = thread_cpu_timer_create, - }; - - posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); - posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); +const struct k_clock clock_process = { + .clock_getres = process_cpu_clock_getres, + .clock_get = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, + .nsleep_restart = process_cpu_nsleep_restart, +}; - return 0; -} -__initcall(init_posix_cpu_timers); +const struct k_clock clock_thread = { + .clock_getres = thread_cpu_clock_getres, + .clock_get = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, +}; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4d7b2ce09c27..0c0cccfa3586 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -125,8 +125,6 @@ static DEFINE_SPINLOCK(hash_lock); * which we beg off on and pass to do_sys_settimeofday(). */ -static struct k_clock posix_clocks[MAX_CLOCKS]; - /* * These ones are defined below. */ @@ -280,74 +278,87 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) return 0; } + +static const struct k_clock clock_realtime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_monotonic = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_monotonic_raw = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_monotonic_raw, +}; + +static const struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, +}; + +static const struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, +}; + +static const struct k_clock clock_tai = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_boottime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock * const posix_clocks[] = { + [CLOCK_REALTIME] = &clock_realtime, + [CLOCK_MONOTONIC] = &clock_monotonic, + [CLOCK_PROCESS_CPUTIME_ID] = &clock_process, + [CLOCK_THREAD_CPUTIME_ID] = &clock_thread, + [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, + [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, + [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, + [CLOCK_BOOTTIME] = &clock_boottime, + [CLOCK_REALTIME_ALARM] = &alarm_clock, + [CLOCK_BOOTTIME_ALARM] = &alarm_clock, + [CLOCK_TAI] = &clock_tai, +}; + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) { - struct k_clock clock_realtime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_clock_realtime_get, - .clock_set = posix_clock_realtime_set, - .clock_adj = posix_clock_realtime_adj, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_monotonic = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_ktime_get_ts, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_monotonic_raw = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_raw, - }; - struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, - }; - struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, - }; - struct k_clock clock_tai = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_tai, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - struct k_clock clock_boottime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - }; - - posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); - posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); - posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); - posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); - posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); - posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); - posix_timers_register_clock(CLOCK_TAI, &clock_tai); - posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, NULL); @@ -521,30 +532,6 @@ static struct pid *good_sigevent(sigevent_t * event) return task_pid(rtn); } -void posix_timers_register_clock(const clockid_t clock_id, - struct k_clock *new_clock) -{ - if ((unsigned) clock_id >= MAX_CLOCKS) { - printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", - clock_id); - return; - } - - if (!new_clock->clock_get) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", - clock_id); - return; - } - if (!new_clock->clock_getres) { - printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", - clock_id); - return; - } - - posix_clocks[clock_id] = *new_clock; -} -EXPORT_SYMBOL_GPL(posix_timers_register_clock); - static struct k_itimer * alloc_posix_timer(void) { struct k_itimer *tmr; @@ -581,15 +568,15 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) call_rcu(&tmr->it.rcu, k_itimer_rcu_free); } -static struct k_clock *clockid_to_kclock(const clockid_t id) +static const struct k_clock *clockid_to_kclock(const clockid_t id) { if (id < 0) return (id & CLOCKFD_MASK) == CLOCKFD ? &clock_posix_dynamic : &clock_posix_cpu; - if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) + if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id]) return NULL; - return &posix_clocks[id]; + return posix_clocks[id]; } static int common_timer_create(struct k_itimer *new_timer) @@ -604,7 +591,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, struct sigevent __user *, timer_event_spec, timer_t __user *, created_timer_id) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; sigevent_t event; @@ -781,7 +768,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct itimerspec64 cur_setting64; struct itimerspec cur_setting; struct k_itimer *timr; - struct k_clock *kc; + const struct k_clock *kc; unsigned long flags; int ret = 0; @@ -890,7 +877,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, struct itimerspec new_spec, old_spec; struct k_itimer *timr; unsigned long flag; - struct k_clock *kc; + const struct k_clock *kc; int error = 0; if (!new_setting) @@ -939,7 +926,7 @@ static int common_timer_del(struct k_itimer *timer) static inline int timer_delete_hook(struct k_itimer *timer) { - struct k_clock *kc = clockid_to_kclock(timer->it_clock); + const struct k_clock *kc = clockid_to_kclock(timer->it_clock); if (WARN_ON_ONCE(!kc || !kc->timer_del)) return -EINVAL; @@ -1018,7 +1005,7 @@ void exit_itimers(struct signal_struct *sig) SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 new_tp64; struct timespec new_tp; @@ -1035,7 +1022,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 kernel_tp64; struct timespec kernel_tp; int error; @@ -1055,7 +1042,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, struct timex __user *, utx) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timex ktx; int err; @@ -1078,7 +1065,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 rtn_tp64; struct timespec rtn_tp; int error; @@ -1110,7 +1097,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t64; struct timespec t; @@ -1136,7 +1123,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, long clock_nanosleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->nanosleep.clockid; - struct k_clock *kc = clockid_to_kclock(which_clock); + const struct k_clock *kc = clockid_to_kclock(which_clock); if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) return -EINVAL; -- cgit v1.3-14-g43fede From b6b3b80fceb175c825ad6c72659e0a72e201fc5f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 27 May 2017 12:23:47 +0200 Subject: alarmtimer: Fix posix-timer constification fallout Some freezer related variables are only used when either CONFIG_POSIX_TIMER or CONFIG_RTC_CLASS are enabled. Hide them when both are off. Fixes: d3ba5a9a345b ("posix-timers: Make posix_clocks immutable") Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Christoph Helwig --- kernel/time/alarmtimer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 4f4cc3509b30..4cfebfff848d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -45,11 +45,13 @@ static struct alarm_base { clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; +#if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS) /* freezer information to handle clock_nanosleep triggered wakeups */ static enum alarmtimer_type freezer_alarmtype; static ktime_t freezer_expires; static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); +#endif static struct wakeup_source *ws; -- cgit v1.3-14-g43fede From 613763a1f056211522bac77ff39f25706e678fdd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 26 May 2017 22:04:29 -0400 Subject: take compat_sys_old_getrlimit() to native syscall ... and sanitize the ifdefs in there Signed-off-by: Al Viro --- arch/powerpc/include/asm/compat.h | 1 - arch/s390/include/asm/compat.h | 1 - arch/x86/include/asm/compat.h | 1 - include/linux/syscalls.h | 2 +- kernel/compat.c | 29 ----------------------------- kernel/sys.c | 24 ++++++++++++++++++++++++ 6 files changed, 25 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 4f2df589ec1d..f256e1d14a14 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -109,7 +109,6 @@ struct compat_statfs { int f_spare[4]; }; -#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff #define COMPAT_RLIM_INFINITY 0xffffffff typedef u32 compat_old_sigset_t; diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 0ddd37e6c29d..b9300f8aee10 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -178,7 +178,6 @@ struct compat_statfs64 { u32 f_spare[4]; }; -#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff #define COMPAT_RLIM_INFINITY 0xffffffff typedef u32 compat_old_sigset_t; /* at least 32 bits */ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 24118c0b4640..5343c19814b3 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -116,7 +116,6 @@ struct compat_statfs { int f_spare[4]; }; -#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff #define COMPAT_RLIM_INFINITY 0xffffffff typedef u32 compat_old_sigset_t; /* at least 32 bits */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 980c3c9b06f8..3cb15ea48aee 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -650,7 +650,7 @@ asmlinkage long sys_olduname(struct oldold_utsname __user *); asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim); -#if defined(COMPAT_RLIM_OLD_INFINITY) || !(defined(CONFIG_IA64)) +#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim); #endif asmlinkage long sys_setrlimit(unsigned int resource, diff --git a/kernel/compat.c b/kernel/compat.c index 933bcb31ae10..860f674fa556 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -468,35 +468,6 @@ COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, return do_prlimit(current, resource, &r, NULL); } -#ifdef COMPAT_RLIM_OLD_INFINITY - -COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, - struct compat_rlimit __user *, rlim) -{ - struct rlimit r; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r); - set_fs(old_fs); - - if (!ret) { - if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY) - r.rlim_cur = COMPAT_RLIM_INFINITY; - if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY) - r.rlim_max = COMPAT_RLIM_INFINITY; - - if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || - __put_user(r.rlim_cur, &rlim->rlim_cur) || - __put_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - } - return ret; -} - -#endif - COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct compat_rlimit __user *, rlim) { diff --git a/kernel/sys.c b/kernel/sys.c index 8a94b4eabcaa..3778a8a417b6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1328,6 +1328,30 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) +{ + struct rlimit r; + + if (resource >= RLIM_NLIMITS) + return -EINVAL; + + task_lock(current->group_leader); + r = current->signal->rlim[resource]; + task_unlock(current->group_leader); + if (r.rlim_cur > 0x7FFFFFFF) + r.rlim_cur = 0x7FFFFFFF; + if (r.rlim_max > 0x7FFFFFFF) + r.rlim_max = 0x7FFFFFFF; + + if (put_user(r.rlim_cur, &rlim->rlim_cur) || + put_user(r.rlim_max, &rlim->rlim_max)) + return -EFAULT; + return 0; +} +#endif + #endif static inline bool rlim64_is_infinity(__u64 rlim64) -- cgit v1.3-14-g43fede From bcfe8ad8ef55a1cf3c935c2667e8e5ae598b3b7e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 27 May 2017 00:29:34 -0400 Subject: do_sigaltstack(): lift copying to/from userland into callers Signed-off-by: Al Viro --- kernel/signal.c | 107 ++++++++++++++++++++++++-------------------------------- 1 file changed, 46 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ca92bcfeb322..d1eed0d7ca64 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3113,78 +3113,68 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) } static int -do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) +do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp) { - stack_t oss; - int error; + struct task_struct *t = current; - oss.ss_sp = (void __user *) current->sas_ss_sp; - oss.ss_size = current->sas_ss_size; - oss.ss_flags = sas_ss_flags(sp) | - (current->sas_ss_flags & SS_FLAG_BITS); + if (oss) { + memset(oss, 0, sizeof(stack_t)); + oss->ss_sp = (void __user *) t->sas_ss_sp; + oss->ss_size = t->sas_ss_size; + oss->ss_flags = sas_ss_flags(sp) | + (current->sas_ss_flags & SS_FLAG_BITS); + } - if (uss) { - void __user *ss_sp; - size_t ss_size; - unsigned ss_flags; + if (ss) { + void __user *ss_sp = ss->ss_sp; + size_t ss_size = ss->ss_size; + unsigned ss_flags = ss->ss_flags; int ss_mode; - error = -EFAULT; - if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) - goto out; - error = __get_user(ss_sp, &uss->ss_sp) | - __get_user(ss_flags, &uss->ss_flags) | - __get_user(ss_size, &uss->ss_size); - if (error) - goto out; - - error = -EPERM; - if (on_sig_stack(sp)) - goto out; + if (unlikely(on_sig_stack(sp))) + return -EPERM; ss_mode = ss_flags & ~SS_FLAG_BITS; - error = -EINVAL; - if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && - ss_mode != 0) - goto out; + if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && + ss_mode != 0)) + return -EINVAL; if (ss_mode == SS_DISABLE) { ss_size = 0; ss_sp = NULL; } else { - error = -ENOMEM; - if (ss_size < MINSIGSTKSZ) - goto out; + if (unlikely(ss_size < MINSIGSTKSZ)) + return -ENOMEM; } - current->sas_ss_sp = (unsigned long) ss_sp; - current->sas_ss_size = ss_size; - current->sas_ss_flags = ss_flags; - } - - error = 0; - if (uoss) { - error = -EFAULT; - if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss))) - goto out; - error = __put_user(oss.ss_sp, &uoss->ss_sp) | - __put_user(oss.ss_size, &uoss->ss_size) | - __put_user(oss.ss_flags, &uoss->ss_flags); + t->sas_ss_sp = (unsigned long) ss_sp; + t->sas_ss_size = ss_size; + t->sas_ss_flags = ss_flags; } - -out: - return error; + return 0; } + SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) { - return do_sigaltstack(uss, uoss, current_user_stack_pointer()); + stack_t new, old; + int err; + if (uss && copy_from_user(&new, uss, sizeof(stack_t))) + return -EFAULT; + err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL, + current_user_stack_pointer()); + if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t))) + err = -EFAULT; + return err; } int restore_altstack(const stack_t __user *uss) { - int err = do_sigaltstack(uss, NULL, current_user_stack_pointer()); + stack_t new; + if (copy_from_user(&new, uss, sizeof(stack_t))) + return -EFAULT; + (void)do_sigaltstack(&new, NULL, current_user_stack_pointer()); /* squash all but EFAULT for now */ - return err == -EFAULT ? err : 0; + return 0; } int __save_altstack(stack_t __user *uss, unsigned long sp) @@ -3207,29 +3197,24 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack, { stack_t uss, uoss; int ret; - mm_segment_t seg; if (uss_ptr) { compat_stack_t uss32; - - memset(&uss, 0, sizeof(stack_t)); if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) return -EFAULT; uss.ss_sp = compat_ptr(uss32.ss_sp); uss.ss_flags = uss32.ss_flags; uss.ss_size = uss32.ss_size; } - seg = get_fs(); - set_fs(KERNEL_DS); - ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL), - (stack_t __force __user *) &uoss, + ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, compat_user_stack_pointer()); - set_fs(seg); if (ret >= 0 && uoss_ptr) { - if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) || - __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || - __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || - __put_user(uoss.ss_size, &uoss_ptr->ss_size)) + compat_stack_t old; + memset(&old, 0, sizeof(old)); + old.ss_sp = ptr_to_compat(uoss.ss_sp); + old.ss_flags = uoss.ss_flags; + old.ss_size = uoss.ss_size; + if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t))) ret = -EFAULT; } return ret; -- cgit v1.3-14-g43fede From 7c25904508afef134d7a9d2ad1690ee6554a1faa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 15 May 2017 14:56:50 +0200 Subject: nohz: Reset next_tick cache even when the timer has no regs Handle tick interrupts whose regs are NULL, out of general paranoia. It happens when hrtimer_interrupt() is called from non-interrupt contexts, such as hotplug CPU down events. Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 764d2905e6a5..e3043873fcdc 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1202,6 +1202,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) */ if (regs) tick_sched_handle(ts, regs); + else + ts->next_tick = 0; /* No need to reprogram if we are in idle or full dynticks mode */ if (unlikely(ts->tick_stopped)) -- cgit v1.3-14-g43fede From 7786f6b6dfc12d17eea2df04116de6ebac50c884 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 7 Apr 2017 10:17:27 -0400 Subject: audit: add ambient capabilities to CAPSET and BPRM_FCAPS records Capabilities were augmented to include ambient capabilities in v4.3 commit 58319057b784 ("capabilities: ambient capabilities"). Add ambient capabilities to the audit BPRM_FCAPS and CAPSET records. The record contains fields "old_pp", "old_pi", "old_pe", "new_pp", "new_pi", "new_pe" so in keeping with the previous record normalizations, change the "new_*" variants to simply drop the "new_" prefix. A sample of the replaced BPRM_FCAPS record: RAW: type=BPRM_FCAPS msg=audit(1491468034.252:237): fver=2 fp=0000000000200000 fi=0000000000000000 fe=1 old_pp=0000000000000000 old_pi=0000000000000000 old_pe=0000000000000000 old_pa=0000000000000000 pp=0000000000200000 pi=0000000000000000 pe=0000000000200000 pa=0000000000000000 INTERPRET: type=BPRM_FCAPS msg=audit(04/06/2017 04:40:34.252:237): fver=2 fp=sys_admin fi=none fe=chown old_pp=none old_pi=none old_pe=none old_pa=none pp=sys_admin pi=none pe=sys_admin pa=none A sample of the replaced CAPSET record: RAW: type=CAPSET msg=audit(1491469502.371:242): pid=833 cap_pi=0000003fffffffff cap_pp=0000003fffffffff cap_pe=0000003fffffffff cap_pa=0000000000000000 INTERPRET: type=CAPSET msg=audit(04/06/2017 05:05:02.371:242) : pid=833 cap_pi=chown,dac_override,dac_read_search,fowner,fsetid,kill, setgid,setuid,setpcap,linux_immutable,net_bind_service,net_broadcast, net_admin,net_raw,ipc_lock,ipc_owner,sys_module,sys_rawio,sys_chroot, sys_ptrace,sys_pacct,sys_admin,sys_boot,sys_nice,sys_resource,sys_time, sys_tty_config,mknod,lease,audit_write,audit_control,setfcap, mac_override,mac_admin,syslog,wake_alarm,block_suspend,audit_read cap_pp=chown,dac_override,dac_read_search,fowner,fsetid,kill,setgid, setuid,setpcap,linux_immutable,net_bind_service,net_broadcast, net_admin,net_raw,ipc_lock,ipc_owner,sys_module,sys_rawio,sys_chroot, sys_ptrace,sys_pacct,sys_admin,sys_boot,sys_nice,sys_resource, sys_time,sys_tty_config,mknod,lease,audit_write,audit_control,setfcap, mac_override,mac_admin,syslog,wake_alarm,block_suspend,audit_read cap_pe=chown,dac_override,dac_read_search,fowner,fsetid,kill,setgid, setuid,setpcap,linux_immutable,net_bind_service,net_broadcast, net_admin,net_raw,ipc_lock,ipc_owner,sys_module,sys_rawio,sys_chroot, sys_ptrace,sys_pacct,sys_admin,sys_boot,sys_nice,sys_resource, sys_time,sys_tty_config,mknod,lease,audit_write,audit_control,setfcap, mac_override,mac_admin,syslog,wake_alarm,block_suspend,audit_read cap_pa=none See: https://github.com/linux-audit/audit-kernel/issues/40 Signed-off-by: Richard Guy Briggs Acked-by: Serge Hallyn Signed-off-by: Paul Moore --- kernel/audit.h | 1 + kernel/auditsc.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index ddfce2ea4891..bb3a4e14b7e5 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -68,6 +68,7 @@ struct audit_cap_data { unsigned int fE; /* effective bit of file cap */ kernel_cap_t effective; /* effective set of process */ }; + kernel_cap_t ambient; }; /* When fs/namei.c:getname() is called, we store the pointer in name and bump diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b2dcbe637b7c..5fa68d10032f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1260,6 +1260,7 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); + audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient); break; case AUDIT_MMAP: audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, @@ -1381,9 +1382,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); - audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted); - audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable); - audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); + audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient); + audit_log_cap(ab, "pp", &axs->new_pcap.permitted); + audit_log_cap(ab, "pi", &axs->new_pcap.inheritable); + audit_log_cap(ab, "pe", &axs->new_pcap.effective); + audit_log_cap(ab, "pa", &axs->new_pcap.ambient); break; } } @@ -2341,10 +2344,12 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->old_pcap.permitted = old->cap_permitted; ax->old_pcap.inheritable = old->cap_inheritable; ax->old_pcap.effective = old->cap_effective; + ax->old_pcap.ambient = old->cap_ambient; ax->new_pcap.permitted = new->cap_permitted; ax->new_pcap.inheritable = new->cap_inheritable; ax->new_pcap.effective = new->cap_effective; + ax->new_pcap.ambient = new->cap_ambient; return 0; } @@ -2363,6 +2368,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old) context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; context->capset.cap.permitted = new->cap_permitted; + context->capset.cap.ambient = new->cap_ambient; context->type = AUDIT_CAPSET; } -- cgit v1.3-14-g43fede From 71189fa9b092ef125ee741eccb2f5fa916798afd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:27 -0700 Subject: bpf: free up BPF_JMP | BPF_CALL | BPF_X opcode free up BPF_JMP | BPF_CALL | BPF_X opcode to be used by actual indirect call by register and use kernel internal opcode to mark call instruction into bpf_tail_call() helper. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- arch/arm64/net/bpf_jit_comp.c | 2 +- arch/powerpc/net/bpf_jit_comp64.c | 2 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp_64.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- include/linux/filter.h | 3 +++ kernel/bpf/core.c | 2 +- kernel/bpf/verifier.c | 2 +- 8 files changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 71f930501ade..b1d38eeb24f6 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -586,7 +586,7 @@ emit_cond_jmp: break; } /* tail call */ - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: if (emit_bpf_tail_call(ctx)) return -EFAULT; break; diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index aee2bb817ac6..a01366584a4b 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -938,7 +938,7 @@ common_load: /* * Tail call */ - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: ctx->seen |= SEEN_TAILCALL; bpf_jit_emit_tail_call(image, ctx, addrs[i + 1]); break; diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 6e97a2e3fd8d..42ad3832586c 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -991,7 +991,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i } break; } - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: /* * Implicit input: * B1: pointer to ctx diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 21de77419f48..4a52d34facf9 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1217,7 +1217,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) } /* tail call */ - case BPF_JMP | BPF_CALL |BPF_X: + case BPF_JMP | BPF_TAIL_CALL: emit_tail_call(ctx); break; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f58939393eef..fec12eaa0dec 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -877,7 +877,7 @@ xadd: if (is_imm8(insn->off)) } break; - case BPF_JMP | BPF_CALL | BPF_X: + case BPF_JMP | BPF_TAIL_CALL: emit_bpf_tail_call(&prog); break; diff --git a/include/linux/filter.h b/include/linux/filter.h index 62d948f80730..a20ba40fcb73 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -57,6 +57,9 @@ struct bpf_prog_aux; #define BPF_REG_AX MAX_BPF_REG #define MAX_BPF_JIT_REG (MAX_BPF_REG + 1) +/* unused opcode to mark special call to bpf_tail_call() helper */ +#define BPF_TAIL_CALL 0xf0 + /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index dedf367f59bb..339289402b96 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -824,7 +824,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, /* Call instruction */ [BPF_JMP | BPF_CALL] = &&JMP_CALL, - [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL, + [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, /* Jumps */ [BPF_JMP | BPF_JA] = &&JMP_JA, [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 339c8a1371de..28113d0e8e92 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3469,7 +3469,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) * that doesn't support bpf_tail_call yet */ insn->imm = 0; - insn->code |= BPF_X; + insn->code = BPF_JMP | BPF_TAIL_CALL; continue; } -- cgit v1.3-14-g43fede From f696b8f471ec987e987e38206b8eb23c39ee5a86 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:28 -0700 Subject: bpf: split bpf core interpreter split __bpf_prog_run() interpreter into stack allocation and execution parts. The code section shrinks which helps interpreter performance in some cases. text data bss dec hex filename 26350 10328 624 37302 91b6 kernel/bpf/core.o.before 25777 10328 624 36729 8f79 kernel/bpf/core.o.after Very short programs got slower (due to extra function call): Before: test_bpf: #89 ALU64_ADD_K: 1 + 2 = 3 jited:0 7 PASS test_bpf: #90 ALU64_ADD_K: 3 + 0 = 3 jited:0 8 PASS test_bpf: #91 ALU64_ADD_K: 1 + 2147483646 = 2147483647 jited:0 7 PASS test_bpf: #92 ALU64_ADD_K: 4294967294 + 2 = 4294967296 jited:0 11 PASS test_bpf: #93 ALU64_ADD_K: 2147483646 + -2147483647 = -1 jited:0 7 PASS After: test_bpf: #89 ALU64_ADD_K: 1 + 2 = 3 jited:0 11 PASS test_bpf: #90 ALU64_ADD_K: 3 + 0 = 3 jited:0 11 PASS test_bpf: #91 ALU64_ADD_K: 1 + 2147483646 = 2147483647 jited:0 11 PASS test_bpf: #92 ALU64_ADD_K: 4294967294 + 2 = 4294967296 jited:0 14 PASS test_bpf: #93 ALU64_ADD_K: 2147483646 + -2147483647 = -1 jited:0 10 PASS Longer programs got faster: Before: test_bpf: #266 BPF_MAXINSNS: Ctx heavy transformations jited:0 20286 20513 PASS test_bpf: #267 BPF_MAXINSNS: Call heavy transformations jited:0 31853 31768 PASS test_bpf: #268 BPF_MAXINSNS: Jump heavy test jited:0 9815 PASS test_bpf: #269 BPF_MAXINSNS: Very long jump backwards jited:0 6 PASS test_bpf: #270 BPF_MAXINSNS: Edge hopping nuthouse jited:0 13959 PASS test_bpf: #271 BPF_MAXINSNS: Jump, gap, jump, ... jited:0 210 PASS test_bpf: #272 BPF_MAXINSNS: ld_abs+get_processor_id jited:0 21724 PASS test_bpf: #273 BPF_MAXINSNS: ld_abs+vlan_push/pop jited:0 19118 PASS After: test_bpf: #266 BPF_MAXINSNS: Ctx heavy transformations jited:0 19008 18827 PASS test_bpf: #267 BPF_MAXINSNS: Call heavy transformations jited:0 29238 28450 PASS test_bpf: #268 BPF_MAXINSNS: Jump heavy test jited:0 9485 PASS test_bpf: #269 BPF_MAXINSNS: Very long jump backwards jited:0 12 PASS test_bpf: #270 BPF_MAXINSNS: Edge hopping nuthouse jited:0 13257 PASS test_bpf: #271 BPF_MAXINSNS: Jump, gap, jump, ... jited:0 213 PASS test_bpf: #272 BPF_MAXINSNS: ld_abs+get_processor_id jited:0 19389 PASS test_bpf: #273 BPF_MAXINSNS: ld_abs+vlan_push/pop jited:0 19583 PASS For real world production programs the difference is noise. This patch is first step towards reducing interpreter stack consumption. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/core.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 339289402b96..abd410d394bc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -763,10 +763,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); * * Decode and execute eBPF instructions. */ -static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) +static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, + u64 *stack) { - u64 stack[MAX_BPF_STACK / sizeof(u64)]; - u64 regs[MAX_BPF_REG], tmp; + u64 tmp; static const void *jumptable[256] = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ @@ -874,9 +874,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) - FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; - ARG1 = (u64) (unsigned long) ctx; - select_insn: goto *jumptable[insn->code]; @@ -1219,7 +1216,17 @@ load_byte: WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); return 0; } -STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ +STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ + +static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) +{ + u64 stack[MAX_BPF_STACK / sizeof(u64)]; + u64 regs[MAX_BPF_REG]; + + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; + ARG1 = (u64) (unsigned long) ctx; + return ___bpf_prog_run(regs, insn, stack); +} bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) -- cgit v1.3-14-g43fede From 8726679a0fa317f8e83d0843b266453f31bff092 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:29 -0700 Subject: bpf: teach verifier to track stack depth teach verifier to track bpf program stack depth Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6bb38d76faf4..fcc80ca11045 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -171,6 +171,7 @@ struct bpf_prog_aux { atomic_t refcnt; u32 used_map_cnt; u32 max_ctx_offset; + u32 stack_depth; struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; const struct bpf_verifier_ops *ops; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 28113d0e8e92..d96f27ff9f6f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -926,6 +926,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, verbose("invalid stack off=%d size=%d\n", off, size); return -EACCES; } + + if (env->prog->aux->stack_depth < -off) + env->prog->aux->stack_depth = -off; + if (t == BPF_WRITE) { if (!env->allow_ptr_leaks && state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && @@ -1032,6 +1036,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } + if (env->prog->aux->stack_depth < -off) + env->prog->aux->stack_depth = -off; + if (meta && meta->raw_mode) { meta->access_size = access_size; meta->regno = regno; @@ -3167,7 +3174,8 @@ process_bpf_exit: insn_idx++; } - verbose("processed %d insns\n", insn_processed); + verbose("processed %d insns, stack depth %d\n", + insn_processed, env->prog->aux->stack_depth); return 0; } -- cgit v1.3-14-g43fede From 80a58d02559465b0ea403ff91c8bca9a733b1b0d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:30 -0700 Subject: bpf: reconcile bpf_tail_call and stack_depth The next set of patches will take advantage of stack_depth tracking, so make sure that the program that does bpf_tail_call() has stack depth large enough for the callee. We could have tracked the stack depth of the prog_array owner program and only allow insertion of the programs with stack depth less than the owner, but it will break existing applications. Some of them have trivial root bpf program that only does multiple bpf_tail_calls and at init time the prog array is empty. In the future we may add a flag to do such tracking optionally, but for now play simple and safe. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d96f27ff9f6f..14ccb0759fa4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3470,6 +3470,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) * the program array. */ prog->cb_access = 1; + env->prog->aux->stack_depth = MAX_BPF_STACK; /* mark bpf_tail_call as different opcode to avoid * conditional branch in the interpeter for every normal -- cgit v1.3-14-g43fede From b870aa901f4be1d32c13faf9e8f40bf2a8562e19 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 May 2017 13:31:33 -0700 Subject: bpf: use different interpreter depending on required stack size 16 __bpf_prog_run() interpreters for various stack sizes add .text but not a lot comparing to run-time stack savings text data bss dec hex filename 26350 10328 624 37302 91b6 kernel/bpf/core.o.before_split 25777 10328 624 36729 8f79 kernel/bpf/core.o.after_split 26970 10328 624 37922 9422 kernel/bpf/core.o.now Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/core.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index abd410d394bc..774069ca18a7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1218,16 +1218,38 @@ load_byte: } STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ -static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) -{ - u64 stack[MAX_BPF_STACK / sizeof(u64)]; - u64 regs[MAX_BPF_REG]; - - FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; - ARG1 = (u64) (unsigned long) ctx; - return ___bpf_prog_run(regs, insn, stack); +#define PROG_NAME(stack_size) __bpf_prog_run##stack_size +#define DEFINE_BPF_PROG_RUN(stack_size) \ +static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ +{ \ + u64 stack[stack_size / sizeof(u64)]; \ + u64 regs[MAX_BPF_REG]; \ +\ + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ + ARG1 = (u64) (unsigned long) ctx; \ + return ___bpf_prog_run(regs, insn, stack); \ } +#define EVAL1(FN, X) FN(X) +#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) +#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) +#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y) +#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y) +#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y) + +EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); +EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); +EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); + +#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), + +static unsigned int (*interpreters[])(const void *ctx, + const struct bpf_insn *insn) = { +EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) +EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) +EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) +}; + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -1275,7 +1297,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { - fp->bpf_func = (void *) __bpf_prog_run; + fp->bpf_func = interpreters[round_down(fp->aux->stack_depth, 32) / 32]; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during -- cgit v1.3-14-g43fede From fb9a307d11d62749d75b404f15517d73f5d6e148 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 31 May 2017 18:15:59 -0700 Subject: bpf: Allow CGROUP_SKB eBPF program to access sk_buff This allows cgroup eBPF program to classify packet based on their protocol or other detail information. Currently program need CAP_NET_ADMIN privilege to attach a cgroup eBPF program, and A process with CAP_NET_ADMIN can already see all packets on the system, for example, by creating an iptables rules that causes the packet to be passed to userspace via NFLOG. Signed-off-by: Chenbo Feng Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 14ccb0759fa4..8acae64df255 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2426,6 +2426,7 @@ static bool may_access_skb(enum bpf_prog_type type) case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_CGROUP_SKB: return true; default: return false; -- cgit v1.3-14-g43fede From 80b7d81912d807f161d55e9c2c9cc81061666f83 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 31 May 2017 18:16:00 -0700 Subject: bpf: Remove the capability check for cgroup skb eBPF program Currently loading a cgroup skb eBPF program require a CAP_SYS_ADMIN capability while attaching the program to a cgroup only requires the user have CAP_NET_ADMIN privilege. We can escape the capability check when load the program just like socket filter program to make the capability requirement consistent. Change since v1: Change the code style in order to be compliant with checkpatch.pl preference Signed-off-by: Chenbo Feng Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 265a0d854e33..59da103adb85 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -815,7 +815,9 @@ static int bpf_prog_load(union bpf_attr *attr) attr->kern_version != LINUX_VERSION_CODE) return -EINVAL; - if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) + if (type != BPF_PROG_TYPE_SOCKET_FILTER && + type != BPF_PROG_TYPE_CGROUP_SKB && + !capable(CAP_SYS_ADMIN)) return -EPERM; /* plain bpf_prog allocation */ -- cgit v1.3-14-g43fede From e5aeee51f6b4fb22e851105ee6d8ad211c40a214 Mon Sep 17 00:00:00 2001 From: Alexander Levin Date: Sat, 3 Jun 2017 03:39:13 +0000 Subject: perf/core: Don't release cred_guard_mutex if not taken If we failed to acquire task's cred_guard_mutex we shouldn't proceed to release it in the error path. Fixes: a63fbed776c ("perf/tracing/cpuhotplug: Fix locking order") Signed-off-by: Alexander Levin Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: mhiramat@kernel.org Cc: paulmck@linux.vnet.ibm.com Cc: bigeasy@linutronix.de Link: http://lkml.kernel.org/r/20170603033903.12056-1-alexander.levin@verizon.com Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b97cda4d1777..1f1b8cdaca2d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9878,7 +9878,7 @@ SYSCALL_DEFINE5(perf_event_open, if (task) { err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); if (err) - goto err_cred; + goto err_task; /* * Reuse ptrace permission checks for now. -- cgit v1.3-14-g43fede From 201d7f47f34bd7cb19161d0426f13b141e381f30 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 May 2017 11:58:32 +0200 Subject: genirq: Handle NOAUTOEN interrupt setup proper If an interrupt is marked NOAUTOEN then request_irq() installs the action, but does not enable the interrupt via startup_irq(). The interrupt is enabled via enable_irq() later from the driver. enable_irq() calls irq_enable(). That means that for interrupts which have a irq_startup() callback this callback is never invoked. Neither is irq_domain_activate_irq() invoked for such interrupts. If an interrupt depends on irq_startup() or irq_domain_activate_irq() then the enable via irq_enable() is not enough. Add a status flag IRQD_IRQ_STARTED_UP and use this to select the proper mechanism in enable_irq(). Use the flag also to avoid pointless calls into the low level functions. Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: dianders@chromium.org Cc: jeffy Cc: Brian Norris Cc: tfiga@chromium.org Link: http://lkml.kernel.org/r/20170531100212.130986205@linutronix.de --- include/linux/irq.h | 6 +++++ kernel/irq/chip.c | 76 +++++++++++++++++++++++++++++++++++++---------------- kernel/irq/manage.c | 12 ++++++--- 3 files changed, 69 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index f887351aa80e..94d1ad6ffdd4 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -216,6 +216,7 @@ enum { IRQD_WAKEUP_ARMED = (1 << 19), IRQD_FORWARDED_TO_VCPU = (1 << 20), IRQD_AFFINITY_MANAGED = (1 << 21), + IRQD_IRQ_STARTED = (1 << 22), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -329,6 +330,11 @@ static inline void irqd_clr_activated(struct irq_data *d) __irqd_to_state(d) &= ~IRQD_ACTIVATED; } +static inline bool irqd_is_started(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_IRQ_STARTED; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c94da688ee9b..e0051d58c909 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -185,37 +185,64 @@ static void irq_state_set_masked(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); } +static void irq_state_clr_started(struct irq_desc *desc) +{ + irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); +} + +static void irq_state_set_started(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_STARTED); +} + int irq_startup(struct irq_desc *desc, bool resend) { int ret = 0; - irq_state_clr_disabled(desc); desc->depth = 0; - irq_domain_activate_irq(&desc->irq_data); - if (desc->irq_data.chip->irq_startup) { - ret = desc->irq_data.chip->irq_startup(&desc->irq_data); - irq_state_clr_masked(desc); - } else { + if (irqd_is_started(&desc->irq_data)) { irq_enable(desc); + } else { + irq_domain_activate_irq(&desc->irq_data); + if (desc->irq_data.chip->irq_startup) { + ret = desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_state_clr_disabled(desc); + irq_state_clr_masked(desc); + } else { + irq_enable(desc); + } + irq_state_set_started(desc); } + if (resend) check_irq_resend(desc); + return ret; } +static void __irq_disable(struct irq_desc *desc, bool mask); + void irq_shutdown(struct irq_desc *desc) { - irq_state_set_disabled(desc); - desc->depth = 1; - if (desc->irq_data.chip->irq_shutdown) - desc->irq_data.chip->irq_shutdown(&desc->irq_data); - else if (desc->irq_data.chip->irq_disable) - desc->irq_data.chip->irq_disable(&desc->irq_data); - else - desc->irq_data.chip->irq_mask(&desc->irq_data); + if (irqd_is_started(&desc->irq_data)) { + desc->depth = 1; + if (desc->irq_data.chip->irq_shutdown) { + desc->irq_data.chip->irq_shutdown(&desc->irq_data); + irq_state_set_disabled(desc); + irq_state_set_masked(desc); + } else { + __irq_disable(desc, true); + } + irq_state_clr_started(desc); + } + /* + * This must be called even if the interrupt was never started up, + * because the activation can happen before the interrupt is + * available for request/startup. It has it's own state tracking so + * it's safe to call it unconditionally. + */ irq_domain_deactivate_irq(&desc->irq_data); - irq_state_set_masked(desc); } void irq_enable(struct irq_desc *desc) @@ -228,6 +255,17 @@ void irq_enable(struct irq_desc *desc) irq_state_clr_masked(desc); } +static void __irq_disable(struct irq_desc *desc, bool mask) +{ + irq_state_set_disabled(desc); + if (desc->irq_data.chip->irq_disable) { + desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_state_set_masked(desc); + } else if (mask) { + mask_irq(desc); + } +} + /** * irq_disable - Mark interrupt disabled * @desc: irq descriptor which should be disabled @@ -250,13 +288,7 @@ void irq_enable(struct irq_desc *desc) */ void irq_disable(struct irq_desc *desc) { - irq_state_set_disabled(desc); - if (desc->irq_data.chip->irq_disable) { - desc->irq_data.chip->irq_disable(&desc->irq_data); - irq_state_set_masked(desc); - } else if (irq_settings_disable_unlazy(desc)) { - mask_irq(desc); - } + __irq_disable(desc, irq_settings_disable_unlazy(desc)); } void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 070be980c37a..57056109f176 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -533,9 +533,15 @@ void __enable_irq(struct irq_desc *desc) goto err_out; /* Prevent probing on this irq: */ irq_settings_set_noprobe(desc); - irq_enable(desc); - check_irq_resend(desc); - /* fall-through */ + /* + * Call irq_startup() not irq_enable() here because the + * interrupt might be marked NOAUTOEN. So irq_startup() + * needs to be invoked when it gets enabled the first + * time. If it was already started up, then irq_startup() + * will invoke irq_enable() under the hood. + */ + irq_startup(desc, true); + break; } default: desc->depth--; -- cgit v1.3-14-g43fede From 04c848d398797a626608ff48804d809ae6687163 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 May 2017 11:58:33 +0200 Subject: genirq: Warn when IRQ_NOAUTOEN is used with shared interrupts Shared interrupts do not go well with disabling auto enable: 1) The sharing interrupt might request it while it's still disabled and then wait for interrupts forever. 2) The interrupt might have been requested by the driver sharing the line before IRQ_NOAUTOEN has been set. So the driver which expects that disabled state after calling request_irq() will not get what it wants. Even worse, when it calls enable_irq() later, it will trigger the unbalanced enable_irq() warning. Reported-by: Brian Norris Signed-off-by: Thomas Gleixner Cc: dianders@chromium.org Cc: jeffy Cc: Marc Zyngier Cc: tfiga@chromium.org Link: http://lkml.kernel.org/r/20170531100212.210682135@linutronix.de --- kernel/irq/chip.c | 7 +++++++ kernel/irq/manage.c | 12 ++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e0051d58c909..bc1331f84fb5 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -935,6 +935,13 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) if (!desc) return; + + /* + * Warn when a driver sets the no autoenable flag on an already + * active interrupt. + */ + WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN)); + irq_settings_clr_and_set(desc, clr, set); irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 57056109f176..49c37f1e71c0 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1334,11 +1334,19 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; - if (irq_settings_can_autoenable(desc)) + if (irq_settings_can_autoenable(desc)) { irq_startup(desc, true); - else + } else { + /* + * Shared interrupts do not go well with disabling + * auto enable. The sharing interrupt might request + * it while it's still disabled and then wait for + * interrupts forever. + */ + WARN_ON_ONCE(new->flags & IRQF_SHARED); /* Undo nested disables: */ desc->depth = 1; + } /* Exclude IRQ from balancing if requested */ if (new->flags & IRQF_NOBALANCING) { -- cgit v1.3-14-g43fede From 31ea70e0308b73a1b862bd17c06efc3cbcfd2016 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 3 Jun 2017 21:01:00 +0200 Subject: posix-timers: Move the do_schedule_next_timer declaration Having it in asm-generic/siginfo.h doesn't make any sense as it is in no way architecture specific. Move it to posix-timers.h instead. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: Fenghua Yu Cc: Tony Luck Cc: linux-ia64@vger.kernel.org Cc: Arnd Bergmann Cc: sparclinux@vger.kernel.org Cc: "David S. Miller" Link: http://lkml.kernel.org/r/20170603190102.28866-4-hch@lst.de --- include/asm-generic/siginfo.h | 1 - include/linux/posix-timers.h | 3 +++ kernel/signal.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h index a2508a8f9a9c..5a9394763a66 100644 --- a/include/asm-generic/siginfo.h +++ b/include/asm-generic/siginfo.h @@ -15,7 +15,6 @@ #define __SI_CODE(T,N) ((T) | ((N) & 0xffff)) struct siginfo; -void do_schedule_next_timer(struct siginfo *info); extern int copy_siginfo_to_user(struct siginfo __user *to, const struct siginfo *from); diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 34e893a75771..8929f7e8f452 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -7,6 +7,7 @@ #include #include +struct siginfo; struct cpu_timer_list { struct list_head entry; @@ -120,4 +121,6 @@ long clock_nanosleep_restart(struct restart_block *restart_block); void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); +void do_schedule_next_timer(struct siginfo *info); + #endif diff --git a/kernel/signal.c b/kernel/signal.c index ca92bcfeb322..1f85c843be8e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -39,6 +39,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include -- cgit v1.3-14-g43fede From 18c700c4e3d0a37c43a2df8b8f740121d4dac645 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:36 +0200 Subject: alarmtimer: Remove pointless config conditional Having a IF_ENABLED(CONFIG_POSIX_TIMERS) inside of a #ifdef CONFIG_POSIX_TIMERS section is pointless. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211655.975218056@linutronix.de --- kernel/time/alarmtimer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index e645dcc7d4ee..2a8675f9aac5 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -520,8 +520,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, spin_lock_irqsave(&ptr->it_lock, flags); if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { - if (IS_ENABLED(CONFIG_POSIX_TIMERS) && - posix_timer_event(ptr, 0) != 0) + if (posix_timer_event(ptr, 0)) ptr->it_overrun++; } -- cgit v1.3-14-g43fede From a81129e5a189973abd661704b261f8aad9325407 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:37 +0200 Subject: posix-timers: Remove unused export of posix_timer_event() Since the removal of the mmtimer driver the export is not longer needed. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.052744418@linutronix.de --- kernel/time/posix-timers.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0c0cccfa3586..44d486590e6e 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -442,7 +442,6 @@ int posix_timer_event(struct k_itimer *timr, int si_private) /* If we failed to send the signal the timer stops. */ return ret > 0; } -EXPORT_SYMBOL_GPL(posix_timer_event); /* * This function gets called when a POSIX.1b interval timer expires. It -- cgit v1.3-14-g43fede From 3a06c7ac24f9f24ec059cd77c2dbdf7fbfd0aaaf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:38 +0200 Subject: posix-clocks: Remove interval timer facility and mmap/fasync callbacks The only user of this facility is ptp_clock, which does not implement any of those functions. Remove them to prevent accidental users. Especially the interval timer interfaces are now more or less impossible to implement because the necessary infrastructure has been confined to the core code. Aside of that it's really complex to make these callbacks implemented according to spec as the alarm timer implementation demonstrates. If at all then a nanosleep callback might be a reasonable extension. For now keep just what ptp_clock needs. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.145036286@linutronix.de --- include/linux/posix-clock.h | 22 --------- kernel/time/posix-clock.c | 113 -------------------------------------------- 2 files changed, 135 deletions(-) (limited to 'kernel') diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h index 83b22ae9ae12..38d8225510f1 100644 --- a/include/linux/posix-clock.h +++ b/include/linux/posix-clock.h @@ -42,12 +42,6 @@ struct posix_clock; * @clock_gettime: Read the current time * @clock_getres: Get the clock resolution * @clock_settime: Set the current time value - * @timer_create: Create a new timer - * @timer_delete: Remove a previously created timer - * @timer_gettime: Get remaining time and interval of a timer - * @timer_settime: Set a timer's initial expiration and interval - * @fasync: Optional character device fasync method - * @mmap: Optional character device mmap method * @open: Optional character device open method * @release: Optional character device release method * @ioctl: Optional character device ioctl method @@ -66,28 +60,12 @@ struct posix_clock_operations { int (*clock_settime)(struct posix_clock *pc, const struct timespec64 *ts); - int (*timer_create) (struct posix_clock *pc, struct k_itimer *kit); - - int (*timer_delete) (struct posix_clock *pc, struct k_itimer *kit); - - void (*timer_gettime)(struct posix_clock *pc, - struct k_itimer *kit, struct itimerspec64 *tsp); - - int (*timer_settime)(struct posix_clock *pc, - struct k_itimer *kit, int flags, - struct itimerspec64 *tsp, struct itimerspec64 *old); /* * Optional character device methods: */ - int (*fasync) (struct posix_clock *pc, - int fd, struct file *file, int on); - long (*ioctl) (struct posix_clock *pc, unsigned int cmd, unsigned long arg); - int (*mmap) (struct posix_clock *pc, - struct vm_area_struct *vma); - int (*open) (struct posix_clock *pc, fmode_t f_mode); uint (*poll) (struct posix_clock *pc, diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 7e453005e078..bd4fb785652f 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -82,38 +82,6 @@ static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) return result; } -static int posix_clock_fasync(int fd, struct file *fp, int on) -{ - struct posix_clock *clk = get_posix_clock(fp); - int err = 0; - - if (!clk) - return -ENODEV; - - if (clk->ops.fasync) - err = clk->ops.fasync(clk, fd, fp, on); - - put_posix_clock(clk); - - return err; -} - -static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) -{ - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENODEV; - - if (!clk) - return -ENODEV; - - if (clk->ops.mmap) - err = clk->ops.mmap(clk, vma); - - put_posix_clock(clk); - - return err; -} - static long posix_clock_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { @@ -199,8 +167,6 @@ static const struct file_operations posix_clock_file_operations = { .unlocked_ioctl = posix_clock_ioctl, .open = posix_clock_open, .release = posix_clock_release, - .fasync = posix_clock_fasync, - .mmap = posix_clock_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = posix_clock_compat_ioctl, #endif @@ -359,88 +325,9 @@ out: return err; } -static int pc_timer_create(struct k_itimer *kit) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if (cd.clk->ops.timer_create) - err = cd.clk->ops.timer_create(cd.clk, kit); - else - err = -EOPNOTSUPP; - - put_clock_desc(&cd); - - return err; -} - -static int pc_timer_delete(struct k_itimer *kit) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if (cd.clk->ops.timer_delete) - err = cd.clk->ops.timer_delete(cd.clk, kit); - else - err = -EOPNOTSUPP; - - put_clock_desc(&cd); - - return err; -} - -static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - - if (get_clock_desc(id, &cd)) - return; - - if (cd.clk->ops.timer_gettime) - cd.clk->ops.timer_gettime(cd.clk, kit, ts); - - put_clock_desc(&cd); -} - -static int pc_timer_settime(struct k_itimer *kit, int flags, - struct itimerspec64 *ts, struct itimerspec64 *old) -{ - clockid_t id = kit->it_clock; - struct posix_clock_desc cd; - int err; - - err = get_clock_desc(id, &cd); - if (err) - return err; - - if (cd.clk->ops.timer_settime) - err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); - else - err = -EOPNOTSUPP; - - put_clock_desc(&cd); - - return err; -} - const struct k_clock clock_posix_dynamic = { .clock_getres = pc_clock_getres, .clock_set = pc_clock_settime, .clock_get = pc_clock_gettime, .clock_adj = pc_clock_adjtime, - .timer_create = pc_timer_create, - .timer_set = pc_timer_settime, - .timer_del = pc_timer_delete, - .timer_get = pc_timer_gettime, }; -- cgit v1.3-14-g43fede From 6631fa12c105e326bbe5fb215eb216e86c90d1ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:39 +0200 Subject: posix-timers: Avoid gazillions of forward declarations Move it below the actual implementations as there are new callbacks coming which would require even more forward declarations. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.238209952@linutronix.de --- kernel/time/posix-timers.c | 190 +++++++++++++++++++++------------------------ 1 file changed, 89 insertions(+), 101 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 44d486590e6e..b60b655dfbcd 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -69,6 +69,9 @@ static struct kmem_cache *posix_timers_cache; static DEFINE_HASHTABLE(posix_timers_hashtable, 9); static DEFINE_SPINLOCK(hash_lock); +static const struct k_clock * const posix_clocks[]; +static const struct k_clock *clockid_to_kclock(const clockid_t id); + /* * we assume that the new SIGEV_THREAD_ID shares no bits with the other * SIGEV values. Here we put out an error if this assumption fails. @@ -124,20 +127,6 @@ static DEFINE_SPINLOCK(hash_lock); * have is CLOCK_REALTIME and its high res counter part, both of * which we beg off on and pass to do_sys_settimeofday(). */ - -/* - * These ones are defined below. - */ -static int common_nsleep(const clockid_t, int flags, struct timespec64 *t, - struct timespec __user *rmtp); -static int common_timer_create(struct k_itimer *new_timer); -static void common_timer_get(struct k_itimer *, struct itimerspec64 *); -static int common_timer_set(struct k_itimer *, int, - struct itimerspec64 *, struct itimerspec64 *); -static int common_timer_del(struct k_itimer *timer); - -static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); - static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); #define lock_timer(tid, flags) \ @@ -278,82 +267,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) return 0; } - -static const struct k_clock clock_realtime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_clock_realtime_get, - .clock_set = posix_clock_realtime_set, - .clock_adj = posix_clock_realtime_adj, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, -}; - -static const struct k_clock clock_monotonic = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_ktime_get_ts, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, -}; - -static const struct k_clock clock_monotonic_raw = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_raw, -}; - -static const struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, -}; - -static const struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, -}; - -static const struct k_clock clock_tai = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_tai, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, -}; - -static const struct k_clock clock_boottime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, -}; - -static const struct k_clock * const posix_clocks[] = { - [CLOCK_REALTIME] = &clock_realtime, - [CLOCK_MONOTONIC] = &clock_monotonic, - [CLOCK_PROCESS_CPUTIME_ID] = &clock_process, - [CLOCK_THREAD_CPUTIME_ID] = &clock_thread, - [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, - [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, - [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, - [CLOCK_BOOTTIME] = &clock_boottime, - [CLOCK_REALTIME_ALARM] = &alarm_clock, - [CLOCK_BOOTTIME_ALARM] = &alarm_clock, - [CLOCK_TAI] = &clock_tai, -}; - /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ @@ -567,17 +480,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) call_rcu(&tmr->it.rcu, k_itimer_rcu_free); } -static const struct k_clock *clockid_to_kclock(const clockid_t id) -{ - if (id < 0) - return (id & CLOCKFD_MASK) == CLOCKFD ? - &clock_posix_dynamic : &clock_posix_cpu; - - if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id]) - return NULL; - return posix_clocks[id]; -} - static int common_timer_create(struct k_itimer *new_timer) { hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); @@ -1129,3 +1031,89 @@ long clock_nanosleep_restart(struct restart_block *restart_block) return kc->nsleep_restart(restart_block); } + +static const struct k_clock clock_realtime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_monotonic = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_monotonic_raw = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_monotonic_raw, +}; + +static const struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, +}; + +static const struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, +}; + +static const struct k_clock clock_tai = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock clock_boottime = { + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, +}; + +static const struct k_clock * const posix_clocks[] = { + [CLOCK_REALTIME] = &clock_realtime, + [CLOCK_MONOTONIC] = &clock_monotonic, + [CLOCK_PROCESS_CPUTIME_ID] = &clock_process, + [CLOCK_THREAD_CPUTIME_ID] = &clock_thread, + [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, + [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, + [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, + [CLOCK_BOOTTIME] = &clock_boottime, + [CLOCK_REALTIME_ALARM] = &alarm_clock, + [CLOCK_BOOTTIME_ALARM] = &alarm_clock, + [CLOCK_TAI] = &clock_tai, +}; + +static const struct k_clock *clockid_to_kclock(const clockid_t id) +{ + if (id < 0) + return (id & CLOCKFD_MASK) == CLOCKFD ? + &clock_posix_dynamic : &clock_posix_cpu; + + if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id]) + return NULL; + return posix_clocks[id]; +} -- cgit v1.3-14-g43fede From bab0aae9dcba9466dcc968b8bd21914f8f691631 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:41 +0200 Subject: posix-timers: Move posix-timer internals to core None of these declarations is required outside of kernel/time. Move them to an internal header. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170530211656.394803853@linutronix.de --- include/linux/posix-timers.h | 30 ------------------------------ kernel/time/alarmtimer.c | 2 ++ kernel/time/posix-clock.c | 2 ++ kernel/time/posix-cpu-timers.c | 2 ++ kernel/time/posix-timers.c | 1 + kernel/time/posix-timers.h | 29 +++++++++++++++++++++++++++++ 6 files changed, 36 insertions(+), 30 deletions(-) create mode 100644 kernel/time/posix-timers.h (limited to 'kernel') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index e06062c3967b..a372e7e3a396 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -100,36 +100,6 @@ struct k_itimer { } it; }; -struct k_clock { - int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp); - int (*clock_set) (const clockid_t which_clock, - const struct timespec64 *tp); - int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp); - int (*clock_adj) (const clockid_t which_clock, struct timex *tx); - int (*timer_create) (struct k_itimer *timer); - int (*nsleep) (const clockid_t which_clock, int flags, - struct timespec64 *, struct timespec __user *); - long (*nsleep_restart) (struct restart_block *restart_block); - int (*timer_set) (struct k_itimer *timr, int flags, - struct itimerspec64 *new_setting, - struct itimerspec64 *old_setting); - int (*timer_del) (struct k_itimer *timr); -#define TIMER_RETRY 1 - void (*timer_get) (struct k_itimer *timr, - struct itimerspec64 *cur_setting); -}; - -extern const struct k_clock clock_posix_cpu; -extern const struct k_clock clock_posix_dynamic; -extern const struct k_clock clock_process; -extern const struct k_clock clock_thread; -extern const struct k_clock alarm_clock; - -/* function to call to trigger timer event */ -int posix_timer_event(struct k_itimer *timr, int si_private); - -void posix_cpu_timer_schedule(struct k_itimer *timer); - void run_posix_cpu_timers(struct task_struct *task); void posix_cpu_timers_exit(struct task_struct *task); void posix_cpu_timers_exit_group(struct task_struct *task); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2a8675f9aac5..36855d675da5 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -28,6 +28,8 @@ #include #include +#include "posix-timers.h" + #define CREATE_TRACE_POINTS #include diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index bd4fb785652f..17cdc554c9fe 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -25,6 +25,8 @@ #include #include +#include "posix-timers.h" + static void delete_clock(struct kref *kref); /* diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index c99434739fd5..a77a792f2570 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -13,6 +13,8 @@ #include #include +#include "posix-timers.h" + /* * Called after updating RLIMIT_CPU to run cpu timer and update * tsk->signal->cputime_expires expiration cache if necessary. Needs diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b60b655dfbcd..dee6a0d911d4 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -51,6 +51,7 @@ #include #include "timekeeping.h" +#include "posix-timers.h" /* * Management arrays for POSIX timers. Timers are now kept in static hash table diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h new file mode 100644 index 000000000000..ad2dbd29b389 --- /dev/null +++ b/kernel/time/posix-timers.h @@ -0,0 +1,29 @@ +#define TIMER_RETRY 1 + +struct k_clock { + int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp); + int (*clock_set) (const clockid_t which_clock, + const struct timespec64 *tp); + int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp); + int (*clock_adj) (const clockid_t which_clock, struct timex *tx); + int (*timer_create) (struct k_itimer *timer); + int (*nsleep) (const clockid_t which_clock, int flags, + struct timespec64 *, struct timespec __user *); + long (*nsleep_restart) (struct restart_block *restart_block); + int (*timer_set) (struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting); + int (*timer_del) (struct k_itimer *timr); + void (*timer_get) (struct k_itimer *timr, + struct itimerspec64 *cur_setting); +}; + +extern const struct k_clock clock_posix_cpu; +extern const struct k_clock clock_posix_dynamic; +extern const struct k_clock clock_process; +extern const struct k_clock clock_thread; +extern const struct k_clock alarm_clock; + +int posix_timer_event(struct k_itimer *timr, int si_private); + +void posix_cpu_timer_schedule(struct k_itimer *timer); -- cgit v1.3-14-g43fede From af888d677a3f4473c198b4720319dd037f398b51 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:42 +0200 Subject: posix-timers: Unify overrun/requeue_pending handling hrtimer based posix-timers and posix-cpu-timers handle the update of the rearming and overflow related status fields differently. Move that update to the common rearming code. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.484936964@linutronix.de --- kernel/time/posix-cpu-timers.c | 18 +++++++----------- kernel/time/posix-timers.c | 15 ++++++++------- 2 files changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a77a792f2570..1683e503179e 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -527,6 +527,7 @@ static void cpu_timer_fire(struct k_itimer *timer) * ticking in case the signal is deliverable next time. */ posix_cpu_timer_schedule(timer); + ++timer->it_requeue_pending; } } @@ -997,12 +998,12 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) cpu_clock_sample(timer->it_clock, p, &now); bump_cpu_timer(timer, now); if (unlikely(p->exit_state)) - goto out; + return; /* Protect timer list r/w in arm_timer() */ sighand = lock_task_sighand(p, &flags); if (!sighand) - goto out; + return; } else { /* * Protect arm_timer() and timer sampling in case of call to @@ -1015,11 +1016,10 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) * We can't even collect a sample any more. */ timer->it.cpu.expires = 0; - goto out; + return; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { - unlock_task_sighand(p, &flags); - /* Optimizations: if the process is dying, no need to rearm */ - goto out; + /* If the process is dying, no need to rearm */ + goto unlock; } cpu_timer_sample_group(timer->it_clock, p, &now); bump_cpu_timer(timer, now); @@ -1031,12 +1031,8 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) */ WARN_ON_ONCE(!irqs_disabled()); arm_timer(timer); +unlock: unlock_task_sighand(p, &flags); - -out: - timer->it_overrun_last = timer->it_overrun; - timer->it_overrun = -1; - ++timer->it_requeue_pending; } /** diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index dee6a0d911d4..79a00e0f1ef9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -291,10 +291,6 @@ static void schedule_next_timer(struct k_itimer *timr) timr->it_overrun += (unsigned int) hrtimer_forward(timer, timer->base->get_time(), timr->it.real.interval); - - timr->it_overrun_last = timr->it_overrun; - timr->it_overrun = -1; - ++timr->it_requeue_pending; hrtimer_restart(timer); } @@ -315,18 +311,23 @@ void do_schedule_next_timer(struct siginfo *info) unsigned long flags; timr = lock_timer(info->si_tid, &flags); + if (!timr) + return; - if (timr && timr->it_requeue_pending == info->si_sys_private) { + if (timr->it_requeue_pending == info->si_sys_private) { if (timr->it_clock < 0) posix_cpu_timer_schedule(timr); else schedule_next_timer(timr); + timr->it_overrun_last = timr->it_overrun; + timr->it_overrun = -1; + ++timr->it_requeue_pending; + info->si_overrun += timr->it_overrun_last; } - if (timr) - unlock_timer(timr, flags); + unlock_timer(timr, flags); } int posix_timer_event(struct k_itimer *timr, int si_private) -- cgit v1.3-14-g43fede From 80105cd0e62ba8a2caf8eebd52f42952c7c04046 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:43 +0200 Subject: posix-timers: Move interval out of the union Preparatory patch to unify the alarm timer and hrtimer based posix interval timer handling. The interval is used as a criteria for rearming decisions so moving it out of the clock specific data structures allows later unification. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.563922908@linutronix.de --- include/linux/posix-timers.h | 4 ++-- kernel/time/alarmtimer.c | 13 ++++++------- kernel/time/posix-timers.c | 20 ++++++++++---------- 3 files changed, 18 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index a372e7e3a396..908048f488ae 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -63,6 +63,7 @@ struct cpu_timer_list { * @it_requeue_pending: Indicator that timer waits for being requeued on * signal delivery * @it_sigev_notify: The notify word of sigevent struct for signal delivery + * @it_interval: The interval for periodic timers * @it_signal: Pointer to the creators signal struct * @it_pid: The pid of the process/task targeted by the signal * @it_process: The task to wakeup on clock_nanosleep (CPU timers) @@ -80,6 +81,7 @@ struct k_itimer { int it_overrun_last; int it_requeue_pending; int it_sigev_notify; + ktime_t it_interval; struct signal_struct *it_signal; union { struct pid *it_pid; @@ -89,12 +91,10 @@ struct k_itimer { union { struct { struct hrtimer timer; - ktime_t interval; } real; struct cpu_timer_list cpu; struct { struct alarm alarmtimer; - ktime_t interval; } alarm; struct rcu_head rcu; } it; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 36855d675da5..5b8cf4b61854 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -527,9 +527,8 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, } /* Re-add periodic timers */ - if (ptr->it.alarm.interval) { - ptr->it_overrun += alarm_forward(alarm, now, - ptr->it.alarm.interval); + if (ptr->it_interval) { + ptr->it_overrun += alarm_forward(alarm, now, ptr->it_interval); result = ALARMTIMER_RESTART; } spin_unlock_irqrestore(&ptr->it_lock, flags); @@ -613,7 +612,7 @@ static void alarm_timer_get(struct k_itimer *timr, cur_setting->it_value.tv_nsec = 0; } - cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval); + cur_setting->it_interval = ktime_to_timespec64(timr->it_interval); } /** @@ -662,14 +661,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, return TIMER_RETRY; /* start the timer */ - timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval); + timr->it_interval = timespec64_to_ktime(new_setting->it_interval); /* * Rate limit to the tick as a hot fix to prevent DOS. Will be * mopped up later. */ - if (timr->it.alarm.interval < TICK_NSEC) - timr->it.alarm.interval = TICK_NSEC; + if (timr->it_interval < TICK_NSEC) + timr->it_interval = TICK_NSEC; exp = timespec64_to_ktime(new_setting->it_value); /* Convert (if necessary) to absolute time */ diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 79a00e0f1ef9..7dd992cc7105 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -285,12 +285,12 @@ static void schedule_next_timer(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; - if (timr->it.real.interval == 0) + if (!timr->it_interval) return; timr->it_overrun += (unsigned int) hrtimer_forward(timer, timer->base->get_time(), - timr->it.real.interval); + timr->it_interval); hrtimer_restart(timer); } @@ -375,7 +375,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); - if (timr->it.real.interval != 0) + if (timr->it_interval != 0) si_private = ++timr->it_requeue_pending; if (posix_timer_event(timr, si_private)) { @@ -384,7 +384,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) * we will not get a call back to restart it AND * it should be restarted. */ - if (timr->it.real.interval != 0) { + if (timr->it_interval != 0) { ktime_t now = hrtimer_cb_get_time(timer); /* @@ -413,13 +413,13 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { ktime_t kj = NSEC_PER_SEC / HZ; - if (timr->it.real.interval < kj) + if (timr->it_interval < kj) now = ktime_add(now, kj); } #endif timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, - timr->it.real.interval); + timr->it_interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; } @@ -631,7 +631,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) memset(cur_setting, 0, sizeof(*cur_setting)); - iv = timr->it.real.interval; + iv = timr->it_interval; /* interval timer ? */ if (iv) @@ -732,7 +732,7 @@ common_timer_set(struct k_itimer *timr, int flags, common_timer_get(timr, old_setting); /* disable the timer */ - timr->it.real.interval = 0; + timr->it_interval = 0; /* * careful here. If smp we could be in the "fire" routine which will * be spinning as we hold the lock. But this is ONLY an SMP issue. @@ -755,7 +755,7 @@ common_timer_set(struct k_itimer *timr, int flags, hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value)); /* Convert interval */ - timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval); + timr->it_interval = timespec64_to_ktime(new_setting->it_interval); /* SIGEV_NONE timers are not queued ! See common_timer_get */ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { @@ -820,7 +820,7 @@ retry: static int common_timer_del(struct k_itimer *timer) { - timer->it.real.interval = 0; + timer->it_interval = 0; if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) return TIMER_RETRY; -- cgit v1.3-14-g43fede From d97bb75ddd2f38068df01da8abf26df78756253c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:44 +0200 Subject: posix-timers: Store k_clock pointer in k_itimer Having the k_clock pointer in the k_itimer struct avoids the lookup in several code pathes and makes the next steps of unification of the hrtimer and alarmtimer based posix timers simpler. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.641222072@linutronix.de --- include/linux/posix-timers.h | 2 ++ kernel/time/posix-cpu-timers.c | 2 ++ kernel/time/posix-timers.c | 7 ++++--- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 908048f488ae..8f9cca390cdb 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -56,6 +56,7 @@ struct cpu_timer_list { * @list: List head for binding the timer to signals->posix_timers * @t_hash: Entry in the posix timer hash table * @it_lock: Lock protecting the timer + * @kclock: Pointer to the k_clock struct handling this timer * @it_clock: The posix timer clock id * @it_id: The posix timer id for identifying the timer * @it_overrun: The overrun counter for pending signals @@ -75,6 +76,7 @@ struct k_itimer { struct list_head list; struct hlist_node t_hash; spinlock_t it_lock; + const struct k_clock *kclock; clockid_t it_clock; timer_t it_id; int it_overrun; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1683e503179e..0123ece6851b 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -324,6 +324,8 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) return -EINVAL; + new_timer->kclock = &clock_posix_cpu; + INIT_LIST_HEAD(&new_timer->it.cpu.entry); rcu_read_lock(); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 7dd992cc7105..eb007e19811d 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -519,6 +519,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, it_id_set = IT_ID_SET; new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; + new_timer->kclock = kc; new_timer->it_overrun = -1; if (timer_event_spec) { @@ -679,7 +680,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, if (!timr) return -EINVAL; - kc = clockid_to_kclock(timr->it_clock); + kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_get)) ret = -EINVAL; else @@ -798,7 +799,7 @@ retry: if (!timr) return -EINVAL; - kc = clockid_to_kclock(timr->it_clock); + kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; else @@ -829,7 +830,7 @@ static int common_timer_del(struct k_itimer *timer) static inline int timer_delete_hook(struct k_itimer *timer) { - const struct k_clock *kc = clockid_to_kclock(timer->it_clock); + const struct k_clock *kc = timer->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_del)) return -EINVAL; -- cgit v1.3-14-g43fede From 30802945893bc944b5971b408b37511a03b54e5c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:45 +0200 Subject: posix-timers: Add timer_rearm() callback Add a timer_rearm() callback which is used to make the rescheduling of posix interval timers independent of the underlying clock implementation. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.732632167@linutronix.de --- kernel/time/posix-timers.h | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index ad2dbd29b389..02ffd1b9d230 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -1,21 +1,24 @@ #define TIMER_RETRY 1 struct k_clock { - int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp); - int (*clock_set) (const clockid_t which_clock, - const struct timespec64 *tp); - int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp); - int (*clock_adj) (const clockid_t which_clock, struct timex *tx); - int (*timer_create) (struct k_itimer *timer); - int (*nsleep) (const clockid_t which_clock, int flags, - struct timespec64 *, struct timespec __user *); - long (*nsleep_restart) (struct restart_block *restart_block); - int (*timer_set) (struct k_itimer *timr, int flags, - struct itimerspec64 *new_setting, - struct itimerspec64 *old_setting); - int (*timer_del) (struct k_itimer *timr); - void (*timer_get) (struct k_itimer *timr, - struct itimerspec64 *cur_setting); + int (*clock_getres)(const clockid_t which_clock, + struct timespec64 *tp); + int (*clock_set)(const clockid_t which_clock, + const struct timespec64 *tp); + int (*clock_get)(const clockid_t which_clock, + struct timespec64 *tp); + int (*clock_adj)(const clockid_t which_clock, struct timex *tx); + int (*timer_create)(struct k_itimer *timer); + int (*nsleep)(const clockid_t which_clock, int flags, + struct timespec64 *, struct timespec __user *); + long (*nsleep_restart)(struct restart_block *restart_block); + int (*timer_set)(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting); + int (*timer_del)(struct k_itimer *timr); + void (*timer_get)(struct k_itimer *timr, + struct itimerspec64 *cur_setting); + void (*timer_rearm)(struct k_itimer *timr); }; extern const struct k_clock clock_posix_cpu; -- cgit v1.3-14-g43fede From 96fe3b072f134e4993f829d599eaa1e0eb5a10e5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:46 +0200 Subject: posix-timers: Rename do_schedule_next_timer That function is a misnomer. Rename it with a proper prefix to posixtimer_rearm(). Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.811362578@linutronix.de --- include/linux/posix-timers.h | 2 +- kernel/signal.c | 2 +- kernel/time/posix-cpu-timers.c | 2 +- kernel/time/posix-timers.c | 10 +++++----- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 8f9cca390cdb..771e5f788c90 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -112,6 +112,6 @@ long clock_nanosleep_restart(struct restart_block *restart_block); void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); -void do_schedule_next_timer(struct siginfo *info); +void posixtimer_rearm(struct siginfo *info); #endif diff --git a/kernel/signal.c b/kernel/signal.c index 1f85c843be8e..d031cd24f8a9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -630,7 +630,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * about to disable them again anyway. */ spin_unlock(&tsk->sighand->siglock); - do_schedule_next_timer(info); + posixtimer_rearm(info); spin_lock(&tsk->sighand->siglock); } #endif diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0123ece6851b..1ba576d3151a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -981,7 +981,7 @@ static void check_process_timers(struct task_struct *tsk, } /* - * This is called from the signal code (via do_schedule_next_timer) + * This is called from the signal code (via posixtimer_rearm) * when the last timer signal was delivered and we have to reload the timer. */ void posix_cpu_timer_schedule(struct k_itimer *timer) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index eb007e19811d..036b7e70c65c 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -305,7 +305,7 @@ static void schedule_next_timer(struct k_itimer *timr) * To protect against the timer going away while the interrupt is queued, * we require that the it_requeue_pending flag be set. */ -void do_schedule_next_timer(struct siginfo *info) +void posixtimer_rearm(struct siginfo *info) { struct k_itimer *timr; unsigned long flags; @@ -336,12 +336,12 @@ int posix_timer_event(struct k_itimer *timr, int si_private) int shared, ret = -1; /* * FIXME: if ->sigq is queued we can race with - * dequeue_signal()->do_schedule_next_timer(). + * dequeue_signal()->posixtimer_rearm(). * * If dequeue_signal() sees the "right" value of - * si_sys_private it calls do_schedule_next_timer(). + * si_sys_private it calls posixtimer_rearm(). * We re-queue ->sigq and drop ->it_lock(). - * do_schedule_next_timer() locks the timer + * posixtimer_rearm() locks the timer * and re-schedules it while ->sigq is pending. * Not really bad, but not that we want. */ @@ -701,7 +701,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, * accumulating overruns on the next timer. The overrun is frozen when * the signal is delivered, either at the notify time (if the info block * is not queued) or at the actual delivery time (as we are informed by - * the call back to do_schedule_next_timer(). So all we need to do is + * the call back to posixtimer_rearm(). So all we need to do is * to pick up the frozen overrun. */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) -- cgit v1.3-14-g43fede From f37fb0aa4f453c7c785bbcecc4991ac48c5c0e51 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:47 +0200 Subject: posix-timers: Use timer_rearm() callback in posixtimer_rearm() Use the new timer_rearm() callback to replace the conditional hardcoded calls into the hrtimer and cpu timer code. This allows later to bring the same logic to alarmtimers. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.889661919@linutronix.de --- kernel/time/posix-cpu-timers.c | 7 +++++-- kernel/time/posix-timers.c | 12 ++++++------ kernel/time/posix-timers.h | 2 -- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1ba576d3151a..96c833a61ade 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -15,6 +15,8 @@ #include "posix-timers.h" +static void posix_cpu_timer_rearm(struct k_itimer *timer); + /* * Called after updating RLIMIT_CPU to run cpu timer and update * tsk->signal->cputime_expires expiration cache if necessary. Needs @@ -528,7 +530,7 @@ static void cpu_timer_fire(struct k_itimer *timer) * reload the timer. But we need to keep it * ticking in case the signal is deliverable next time. */ - posix_cpu_timer_schedule(timer); + posix_cpu_timer_rearm(timer); ++timer->it_requeue_pending; } } @@ -984,7 +986,7 @@ static void check_process_timers(struct task_struct *tsk, * This is called from the signal code (via posixtimer_rearm) * when the last timer signal was delivered and we have to reload the timer. */ -void posix_cpu_timer_schedule(struct k_itimer *timer) +static void posix_cpu_timer_rearm(struct k_itimer *timer) { struct sighand_struct *sighand; unsigned long flags; @@ -1431,6 +1433,7 @@ const struct k_clock clock_posix_cpu = { .timer_set = posix_cpu_timer_set, .timer_del = posix_cpu_timer_del, .timer_get = posix_cpu_timer_get, + .timer_rearm = posix_cpu_timer_rearm, }; const struct k_clock clock_process = { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 036b7e70c65c..b12582a4b122 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -278,10 +278,9 @@ static __init int init_posix_timers(void) NULL); return 0; } - __initcall(init_posix_timers); -static void schedule_next_timer(struct k_itimer *timr) +static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; @@ -315,10 +314,7 @@ void posixtimer_rearm(struct siginfo *info) return; if (timr->it_requeue_pending == info->si_sys_private) { - if (timr->it_clock < 0) - posix_cpu_timer_schedule(timr); - else - schedule_next_timer(timr); + timr->kclock->timer_rearm(timr); timr->it_overrun_last = timr->it_overrun; timr->it_overrun = -1; @@ -1046,6 +1042,7 @@ static const struct k_clock clock_realtime = { .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, }; static const struct k_clock clock_monotonic = { @@ -1057,6 +1054,7 @@ static const struct k_clock clock_monotonic = { .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, }; static const struct k_clock clock_monotonic_raw = { @@ -1083,6 +1081,7 @@ static const struct k_clock clock_tai = { .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, }; static const struct k_clock clock_boottime = { @@ -1094,6 +1093,7 @@ static const struct k_clock clock_boottime = { .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, }; static const struct k_clock * const posix_clocks[] = { diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 02ffd1b9d230..1f6f6f9a6a37 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -28,5 +28,3 @@ extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; int posix_timer_event(struct k_itimer *timr, int si_private); - -void posix_cpu_timer_schedule(struct k_itimer *timer); -- cgit v1.3-14-g43fede From 21e55c1f83880a56360287c00f2b5cd5e5a4a912 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:48 +0200 Subject: posix-timers: Add active flag to k_itimer Keep track of the activation state of posix timers. This is a preparatory change for making common_timer_get() usable by both hrtimer and alarm timer implementations. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211656.967783982@linutronix.de --- include/linux/posix-timers.h | 2 ++ kernel/time/posix-timers.c | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 771e5f788c90..667095dbcd37 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -59,6 +59,7 @@ struct cpu_timer_list { * @kclock: Pointer to the k_clock struct handling this timer * @it_clock: The posix timer clock id * @it_id: The posix timer id for identifying the timer + * @it_active: Marker that timer is active * @it_overrun: The overrun counter for pending signals * @it_overrun_last: The overrun at the time of the last delivered signal * @it_requeue_pending: Indicator that timer waits for being requeued on @@ -79,6 +80,7 @@ struct k_itimer { const struct k_clock *kclock; clockid_t it_clock; timer_t it_id; + int it_active; int it_overrun; int it_overrun_last; int it_requeue_pending; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b12582a4b122..795215bba73d 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -316,6 +316,7 @@ void posixtimer_rearm(struct siginfo *info) if (timr->it_requeue_pending == info->si_sys_private) { timr->kclock->timer_rearm(timr); + timr->it_active = 1; timr->it_overrun_last = timr->it_overrun; timr->it_overrun = -1; ++timr->it_requeue_pending; @@ -371,6 +372,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); + timr->it_active = 0; if (timr->it_interval != 0) si_private = ++timr->it_requeue_pending; @@ -418,6 +420,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) timr->it_interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; + timr->it_active = 1; } } @@ -737,7 +740,8 @@ common_timer_set(struct k_itimer *timr, int flags, if (hrtimer_try_to_cancel(timer) < 0) return TIMER_RETRY; - timr->it_requeue_pending = (timr->it_requeue_pending + 2) & + timr->it_active = 0; + timr->it_requeue_pending = (timr->it_requeue_pending + 2) & ~REQUEUE_PENDING; timr->it_overrun_last = 0; @@ -763,6 +767,7 @@ common_timer_set(struct k_itimer *timr, int flags, return 0; } + timr->it_active = 1; hrtimer_start_expires(timer, mode); return 0; } @@ -821,6 +826,7 @@ static int common_timer_del(struct k_itimer *timer) if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) return TIMER_RETRY; + timer->it_active = 0; return 0; } -- cgit v1.3-14-g43fede From 63841b2a6969501de183efafc14d20175e402804 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:49 +0200 Subject: posix-timers: Add forward/remaining callbacks Add two callbacks to kclock which allow using common_)timer_get() for both hrtimer and alarm timer based clocks. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.044915536@linutronix.de --- kernel/time/posix-timers.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 1f6f6f9a6a37..3bc5b74c342f 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -19,6 +19,8 @@ struct k_clock { void (*timer_get)(struct k_itimer *timr, struct itimerspec64 *cur_setting); void (*timer_rearm)(struct k_itimer *timr); + int (*timer_forward)(struct k_itimer *timr, ktime_t now); + ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now); }; extern const struct k_clock clock_posix_cpu; -- cgit v1.3-14-g43fede From 91d57bae08689199c8acc77a8b3b41150cafab1c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:50 +0200 Subject: posix-timers: Make use of forward/remaining callbacks Replace the hrtimer calls by calls to the new forward/remaining kclock callbacks and move the hrtimer specific implementation into the corresponding callback functions. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.121437232@linutronix.de --- kernel/time/posix-timers.c | 64 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 795215bba73d..48f6c37ae5df 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -607,6 +607,20 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) return NULL; } +static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now) +{ + struct hrtimer *timer = &timr->it.real.timer; + + return __hrtimer_expires_remaining_adjusted(timer, now); +} + +static int common_hrtimer_forward(struct k_itimer *timr, ktime_t now) +{ + struct hrtimer *timer = &timr->it.real.timer; + + return (int)hrtimer_forward(timer, now, timr->it_interval); +} + /* * Get the time remaining on a POSIX.1b interval timer. This function * is ALWAYS called with spin_lock_irq on the timer, thus it must not @@ -626,42 +640,54 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) static void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { + const struct k_clock *kc = timr->kclock; ktime_t now, remaining, iv; - struct hrtimer *timer = &timr->it.real.timer; + struct timespec64 ts64; + bool sig_none; memset(cur_setting, 0, sizeof(*cur_setting)); + sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE; iv = timr->it_interval; /* interval timer ? */ - if (iv) + if (iv) { cur_setting->it_interval = ktime_to_timespec64(iv); - else if (!hrtimer_active(timer) && - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) - return; + } else if (!timr->it_active) { + /* + * SIGEV_NONE oneshot timers are never queued. Check them + * below. + */ + if (!sig_none) + return; + } - now = timer->base->get_time(); + /* + * The timespec64 based conversion is suboptimal, but it's not + * worth to implement yet another callback. + */ + kc->clock_get(timr->it_clock, &ts64); + now = timespec64_to_ktime(ts64); /* - * When a requeue is pending or this is a SIGEV_NONE - * timer move the expiry time forward by intervals, so - * expiry is > now. + * When a requeue is pending or this is a SIGEV_NONE timer move the + * expiry time forward by intervals, so expiry is > now. */ - if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) - timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); + if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) + timr->it_overrun += kc->timer_forward(timr, now); - remaining = __hrtimer_expires_remaining_adjusted(timer, now); + remaining = kc->timer_remaining(timr, now); /* Return 0 only, when the timer is expired and not pending */ if (remaining <= 0) { /* * A single shot SIGEV_NONE timer must return 0, when * it is expired ! */ - if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) + if (!sig_none) cur_setting->it_value.tv_nsec = 1; - } else + } else { cur_setting->it_value = ktime_to_timespec64(remaining); + } } /* Get the time remaining on a POSIX.1b interval timer. */ @@ -1049,6 +1075,8 @@ static const struct k_clock clock_realtime = { .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining= common_hrtimer_remaining, }; static const struct k_clock clock_monotonic = { @@ -1061,6 +1089,8 @@ static const struct k_clock clock_monotonic = { .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining= common_hrtimer_remaining, }; static const struct k_clock clock_monotonic_raw = { @@ -1088,6 +1118,8 @@ static const struct k_clock clock_tai = { .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining= common_hrtimer_remaining, }; static const struct k_clock clock_boottime = { @@ -1100,6 +1132,8 @@ static const struct k_clock clock_boottime = { .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining= common_hrtimer_remaining, }; static const struct k_clock * const posix_clocks[] = { -- cgit v1.3-14-g43fede From eabdec04385376d560078992710104cc7be2ce1b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:51 +0200 Subject: posix-timers: Zero settings value in common code Zero out the settings struct in the common code so the callbacks do not have to do it themself. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.200870713@linutronix.de --- kernel/time/posix-cpu-timers.c | 5 +---- kernel/time/posix-timers.c | 3 +-- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 96c833a61ade..cb4a4eb44279 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -719,10 +719,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp */ itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); - if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ - itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; + if (!timer->it.cpu.expires) return; - } /* * Sample the clock to take the difference with the expiry time. @@ -746,7 +744,6 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp * Call the timer disarmed, nothing else to do. */ timer->it.cpu.expires = 0; - itp->it_value = ns_to_timespec64(timer->it.cpu.expires); return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 48f6c37ae5df..0332f7a60fd6 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -645,8 +645,6 @@ common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) struct timespec64 ts64; bool sig_none; - memset(cur_setting, 0, sizeof(*cur_setting)); - sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE; iv = timr->it_interval; @@ -705,6 +703,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, if (!timr) return -EINVAL; + memset(&cur_setting64, 0, sizeof(cur_setting64)); kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_get)) ret = -EINVAL; -- cgit v1.3-14-g43fede From 525b8ed91671e29e187dfe02d408b11190ccf494 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:52 +0200 Subject: posix-timers: Add cancel/arm callbacks Add timer_try_to_cancel() and timer_arm() callbacks to kclock which allow to make common_timer_set() usable by both hrtimer and alarmtimer based clocks. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.278022962@linutronix.de --- kernel/time/posix-timers.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 3bc5b74c342f..b0ad77e18886 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -21,6 +21,9 @@ struct k_clock { void (*timer_rearm)(struct k_itimer *timr); int (*timer_forward)(struct k_itimer *timr, ktime_t now); ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now); + int (*timer_try_to_cancel)(struct k_itimer *timr); + void (*timer_arm)(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none); }; extern const struct k_clock clock_posix_cpu; -- cgit v1.3-14-g43fede From eae1c4ae275fe3e024454c012a548ee0d700f54c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:53 +0200 Subject: posix-timers: Make use of cancel/arm callbacks Replace the hrtimer calls by calls to the new try_to_cancel()/arm() kclock callbacks and move the hrtimer specific implementation into the corresponding callback functions. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.355396667@linutronix.de --- kernel/time/posix-timers.c | 181 +++++++++++++++++++++++++-------------------- 1 file changed, 100 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0332f7a60fd6..8acc9ee2c2d6 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -744,25 +744,49 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) return overrun; } +static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none) +{ + struct hrtimer *timer = &timr->it.real.timer; + enum hrtimer_mode mode; + + mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; + hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); + timr->it.real.timer.function = posix_timer_fn; + + if (!absolute) + expires = ktime_add_safe(expires, timer->base->get_time()); + hrtimer_set_expires(timer, expires); + + if (!sigev_none) + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); +} + +static int common_hrtimer_try_to_cancel(struct k_itimer *timr) +{ + return hrtimer_try_to_cancel(&timr->it.real.timer); +} + /* Set a POSIX.1b interval timer. */ -/* timr->it_lock is taken. */ static int common_timer_set(struct k_itimer *timr, int flags, - struct itimerspec64 *new_setting, struct itimerspec64 *old_setting) + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting) { - struct hrtimer *timer = &timr->it.real.timer; - enum hrtimer_mode mode; + const struct k_clock *kc = timr->kclock; + bool sigev_none; + ktime_t expires; if (old_setting) common_timer_get(timr, old_setting); - /* disable the timer */ + /* Prevent rearming by clearing the interval */ timr->it_interval = 0; /* - * careful here. If smp we could be in the "fire" routine which will - * be spinning as we hold the lock. But this is ONLY an SMP issue. + * Careful here. On SMP systems the timer expiry function could be + * active and spinning on timr->it_lock. */ - if (hrtimer_try_to_cancel(timer) < 0) + if (kc->timer_try_to_cancel(timr) < 0) return TIMER_RETRY; timr->it_active = 0; @@ -770,30 +794,16 @@ common_timer_set(struct k_itimer *timr, int flags, ~REQUEUE_PENDING; timr->it_overrun_last = 0; - /* switch off the timer when it_value is zero */ + /* Switch off the timer when it_value is zero */ if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; - mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; - - hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value)); - - /* Convert interval */ timr->it_interval = timespec64_to_ktime(new_setting->it_interval); + expires = timespec64_to_ktime(new_setting->it_value); + sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; - /* SIGEV_NONE timers are not queued ! See common_timer_get */ - if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { - /* Setup correct expiry time for relative timers */ - if (mode == HRTIMER_MODE_REL) { - hrtimer_add_expires(timer, timer->base->get_time()); - } - return 0; - } - - timr->it_active = 1; - hrtimer_start_expires(timer, mode); + kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); + timr->it_active = !sigev_none; return 0; } @@ -847,9 +857,10 @@ retry: static int common_timer_del(struct k_itimer *timer) { - timer->it_interval = 0; + const struct k_clock *kc = timer->kclock; - if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) + timer->it_interval = 0; + if (kc->timer_try_to_cancel(timer) < 0) return TIMER_RETRY; timer->it_active = 0; return 0; @@ -1063,76 +1074,84 @@ long clock_nanosleep_restart(struct restart_block *restart_block) } static const struct k_clock clock_realtime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_clock_realtime_get, - .clock_set = posix_clock_realtime_set, - .clock_adj = posix_clock_realtime_adj, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - .timer_rearm = common_hrtimer_rearm, - .timer_forward = common_hrtimer_forward, - .timer_remaining= common_hrtimer_remaining, + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_monotonic = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_ktime_get_ts, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - .timer_rearm = common_hrtimer_rearm, - .timer_forward = common_hrtimer_forward, - .timer_remaining= common_hrtimer_remaining, + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_monotonic_raw = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_raw, + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_monotonic_raw, }; static const struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, }; static const struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, }; static const struct k_clock clock_tai = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_tai, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - .timer_rearm = common_hrtimer_rearm, - .timer_forward = common_hrtimer_forward, - .timer_remaining= common_hrtimer_remaining, + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_tai, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_boottime = { - .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, - .timer_create = common_timer_create, - .timer_set = common_timer_set, - .timer_get = common_timer_get, - .timer_del = common_timer_del, - .timer_rearm = common_hrtimer_rearm, - .timer_forward = common_hrtimer_forward, - .timer_remaining= common_hrtimer_remaining, + .clock_getres = posix_get_hrtimer_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + .timer_rearm = common_hrtimer_rearm, + .timer_forward = common_hrtimer_forward, + .timer_remaining = common_hrtimer_remaining, + .timer_try_to_cancel = common_hrtimer_try_to_cancel, + .timer_arm = common_hrtimer_arm, }; static const struct k_clock * const posix_clocks[] = { -- cgit v1.3-14-g43fede From b3db80f77a95a45dbb2136f7b2a364dc797ea914 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:54 +0200 Subject: alarmtimer: Implement timer_rearm() callback Preparatory change to utilize the common posix timer mechanisms. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.434598989@linutronix.de --- kernel/time/alarmtimer.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5b8cf4b61854..be85e3cbfe1b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -536,6 +536,18 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, return result; } +/** + * alarm_timer_rearm - Posix timer callback for rearming timer + * @timr: Pointer to the posixtimer data struct + */ +static void alarm_timer_rearm(struct k_itimer *timr) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + timr->it_overrun += alarm_forward_now(alarm, timr->it_interval); + alarm_start(alarm, alarm->node.expires); +} + /** * alarm_clock_getres - posix getres interface * @which_clock: clockid @@ -594,7 +606,7 @@ static int alarm_timer_create(struct k_itimer *new_timer) /** * alarm_timer_get - posix timer_get interface - * @new_timer: k_itimer pointer + * @timr: k_itimer pointer * @cur_setting: itimerspec data to fill * * Copies out the current itimerspec data @@ -863,6 +875,7 @@ const struct k_clock alarm_clock = { .timer_set = alarm_timer_set, .timer_del = alarm_timer_del, .timer_get = alarm_timer_get, + .timer_rearm = alarm_timer_rearm, .nsleep = alarm_timer_nsleep, }; #endif /* CONFIG_POSIX_TIMERS */ -- cgit v1.3-14-g43fede From e7561f1633ac735df48c55ad09a2530e9ab9fab1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:55 +0200 Subject: alarmtimer: Implement forward callback Preparatory change to utilize the common posix timer mechanisms. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.513694229@linutronix.de --- kernel/time/alarmtimer.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index be85e3cbfe1b..6082cf1af876 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -548,6 +548,18 @@ static void alarm_timer_rearm(struct k_itimer *timr) alarm_start(alarm, alarm->node.expires); } +/** + * alarm_timer_forward - Posix timer callback for forwarding timer + * @timr: Pointer to the posixtimer data struct + * @now: Current time to forward the timer against + */ +static int alarm_timer_forward(struct k_itimer *timr, ktime_t now) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + return (int) alarm_forward(alarm, timr->it_interval, now); +} + /** * alarm_clock_getres - posix getres interface * @which_clock: clockid @@ -876,6 +888,7 @@ const struct k_clock alarm_clock = { .timer_del = alarm_timer_del, .timer_get = alarm_timer_get, .timer_rearm = alarm_timer_rearm, + .timer_forward = alarm_timer_forward, .nsleep = alarm_timer_nsleep, }; #endif /* CONFIG_POSIX_TIMERS */ -- cgit v1.3-14-g43fede From d653d8457c76da11f047af1f66256ac9b8421b69 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:56 +0200 Subject: alarmtimer: Implement remaining callback Preparatory change to utilize the common posix timer mechanisms. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.592676753@linutronix.de --- kernel/time/alarmtimer.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 6082cf1af876..02ddc40f19fe 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -560,6 +560,18 @@ static int alarm_timer_forward(struct k_itimer *timr, ktime_t now) return (int) alarm_forward(alarm, timr->it_interval, now); } +/** + * alarm_timer_remaining - Posix timer callback to retrieve remaining time + * @timr: Pointer to the posixtimer data struct + * @now: Current time to calculate against + */ +static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + + return ktime_sub(now, alarm->node.expires); +} + /** * alarm_clock_getres - posix getres interface * @which_clock: clockid @@ -881,15 +893,16 @@ out: } const struct k_clock alarm_clock = { - .clock_getres = alarm_clock_getres, - .clock_get = alarm_clock_get, - .timer_create = alarm_timer_create, - .timer_set = alarm_timer_set, - .timer_del = alarm_timer_del, - .timer_get = alarm_timer_get, - .timer_rearm = alarm_timer_rearm, - .timer_forward = alarm_timer_forward, - .nsleep = alarm_timer_nsleep, + .clock_getres = alarm_clock_getres, + .clock_get = alarm_clock_get, + .timer_create = alarm_timer_create, + .timer_set = alarm_timer_set, + .timer_del = alarm_timer_del, + .timer_get = alarm_timer_get, + .timer_rearm = alarm_timer_rearm, + .timer_forward = alarm_timer_forward, + .timer_remaining = alarm_timer_remaining, + .nsleep = alarm_timer_nsleep, }; #endif /* CONFIG_POSIX_TIMERS */ -- cgit v1.3-14-g43fede From e344c9e76bc6af997926171bfd90d25bbae0a2c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:57 +0200 Subject: alarmtimer: Implement try_to_cancel callback Preparatory change to utilize the common posix timer mechanisms. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.670026824@linutronix.de --- kernel/time/alarmtimer.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 02ddc40f19fe..374bd855a488 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -572,6 +572,15 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now) return ktime_sub(now, alarm->node.expires); } +/** + * alarm_timer_try_to_cancel - Posix timer callback to cancel a timer + * @timr: Pointer to the posixtimer data struct + */ +static int alarm_timer_try_to_cancel(struct k_itimer *timr) +{ + return alarm_try_to_cancel(&timr->it.alarm.alarmtimer); +} + /** * alarm_clock_getres - posix getres interface * @which_clock: clockid @@ -902,6 +911,7 @@ const struct k_clock alarm_clock = { .timer_rearm = alarm_timer_rearm, .timer_forward = alarm_timer_forward, .timer_remaining = alarm_timer_remaining, + .timer_try_to_cancel = alarm_timer_try_to_cancel, .nsleep = alarm_timer_nsleep, }; #endif /* CONFIG_POSIX_TIMERS */ -- cgit v1.3-14-g43fede From b3bf6f369d50ece9dec6338741648005d95c19e4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:58 +0200 Subject: alarmtimer: Implement arm callback Preparatory change to utilize the common posix timer mechanisms. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.747567162@linutronix.de --- kernel/time/alarmtimer.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 374bd855a488..c618a44bb054 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -581,6 +581,27 @@ static int alarm_timer_try_to_cancel(struct k_itimer *timr) return alarm_try_to_cancel(&timr->it.alarm.alarmtimer); } +/** + * alarm_timer_arm - Posix timer callback to arm a timer + * @timr: Pointer to the posixtimer data struct + * @expires: The new expiry time + * @absolute: Expiry value is absolute time + * @sigev_none: Posix timer does not deliver signals + */ +static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, + bool absolute, bool sigev_none) +{ + struct alarm *alarm = &timr->it.alarm.alarmtimer; + struct alarm_base *base = &alarm_bases[alarm->type]; + + if (!absolute) + expires = ktime_add_safe(expires, base->gettime()); + if (sigev_none) + alarm->node.expires = expires; + else + alarm_start(&timr->it.alarm.alarmtimer, expires); +} + /** * alarm_clock_getres - posix getres interface * @which_clock: clockid @@ -908,6 +929,7 @@ const struct k_clock alarm_clock = { .timer_set = alarm_timer_set, .timer_del = alarm_timer_del, .timer_get = alarm_timer_get, + .timer_arm = alarm_timer_arm, .timer_rearm = alarm_timer_rearm, .timer_forward = alarm_timer_forward, .timer_remaining = alarm_timer_remaining, -- cgit v1.3-14-g43fede From f2c45807d3992fe0f173f34af9c347d907c31686 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 May 2017 23:15:59 +0200 Subject: alarmtimer: Switch over to generic set/get/rearm routine All required callbacks are in place. Switch the alarm timer based posix interval timer callbacks to the common implementation and remove the incorrect private implementation. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170530211657.825471962@linutronix.de --- kernel/time/alarmtimer.c | 121 +++++++-------------------------------------- kernel/time/posix-timers.c | 12 ++--- kernel/time/posix-timers.h | 6 +++ 3 files changed, 29 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c618a44bb054..d8a7a7e214de 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -515,20 +515,26 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now) { - unsigned long flags; struct k_itimer *ptr = container_of(alarm, struct k_itimer, - it.alarm.alarmtimer); + it.alarm.alarmtimer); enum alarmtimer_restart result = ALARMTIMER_NORESTART; + unsigned long flags; + int si_private = 0; spin_lock_irqsave(&ptr->it_lock, flags); - if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { - if (posix_timer_event(ptr, 0)) - ptr->it_overrun++; - } - /* Re-add periodic timers */ - if (ptr->it_interval) { - ptr->it_overrun += alarm_forward(alarm, now, ptr->it_interval); + ptr->it_active = 0; + if (ptr->it_interval) + si_private = ++ptr->it_requeue_pending; + + if (posix_timer_event(ptr, si_private) && ptr->it_interval) { + /* + * Handle ignored signals and rearm the timer. This will go + * away once we handle ignored signals proper. + */ + ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval); + ++ptr->it_requeue_pending; + ptr->it_active = 1; result = ALARMTIMER_RESTART; } spin_unlock_irqrestore(&ptr->it_lock, flags); @@ -658,97 +664,6 @@ static int alarm_timer_create(struct k_itimer *new_timer) return 0; } -/** - * alarm_timer_get - posix timer_get interface - * @timr: k_itimer pointer - * @cur_setting: itimerspec data to fill - * - * Copies out the current itimerspec data - */ -static void alarm_timer_get(struct k_itimer *timr, - struct itimerspec64 *cur_setting) -{ - ktime_t relative_expiry_time = - alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); - - if (ktime_to_ns(relative_expiry_time) > 0) { - cur_setting->it_value = ktime_to_timespec64(relative_expiry_time); - } else { - cur_setting->it_value.tv_sec = 0; - cur_setting->it_value.tv_nsec = 0; - } - - cur_setting->it_interval = ktime_to_timespec64(timr->it_interval); -} - -/** - * alarm_timer_del - posix timer_del interface - * @timr: k_itimer pointer to be deleted - * - * Cancels any programmed alarms for the given timer. - */ -static int alarm_timer_del(struct k_itimer *timr) -{ - if (!rtcdev) - return -ENOTSUPP; - - if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) - return TIMER_RETRY; - - return 0; -} - -/** - * alarm_timer_set - posix timer_set interface - * @timr: k_itimer pointer to be deleted - * @flags: timer flags - * @new_setting: itimerspec to be used - * @old_setting: itimerspec being replaced - * - * Sets the timer to new_setting, and starts the timer. - */ -static int alarm_timer_set(struct k_itimer *timr, int flags, - struct itimerspec64 *new_setting, - struct itimerspec64 *old_setting) -{ - ktime_t exp; - - if (!rtcdev) - return -ENOTSUPP; - - if (flags & ~TIMER_ABSTIME) - return -EINVAL; - - if (old_setting) - alarm_timer_get(timr, old_setting); - - /* If the timer was already set, cancel it */ - if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) - return TIMER_RETRY; - - /* start the timer */ - timr->it_interval = timespec64_to_ktime(new_setting->it_interval); - - /* - * Rate limit to the tick as a hot fix to prevent DOS. Will be - * mopped up later. - */ - if (timr->it_interval < TICK_NSEC) - timr->it_interval = TICK_NSEC; - - exp = timespec64_to_ktime(new_setting->it_value); - /* Convert (if necessary) to absolute time */ - if (flags != TIMER_ABSTIME) { - ktime_t now; - - now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); - exp = ktime_add_safe(now, exp); - } - - alarm_start(&timr->it.alarm.alarmtimer, exp); - return 0; -} - /** * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep * @alarm: ptr to alarm that fired @@ -926,9 +841,9 @@ const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, .clock_get = alarm_clock_get, .timer_create = alarm_timer_create, - .timer_set = alarm_timer_set, - .timer_del = alarm_timer_del, - .timer_get = alarm_timer_get, + .timer_set = common_timer_set, + .timer_del = common_timer_del, + .timer_get = common_timer_get, .timer_arm = alarm_timer_arm, .timer_rearm = alarm_timer_rearm, .timer_forward = alarm_timer_forward, diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 8acc9ee2c2d6..6e7a70b1bf37 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -637,8 +637,7 @@ static int common_hrtimer_forward(struct k_itimer *timr, ktime_t now) * it is the same as a requeue pending timer WRT to what we should * report. */ -static void -common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) +void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { const struct k_clock *kc = timr->kclock; ktime_t now, remaining, iv; @@ -768,10 +767,9 @@ static int common_hrtimer_try_to_cancel(struct k_itimer *timr) } /* Set a POSIX.1b interval timer. */ -static int -common_timer_set(struct k_itimer *timr, int flags, - struct itimerspec64 *new_setting, - struct itimerspec64 *old_setting) +int common_timer_set(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting) { const struct k_clock *kc = timr->kclock; bool sigev_none; @@ -855,7 +853,7 @@ retry: return error; } -static int common_timer_del(struct k_itimer *timer) +int common_timer_del(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index b0ad77e18886..b086f5ba2f5b 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -33,3 +33,9 @@ extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; int posix_timer_event(struct k_itimer *timr, int si_private); + +void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting); +int common_timer_set(struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting); +int common_timer_del(struct k_itimer *timer); -- cgit v1.3-14-g43fede From f91840a32deef5cb1bf73338bc5010f843b01426 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Jun 2017 21:03:52 -0700 Subject: perf, bpf: Add BPF support to all perf_event types Allow BPF_PROG_TYPE_PERF_EVENT program types to attach to all perf_event types, including HW_CACHE, RAW, and dynamic pmu events. Only tracepoint/kprobe events are treated differently which require BPF_PROG_TYPE_TRACEPOINT/BPF_PROG_TYPE_KPROBE program types accordingly. Also add support for reading all event counters using bpf_perf_event_read() helper. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/perf_event.h | 7 +++++-- kernel/bpf/arraymap.c | 28 +++++++-------------------- kernel/events/core.c | 47 +++++++++++++++++++++++++++------------------- kernel/trace/bpf_trace.c | 22 ++++++++-------------- 4 files changed, 48 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 24a635887f28..8fc5f0fada5e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -896,7 +896,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); -extern u64 perf_event_read_local(struct perf_event *event); +int perf_event_read_local(struct perf_event *event, u64 *value); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); @@ -1301,7 +1301,10 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event * { return ERR_PTR(-EINVAL); } -static inline u64 perf_event_read_local(struct perf_event *event) { return -EINVAL; } +static inline int perf_event_read_local(struct perf_event *event, u64 *value) +{ + return -EINVAL; +} static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 172dc8ee0e3b..ecb43542246e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -452,38 +452,24 @@ static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) static void *perf_event_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { - const struct perf_event_attr *attr; struct bpf_event_entry *ee; struct perf_event *event; struct file *perf_file; + u64 value; perf_file = perf_event_get(fd); if (IS_ERR(perf_file)) return perf_file; + ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; - ee = ERR_PTR(-EINVAL); - - attr = perf_event_attrs(event); - if (IS_ERR(attr) || attr->inherit) + if (perf_event_read_local(event, &value) == -EOPNOTSUPP) goto err_out; - switch (attr->type) { - case PERF_TYPE_SOFTWARE: - if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) - goto err_out; - /* fall-through */ - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - ee = bpf_event_entry_gen(perf_file, map_file); - if (ee) - return ee; - ee = ERR_PTR(-ENOMEM); - /* fall-through */ - default: - break; - } - + ee = bpf_event_entry_gen(perf_file, map_file); + if (ee) + return ee; + ee = ERR_PTR(-ENOMEM); err_out: fput(perf_file); return ee; diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e75a5c9412d..51e40e4876c0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3636,10 +3636,10 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -u64 perf_event_read_local(struct perf_event *event) +int perf_event_read_local(struct perf_event *event, u64 *value) { unsigned long flags; - u64 val; + int ret = 0; /* * Disabling interrupts avoids all counter scheduling (context @@ -3647,25 +3647,37 @@ u64 perf_event_read_local(struct perf_event *event) */ local_irq_save(flags); - /* If this is a per-task event, it must be for current */ - WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) && - event->hw.target != current); - - /* If this is a per-CPU event, it must be for this CPU */ - WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) && - event->cpu != smp_processor_id()); - /* * It must not be an event with inherit set, we cannot read * all child counters from atomic context. */ - WARN_ON_ONCE(event->attr.inherit); + if (event->attr.inherit) { + ret = -EOPNOTSUPP; + goto out; + } /* * It must not have a pmu::count method, those are not * NMI safe. */ - WARN_ON_ONCE(event->pmu->count); + if (event->pmu->count) { + ret = -EOPNOTSUPP; + goto out; + } + + /* If this is a per-task event, it must be for current */ + if ((event->attach_state & PERF_ATTACH_TASK) && + event->hw.target != current) { + ret = -EINVAL; + goto out; + } + + /* If this is a per-CPU event, it must be for this CPU */ + if (!(event->attach_state & PERF_ATTACH_TASK) && + event->cpu != smp_processor_id()) { + ret = -EINVAL; + goto out; + } /* * If the event is currently on this CPU, its either a per-task event, @@ -3675,10 +3687,11 @@ u64 perf_event_read_local(struct perf_event *event) if (event->oncpu == smp_processor_id()) event->pmu->read(event); - val = local64_read(&event->count); + *value = local64_read(&event->count); +out: local_irq_restore(flags); - return val; + return ret; } static int perf_event_read(struct perf_event *event, bool group) @@ -8037,12 +8050,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) bool is_kprobe, is_tracepoint; struct bpf_prog *prog; - if (event->attr.type == PERF_TYPE_HARDWARE || - event->attr.type == PERF_TYPE_SOFTWARE) - return perf_event_set_bpf_handler(event, prog_fd); - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -EINVAL; + return perf_event_set_bpf_handler(event, prog_fd); if (event->tp_event->prog) return -EEXIST; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 460a031c77e5..08eb072430b9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -234,7 +234,8 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; - struct perf_event *event; + u64 value = 0; + int err; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; @@ -247,21 +248,14 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - event = ee->event; - if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && - event->attr.type != PERF_TYPE_RAW)) - return -EINVAL; - - /* make sure event is local and doesn't have pmu::count */ - if (unlikely(event->oncpu != cpu || event->pmu->count)) - return -EINVAL; - + err = perf_event_read_local(ee->event, &value); /* - * we don't know if the function is run successfully by the - * return value. It can be judged in other places, such as - * eBPF programs. + * this api is ugly since we miss [-22..-2] range of valid + * counter values, but that's uapi */ - return perf_event_read_local(event); + if (err) + return err; + return value; } static const struct bpf_func_proto bpf_perf_event_read_proto = { -- cgit v1.3-14-g43fede From f99973e18b65ca1fff8c81532e3132b8f622aea8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 1 Jun 2017 16:47:09 +0200 Subject: nohz: Fix buggy tick delay on IRQ storms When the tick is stopped and we reach the dynticks evaluation code on IRQ exit, we perform a soft tick restart if we observe an expired timer from there. It means we program the nearest possible tick but we stay in dynticks mode (ts->tick_stopped = 1) because we may need to stop the tick again after that expired timer is handled. Now this solution works most of the time but if we suffer an IRQ storm and those interrupts trigger faster than the hardware clockevents min delay, our tick won't fire until that IRQ storm is finished. Here is the problem: on IRQ exit we reprog the timer to at least NOW() + min_clockevents_delay. Another IRQ fires before the tick so we reschedule again to NOW() + min_clockevents_delay, etc... The tick is eternally rescheduled min_clockevents_delay ahead. A solution is to simply remove this soft tick restart. After all the normal dynticks evaluation path can handle 0 delay just fine. And by doing that we benefit from the optimization branch which avoids clock reprogramming if the clockevents deadline hasn't changed since the last reprog. This fixes our issue because we don't do repetitive clock reprog that always add hardware min delay. As a side effect it should even optimize the 0 delay path in general. Reported-and-tested-by: Octavian Purdila Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1496328429-13317-1-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e3043873fcdc..9d31f1e0067b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -713,8 +713,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { - tick = 0; - /* * Tell the timer code that the base is not idle, i.e. undo * the effect of get_next_timer_interrupt(): @@ -724,23 +722,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ - if (!ts->tick_stopped) - goto out; - - /* - * If, OTOH, we did stop it, but there's a pending (expired) - * timer reprogram the timer hardware to fire now. - * - * We will not restart the tick proper, just prod the timer - * hardware into firing an interrupt to process the pending - * timers. Just like tick_irq_exit() will not restart the tick - * for 'normal' interrupts. - * - * Only once we exit the idle loop will we re-enable the tick, - * see tick_nohz_idle_exit(). - */ - if (delta == 0) { - tick_nohz_restart(ts, now); + if (!ts->tick_stopped) { + tick = 0; goto out; } } -- cgit v1.3-14-g43fede From 680895d6efe47332d25e49817d2d6781295c1614 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 May 2017 09:51:09 +0200 Subject: sysctl: switch to use uuid_t Signed-off-by: Christoph Hellwig Reviewed-by: Amir Goldstein Reviewed-by: Andy Shevchenko --- kernel/sysctl_binary.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index ece4b177052b..939a158eab11 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1119,7 +1119,7 @@ static ssize_t bin_uuid(struct file *file, /* Only supports reads */ if (oldval && oldlen) { char buf[UUID_STRING_LEN + 1]; - uuid_be uuid; + uuid_t uuid; result = kernel_read(file, 0, buf, sizeof(buf) - 1); if (result < 0) @@ -1128,7 +1128,7 @@ static ssize_t bin_uuid(struct file *file, buf[result] = '\0'; result = -EIO; - if (uuid_be_to_bin(buf, &uuid)) + if (uuid_parse(buf, &uuid)) goto out; if (oldlen > 16) -- cgit v1.3-14-g43fede From dc4bb0e2356149aee4cdae061936f3bbdd45595c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:46 -0700 Subject: bpf: Introduce bpf_prog ID This patch generates an unique ID for each BPF_PROG_LOAD-ed prog. It is worth to note that each BPF_PROG_LOAD-ed prog will have a different ID even they have the same bpf instructions. The ID is generated by the existing idr_alloc_cyclic(). The ID is ranged from [1, INT_MAX). It is allocated in cyclic manner, so an ID will get reused every 2 billion BPF_PROG_LOAD. The bpf_prog_alloc_id() is done after bpf_prog_select_runtime() because the jit process may have allocated a new prog. Hence, we need to ensure the value of pointer 'prog' will not be changed any more before storing the prog to the prog_idr. After bpf_prog_select_runtime(), the prog is read-only. Hence, the id is stored in 'struct bpf_prog_aux'. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 40 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fcc80ca11045..c5946d19f2ca 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -172,6 +172,7 @@ struct bpf_prog_aux { u32 used_map_cnt; u32 max_ctx_offset; u32 stack_depth; + u32 id; struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; const struct bpf_verifier_ops *ops; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 59da103adb85..2a1b32b470f1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -22,8 +22,11 @@ #include #include #include +#include DEFINE_PER_CPU(int, bpf_prog_active); +static DEFINE_IDR(prog_idr); +static DEFINE_SPINLOCK(prog_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; @@ -650,6 +653,34 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) free_uid(user); } +static int bpf_prog_alloc_id(struct bpf_prog *prog) +{ + int id; + + spin_lock_bh(&prog_idr_lock); + id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); + if (id > 0) + prog->aux->id = id; + spin_unlock_bh(&prog_idr_lock); + + /* id is in [1, INT_MAX) */ + if (WARN_ON_ONCE(!id)) + return -ENOSPC; + + return id > 0 ? 0 : id; +} + +static void bpf_prog_free_id(struct bpf_prog *prog) +{ + /* cBPF to eBPF migrations are currently not in the idr store. */ + if (!prog->aux->id) + return; + + spin_lock_bh(&prog_idr_lock); + idr_remove(&prog_idr, prog->aux->id); + spin_unlock_bh(&prog_idr_lock); +} + static void __bpf_prog_put_rcu(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); @@ -663,6 +694,7 @@ void bpf_prog_put(struct bpf_prog *prog) { if (atomic_dec_and_test(&prog->aux->refcnt)) { trace_bpf_prog_put_rcu(prog); + bpf_prog_free_id(prog); bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } @@ -857,15 +889,21 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; + err = bpf_prog_alloc_id(prog); + if (err) + goto free_used_maps; + err = bpf_prog_new_fd(prog); if (err < 0) /* failed to allocate fd */ - goto free_used_maps; + goto free_id; bpf_prog_kallsyms_add(prog); trace_bpf_prog_load(prog, err); return err; +free_id: + bpf_prog_free_id(prog); free_used_maps: free_used_maps(prog->aux); free_prog: -- cgit v1.3-14-g43fede From f3f1c054c288bb6e503005e6d73611151ed20e91 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:47 -0700 Subject: bpf: Introduce bpf_map ID This patch generates an unique ID for each created bpf_map. The approach is similar to the earlier patch for bpf_prog ID. It is worth to note that the bpf_map's ID and bpf_prog's ID are in two independent ID spaces and both have the same valid range: [1, INT_MAX). Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c5946d19f2ca..c32bace66d3d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -46,6 +46,7 @@ struct bpf_map { u32 max_entries; u32 map_flags; u32 pages; + u32 id; struct user_struct *user; const struct bpf_map_ops *ops; struct work_struct work; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2a1b32b470f1..4c3075b5d840 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -27,6 +27,8 @@ DEFINE_PER_CPU(int, bpf_prog_active); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); +static DEFINE_IDR(map_idr); +static DEFINE_SPINLOCK(map_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly; @@ -117,6 +119,29 @@ static void bpf_map_uncharge_memlock(struct bpf_map *map) free_uid(user); } +static int bpf_map_alloc_id(struct bpf_map *map) +{ + int id; + + spin_lock_bh(&map_idr_lock); + id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); + if (id > 0) + map->id = id; + spin_unlock_bh(&map_idr_lock); + + if (WARN_ON_ONCE(!id)) + return -ENOSPC; + + return id > 0 ? 0 : id; +} + +static void bpf_map_free_id(struct bpf_map *map) +{ + spin_lock_bh(&map_idr_lock); + idr_remove(&map_idr, map->id); + spin_unlock_bh(&map_idr_lock); +} + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { @@ -141,6 +166,7 @@ static void bpf_map_put_uref(struct bpf_map *map) void bpf_map_put(struct bpf_map *map) { if (atomic_dec_and_test(&map->refcnt)) { + bpf_map_free_id(map); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); } @@ -239,14 +265,20 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_nouncharge; + err = bpf_map_alloc_id(map); + if (err) + goto free_map; + err = bpf_map_new_fd(map); if (err < 0) /* failed to allocate fd */ - goto free_map; + goto free_id; trace_bpf_map_create(map, err); return err; +free_id: + bpf_map_free_id(map); free_map: bpf_map_uncharge_memlock(map); free_map_nouncharge: -- cgit v1.3-14-g43fede From 34ad5580f8f9c86cb273ebea25c149613cd1667e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:48 -0700 Subject: bpf: Add BPF_(PROG|MAP)_GET_NEXT_ID command This patch adds BPF_PROG_GET_NEXT_ID and BPF_MAP_GET_NEXT_ID to allow userspace to iterate all bpf_prog IDs and bpf_map IDs. The API is trying to be consistent with the existing BPF_MAP_GET_NEXT_KEY. It is currently limited to CAP_SYS_ADMIN which we can consider to lift it in followup patches. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +++++++ kernel/bpf/syscall.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e78aece03628..629747a3f273 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -82,6 +82,8 @@ enum bpf_cmd { BPF_PROG_ATTACH, BPF_PROG_DETACH, BPF_PROG_TEST_RUN, + BPF_PROG_GET_NEXT_ID, + BPF_MAP_GET_NEXT_ID, }; enum bpf_map_type { @@ -209,6 +211,11 @@ union bpf_attr { __u32 repeat; __u32 duration; } test; + + struct { /* anonymous struct used by BPF_*_GET_NEXT_ID */ + __u32 start_id; + __u32 next_id; + }; } __attribute__((aligned(8))); /* BPF helper function descriptions: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4c3075b5d840..2405feedb8c1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -166,6 +166,7 @@ static void bpf_map_put_uref(struct bpf_map *map) void bpf_map_put(struct bpf_map *map) { if (atomic_dec_and_test(&map->refcnt)) { + /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); @@ -726,6 +727,7 @@ void bpf_prog_put(struct bpf_prog *prog) { if (atomic_dec_and_test(&prog->aux->refcnt)) { trace_bpf_prog_put_rcu(prog); + /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog); bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); @@ -1069,6 +1071,34 @@ static int bpf_prog_test_run(const union bpf_attr *attr, return ret; } +#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id + +static int bpf_obj_get_next_id(const union bpf_attr *attr, + union bpf_attr __user *uattr, + struct idr *idr, + spinlock_t *lock) +{ + u32 next_id = attr->start_id; + int err = 0; + + if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + next_id++; + spin_lock_bh(lock); + if (!idr_get_next(idr, &next_id)) + err = -ENOENT; + spin_unlock_bh(lock); + + if (!err) + err = put_user(next_id, &uattr->next_id); + + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1146,6 +1176,14 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); break; + case BPF_PROG_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &prog_idr, &prog_idr_lock); + break; + case BPF_MAP_GET_NEXT_ID: + err = bpf_obj_get_next_id(&attr, uattr, + &map_idr, &map_idr_lock); + break; default: err = -EINVAL; break; -- cgit v1.3-14-g43fede From b16d9aa4c2b90af8d2c3201e245150f8c430c3bc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:49 -0700 Subject: bpf: Add BPF_PROG_GET_FD_BY_ID Add BPF_PROG_GET_FD_BY_ID command to allow user to get a fd from a bpf_prog's ID. bpf_prog_inc_not_zero() is added and is called with prog_idr_lock held. __bpf_prog_put() is also added which has the 'bool do_idr_lock' param to decide if the prog_idr_lock should be acquired when freeing the prog->id. In the error path of bpf_prog_inc_not_zero(), it may have to call __bpf_prog_put(map, false) which does not need to take the prog_idr_lock when freeing the prog->id. It is currently limited to CAP_SYS_ADMIN which we can consider to lift it in followup patches. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 8 +++-- kernel/bpf/syscall.c | 91 ++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 87 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 629747a3f273..d70cfed19d5e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -84,6 +84,7 @@ enum bpf_cmd { BPF_PROG_TEST_RUN, BPF_PROG_GET_NEXT_ID, BPF_MAP_GET_NEXT_ID, + BPF_PROG_GET_FD_BY_ID, }; enum bpf_map_type { @@ -212,8 +213,11 @@ union bpf_attr { __u32 duration; } test; - struct { /* anonymous struct used by BPF_*_GET_NEXT_ID */ - __u32 start_id; + struct { /* anonymous struct used by BPF_*_GET_*_ID */ + union { + __u32 start_id; + __u32 prog_id; + }; __u32 next_id; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2405feedb8c1..dc6253bb8ebb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -703,15 +703,23 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) return id > 0 ? 0 : id; } -static void bpf_prog_free_id(struct bpf_prog *prog) +static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) { /* cBPF to eBPF migrations are currently not in the idr store. */ if (!prog->aux->id) return; - spin_lock_bh(&prog_idr_lock); + if (do_idr_lock) + spin_lock_bh(&prog_idr_lock); + else + __acquire(&prog_idr_lock); + idr_remove(&prog_idr, prog->aux->id); - spin_unlock_bh(&prog_idr_lock); + + if (do_idr_lock) + spin_unlock_bh(&prog_idr_lock); + else + __release(&prog_idr_lock); } static void __bpf_prog_put_rcu(struct rcu_head *rcu) @@ -723,16 +731,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) bpf_prog_free(aux->prog); } -void bpf_prog_put(struct bpf_prog *prog) +static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { trace_bpf_prog_put_rcu(prog); /* bpf_prog_free_id() must be called first */ - bpf_prog_free_id(prog); + bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } } + +void bpf_prog_put(struct bpf_prog *prog) +{ + __bpf_prog_put(prog, true); +} EXPORT_SYMBOL_GPL(bpf_prog_put); static int bpf_prog_release(struct inode *inode, struct file *filp) @@ -814,6 +827,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc); +/* prog_idr_lock should have been held */ +static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) +{ + int refold; + + refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0); + + if (refold >= BPF_MAX_REFCNT) { + __bpf_prog_put(prog, false); + return ERR_PTR(-EBUSY); + } + + if (!refold) + return ERR_PTR(-ENOENT); + + return prog; +} + static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { struct fd f = fdget(ufd); @@ -928,16 +959,21 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_used_maps; err = bpf_prog_new_fd(prog); - if (err < 0) - /* failed to allocate fd */ - goto free_id; + if (err < 0) { + /* failed to allocate fd. + * bpf_prog_put() is needed because the above + * bpf_prog_alloc_id() has published the prog + * to the userspace and the userspace may + * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID. + */ + bpf_prog_put(prog); + return err; + } bpf_prog_kallsyms_add(prog); trace_bpf_prog_load(prog, err); return err; -free_id: - bpf_prog_free_id(prog); free_used_maps: free_used_maps(prog->aux); free_prog: @@ -1099,6 +1135,38 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, return err; } +#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id + +static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + u32 id = attr->prog_id; + int fd; + + if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&prog_idr_lock); + prog = idr_find(&prog_idr, id); + if (prog) + prog = bpf_prog_inc_not_zero(prog); + else + prog = ERR_PTR(-ENOENT); + spin_unlock_bh(&prog_idr_lock); + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + fd = bpf_prog_new_fd(prog); + if (fd < 0) + bpf_prog_put(prog); + + return fd; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1184,6 +1252,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_obj_get_next_id(&attr, uattr, &map_idr, &map_idr_lock); break; + case BPF_PROG_GET_FD_BY_ID: + err = bpf_prog_get_fd_by_id(&attr); + break; default: err = -EINVAL; break; -- cgit v1.3-14-g43fede From bd5f5f4ecb78e2698dad655645b6d6a2f7012a8c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:50 -0700 Subject: bpf: Add BPF_MAP_GET_FD_BY_ID Add BPF_MAP_GET_FD_BY_ID command to allow user to get a fd from a bpf_map's ID. bpf_map_inc_not_zero() is added and is called with map_idr_lock held. __bpf_map_put() is also added which has the 'bool do_idr_lock' param to decide if the map_idr_lock should be acquired when freeing the map->id. In the error path of bpf_map_inc_not_zero(), it may have to call __bpf_map_put(map, false) which does not need to take the map_idr_lock when freeing the map->id. It is currently limited to CAP_SYS_ADMIN which we can consider to lift it in followup patches. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 + kernel/bpf/syscall.c | 95 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 87 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d70cfed19d5e..dd23f47ff00c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -85,6 +85,7 @@ enum bpf_cmd { BPF_PROG_GET_NEXT_ID, BPF_MAP_GET_NEXT_ID, BPF_PROG_GET_FD_BY_ID, + BPF_MAP_GET_FD_BY_ID, }; enum bpf_map_type { @@ -217,6 +218,7 @@ union bpf_attr { union { __u32 start_id; __u32 prog_id; + __u32 map_id; }; __u32 next_id; }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dc6253bb8ebb..1802bb9c47d9 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -135,11 +135,19 @@ static int bpf_map_alloc_id(struct bpf_map *map) return id > 0 ? 0 : id; } -static void bpf_map_free_id(struct bpf_map *map) +static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { - spin_lock_bh(&map_idr_lock); + if (do_idr_lock) + spin_lock_bh(&map_idr_lock); + else + __acquire(&map_idr_lock); + idr_remove(&map_idr, map->id); - spin_unlock_bh(&map_idr_lock); + + if (do_idr_lock) + spin_unlock_bh(&map_idr_lock); + else + __release(&map_idr_lock); } /* called from workqueue */ @@ -163,16 +171,21 @@ static void bpf_map_put_uref(struct bpf_map *map) /* decrement map refcnt and schedule it for freeing via workqueue * (unrelying map implementation ops->map_free() might sleep) */ -void bpf_map_put(struct bpf_map *map) +static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) { if (atomic_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ - bpf_map_free_id(map); + bpf_map_free_id(map, do_idr_lock); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); } } +void bpf_map_put(struct bpf_map *map) +{ + __bpf_map_put(map, true); +} + void bpf_map_put_with_uref(struct bpf_map *map) { bpf_map_put_uref(map); @@ -271,15 +284,20 @@ static int map_create(union bpf_attr *attr) goto free_map; err = bpf_map_new_fd(map); - if (err < 0) - /* failed to allocate fd */ - goto free_id; + if (err < 0) { + /* failed to allocate fd. + * bpf_map_put() is needed because the above + * bpf_map_alloc_id() has published the map + * to the userspace and the userspace may + * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. + */ + bpf_map_put(map); + return err; + } trace_bpf_map_create(map, err); return err; -free_id: - bpf_map_free_id(map); free_map: bpf_map_uncharge_memlock(map); free_map_nouncharge: @@ -331,6 +349,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) return map; } +/* map_idr_lock should have been held */ +static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, + bool uref) +{ + int refold; + + refold = __atomic_add_unless(&map->refcnt, 1, 0); + + if (refold >= BPF_MAX_REFCNT) { + __bpf_map_put(map, false); + return ERR_PTR(-EBUSY); + } + + if (!refold) + return ERR_PTR(-ENOENT); + + if (uref) + atomic_inc(&map->usercnt); + + return map; +} + int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { return -ENOTSUPP; @@ -1167,6 +1207,38 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) return fd; } +#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id + +static int bpf_map_get_fd_by_id(const union bpf_attr *attr) +{ + struct bpf_map *map; + u32 id = attr->map_id; + int fd; + + if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock_bh(&map_idr_lock); + map = idr_find(&map_idr, id); + if (map) + map = bpf_map_inc_not_zero(map, true); + else + map = ERR_PTR(-ENOENT); + spin_unlock_bh(&map_idr_lock); + + if (IS_ERR(map)) + return PTR_ERR(map); + + fd = bpf_map_new_fd(map); + if (fd < 0) + bpf_map_put(map); + + return fd; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1255,6 +1327,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_GET_FD_BY_ID: err = bpf_prog_get_fd_by_id(&attr); break; + case BPF_MAP_GET_FD_BY_ID: + err = bpf_map_get_fd_by_id(&attr); + break; default: err = -EINVAL; break; -- cgit v1.3-14-g43fede From 1e270976908686ec25fb91b8a34145be54137976 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 5 Jun 2017 12:15:52 -0700 Subject: bpf: Add BPF_OBJ_GET_INFO_BY_FD A single BPF_OBJ_GET_INFO_BY_FD cmd is used to obtain the info for both bpf_prog and bpf_map. The kernel can figure out the fd is associated with a bpf_prog or bpf_map. The suggested struct bpf_prog_info and struct bpf_map_info are not meant to be a complete list and it is not the goal of this patch. New fields can be added in the future patch. The focus of this patch is to create the interface, BPF_OBJ_GET_INFO_BY_FD cmd for exposing the bpf_prog's and bpf_map's info. The obj's info, which will be extended (and get bigger) over time, is separated from the bpf_attr to avoid bloating the bpf_attr. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 2 - include/uapi/linux/bpf.h | 28 ++++++++ kernel/bpf/syscall.c | 163 ++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 174 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1e2dddf21f3b..1fa26dc562ce 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -69,8 +69,6 @@ struct bpf_prog_aux; /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 -#define BPF_TAG_SIZE 8 - /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dd23f47ff00c..9b2c10b45733 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -86,6 +86,7 @@ enum bpf_cmd { BPF_MAP_GET_NEXT_ID, BPF_PROG_GET_FD_BY_ID, BPF_MAP_GET_FD_BY_ID, + BPF_OBJ_GET_INFO_BY_FD, }; enum bpf_map_type { @@ -222,6 +223,12 @@ union bpf_attr { }; __u32 next_id; }; + + struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ + __u32 bpf_fd; + __u32 info_len; + __aligned_u64 info; + } info; } __attribute__((aligned(8))); /* BPF helper function descriptions: @@ -686,4 +693,25 @@ struct xdp_md { __u32 data_end; }; +#define BPF_TAG_SIZE 8 + +struct bpf_prog_info { + __u32 type; + __u32 id; + __u8 tag[BPF_TAG_SIZE]; + __u32 jited_prog_len; + __u32 xlated_prog_len; + __aligned_u64 jited_prog_insns; + __aligned_u64 xlated_prog_insns; +} __attribute__((aligned(8))); + +struct bpf_map_info { + __u32 type; + __u32 id; + __u32 key_size; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; +} __attribute__((aligned(8))); + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1802bb9c47d9..8942c820d620 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1239,6 +1239,145 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) return fd; } +static int check_uarg_tail_zero(void __user *uaddr, + size_t expected_size, + size_t actual_size) +{ + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + int err; + + if (actual_size <= expected_size) + return 0; + + addr = uaddr + expected_size; + end = uaddr + actual_size; + + for (; addr < end; addr++) { + err = get_user(val, addr); + if (err) + return err; + if (val) + return -E2BIG; + } + + return 0; +} + +static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_prog_info info = {}; + u32 info_len = attr->info.info_len; + char __user *uinsns; + u32 ulen; + int err; + + err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + if (copy_from_user(&info, uinfo, info_len)) + return err; + + info.type = prog->type; + info.id = prog->aux->id; + + memcpy(info.tag, prog->tag, sizeof(prog->tag)); + + if (!capable(CAP_SYS_ADMIN)) { + info.jited_prog_len = 0; + info.xlated_prog_len = 0; + goto done; + } + + ulen = info.jited_prog_len; + info.jited_prog_len = prog->jited_len; + if (info.jited_prog_len && ulen) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } + + ulen = info.xlated_prog_len; + info.xlated_prog_len = bpf_prog_size(prog->len); + if (info.xlated_prog_len && ulen) { + uinsns = u64_to_user_ptr(info.xlated_prog_insns); + ulen = min_t(u32, info.xlated_prog_len, ulen); + if (copy_to_user(uinsns, prog->insnsi, ulen)) + return -EFAULT; + } + +done: + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + +static int bpf_map_get_info_by_fd(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_map_info info = {}; + u32 info_len = attr->info.info_len; + int err; + + err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + if (err) + return err; + info_len = min_t(u32, sizeof(info), info_len); + + info.type = map->map_type; + info.id = map->id; + info.key_size = map->key_size; + info.value_size = map->value_size; + info.max_entries = map->max_entries; + info.map_flags = map->map_flags; + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + +#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info + +static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + int ufd = attr->info.bpf_fd; + struct fd f; + int err; + + if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) + return -EINVAL; + + f = fdget(ufd); + if (!f.file) + return -EBADFD; + + if (f.file->f_op == &bpf_prog_fops) + err = bpf_prog_get_info_by_fd(f.file->private_data, attr, + uattr); + else if (f.file->f_op == &bpf_map_fops) + err = bpf_map_get_info_by_fd(f.file->private_data, attr, + uattr); + else + err = -EINVAL; + + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1258,23 +1397,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz * user-space does not rely on any kernel feature * extensions we dont know about yet. */ - if (size > sizeof(attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - err = get_user(val, addr); - if (err) - return err; - if (val) - return -E2BIG; - } - size = sizeof(attr); - } + err = check_uarg_tail_zero(uattr, sizeof(attr), size); + if (err) + return err; + size = min_t(u32, size, sizeof(attr)); /* copy attributes from user space, may be less than sizeof(bpf_attr) */ if (copy_from_user(&attr, uattr, size) != 0) @@ -1330,6 +1456,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_FD_BY_ID: err = bpf_map_get_fd_by_id(&attr); break; + case BPF_OBJ_GET_INFO_BY_FD: + err = bpf_obj_get_info_by_fd(&attr, uattr); + break; default: err = -EINVAL; break; -- cgit v1.3-14-g43fede From 92046578ac88e0a93f8ef03240e6c832b0189aa7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 6 Jun 2017 18:38:04 +0200 Subject: bpf: cgroup skb progs cannot access ld_abs/ind Commit fb9a307d11d6 ("bpf: Allow CGROUP_SKB eBPF program to access sk_buff") enabled programs of BPF_PROG_TYPE_CGROUP_SKB type to use ld_abs/ind instructions. However, at this point, we cannot use them, since offsets relative to SKF_LL_OFF will end up pointing skb_mac_header(skb) out of bounds since in the egress path it is not yet set at that point in time, but only after __dev_queue_xmit() did a general reset on the mac header. bpf_internal_load_pointer_neg_helper() will then end up reading data from a wrong offset. BPF_PROG_TYPE_CGROUP_SKB programs can use bpf_skb_load_bytes() already to access packet data, which is also more flexible than the insns carried over from cBPF. Fixes: fb9a307d11d6 ("bpf: Allow CGROUP_SKB eBPF program to access sk_buff") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Chenbo Feng Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8acae64df255..14ccb0759fa4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2426,7 +2426,6 @@ static bool may_access_skb(enum bpf_prog_type type) case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: - case BPF_PROG_TYPE_CGROUP_SKB: return true; default: return false; -- cgit v1.3-14-g43fede From c3ca46ef719580eb01994fc6032db470fde92d85 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 23 May 2017 15:05:50 +0900 Subject: ftrace/kprobes: selftests: Check kretprobe maxactive is supported Check the kretprobe maxactive is supported by kprobe_events interface. To ensure the kernel feature, this changes ftrace README to describe it. Signed-off-by: Masami Hiramatsu Signed-off-by: Shuah Khan --- kernel/trace/trace.c | 3 ++- tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1122f151466f..dc3f91e70345 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4473,7 +4473,8 @@ static const char readme_msg[] = #endif #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" - "\t Format: p|r[:[/]] []\n" + "\t Format: p[:[/]] []\n" + "\t r[maxactive][:[/]] []\n" "\t -:[/]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc index 57abdf1caabf..7ec6f2639ad6 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc @@ -2,6 +2,7 @@ # description: Kretprobe dynamic event with maxactive [ -f kprobe_events ] || exit_unsupported # this is configurable +grep -q 'r\[maxactive\]' README || exit_unsupported # this is older version echo > kprobe_events -- cgit v1.3-14-g43fede From ba5213ae6b88fb170c4771fef6553f759c7d8cdd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 30 May 2017 11:45:12 +0200 Subject: perf/core: Correct event creation with PERF_FORMAT_GROUP Andi was asking about PERF_FORMAT_GROUP vs inherited events, which led to the discovery of a bug from commit: 3dab77fb1bf8 ("perf: Rework/fix the whole read vs group stuff") - PERF_SAMPLE_GROUP = 1U << 4, + PERF_SAMPLE_READ = 1U << 4, - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) is a clear fail :/ While this changes user visible behaviour; it was previously possible to create an inherited event with PERF_SAMPLE_READ; this is deemed acceptible because its results were always incorrect. Reported-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 3dab77fb1bf8 ("perf: Rework/fix the whole read vs group stuff") Link: http://lkml.kernel.org/r/20170530094512.dy2nljns2uq7qa3j@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3de0b98c4414..407dad6cf89a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5729,9 +5729,6 @@ static void perf_output_read_one(struct perf_output_handle *handle, __output_copy(handle, values, n * sizeof(u64)); } -/* - * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. - */ static void perf_output_read_group(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) @@ -5776,6 +5773,13 @@ static void perf_output_read_group(struct perf_output_handle *handle, #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ PERF_FORMAT_TOTAL_TIME_RUNNING) +/* + * XXX PERF_SAMPLE_READ vs inherited events seems difficult. + * + * The problem is that its both hard and excessively expensive to iterate the + * child list, not to mention that its impossible to IPI the children running + * on another CPU, from interrupt/NMI context. + */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { @@ -9462,9 +9466,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, local64_set(&hwc->period_left, hwc->sample_period); /* - * we currently do not support PERF_FORMAT_GROUP on inherited events + * We currently do not support PERF_SAMPLE_READ on inherited events. + * See perf_output_read(). */ - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) + if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) goto err_ns; if (!has_branch_stack(event)) -- cgit v1.3-14-g43fede From d0fabd1cb8b70073a0f44f1cf8b663b5e7241c74 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 23 May 2017 14:51:32 -0700 Subject: perf/core: Remove unused perf_cgroup_event_cgrp_time() function The function was added by commit e5d1367f17ba ("perf: Add cgroup support") in 2011 and hasn't been used since then. Removing it fixes the following warning when building with Clang: kernel/events/core.c:696:19: error: unused function 'perf_cgroup_event_cgrp_time' [-Werror,-Wunused-function] Signed-off-by: Matthias Kaehlcke Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Douglas Anderson Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170523215132.189049-1-mka@chromium.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 407dad6cf89a..bc63f8db1b0d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -925,11 +925,6 @@ static inline int is_cgroup_event(struct perf_event *event) return 0; } -static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) -{ - return 0; -} - static inline void update_cgrp_time_from_event(struct perf_event *event) { } -- cgit v1.3-14-g43fede From 1ad3aaf3fcd2444406628a19a9b9e0922b95e2d4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 17 May 2017 12:53:50 +0200 Subject: sched/core: Implement new approach to scale select_idle_cpu() Hackbench recently suffered a bunch of pain, first by commit: 4c77b18cf8b7 ("sched/fair: Make select_idle_cpu() more aggressive") and then by commit: c743f0a5c50f ("sched/fair, cpumask: Export for_each_cpu_wrap()") which fixed a bug in the initial for_each_cpu_wrap() implementation that made select_idle_cpu() even more expensive. The bug was that it would skip over CPUs when bits were consequtive in the bitmask. This however gave me an idea to fix select_idle_cpu(); where the old scheme was a cliff-edge throttle on idle scanning, this introduces a more gradual approach. Instead of stopping to scan entirely, we limit how many CPUs we scan. Initial benchmarks show that it mostly recovers hackbench while not hurting anything else, except Mason's schbench, but not as bad as the old thing. It also appears to recover the tbench high-end, which also suffered like hackbench. Tested-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Mason Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hpa@zytor.com Cc: kitsunyan Cc: linux-kernel@vger.kernel.org Cc: lvenanci@redhat.com Cc: riel@redhat.com Cc: xiaolong.ye@intel.com Link: http://lkml.kernel.org/r/20170517105350.hk5m4h4jb6dfr65a@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 21 ++++++++++++++++----- kernel/sched/features.h | 1 + 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 47a0c552c77b..396bca9c7996 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5794,27 +5794,38 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { struct sched_domain *this_sd; - u64 avg_cost, avg_idle = this_rq()->avg_idle; + u64 avg_cost, avg_idle; u64 time, cost; s64 delta; - int cpu; + int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) return -1; - avg_cost = this_sd->avg_scan_cost; - /* * Due to large variance we need a large fuzz factor; hackbench in * particularly is sensitive here. */ - if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost) + avg_idle = this_rq()->avg_idle / 512; + avg_cost = this_sd->avg_scan_cost + 1; + + if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) return -1; + if (sched_feat(SIS_PROP)) { + u64 span_avg = sd->span_weight * avg_idle; + if (span_avg > 4*avg_cost) + nr = div_u64(span_avg, avg_cost); + else + nr = 4; + } + time = local_clock(); for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + if (!--nr) + return -1; if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; if (idle_cpu(cpu)) diff --git a/kernel/sched/features.h b/kernel/sched/features.h index dc4d1483b038..d3fb15555291 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -55,6 +55,7 @@ SCHED_FEAT(TTWU_QUEUE, true) * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ SCHED_FEAT(SIS_AVG_CPU, false) +SCHED_FEAT(SIS_PROP, true) /* * Issue a WARN when we do multiple update_rq_clock() calls -- cgit v1.3-14-g43fede From e36d8677bfa55054e4194ec3683189b882a538f6 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:28 +0200 Subject: sched/deadline: Track the active utilization Active utilization is defined as the total utilization of active (TASK_RUNNING) tasks queued on a runqueue. Hence, it is increased when a task wakes up and is decreased when a task blocks. When a task is migrated from CPUi to CPUj, immediately subtract the task's utilization from CPUi and add it to CPUj. This mechanism is implemented by modifying the pull and push functions. Note: this is not fully correct from the theoretical point of view (the utilization should be removed from CPUi only at the 0 lag time), a more theoretically sound solution is presented in the next patches. Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: Claudio Scordino Cc: Joel Fernandes Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-2-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 6 +++++ 2 files changed, 67 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index df6c2912bd60..b36ecc2b1b10 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +static inline +void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->running_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->running_bw += dl_bw; + SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ +} + +static inline +void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->running_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->running_bw -= dl_bw; + SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ + if (dl_rq->running_bw > old) + dl_rq->running_bw = 0; +} + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; @@ -83,6 +105,8 @@ void init_dl_rq(struct dl_rq *dl_rq) #else init_dl_bw(&dl_rq->dl_bw); #endif + + dl_rq->running_bw = 0; } #ifdef CONFIG_SMP @@ -946,10 +970,14 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (flags & ENQUEUE_WAKEUP) + if (flags & ENQUEUE_WAKEUP) { + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + add_running_bw(dl_se->dl_bw, dl_rq); update_dl_entity(dl_se, pi_se); - else if (flags & ENQUEUE_REPLENISH) + } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); + } __enqueue_dl_entity(dl_se); } @@ -998,14 +1026,25 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) dl_check_constrained_dl(&p->dl); + if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) + add_running_bw(p->dl.dl_bw, &rq->dl); + /* - * If p is throttled, we do nothing. In fact, if it exhausted + * If p is throttled, we do not enqueue it. In fact, if it exhausted * its budget it needs a replenishment and, since it now is on * its rq, the bandwidth timer callback (which clearly has not * run yet) will take care of this. + * However, the active utilization does not depend on the fact + * that the task is on the runqueue or not (but depends on the + * task's state - in GRUB parlance, "inactive" vs "active contending"). + * In other words, even if a task is throttled its utilization must + * be counted in the active utilization; hence, we need to call + * add_running_bw(). */ - if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) + if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + add_running_bw(p->dl.dl_bw, &rq->dl); return; + } enqueue_dl_entity(&p->dl, pi_se, flags); @@ -1023,6 +1062,20 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); + + if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) + sub_running_bw(p->dl.dl_bw, &rq->dl); + + /* + * This check allows to decrease the active utilization in two cases: + * when the task blocks and when it is terminating + * (p->state == TASK_DEAD). We can handle the two cases in the same + * way, because from GRUB's point of view the same thing is happening + * (the task moves from "active contending" to "active non contending" + * or "inactive") + */ + if (flags & DEQUEUE_SLEEP) + sub_running_bw(p->dl.dl_bw, &rq->dl); } /* @@ -1551,7 +1604,9 @@ retry: } deactivate_task(rq, next_task, 0); + sub_running_bw(next_task->dl.dl_bw, &rq->dl); set_task_cpu(next_task, later_rq->cpu); + add_running_bw(next_task->dl.dl_bw, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -1639,7 +1694,9 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); + sub_running_bw(p->dl.dl_bw, &src_rq->dl); set_task_cpu(p, this_cpu); + add_running_bw(p->dl.dl_bw, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f8cf1d87f065..ee26867da339 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -558,6 +558,12 @@ struct dl_rq { #else struct dl_bw dl_bw; #endif + /* + * "Active utilization" for this runqueue: increased when a + * task wakes up (becomes TASK_RUNNING) and decreased when a + * task blocks + */ + u64 running_bw; }; #ifdef CONFIG_SMP -- cgit v1.3-14-g43fede From 209a0cbda7a01d2ea32a8b631d35e873bee498e9 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:29 +0200 Subject: sched/deadline: Improve the tracking of active utilization This patch implements a more theoretically sound algorithm for tracking active utilization: instead of decreasing it when a task blocks, use a timer (the "inactive timer", named after the "Inactive" task state of the GRUB algorithm) to decrease the active utilization at the so called "0-lag time". Tested-by: Claudio Scordino Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-3-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- include/linux/sched.h | 17 +++ kernel/sched/core.c | 3 + kernel/sched/deadline.c | 269 +++++++++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 2 + 4 files changed, 276 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1abaa3728bf7..f1ead2e88d3d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -445,16 +445,33 @@ struct sched_dl_entity { * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. + * + * @dl_non_contending tells if the task is inactive while still + * contributing to the active utilization. In other words, it + * indicates if the inactive timer has been armed and its handler + * has not been executed yet. This flag is useful to avoid race + * conditions between the inactive timer handler and the wakeup + * code. */ int dl_throttled; int dl_boosted; int dl_yielded; + int dl_non_contending; /* * Bandwidth enforcement timer. Each -deadline task has its * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; + + /* + * Inactive timer, responsible for decreasing the active utilization + * at the "0-lag time". When a -deadline task blocks, it contributes + * to GRUB's active utilization until the "0-lag time", hence a + * timer is needed to decrease the active utilization at the correct + * time. + */ + struct hrtimer inactive_timer; }; union rcu_special { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c3e50cada84d..968c655ec5d9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2153,6 +2153,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; + dl_se->dl_non_contending = 0; } /* @@ -2184,6 +2185,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) RB_CLEAR_NODE(&p->dl.rb_node); init_dl_task_timer(&p->dl); + init_dl_inactive_task_timer(&p->dl); __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); @@ -2506,6 +2508,7 @@ static int dl_overflow(struct task_struct *p, int policy, !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { __dl_clear(dl_b, p->dl.dl_bw); __dl_add(dl_b, new_bw); + dl_change_utilization(p, new_bw); err = 0; } else if (!dl_policy(policy) && task_has_dl_policy(p)) { __dl_clear(dl_b, p->dl.dl_bw); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b36ecc2b1b10..6480a929417c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -65,6 +65,161 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) dl_rq->running_bw = 0; } +void dl_change_utilization(struct task_struct *p, u64 new_bw) +{ + if (task_on_rq_queued(p)) + return; + + if (!p->dl.dl_non_contending) + return; + + sub_running_bw(p->dl.dl_bw, &task_rq(p)->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); +} + +/* + * The utilization of a task cannot be immediately removed from + * the rq active utilization (running_bw) when the task blocks. + * Instead, we have to wait for the so called "0-lag time". + * + * If a task blocks before the "0-lag time", a timer (the inactive + * timer) is armed, and running_bw is decreased when the timer + * fires. + * + * If the task wakes up again before the inactive timer fires, + * the timer is cancelled, whereas if the task wakes up after the + * inactive timer fired (and running_bw has been decreased) the + * task's utilization has to be added to running_bw again. + * A flag in the deadline scheduling entity (dl_non_contending) + * is used to avoid race conditions between the inactive timer handler + * and task wakeups. + * + * The following diagram shows how running_bw is updated. A task is + * "ACTIVE" when its utilization contributes to running_bw; an + * "ACTIVE contending" task is in the TASK_RUNNING state, while an + * "ACTIVE non contending" task is a blocked task for which the "0-lag time" + * has not passed yet. An "INACTIVE" task is a task for which the "0-lag" + * time already passed, which does not contribute to running_bw anymore. + * +------------------+ + * wakeup | ACTIVE | + * +------------------>+ contending | + * | add_running_bw | | + * | +----+------+------+ + * | | ^ + * | dequeue | | + * +--------+-------+ | | + * | | t >= 0-lag | | wakeup + * | INACTIVE |<---------------+ | + * | | sub_running_bw | | + * +--------+-------+ | | + * ^ | | + * | t < 0-lag | | + * | | | + * | V | + * | +----+------+------+ + * | sub_running_bw | ACTIVE | + * +-------------------+ | + * inactive timer | non contending | + * fired +------------------+ + * + * The task_non_contending() function is invoked when a task + * blocks, and checks if the 0-lag time already passed or + * not (in the first case, it directly updates running_bw; + * in the second case, it arms the inactive timer). + * + * The task_contending() function is invoked when a task wakes + * up, and checks if the task is still in the "ACTIVE non contending" + * state or not (in the second case, it updates running_bw). + */ +static void task_non_contending(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + struct hrtimer *timer = &dl_se->inactive_timer; + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + s64 zerolag_time; + + /* + * If this is a non-deadline task that has been boosted, + * do nothing + */ + if (dl_se->dl_runtime == 0) + return; + + WARN_ON(hrtimer_active(&dl_se->inactive_timer)); + WARN_ON(dl_se->dl_non_contending); + + zerolag_time = dl_se->deadline - + div64_long((dl_se->runtime * dl_se->dl_period), + dl_se->dl_runtime); + + /* + * Using relative times instead of the absolute "0-lag time" + * allows to simplify the code + */ + zerolag_time -= rq_clock(rq); + + /* + * If the "0-lag time" already passed, decrease the active + * utilization now, instead of starting a timer + */ + if (zerolag_time < 0) { + if (dl_task(p)) + sub_running_bw(dl_se->dl_bw, dl_rq); + if (!dl_task(p) || p->state == TASK_DEAD) + __dl_clear_params(p); + + return; + } + + dl_se->dl_non_contending = 1; + get_task_struct(p); + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); +} + +static void task_contending(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + /* + * If this is a non-deadline task that has been boosted, + * do nothing + */ + if (dl_se->dl_runtime == 0) + return; + + if (dl_se->dl_non_contending) { + dl_se->dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) + put_task_struct(dl_task_of(dl_se)); + } else { + /* + * Since "dl_non_contending" is not set, the + * task's utilization has already been removed from + * active utilization (either when the task blocked, + * when the "inactive timer" fired). + * So, add it back. + */ + add_running_bw(dl_se->dl_bw, dl_rq); + } +} + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; @@ -617,10 +772,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have changed its scheduling policy to something * different than SCHED_DEADLINE (through switched_from_dl()). */ - if (!dl_task(p)) { - __dl_clear_params(p); + if (!dl_task(p)) goto unlock; - } /* * The task might have been boosted by someone else and might be in the @@ -839,6 +992,49 @@ throttle: } } +static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) +{ + struct sched_dl_entity *dl_se = container_of(timer, + struct sched_dl_entity, + inactive_timer); + struct task_struct *p = dl_task_of(dl_se); + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + + if (!dl_task(p) || p->state == TASK_DEAD) { + if (p->state == TASK_DEAD && dl_se->dl_non_contending) { + sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + dl_se->dl_non_contending = 0; + } + __dl_clear_params(p); + + goto unlock; + } + if (dl_se->dl_non_contending == 0) + goto unlock; + + sched_clock_tick(); + update_rq_clock(rq); + + sub_running_bw(dl_se->dl_bw, &rq->dl); + dl_se->dl_non_contending = 0; +unlock: + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + + return HRTIMER_NORESTART; +} + +void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) +{ + struct hrtimer *timer = &dl_se->inactive_timer; + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer->function = inactive_task_timer; +} + #ifdef CONFIG_SMP static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) @@ -971,9 +1167,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * we want a replenishment of its runtime. */ if (flags & ENQUEUE_WAKEUP) { - struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - - add_running_bw(dl_se->dl_bw, dl_rq); + task_contending(dl_se); update_dl_entity(dl_se, pi_se); } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); @@ -1042,7 +1236,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * add_running_bw(). */ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { - add_running_bw(p->dl.dl_bw, &rq->dl); + if (flags & ENQUEUE_WAKEUP) + task_contending(&p->dl); + return; } @@ -1067,7 +1263,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) sub_running_bw(p->dl.dl_bw, &rq->dl); /* - * This check allows to decrease the active utilization in two cases: + * This check allows to start the inactive timer (or to immediately + * decrease the active utilization, if needed) in two cases: * when the task blocks and when it is terminating * (p->state == TASK_DEAD). We can handle the two cases in the same * way, because from GRUB's point of view the same thing is happening @@ -1075,7 +1272,7 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) * or "inactive") */ if (flags & DEQUEUE_SLEEP) - sub_running_bw(p->dl.dl_bw, &rq->dl); + task_non_contending(p); } /* @@ -1153,6 +1350,35 @@ out: return cpu; } +static void migrate_task_rq_dl(struct task_struct *p) +{ + struct rq *rq; + + if (!(p->state == TASK_WAKING) || !(p->dl.dl_non_contending)) + return; + + rq = task_rq(p); + /* + * Since p->state == TASK_WAKING, set_task_cpu() has been called + * from try_to_wake_up(). Hence, p->pi_lock is locked, but + * rq->lock is not... So, lock it + */ + raw_spin_lock(&rq->lock); + sub_running_bw(p->dl.dl_bw, &rq->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + + raw_spin_unlock(&rq->lock); +} + static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) { /* @@ -1794,13 +2020,23 @@ void __init init_sched_dl_class(void) static void switched_from_dl(struct rq *rq, struct task_struct *p) { /* - * Start the deadline timer; if we switch back to dl before this we'll - * continue consuming our current CBS slice. If we stay outside of - * SCHED_DEADLINE until the deadline passes, the timer will reset the - * task. + * task_non_contending() can start the "inactive timer" (if the 0-lag + * time is in the future). If the task switches back to dl before + * the "inactive timer" fires, it can continue to consume its current + * runtime using its current deadline. If it stays outside of + * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer() + * will reset the task parameters. */ - if (!start_dl_timer(p)) - __dl_clear_params(p); + if (task_on_rq_queued(p) && p->dl.dl_runtime) + task_non_contending(p); + + /* + * We cannot use inactive_task_timer() to invoke sub_running_bw() + * at the 0-lag time, because the task could have been migrated + * while SCHED_OTHER in the meanwhile. + */ + if (p->dl.dl_non_contending) + p->dl.dl_non_contending = 0; /* * Since this might be the only -deadline task on the rq, @@ -1819,6 +2055,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); /* If p is not queued we will update its parameters at next wakeup. */ if (!task_on_rq_queued(p)) @@ -1893,6 +2131,7 @@ const struct sched_class dl_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, + .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ee26867da339..c58f38905e0a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -244,6 +244,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } +void dl_change_utilization(struct task_struct *p, u64 new_bw); extern void init_dl_bw(struct dl_bw *dl_b); #ifdef CONFIG_CGROUP_SCHED @@ -1493,6 +1494,7 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); -- cgit v1.3-14-g43fede From 387e31300b5760169e6d3f7a9e1eeed12cc5a30b Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:30 +0200 Subject: sched/deadline: Fix the update of the total -deadline utilization Now that the inactive timer can be armed to fire at the 0-lag time, it is possible to use inactive_task_timer() to update the total -deadline utilization (dl_b->total_bw) at the correct time, fixing dl_overflow() and __setparam_dl(). Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Claudio Scordino Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-4-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 38 ++++++++++++++------------------------ kernel/sched/deadline.c | 28 +++++++++++++--------------- 2 files changed, 27 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 968c655ec5d9..126339daebd7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2475,9 +2475,6 @@ static inline int dl_bw_cpus(int i) * allocated bandwidth to reflect the new situation. * * This function is called while holding p's rq->lock. - * - * XXX we should delay bw change until the task's 0-lag point, see - * __setparam_dl(). */ static int dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) @@ -2502,16 +2499,29 @@ static int dl_overflow(struct task_struct *p, int policy, cpus = dl_bw_cpus(task_cpu(p)); if (dl_policy(policy) && !task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, 0, new_bw)) { + if (hrtimer_active(&p->dl.inactive_timer)) + __dl_clear(dl_b, p->dl.dl_bw); __dl_add(dl_b, new_bw); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + /* + * XXX this is slightly incorrect: when the task + * utilization decreases, we should delay the total + * utilization change until the task's 0-lag point. + * But this would require to set the task's "inactive + * timer" when the task is not inactive. + */ __dl_clear(dl_b, p->dl.dl_bw); __dl_add(dl_b, new_bw); dl_change_utilization(p, new_bw); err = 0; } else if (!dl_policy(policy) && task_has_dl_policy(p)) { - __dl_clear(dl_b, p->dl.dl_bw); + /* + * Do not decrease the total deadline utilization here, + * switched_from_dl() will take care to do it at the correct + * (0-lag) time. + */ err = 0; } raw_spin_unlock(&dl_b->lock); @@ -4020,26 +4030,6 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); - - /* - * Changing the parameters of a task is 'tricky' and we're not doing - * the correct thing -- also see task_dead_dl() and switched_from_dl(). - * - * What we SHOULD do is delay the bandwidth release until the 0-lag - * point. This would include retaining the task_struct until that time - * and change dl_overflow() to not immediately decrement the current - * amount. - * - * Instead we retain the current runtime/deadline and let the new - * parameters take effect after the current reservation period lapses. - * This is safe (albeit pessimistic) because the 0-lag point is always - * before the current scheduling deadline. - * - * We can still have temporary overloads because we do not delay the - * change in bandwidth until that time; so admission control is - * not on the safe side. It does however guarantee tasks will never - * consume more than promised. - */ } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6480a929417c..add9cba1253c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -175,8 +175,14 @@ static void task_non_contending(struct task_struct *p) if (zerolag_time < 0) { if (dl_task(p)) sub_running_bw(dl_se->dl_bw, dl_rq); - if (!dl_task(p) || p->state == TASK_DEAD) + if (!dl_task(p) || p->state == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + raw_spin_lock(&dl_b->lock); + __dl_clear(dl_b, p->dl.dl_bw); __dl_clear_params(p); + raw_spin_unlock(&dl_b->lock); + } return; } @@ -1004,10 +1010,16 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) rq = task_rq_lock(p, &rf); if (!dl_task(p) || p->state == TASK_DEAD) { + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + if (p->state == TASK_DEAD && dl_se->dl_non_contending) { sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); dl_se->dl_non_contending = 0; } + + raw_spin_lock(&dl_b->lock); + __dl_clear(dl_b, p->dl.dl_bw); + raw_spin_unlock(&dl_b->lock); __dl_clear_params(p); goto unlock; @@ -1534,19 +1546,6 @@ static void task_fork_dl(struct task_struct *p) */ } -static void task_dead_dl(struct task_struct *p) -{ - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - - /* - * Since we are TASK_DEAD we won't slip out of the domain! - */ - raw_spin_lock_irq(&dl_b->lock); - /* XXX we should retain the bw until 0-lag */ - dl_b->total_bw -= p->dl.dl_bw; - raw_spin_unlock_irq(&dl_b->lock); -} - static void set_curr_task_dl(struct rq *rq) { struct task_struct *p = rq->curr; @@ -2141,7 +2140,6 @@ const struct sched_class dl_sched_class = { .set_curr_task = set_curr_task_dl, .task_tick = task_tick_dl, .task_fork = task_fork_dl, - .task_dead = task_dead_dl, .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, -- cgit v1.3-14-g43fede From c52f14d384628db0217a7a9080ab800d5ffb2d72 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:31 +0200 Subject: sched/deadline: Implement GRUB accounting According to the GRUB (Greedy Reclaimation of Unused Bandwidth) reclaiming algorithm, the runtime is not decreased as "dq = -dt", but as "dq = -Uact dt" (where Uact is the per-runqueue active utilization). Hence, this commit modifies the runtime accounting rule in update_curr_dl() to implement the GRUB rule. Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Claudio Scordino Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-5-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- kernel/sched/deadline.c | 17 +++++++++++++++++ kernel/sched/sched.h | 2 ++ 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 126339daebd7..b68a1fa05244 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2423,7 +2423,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 20; + return BW_UNIT; /* * Doing this here saves a lot of checks in all @@ -2433,7 +2433,7 @@ unsigned long to_ratio(u64 period, u64 runtime) if (period == 0) return 0; - return div64_u64(runtime << 20, period); + return div64_u64(runtime << BW_SHIFT, period); } #ifdef CONFIG_SMP diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index add9cba1253c..0bee537554f6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -917,6 +917,22 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); +/* + * This function implements the GRUB accounting rule: + * according to the GRUB reclaiming algorithm, the runtime is + * not decreased as "dq = -dt", but as "dq = -Uact dt", where + * Uact is the (per-runqueue) active utilization. + * Since rq->dl.running_bw contains Uact * 2^BW_SHIFT, the result + * has to be shifted right by BW_SHIFT. + */ +u64 grub_reclaim(u64 delta, struct rq *rq) +{ + delta *= rq->dl.running_bw; + delta >>= BW_SHIFT; + + return delta; +} + /* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). @@ -959,6 +975,7 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); + delta_exec = grub_reclaim(delta_exec, rq); dl_se->runtime -= delta_exec; throttle: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c58f38905e0a..bb409ef40120 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1496,6 +1496,8 @@ extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +#define BW_SHIFT 20 +#define BW_UNIT (1 << BW_SHIFT) unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -- cgit v1.3-14-g43fede From 4da3abcefe178c650033f371e94fa10e80bce167 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:32 +0200 Subject: sched/deadline: Do not reclaim the whole CPU bandwidth Original GRUB tends to reclaim 100% of the CPU time... And this allows a CPU hog to starve non-deadline tasks. To address this issue, allow the scheduler to reclaim only a specified fraction of CPU time, stored in the new "bw_ratio" field of the dl runqueue structure. Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Claudio Scordino Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-6-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 +++++++++++ kernel/sched/deadline.c | 12 +++++++++++- kernel/sched/sched.h | 8 ++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b68a1fa05244..7abd06400a98 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6759,6 +6759,16 @@ static int sched_dl_global_validate(void) return ret; } +void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) +{ + if (global_rt_runtime() == RUNTIME_INF) { + dl_rq->bw_ratio = 1 << RATIO_SHIFT; + } else { + dl_rq->bw_ratio = to_ratio(global_rt_runtime(), + global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); + } +} + static void sched_dl_do_global(void) { u64 new_bw = -1; @@ -6784,6 +6794,7 @@ static void sched_dl_do_global(void) raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); + init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0bee537554f6..6a0614b9c8d7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -268,6 +268,7 @@ void init_dl_rq(struct dl_rq *dl_rq) #endif dl_rq->running_bw = 0; + init_dl_rq_bw_ratio(dl_rq); } #ifdef CONFIG_SMP @@ -924,11 +925,20 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); * Uact is the (per-runqueue) active utilization. * Since rq->dl.running_bw contains Uact * 2^BW_SHIFT, the result * has to be shifted right by BW_SHIFT. + * To reclaim only a fraction Umax of the CPU time, the + * runtime accounting rule is modified as + * "dq = -Uact / Umax dt"; since rq->dl.bw_ratio contains + * 2^RATIO_SHIFT / Umax, delta is multiplied by bw_ratio and shifted + * right by RATIO_SHIFT. + * Since delta is a 64 bit variable, to have an overflow its value + * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. + * So, overflow is not an issue here. */ u64 grub_reclaim(u64 delta, struct rq *rq) { delta *= rq->dl.running_bw; - delta >>= BW_SHIFT; + delta *= rq->dl.bw_ratio; + delta >>= BW_SHIFT + RATIO_SHIFT; return delta; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bb409ef40120..878fe757d6ad 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -565,6 +565,12 @@ struct dl_rq { * task blocks */ u64 running_bw; + + /* + * Inverse of the fraction of CPU utilization that can be reclaimed + * by the GRUB algorithm. + */ + u64 bw_ratio; }; #ifdef CONFIG_SMP @@ -1495,9 +1501,11 @@ extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); +extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) +#define RATIO_SHIFT 8 unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -- cgit v1.3-14-g43fede From 2d4283e9d583a3ee8cfb1cbb9c1270614df4c29d Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:33 +0200 Subject: sched/deadline: Make GRUB a task's flag This patch introduces the SCHED_FLAG_RECLAIM flag to specify that a DL task is allowed to reclaim unused CPU time (using the GRUB algorithm). Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Claudio Scordino Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-7-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- include/uapi/linux/sched.h | 1 + kernel/sched/core.c | 3 ++- kernel/sched/deadline.c | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 5f0fe019a720..e2a6c7b3510b 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -47,5 +47,6 @@ * For the sched_{set,get}attr() calls */ #define SCHED_FLAG_RESET_ON_FORK 0x01 +#define SCHED_FLAG_RECLAIM 0x02 #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7abd06400a98..8d1a5a625814 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4195,7 +4195,8 @@ recheck: return -EINVAL; } - if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) + if (attr->sched_flags & + ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM)) return -EINVAL; /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6a0614b9c8d7..61ea3039cdc1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -985,7 +985,8 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - delta_exec = grub_reclaim(delta_exec, rq); + if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) + delta_exec = grub_reclaim(delta_exec, rq); dl_se->runtime -= delta_exec; throttle: -- cgit v1.3-14-g43fede From 8fd27231c3302e0c7e1907df1252db97b65eb241 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:34 +0200 Subject: sched/deadline: Track the "total rq utilization" too The total rq utilization is defined as the sum of the utilisations of tasks that are "assigned" to a runqueue, independently from their state (TASK_RUNNING or blocked) Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Claudio Scordino Signed-off-by: Peter Zijlstra (Intel) Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-8-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 118 ++++++++++++++++++++++++++++++++++-------------- kernel/sched/sched.h | 11 +++++ 2 files changed, 95 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 61ea3039cdc1..6c6a1f099d61 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -51,6 +51,7 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); dl_rq->running_bw += dl_bw; SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ + SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); } static inline @@ -65,25 +66,52 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) dl_rq->running_bw = 0; } +static inline +void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->this_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->this_bw += dl_bw; + SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ +} + +static inline +void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +{ + u64 old = dl_rq->this_bw; + + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + dl_rq->this_bw -= dl_bw; + SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ + if (dl_rq->this_bw > old) + dl_rq->this_bw = 0; + SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); +} + void dl_change_utilization(struct task_struct *p, u64 new_bw) { - if (task_on_rq_queued(p)) - return; + struct rq *rq; - if (!p->dl.dl_non_contending) + if (task_on_rq_queued(p)) return; - sub_running_bw(p->dl.dl_bw, &task_rq(p)->dl); - p->dl.dl_non_contending = 0; - /* - * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() - * will see that dl_not_contending is not set, and - * will not touch the rq's active utilization, - * so we are still safe. - */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + rq = task_rq(p); + if (p->dl.dl_non_contending) { + sub_running_bw(p->dl.dl_bw, &rq->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + } + sub_rq_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(new_bw, &rq->dl); } /* @@ -178,6 +206,8 @@ static void task_non_contending(struct task_struct *p) if (!dl_task(p) || p->state == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + if (p->state == TASK_DEAD) + sub_rq_bw(p->dl.dl_bw, &rq->dl); raw_spin_lock(&dl_b->lock); __dl_clear(dl_b, p->dl.dl_bw); __dl_clear_params(p); @@ -192,7 +222,7 @@ static void task_non_contending(struct task_struct *p) hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); } -static void task_contending(struct sched_dl_entity *dl_se) +static void task_contending(struct sched_dl_entity *dl_se, int flags) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); @@ -203,6 +233,9 @@ static void task_contending(struct sched_dl_entity *dl_se) if (dl_se->dl_runtime == 0) return; + if (flags & ENQUEUE_MIGRATED) + add_rq_bw(dl_se->dl_bw, dl_rq); + if (dl_se->dl_non_contending) { dl_se->dl_non_contending = 0; /* @@ -268,6 +301,7 @@ void init_dl_rq(struct dl_rq *dl_rq) #endif dl_rq->running_bw = 0; + dl_rq->this_bw = 0; init_dl_rq_bw_ratio(dl_rq); } @@ -1042,6 +1076,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) if (p->state == TASK_DEAD && dl_se->dl_non_contending) { sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); dl_se->dl_non_contending = 0; } @@ -1207,7 +1242,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * we want a replenishment of its runtime. */ if (flags & ENQUEUE_WAKEUP) { - task_contending(dl_se); + task_contending(dl_se, flags); update_dl_entity(dl_se, pi_se); } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); @@ -1260,8 +1295,10 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) dl_check_constrained_dl(&p->dl); - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) + if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { + add_rq_bw(p->dl.dl_bw, &rq->dl); add_running_bw(p->dl.dl_bw, &rq->dl); + } /* * If p is throttled, we do not enqueue it. In fact, if it exhausted @@ -1277,7 +1314,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) */ if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { if (flags & ENQUEUE_WAKEUP) - task_contending(&p->dl); + task_contending(&p->dl, flags); return; } @@ -1299,8 +1336,10 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) update_curr_dl(rq); __dequeue_task_dl(rq, p, flags); - if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) + if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(p->dl.dl_bw, &rq->dl); + } /* * This check allows to start the inactive timer (or to immediately @@ -1394,7 +1433,7 @@ static void migrate_task_rq_dl(struct task_struct *p) { struct rq *rq; - if (!(p->state == TASK_WAKING) || !(p->dl.dl_non_contending)) + if (p->state != TASK_WAKING) return; rq = task_rq(p); @@ -1404,18 +1443,20 @@ static void migrate_task_rq_dl(struct task_struct *p) * rq->lock is not... So, lock it */ raw_spin_lock(&rq->lock); - sub_running_bw(p->dl.dl_bw, &rq->dl); - p->dl.dl_non_contending = 0; - /* - * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() - * will see that dl_not_contending is not set, and - * will not touch the rq's active utilization, - * so we are still safe. - */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); - + if (p->dl.dl_non_contending) { + sub_running_bw(p->dl.dl_bw, &rq->dl); + p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be cancelled, inactive_task_timer() + * will see that dl_not_contending is not set, and + * will not touch the rq's active utilization, + * so we are still safe. + */ + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) + put_task_struct(p); + } + sub_rq_bw(p->dl.dl_bw, &rq->dl); raw_spin_unlock(&rq->lock); } @@ -1858,7 +1899,9 @@ retry: deactivate_task(rq, next_task, 0); sub_running_bw(next_task->dl.dl_bw, &rq->dl); + sub_rq_bw(next_task->dl.dl_bw, &rq->dl); set_task_cpu(next_task, later_rq->cpu); + add_rq_bw(next_task->dl.dl_bw, &later_rq->dl); add_running_bw(next_task->dl.dl_bw, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -1948,7 +1991,9 @@ static void pull_dl_task(struct rq *this_rq) deactivate_task(src_rq, p, 0); sub_running_bw(p->dl.dl_bw, &src_rq->dl); + sub_rq_bw(p->dl.dl_bw, &src_rq->dl); set_task_cpu(p, this_cpu); + add_rq_bw(p->dl.dl_bw, &this_rq->dl); add_running_bw(p->dl.dl_bw, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -2057,6 +2102,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && p->dl.dl_runtime) task_non_contending(p); + if (!task_on_rq_queued(p)) + sub_rq_bw(p->dl.dl_bw, &rq->dl); + /* * We cannot use inactive_task_timer() to invoke sub_running_bw() * at the 0-lag time, because the task could have been migrated @@ -2086,9 +2134,11 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) put_task_struct(p); /* If p is not queued we will update its parameters at next wakeup. */ - if (!task_on_rq_queued(p)) - return; + if (!task_on_rq_queued(p)) { + add_rq_bw(p->dl.dl_bw, &rq->dl); + return; + } /* * If p is boosted we already updated its params in * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 878fe757d6ad..b7321dac03c1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -566,6 +566,17 @@ struct dl_rq { */ u64 running_bw; + /* + * Utilization of the tasks "assigned" to this runqueue (including + * the tasks that are in runqueue and the tasks that executed on this + * CPU and blocked). Increased when a task moves to this runqueue, and + * decreased when the task moves away (migrates, changes scheduling + * policy, or terminates). + * This is needed to compute the "inactive utilization" for the + * runqueue (inactive utilization = this_bw - running_bw). + */ + u64 this_bw; + /* * Inverse of the fraction of CPU utilization that can be reclaimed * by the GRUB algorithm. -- cgit v1.3-14-g43fede From 9f0d1a5077399143aad7e1244bb031e29116074e Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:35 +0200 Subject: sched/deadline: Base GRUB reclaiming on the inactive utilization Instead of decreasing the runtime as "dq = -Uact dt" (eventually divided by the maximum utilization available for deadline tasks), decrease it as "dq = -max{u, (1 - Uinact)} dt", where u is the task utilization and Uinact is the "inactive utilization". In this way, the maximum fraction of CPU time that can be reclaimed is given by the total utilization of deadline tasks. This approach solves a fairness issue with "traditional" global GRUB reclaiming: using the traditional GRUB algorithm, if tasks are allocated to the various cores in a non-uniform way, the reclaiming mechanism allows some tasks to reclaim more time than others. This issue is visible starting 11 time-consuming tasks with runtime 10ms and period 30ms (total utilization 3.666) on a 4-cores system: some tasks will receive much more than the reserved runtime (thanks to the reclaiming mechanism), while other tasks will receive less than the reserved runtime. Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Claudio Scordino Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-9-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6c6a1f099d61..7d2f05778060 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -955,26 +955,30 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); /* * This function implements the GRUB accounting rule: * according to the GRUB reclaiming algorithm, the runtime is - * not decreased as "dq = -dt", but as "dq = -Uact dt", where - * Uact is the (per-runqueue) active utilization. - * Since rq->dl.running_bw contains Uact * 2^BW_SHIFT, the result - * has to be shifted right by BW_SHIFT. - * To reclaim only a fraction Umax of the CPU time, the - * runtime accounting rule is modified as - * "dq = -Uact / Umax dt"; since rq->dl.bw_ratio contains - * 2^RATIO_SHIFT / Umax, delta is multiplied by bw_ratio and shifted - * right by RATIO_SHIFT. - * Since delta is a 64 bit variable, to have an overflow its value - * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. - * So, overflow is not an issue here. + * not decreased as "dq = -dt", but as "dq = -max{u, (1 - Uinact)} dt", + * where u is the utilization of the task and Uinact is the + * (per-runqueue) inactive utilization, computed as the difference + * between the "total runqueue utilization" and the runqueue + * active utilization. + * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations + * multiplied by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT. */ -u64 grub_reclaim(u64 delta, struct rq *rq) +u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) { - delta *= rq->dl.running_bw; - delta *= rq->dl.bw_ratio; - delta >>= BW_SHIFT + RATIO_SHIFT; + u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ + u64 u_act; - return delta; + /* + * Instead of computing max{u, (1 - u_inact)}, we compare + * u_inact with 1 - u, because u_inact can be larger than 1 + * (so, 1 - u_inact would be negative leading to wrong results) + */ + if (u_inact > BW_UNIT - dl_se->dl_bw) + u_act = dl_se->dl_bw; + else + u_act = BW_UNIT - u_inact; + + return (delta * u_act) >> BW_SHIFT; } /* @@ -1020,7 +1024,7 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) - delta_exec = grub_reclaim(delta_exec, rq); + delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); dl_se->runtime -= delta_exec; throttle: -- cgit v1.3-14-g43fede From daec5798367012951cdb54fdb5c006e4379c9ae9 Mon Sep 17 00:00:00 2001 From: Luca Abeni Date: Thu, 18 May 2017 22:13:36 +0200 Subject: sched/deadline: Reclaim bandwidth not used by dl tasks This commit introduces a per-runqueue "extra utilization" that can be reclaimed by deadline tasks. In this way, the maximum fraction of CPU time that can reclaimed by deadline tasks is fixed (and configurable) and does not depend on the total deadline utilization. The GRUB accounting rule is modified to add this "extra utilization" to the inactive utilization of the runqueue, and to avoid reclaiming more than a maximum fraction of the CPU time. Tested-by: Daniel Bristot de Oliveira Signed-off-by: Luca Abeni Signed-off-by: Peter Zijlstra (Intel) Cc: Claudio Scordino Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/1495138417-6203-10-git-send-email-luca.abeni@santannapisa.it Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 17 ++++++++++------- kernel/sched/deadline.c | 42 +++++++++++++++++++++++++++--------------- kernel/sched/sched.h | 37 +++++++++++++++++++++++++++++++++++-- 3 files changed, 72 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8d1a5a625814..799647927c4c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2444,7 +2444,7 @@ inline struct dl_bw *dl_bw_of(int i) return &cpu_rq(i)->rd->dl_bw; } -static inline int dl_bw_cpus(int i) +inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; @@ -2462,7 +2462,7 @@ inline struct dl_bw *dl_bw_of(int i) return &cpu_rq(i)->dl.dl_bw; } -static inline int dl_bw_cpus(int i) +inline int dl_bw_cpus(int i) { return 1; } @@ -2500,8 +2500,8 @@ static int dl_overflow(struct task_struct *p, int policy, if (dl_policy(policy) && !task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) - __dl_clear(dl_b, p->dl.dl_bw); - __dl_add(dl_b, new_bw); + __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { @@ -2512,8 +2512,8 @@ static int dl_overflow(struct task_struct *p, int policy, * But this would require to set the task's "inactive * timer" when the task is not inactive. */ - __dl_clear(dl_b, p->dl.dl_bw); - __dl_add(dl_b, new_bw); + __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); dl_change_utilization(p, new_bw); err = 0; } else if (!dl_policy(policy) && task_has_dl_policy(p)) { @@ -5515,7 +5515,7 @@ int task_can_attach(struct task_struct *p, * We will free resources in the source root_domain * later on (see set_cpus_allowed_dl()). */ - __dl_add(dl_b, p->dl.dl_bw); + __dl_add(dl_b, p->dl.dl_bw, cpus); } raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); @@ -6764,9 +6764,12 @@ void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) { if (global_rt_runtime() == RUNTIME_INF) { dl_rq->bw_ratio = 1 << RATIO_SHIFT; + dl_rq->extra_bw = 1 << BW_SHIFT; } else { dl_rq->bw_ratio = to_ratio(global_rt_runtime(), global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); + dl_rq->extra_bw = to_ratio(global_rt_period(), + global_rt_runtime()); } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 7d2f05778060..e3b25dfb74f3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -209,7 +209,7 @@ static void task_non_contending(struct task_struct *p) if (p->state == TASK_DEAD) sub_rq_bw(p->dl.dl_bw, &rq->dl); raw_spin_lock(&dl_b->lock); - __dl_clear(dl_b, p->dl.dl_bw); + __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); __dl_clear_params(p); raw_spin_unlock(&dl_b->lock); } @@ -955,28 +955,40 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); /* * This function implements the GRUB accounting rule: * according to the GRUB reclaiming algorithm, the runtime is - * not decreased as "dq = -dt", but as "dq = -max{u, (1 - Uinact)} dt", - * where u is the utilization of the task and Uinact is the - * (per-runqueue) inactive utilization, computed as the difference - * between the "total runqueue utilization" and the runqueue - * active utilization. + * not decreased as "dq = -dt", but as + * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt", + * where u is the utilization of the task, Umax is the maximum reclaimable + * utilization, Uinact is the (per-runqueue) inactive utilization, computed + * as the difference between the "total runqueue utilization" and the + * runqueue active utilization, and Uextra is the (per runqueue) extra + * reclaimable utilization. * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations - * multiplied by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT. + * multiplied by 2^BW_SHIFT, the result has to be shifted right by + * BW_SHIFT. + * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT, + * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. + * Since delta is a 64 bit variable, to have an overflow its value + * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. + * So, overflow is not an issue here. */ u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) { u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ u64 u_act; + u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT; /* - * Instead of computing max{u, (1 - u_inact)}, we compare - * u_inact with 1 - u, because u_inact can be larger than 1 - * (so, 1 - u_inact would be negative leading to wrong results) + * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)}, + * we compare u_inact + rq->dl.extra_bw with + * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because + * u_inact + rq->dl.extra_bw can be larger than + * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative + * leading to wrong results) */ - if (u_inact > BW_UNIT - dl_se->dl_bw) - u_act = dl_se->dl_bw; + if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min) + u_act = u_act_min; else - u_act = BW_UNIT - u_inact; + u_act = BW_UNIT - u_inact - rq->dl.extra_bw; return (delta * u_act) >> BW_SHIFT; } @@ -1085,7 +1097,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) } raw_spin_lock(&dl_b->lock); - __dl_clear(dl_b, p->dl.dl_bw); + __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); __dl_clear_params(p); @@ -2054,7 +2066,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * until we complete the update. */ raw_spin_lock(&src_dl_b->lock); - __dl_clear(src_dl_b, p->dl.dl_bw); + __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&src_dl_b->lock); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b7321dac03c1..f1e400c6403c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -219,22 +219,27 @@ static inline int dl_bandwidth_enabled(void) } extern struct dl_bw *dl_bw_of(int i); +extern int dl_bw_cpus(int i); struct dl_bw { raw_spinlock_t lock; u64 bw, total_bw; }; +static inline void __dl_update(struct dl_bw *dl_b, s64 bw); + static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); } static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) { dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); } static inline @@ -576,6 +581,7 @@ struct dl_rq { * runqueue (inactive utilization = this_bw - running_bw). */ u64 this_bw; + u64 extra_bw; /* * Inverse of the fraction of CPU utilization that can be reclaimed @@ -1958,6 +1964,33 @@ extern void nohz_balance_exit_idle(unsigned int cpu); static inline void nohz_balance_exit_idle(unsigned int cpu) { } #endif + +#ifdef CONFIG_SMP +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} +#else +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} +#endif + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { u64 total; -- cgit v1.3-14-g43fede From ae83b56a56f8d9643dedbee86b457fa1c5d42f59 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 10 May 2017 21:03:37 +0800 Subject: sched/deadline: Zero out positive runtime after throttling constrained tasks When a contrained task is throttled by dl_check_constrained_dl(), it may carry the remaining positive runtime, as a result when dl_task_timer() fires and calls replenish_dl_entity(), it will not be replenished correctly due to the positive dl_se->runtime. This patch assigns its runtime to 0 if positive after throttling. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Luca Abeni Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Fixes: df8eac8cafce ("sched/deadline: Throttle a constrained deadline task activated after the deadline) Link: http://lkml.kernel.org/r/1494421417-27550-1-git-send-email-xlpang@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e3b25dfb74f3..54302cf68bb9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -941,6 +941,8 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) return; dl_se->dl_throttled = 1; + if (dl_se->runtime > 0) + dl_se->runtime = 0; } } -- cgit v1.3-14-g43fede From 3effcb4247e74a51f5d8b775a1ee4abf87cc089a Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 29 May 2017 16:24:03 +0200 Subject: sched/deadline: Use the revised wakeup rule for suspending constrained dl tasks We have been facing some problems with self-suspending constrained deadline tasks. The main reason is that the original CBS was not designed for such sort of tasks. One problem reported by Xunlei Pang takes place when a task suspends, and then is awakened before the deadline, but so close to the deadline that its remaining runtime can cause the task to have an absolute density higher than allowed. In such situation, the original CBS assumes that the task is facing an early activation, and so it replenishes the task and set another deadline, one deadline in the future. This rule works fine for implicit deadline tasks. Moreover, it allows the system to adapt the period of a task in which the external event source suffered from a clock drift. However, this opens the window for bandwidth leakage for constrained deadline tasks. For instance, a task with the following parameters: runtime = 5 ms deadline = 7 ms [density] = 5 / 7 = 0.71 period = 1000 ms If the task runs for 1 ms, and then suspends for another 1ms, it will be awakened with the following parameters: remaining runtime = 4 laxity = 5 presenting a absolute density of 4 / 5 = 0.80. In this case, the original CBS would assume the task had an early wakeup. Then, CBS will reset the runtime, and the absolute deadline will be postponed by one relative deadline, allowing the task to run. The problem is that, if the task runs this pattern forever, it will keep receiving bandwidth, being able to run 1ms every 2ms. Following this behavior, the task would be able to run 500 ms in 1 sec. Thus running more than the 5 ms / 1 sec the admission control allowed it to run. Trying to address the self-suspending case, Luca Abeni, Giuseppe Lipari, and Juri Lelli [1] revisited the CBS in order to deal with self-suspending tasks. In the new approach, rather than replenishing/postponing the absolute deadline, the revised wakeup rule adjusts the remaining runtime, reducing it to fit into the allowed density. A revised version of the idea is: At a given time t, the maximum absolute density of a task cannot be higher than its relative density, that is: runtime / (deadline - t) <= dl_runtime / dl_deadline Knowing the laxity of a task (deadline - t), it is possible to move it to the other side of the equality, thus enabling to define max remaining runtime a task can use within the absolute deadline, without over-running the allowed density: runtime = (dl_runtime / dl_deadline) * (deadline - t) For instance, in our previous example, the task could still run: runtime = ( 5 / 7 ) * 5 runtime = 3.57 ms Without causing damage for other deadline tasks. It is note worthy that the laxity cannot be negative because that would cause a negative runtime. Thus, this patch depends on the patch: df8eac8cafce ("sched/deadline: Throttle a constrained deadline task activated after the deadline") Which throttles a constrained deadline task activated after the deadline. Finally, it is also possible to use the revised wakeup rule for all other tasks, but that would require some more discussions about pros and cons. Reported-by: Xunlei Pang Signed-off-by: Daniel Bristot de Oliveira [peterz: replaced dl_is_constrained with dl_is_implicit] Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Luca Abeni Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Romulo Silva de Oliveira Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tommaso Cucinotta Link: http://lkml.kernel.org/r/5c800ab3a74a168a84ee5f3f84d12a02e11383be.1495803804.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 2 + kernel/sched/deadline.c | 98 +++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 89 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3113c828483b..1f0f427e0292 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -422,6 +422,7 @@ struct sched_dl_entity { u64 dl_deadline; /* Relative deadline of each instance */ u64 dl_period; /* Separation of two instances (period) */ u64 dl_bw; /* dl_runtime / dl_period */ + u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 799647927c4c..e5bd587e87f8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2150,6 +2150,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_period = 0; dl_se->flags = 0; dl_se->dl_bw = 0; + dl_se->dl_density = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; @@ -4030,6 +4031,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 54302cf68bb9..e12f85975857 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -704,13 +704,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, } /* - * When a -deadline entity is queued back on the runqueue, its runtime and - * deadline might need updating. + * Revised wakeup rule [1]: For self-suspending tasks, rather then + * re-initializing task's runtime and deadline, the revised wakeup + * rule adjusts the task's runtime to avoid the task to overrun its + * density. * - * The policy here is that we update the deadline of the entity only if: - * - the current deadline is in the past, - * - using the remaining runtime with the current deadline would make - * the entity exceed its bandwidth. + * Reasoning: a task may overrun the density if: + * runtime / (deadline - t) > dl_runtime / dl_deadline + * + * Therefore, runtime can be adjusted to: + * runtime = (dl_runtime / dl_deadline) * (deadline - t) + * + * In such way that runtime will be equal to the maximum density + * the task can use without breaking any rule. + * + * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant + * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24. + */ +static void +update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq) +{ + u64 laxity = dl_se->deadline - rq_clock(rq); + + /* + * If the task has deadline < period, and the deadline is in the past, + * it should already be throttled before this check. + * + * See update_dl_entity() comments for further details. + */ + WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq))); + + dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT; +} + +/* + * Regarding the deadline, a task with implicit deadline has a relative + * deadline == relative period. A task with constrained deadline has a + * relative deadline <= relative period. + * + * We support constrained deadline tasks. However, there are some restrictions + * applied only for tasks which do not have an implicit deadline. See + * update_dl_entity() to know more about such restrictions. + * + * The dl_is_implicit() returns true if the task has an implicit deadline. + */ +static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_deadline == dl_se->dl_period; +} + +/* + * When a deadline entity is placed in the runqueue, its runtime and deadline + * might need to be updated. This is done by a CBS wake up rule. There are two + * different rules: 1) the original CBS; and 2) the Revisited CBS. + * + * When the task is starting a new period, the Original CBS is used. In this + * case, the runtime is replenished and a new absolute deadline is set. + * + * When a task is queued before the begin of the next period, using the + * remaining runtime and deadline could make the entity to overflow, see + * dl_entity_overflow() to find more about runtime overflow. When such case + * is detected, the runtime and deadline need to be updated. + * + * If the task has an implicit deadline, i.e., deadline == period, the Original + * CBS is applied. the runtime is replenished and a new absolute deadline is + * set, as in the previous cases. + * + * However, the Original CBS does not work properly for tasks with + * deadline < period, which are said to have a constrained deadline. By + * applying the Original CBS, a constrained deadline task would be able to run + * runtime/deadline in a period. With deadline < period, the task would + * overrun the runtime/period allowed bandwidth, breaking the admission test. + * + * In order to prevent this misbehave, the Revisited CBS is used for + * constrained deadline tasks when a runtime overflow is detected. In the + * Revisited CBS, rather than replenishing & setting a new absolute deadline, + * the remaining runtime of the task is reduced to avoid runtime overflow. + * Please refer to the comments update_dl_revised_wakeup() function to find + * more about the Revised CBS rule. */ static void update_dl_entity(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se) @@ -720,6 +791,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + + if (unlikely(!dl_is_implicit(dl_se) && + !dl_time_before(dl_se->deadline, rq_clock(rq)) && + !dl_se->dl_boosted)){ + update_dl_revised_wakeup(dl_se, rq); + return; + } + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -1274,11 +1353,6 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) __dequeue_dl_entity(dl_se); } -static inline bool dl_is_constrained(struct sched_dl_entity *dl_se) -{ - return dl_se->dl_deadline < dl_se->dl_period; -} - static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *pi_task = rt_mutex_get_top_task(p); @@ -1310,7 +1384,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * If that is the case, the task will be throttled and * the replenishment timer will be set to the next period. */ - if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) + if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) dl_check_constrained_dl(&p->dl); if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { -- cgit v1.3-14-g43fede From f5832c1998af2ca8d9947792d1c8e1816ab58e57 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 29 May 2017 17:02:57 -0400 Subject: sched/core: Omit building stop_sched_class when !SMP The stop class is invoked through stop_machine only. This is dead code on UP builds. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170529210302.26868-3-nicolas.pitre@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/Makefile | 4 ++-- kernel/sched/core.c | 60 +++++++++++++++++++++++++-------------------------- kernel/sched/sched.h | 4 ++++ 3 files changed, 36 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 89ab6758667b..5e4c2e7a632b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,9 +16,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o +obj-y += idle_task.o fair.o rt.o deadline.o obj-y += wait.o swait.o completion.o idle.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e5bd587e87f8..c343b8135774 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -788,36 +788,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -void sched_set_stop_task(int cpu, struct task_struct *stop) -{ - struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; - struct task_struct *old_stop = cpu_rq(cpu)->stop; - - if (stop) { - /* - * Make it appear like a SCHED_FIFO task, its something - * userspace knows about and won't get confused about. - * - * Also, it will make PI more or less work without too - * much confusion -- but then, stop work should not - * rely on PI working anyway. - */ - sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); - - stop->sched_class = &stop_sched_class; - } - - cpu_rq(cpu)->stop = stop; - - if (old_stop) { - /* - * Reset it back to a normal scheduling class so that - * it can die in pieces. - */ - old_stop->sched_class = &rt_sched_class; - } -} - /* * __normal_prio - return the priority that is based on the static prio */ @@ -1588,6 +1558,36 @@ static void update_avg(u64 *avg, u64 sample) *avg += diff >> 3; } +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + #else static inline int __set_cpus_allowed_ptr(struct task_struct *p, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f1e400c6403c..f2ef759a4cb6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1453,7 +1453,11 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr) curr->sched_class->set_curr_task(rq); } +#ifdef CONFIG_SMP #define sched_class_highest (&stop_sched_class) +#else +#define sched_class_highest (&dl_sched_class) +#endif #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) -- cgit v1.3-14-g43fede From ebfa4c02fa4806bfef189e88152b833f2a732bff Mon Sep 17 00:00:00 2001 From: Aubrey Li Date: Wed, 7 Jun 2017 10:49:02 +0800 Subject: sched/idle: Add deferrable vmstat_updater back Deferrable vmstat_updater was missing in commit: c1de45ca831a ("sched/idle: Add support for tasks that inject idle") Add it back. Signed-off-by: Aubrey Li Signed-off-by: Peter Zijlstra (Intel) Cc: Aubrey Li Cc: Christoph Lameter Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1496803742-38274-1-git-send-email-aubrey.li@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ef63adce0c9c..6c23e30c0e5c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -219,6 +219,7 @@ static void do_idle(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { -- cgit v1.3-14-g43fede From f5694788ad8da5da41b501f3d6d2ae22379c4ef9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Sep 2016 12:15:37 +0200 Subject: rt_mutex: Add lockdep annotations Now that (PI) futexes have their own private RT-mutex interface and implementation we can easily add lockdep annotations to the existing RT-mutex interface. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/rtmutex.h | 25 +++++++++++++++++++++---- kernel/locking/rtmutex-debug.c | 6 +++++- kernel/locking/rtmutex-debug.h | 2 +- kernel/locking/rtmutex.c | 36 +++++++++++++++++++++++++++++------- kernel/locking/rtmutex.h | 2 +- lib/Kconfig.debug | 3 +++ 6 files changed, 60 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 1abba5ce2a2f..44fd002f7cd5 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -37,6 +37,9 @@ struct rt_mutex { int line; void *magic; #endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif }; struct rt_mutex_waiter; @@ -58,19 +61,33 @@ struct hrtimer_sleeper; #ifdef CONFIG_DEBUG_RT_MUTEXES # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ , .name = #mutexname, .file = __FILE__, .line = __LINE__ -# define rt_mutex_init(mutex) __rt_mutex_init(mutex, __func__) + +# define rt_mutex_init(mutex) \ +do { \ + static struct lock_class_key __key; \ + __rt_mutex_init(mutex, __func__, &__key); \ +} while (0) + extern void rt_mutex_debug_task_free(struct task_struct *tsk); #else # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) -# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL) +# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL, NULL) # define rt_mutex_debug_task_free(t) do { } while (0) #endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \ + , .dep_map = { .name = #mutexname } +#else +#define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) +#endif + #define __RT_MUTEX_INITIALIZER(mutexname) \ { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ , .waiters = RB_ROOT \ , .owner = NULL \ - __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} + __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ + __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)} #define DEFINE_RT_MUTEX(mutexname) \ struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname) @@ -86,7 +103,7 @@ static inline int rt_mutex_is_locked(struct rt_mutex *lock) return lock->owner != NULL; } -extern void __rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); extern void rt_mutex_destroy(struct rt_mutex *lock); extern void rt_mutex_lock(struct rt_mutex *lock); diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 58e366ad36f4..ac35e648b0e5 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -166,12 +166,16 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) memset(waiter, 0x22, sizeof(*waiter)); } -void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) +void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key) { /* * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lock->name = name; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif } diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index b585af9a1b50..5078c6ddf4a5 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -11,7 +11,7 @@ extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); -extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); extern void debug_rt_mutex_lock(struct rt_mutex *lock); extern void debug_rt_mutex_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 28cd09e635ed..43123533e9b1 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1481,6 +1481,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock) { might_sleep(); + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_lock); @@ -1496,9 +1497,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); */ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) { + int ret; + might_sleep(); - return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + + return ret; } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); @@ -1526,11 +1534,18 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) int rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) { + int ret; + might_sleep(); - return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, RT_MUTEX_MIN_CHAINWALK, rt_mutex_slowlock); + if (ret) + mutex_release(&lock->dep_map, 1, _RET_IP_); + + return ret; } EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); @@ -1547,10 +1562,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { + int ret; + if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) return 0; - return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); + ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; } EXPORT_SYMBOL_GPL(rt_mutex_trylock); @@ -1561,6 +1582,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); */ void __sched rt_mutex_unlock(struct rt_mutex *lock) { + mutex_release(&lock->dep_map, 1, _RET_IP_); rt_mutex_fastunlock(lock, rt_mutex_slowunlock); } EXPORT_SYMBOL_GPL(rt_mutex_unlock); @@ -1620,7 +1642,6 @@ void rt_mutex_destroy(struct rt_mutex *lock) lock->magic = NULL; #endif } - EXPORT_SYMBOL_GPL(rt_mutex_destroy); /** @@ -1632,14 +1653,15 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy); * * Initializing of a locked rt lock is not allowed */ -void __rt_mutex_init(struct rt_mutex *lock, const char *name) +void __rt_mutex_init(struct rt_mutex *lock, const char *name, + struct lock_class_key *key) { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); lock->waiters = RB_ROOT; lock->waiters_leftmost = NULL; - debug_rt_mutex_init(lock, name); + debug_rt_mutex_init(lock, name, key); } EXPORT_SYMBOL_GPL(__rt_mutex_init); @@ -1660,7 +1682,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) { - __rt_mutex_init(lock, NULL); + __rt_mutex_init(lock, NULL, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); } diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index 6607802efa8b..5c253caffe91 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -17,7 +17,7 @@ #define debug_rt_mutex_proxy_lock(l,p) do { } while (0) #define debug_rt_mutex_proxy_unlock(l) do { } while (0) #define debug_rt_mutex_unlock(l) do { } while (0) -#define debug_rt_mutex_init(m, n) do { } while (0) +#define debug_rt_mutex_init(m, n, k) do { } while (0) #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) #define debug_rt_mutex_print_deadlock(w) do { } while (0) #define debug_rt_mutex_reset_waiter(w) do { } while (0) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e4587ebe52c7..ca615129aec5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1052,6 +1052,7 @@ config DEBUG_LOCK_ALLOC depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT select DEBUG_SPINLOCK select DEBUG_MUTEXES + select DEBUG_RT_MUTEXES if RT_MUTEXES select LOCKDEP help This feature will check whether any held lock (spinlock, rwlock, @@ -1067,6 +1068,7 @@ config PROVE_LOCKING select LOCKDEP select DEBUG_SPINLOCK select DEBUG_MUTEXES + select DEBUG_RT_MUTEXES if RT_MUTEXES select DEBUG_LOCK_ALLOC select TRACE_IRQFLAGS default n @@ -1121,6 +1123,7 @@ config LOCK_STAT select LOCKDEP select DEBUG_SPINLOCK select DEBUG_MUTEXES + select DEBUG_RT_MUTEXES if RT_MUTEXES select DEBUG_LOCK_ALLOC default n help -- cgit v1.3-14-g43fede From f92c734f02cbf10e40569facff82059ae9b61920 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Apr 2017 15:40:35 -0700 Subject: rcu: Prevent rcu_barrier() from starting needless grace periods Currently rcu_barrier() uses call_rcu() to enqueue new callbacks on each CPU with a non-empty callback list. This works, but means that rcu_barrier() forces grace periods that are not otherwise needed. The key point is that rcu_barrier() never needs to wait for a grace period, but instead only for all pre-existing callbacks to be invoked. This means that rcu_barrier()'s new callbacks should be placed in the callback-list segment containing the last pre-existing callback. This commit makes this change using the new rcu_segcblist_entrain() function. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 1 + kernel/rcu/tree.c | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index e3facb356838..91dc089d65b7 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -742,6 +742,7 @@ TRACE_EVENT(rcu_torture_read, * "OnlineQ": _rcu_barrier() found online CPU with callbacks. * "OnlineNQ": _rcu_barrier() found online CPU, no callbacks. * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. + * "IRQNQ": An rcu_barrier_callback() callback found no callbacks. * "CB": An rcu_barrier_callback() invoked a callback, not the last. * "LastCB": An rcu_barrier_callback() invoked the last callback. * "Inc2": _rcu_barrier() piggyback check counter incremented. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e354e475e645..657056c3e0cd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3578,8 +3578,14 @@ static void rcu_barrier_func(void *type) struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); - atomic_inc(&rsp->barrier_cpu_count); - rsp->call(&rdp->barrier_head, rcu_barrier_callback); + rdp->barrier_head.func = rcu_barrier_callback; + debug_rcu_head_queue(&rdp->barrier_head); + if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { + atomic_inc(&rsp->barrier_cpu_count); + } else { + debug_rcu_head_unqueue(&rdp->barrier_head); + _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence); + } } /* -- cgit v1.3-14-g43fede From 881ec9d209d5371c21db89ca1bb19afd3fcadab3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Apr 2017 15:16:50 -0700 Subject: srcu: Eliminate possibility of destructive counter overflow Earlier versions of Tree SRCU were subject to a counter overflow bug that could theoretically result in too-short grace periods. This commit eliminates this problem by adding an update-side memory barrier. The short explanation is that if the updater sums the unlock counts too late to see a given __srcu_read_unlock() increment, that CPU's next __srcu_read_lock() must see the new value of ->srcu_idx, thus incrementing the other bank of counters. This eliminates the possibility of destructive counter overflow as long as the srcu_read_lock() nesting level does not exceed floor(ULONG_MAX/NR_CPUS/2), which should be an eminently reasonable nesting limit, especially on 64-bit systems. Reported-by: Lance Roy Suggested-by: Lance Roy Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 157654fa436a..fceca84df6b0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -275,15 +275,20 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) * not mean that there are no more readers, as one could have read * the current index but not have incremented the lock counter yet. * - * Possible bug: There is no guarantee that there haven't been - * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were - * counted, meaning that this could return true even if there are - * still active readers. Since there are no memory barriers around - * srcu_flip(), the CPU is not required to increment ->srcu_idx - * before running srcu_readers_unlock_idx(), which means that there - * could be an arbitrarily large number of critical sections that - * execute after srcu_readers_unlock_idx() but use the old value - * of ->srcu_idx. + * So suppose that the updater is preempted here for so long + * that more than ULONG_MAX non-nested readers come and go in + * the meantime. It turns out that this cannot result in overflow + * because if a reader modifies its unlock count after we read it + * above, then that reader's next load of ->srcu_idx is guaranteed + * to get the new value, which will cause it to operate on the + * other bank of counters, where it cannot contribute to the + * overflow of these counters. This means that there is a maximum + * of 2*NR_CPUS increments, which cannot overflow given current + * systems, especially not on 64-bit systems. + * + * OK, how about nesting? This does impose a limit on nesting + * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient, + * especially on 64-bit systems. */ return srcu_readers_lock_idx(sp, idx) == unlocks; } @@ -671,6 +676,16 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) */ static void srcu_flip(struct srcu_struct *sp) { + /* + * Ensure that if this updater saw a given reader's increment + * from __srcu_read_lock(), that reader was using an old value + * of ->srcu_idx. Also ensure that if a given reader sees the + * new value of ->srcu_idx, this updater's earlier scans cannot + * have seen that reader's increments (which is OK, because this + * grace period need not wait on that reader). + */ + smp_mb(); /* E */ /* Pairs with B and C. */ + WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); /* -- cgit v1.3-14-g43fede From 5b72f9643b52a5148bb8ced126e20563adfa3466 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Apr 2017 15:29:14 -0700 Subject: rcu: Complain if blocking in preemptible RCU read-side critical section Although preemptible RCU allows its read-side critical sections to be preempted, general blocking is forbidden. The reason for this is that excessive preemption times can be handled by CONFIG_RCU_BOOST=y, but a voluntarily blocked task doesn't care how high you boost its priority. Because preemptible RCU is a global mechanism, one ill-behaved reader hurts everyone. Hence the prohibition against general blocking in RCU-preempt read-side critical sections. Preemption yes, blocking no. This commit enforces this prohibition. There is a special exception for the -rt patchset (which they kindly volunteered to implement): It is OK to block (as opposed to merely being preempted) within an RCU-preempt read-side critical section, but only if the blocking is subject to priority inheritance. This exception permits CONFIG_RCU_BOOST=y to get -rt RCU readers out of trouble. Why doesn't this exception also apply to mainline's rt_mutex? Because of the possibility that someone does general blocking while holding an rt_mutex. Yes, the priority boosting will affect the rt_mutex, but it won't help with the task doing general blocking while holding that rt_mutex. Reported-by: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 657056c3e0cd..9ce682242e99 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -478,7 +478,7 @@ void rcu_note_context_switch(bool preempt) barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); - rcu_preempt_note_context_switch(); + rcu_preempt_note_context_switch(preempt); /* Load rcu_urgent_qs before other flags. */ if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) goto out; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ba38262c3554..0fa7aee9ef55 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -477,7 +477,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); -static void rcu_preempt_note_context_switch(void); +static void rcu_preempt_note_context_switch(bool preempt); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static bool rcu_preempt_has_tasks(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c9a48657512a..a421753e8e9c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -286,12 +286,13 @@ static void rcu_preempt_qs(void) * * Caller must disable interrupts. */ -static void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(bool preempt) { struct task_struct *t = current; struct rcu_data *rdp; struct rcu_node *rnp; + WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); if (t->rcu_read_lock_nesting > 0 && !t->rcu_read_unlock_special.b.blocked) { @@ -738,7 +739,7 @@ static void __init rcu_bootup_announce(void) * Because preemptible RCU does not exist, we never have to check for * CPUs being in quiescent states. */ -static void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(bool preempt) { } -- cgit v1.3-14-g43fede From 9683937df9ebf4eb62cdedb09c5f20a0760c7d80 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Apr 2017 16:12:52 -0700 Subject: rcuperf: Defer expedited/normal check to end of test Current rcuperf startup checks to see if the user asked to measure only expedited grace periods, yet constrained all grace periods to be normal, or if the user asked to measure only normal grace periods, yet constrained all grace periods to be expedited. Useless tests of this sort are aborted. Unfortunately, making RCU work through the mid-boot dead zone [1] puts RCU into expedited-only mode during that zone. Which happens to also be the exact time that rcuperf carries out the aforementioned check. So if the user asks rcuperf to measure only normal grace periods (the default), rcuperf will now always complain and terminate the test. This commit therefore moves the checks to rcu_perf_cleanup(). This has the disadvantage of failing to abort useless tests, but avoids the need to create yet another kthread and the need to do fiddly checks involving the holdoff time. (Yes, another approach is to do the checks in a late-stage init function, but that would require some way to communicate badness to rcuperf's kthreads, and seems not worth the bother.) [1] https://lwn.net/Articles/716148/ Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index a4a86fb47e4a..ef5b1faac495 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -452,6 +452,15 @@ rcu_perf_cleanup(void) u64 *wdp; u64 *wdpp; + /* + * Would like warning at start, but everything is expedited + * during the mid-boot phase, so have to wait till the end. + */ + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) + VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + if (rcu_gp_is_normal() && gp_exp) + VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (torture_cleanup_begin()) return; @@ -624,16 +633,6 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } - if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { - VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); - firsterr = -EINVAL; - goto unwind; - } - if (rcu_gp_is_normal() && gp_exp) { - VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); - firsterr = -EINVAL; - goto unwind; - } for (i = 0; i < nrealwriters; i++) { writer_durations[i] = kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), -- cgit v1.3-14-g43fede From e28371c891db29c892d85322ea27ad997cc50f72 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Apr 2017 09:59:53 -0700 Subject: rcu: Remove obsolete reference to synchronize_kernel() The synchronize_kernel() primitive was removed in favor of synchronize_sched() more than a decade ago, and it seems likely that rather few kernel hackers are familiar with it. Its continued presence is therefore providing more confusion than enlightenment. This commit therefore removes the reference from the synchronize_sched() header comment, and adds the corresponding information to the synchronize_rcu(0 header comment. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ------ kernel/rcu/tree_plugin.h | 9 +++++++-- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9ce682242e99..3bee58fc23b1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3280,12 +3280,6 @@ static inline int rcu_blocking_is_gp(void) * to have executed a full memory barrier during the execution of * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but * again only if the system has more than one CPU). - * - * This primitive provides the guarantees made by the (now removed) - * synchronize_kernel() API. In contrast, synchronize_rcu() only - * guarantees that rcu_read_lock() sections will have completed. - * In "classic RCU", these two guarantees happen to be one and - * the same, but can differ in realtime RCU implementations. */ void synchronize_sched(void) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a421753e8e9c..3b432fa4c45b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -664,8 +664,13 @@ EXPORT_SYMBOL_GPL(call_rcu); * synchronize_rcu() was waiting. RCU read-side critical sections are * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. * - * See the description of synchronize_sched() for more detailed information - * on memory ordering guarantees. + * See the description of synchronize_sched() for more detailed + * information on memory-ordering guarantees. However, please note + * that -only- the memory-ordering guarantees apply. For example, + * synchronize_rcu() is -not- guaranteed to wait on things like code + * protected by preempt_disable(), instead, synchronize_rcu() is -only- + * guaranteed to wait on RCU read-side critical sections, that is, sections + * of code protected by rcu_read_lock(). */ void synchronize_rcu(void) { -- cgit v1.3-14-g43fede From 881ed593a323c832c2e9383effeb6a0c99859210 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Apr 2017 12:47:10 -0700 Subject: rcuperf: Add ability to performance-test call_rcu() and friends This commit upgrades rcuperf so that it can do performance testing on asynchronous grace-period primitives such as call_srcu(). There is a new rcuperf.gp_async module parameter that specifies this new behavior, with the pre-existing rcuperf.gp_exp testing expedited grace periods such as synchronize_rcu_expedited, and with the default being to test synchronous non-expedited grace periods such as synchronize_rcu(). There is also a new rcuperf.gp_async_max module parameter that specifies the maximum number of outstanding callbacks per writer kthread, defaulting to 1,000. When this limit is exceeded, the writer thread invokes the appropriate flavor of rcu_barrier() to wait for callbacks to drain. Signed-off-by: Paul E. McKenney [ paulmck: Removed the redundant initialization noted by Arnd Bergmann. ] --- Documentation/admin-guide/kernel-parameters.txt | 11 ++++ kernel/rcu/rcuperf.c | 69 +++++++++++++++++++++++-- 2 files changed, 75 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 15f79c27748d..3598464ca8ed 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3319,6 +3319,17 @@ This wake_up() will be accompanied by a WARN_ONCE() splat and an ftrace_dump(). + rcuperf.gp_async= [KNL] + Measure performance of asynchronous + grace-period primitives such as call_rcu(). + + rcuperf.gp_async_max= [KNL] + Specify the maximum number of outstanding + callbacks per writer thread. When a writer + thread exceeds this limit, it invokes the + corresponding flavor of rcu_barrier() to allow + previously posted callbacks to drain. + rcuperf.gp_exp= [KNL] Measure performance of expedited synchronous grace-period primitives. diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index ef5b1faac495..e1ce97bead94 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -59,6 +59,8 @@ MODULE_AUTHOR("Paul E. McKenney "); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) +torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); +torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); @@ -86,13 +88,16 @@ static u64 t_rcu_perf_writer_started; static u64 t_rcu_perf_writer_finished; static unsigned long b_rcu_perf_writer_started; static unsigned long b_rcu_perf_writer_finished; +static DEFINE_PER_CPU(atomic_t, n_async_inflight); static int rcu_perf_writer_state; #define RTWS_INIT 0 -#define RTWS_EXP_SYNC 1 -#define RTWS_SYNC 2 -#define RTWS_IDLE 2 -#define RTWS_STOPPING 3 +#define RTWS_ASYNC 1 +#define RTWS_BARRIER 2 +#define RTWS_EXP_SYNC 3 +#define RTWS_SYNC 4 +#define RTWS_IDLE 5 +#define RTWS_STOPPING 6 #define MAX_MEAS 10000 #define MIN_MEAS 100 @@ -114,6 +119,8 @@ struct rcu_perf_ops { unsigned long (*started)(void); unsigned long (*completed)(void); unsigned long (*exp_completed)(void); + void (*async)(struct rcu_head *head, rcu_callback_t func); + void (*gp_barrier)(void); void (*sync)(void); void (*exp_sync)(void); const char *name; @@ -153,6 +160,8 @@ static struct rcu_perf_ops rcu_ops = { .started = rcu_batches_started, .completed = rcu_batches_completed, .exp_completed = rcu_exp_batches_completed, + .async = call_rcu, + .gp_barrier = rcu_barrier, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, .name = "rcu" @@ -181,6 +190,8 @@ static struct rcu_perf_ops rcu_bh_ops = { .started = rcu_batches_started_bh, .completed = rcu_batches_completed_bh, .exp_completed = rcu_exp_batches_completed_sched, + .async = call_rcu_bh, + .gp_barrier = rcu_barrier_bh, .sync = synchronize_rcu_bh, .exp_sync = synchronize_rcu_bh_expedited, .name = "rcu_bh" @@ -208,6 +219,16 @@ static unsigned long srcu_perf_completed(void) return srcu_batches_completed(srcu_ctlp); } +static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + call_srcu(srcu_ctlp, head, func); +} + +static void srcu_rcu_barrier(void) +{ + srcu_barrier(srcu_ctlp); +} + static void srcu_perf_synchronize(void) { synchronize_srcu(srcu_ctlp); @@ -226,6 +247,8 @@ static struct rcu_perf_ops srcu_ops = { .started = NULL, .completed = srcu_perf_completed, .exp_completed = srcu_perf_completed, + .async = srcu_call_rcu, + .gp_barrier = srcu_rcu_barrier, .sync = srcu_perf_synchronize, .exp_sync = srcu_perf_synchronize_expedited, .name = "srcu" @@ -254,6 +277,8 @@ static struct rcu_perf_ops sched_ops = { .started = rcu_batches_started_sched, .completed = rcu_batches_completed_sched, .exp_completed = rcu_exp_batches_completed_sched, + .async = call_rcu_sched, + .gp_barrier = rcu_barrier_sched, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, .name = "sched" @@ -281,6 +306,8 @@ static struct rcu_perf_ops tasks_ops = { .readunlock = tasks_perf_read_unlock, .started = rcu_no_completed, .completed = rcu_no_completed, + .async = call_rcu_tasks, + .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, .name = "tasks" @@ -343,6 +370,15 @@ rcu_perf_reader(void *arg) return 0; } +/* + * Callback function for asynchronous grace periods from rcu_perf_writer(). + */ +static void rcu_perf_async_cb(struct rcu_head *rhp) +{ + atomic_dec(this_cpu_ptr(&n_async_inflight)); + kfree(rhp); +} + /* * RCU perf writer kthread. Repeatedly does a grace period. */ @@ -352,6 +388,7 @@ rcu_perf_writer(void *arg) int i = 0; int i_max; long me = (long)arg; + struct rcu_head *rhp = NULL; struct sched_param sp; bool started = false, done = false, alldone = false; u64 t; @@ -382,7 +419,23 @@ rcu_perf_writer(void *arg) do { wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); - if (gp_exp) { + if (gp_async) { +retry: + if (!rhp) + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { + rcu_perf_writer_state = RTWS_ASYNC; + atomic_inc(this_cpu_ptr(&n_async_inflight)); + cur_ops->async(rhp, rcu_perf_async_cb); + rhp = NULL; + } else if (!kthread_should_stop()) { + rcu_perf_writer_state = RTWS_BARRIER; + cur_ops->gp_barrier(); + goto retry; + } else { + kfree(rhp); /* Because we are stopping. */ + } + } else if (gp_exp) { rcu_perf_writer_state = RTWS_EXP_SYNC; cur_ops->exp_sync(); } else { @@ -429,6 +482,10 @@ rcu_perf_writer(void *arg) i++; rcu_perf_wait_shutdown(); } while (!torture_must_stop()); + if (gp_async) { + rcu_perf_writer_state = RTWS_BARRIER; + cur_ops->gp_barrier(); + } rcu_perf_writer_state = RTWS_STOPPING; writer_n_durations[me] = i_max; torture_kthread_stopping("rcu_perf_writer"); @@ -460,6 +517,8 @@ rcu_perf_cleanup(void) VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); if (rcu_gp_is_normal() && gp_exp) VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + if (gp_exp && gp_async) + VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!"); if (torture_cleanup_begin()) return; -- cgit v1.3-14-g43fede From dcfc315b7b7a5e7668dd9cb2474708b51ab1cdb1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 18 Apr 2017 09:53:07 -0700 Subject: rcu: Make sync_rcu_preempt_exp_done() return bool The sync_rcu_preempt_exp_done() function returns a logical expression, but its return type is nevertheless int. This commit therefore changes the return type to bool. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e513b4ab1197..dd21ca47e4b4 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -147,7 +147,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * * Caller must hold the rcu_state's exp_mutex. */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) { return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; -- cgit v1.3-14-g43fede From f60cb4d4c8e568c2d63d01d72e589c4f75ee4140 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 Apr 2017 13:43:21 -0700 Subject: rcuperf: Add test for dynamically initialized srcu_struct This commit adds a perf_type of "srcud", which species that rcuperf test SRCU on a dynamically initialized srcu_struct. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index e1ce97bead94..5158ddba6716 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -254,6 +254,35 @@ static struct rcu_perf_ops srcu_ops = { .name = "srcu" }; +static struct srcu_struct srcud; + +static void srcu_sync_perf_init(void) +{ + srcu_ctlp = &srcud; + init_srcu_struct(srcu_ctlp); +} + +static void srcu_sync_perf_cleanup(void) +{ + cleanup_srcu_struct(srcu_ctlp); +} + +static struct rcu_perf_ops srcud_ops = { + .ptype = SRCU_FLAVOR, + .init = srcu_sync_perf_init, + .cleanup = srcu_sync_perf_cleanup, + .readlock = srcu_perf_read_lock, + .readunlock = srcu_perf_read_unlock, + .started = NULL, + .completed = srcu_perf_completed, + .exp_completed = srcu_perf_completed, + .async = srcu_call_rcu, + .gp_barrier = srcu_rcu_barrier, + .sync = srcu_perf_synchronize, + .exp_sync = srcu_perf_synchronize_expedited, + .name = "srcud" +}; + /* * Definitions for sched perf testing. */ @@ -622,7 +651,7 @@ rcu_perf_init(void) long i; int firsterr = 0; static struct rcu_perf_ops *perf_ops[] = { - &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, RCUPERF_TASKS_OPS }; -- cgit v1.3-14-g43fede From 1f4f6da1c80905830337c3ff46a2d3260dabb864 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Apr 2017 11:16:32 -0700 Subject: srcu: Make Classic and Tree SRCU announce themselves at bootup Currently, the only way to tell whether a given kernel is running Classic, Tiny, or Tree SRCU is to look at the .config file, which can easily be lost or associated with the wrong kernel. This commit therefore has Classic and Tree SRCU identify themselves at boot time. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcu.c | 7 +++++++ kernel/rcu/srcutree.c | 7 +++++++ 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index dea03614263f..4e3f558409a0 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -659,3 +659,10 @@ void process_srcu(struct work_struct *work) srcu_reschedule(sp); } EXPORT_SYMBOL_GPL(process_srcu); + +static int __init srcu_bootup_announce(void) +{ + pr_info("Classic SRCU implementation.\n"); + return 0; +} +early_initcall(srcu_bootup_announce); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index fceca84df6b0..03d57fe9f094 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1167,3 +1167,10 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); + +static int __init srcu_bootup_announce(void) +{ + pr_info("Hierarchical SRCU implementation.\n"); + return 0; +} +early_initcall(srcu_bootup_announce); -- cgit v1.3-14-g43fede From 3ddf20c953520203c42dbed1f091ed52080e1cd2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Apr 2017 13:33:20 -0700 Subject: srcu: Shrink Tiny SRCU a bit more This commit rearranges Tiny SRCU's srcu_struct structure, substitutes u8 for bool, and shrinks counters down to short. Signed-off-by: Paul E. McKenney --- include/linux/srcutiny.h | 8 ++++---- kernel/rcu/rcutorture.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 42311ee0334f..b8859179b001 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -27,15 +27,15 @@ #include struct srcu_struct { - int srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ + short srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ + short srcu_idx; /* Current reader array element. */ + u8 srcu_gp_running; /* GP workqueue running? */ + u8 srcu_gp_waiting; /* GP waiting for readers? */ struct swait_queue_head srcu_wq; /* Last srcu_read_unlock() wakes GP. */ unsigned long srcu_gp_seq; /* GP seq # for callback tagging. */ struct rcu_segcblist srcu_cblist; /* Pending SRCU callbacks. */ - int srcu_idx; /* Current reader array element. */ - bool srcu_gp_running; /* GP workqueue running? */ - bool srcu_gp_waiting; /* GP waiting for readers? */ struct work_struct srcu_work; /* For driving grace periods. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ae6e574d4cf5..a58592b73f19 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -609,7 +609,7 @@ static void srcu_torture_stats(void) pr_cont("\n"); #elif defined(CONFIG_TINY_SRCU) idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; - pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n", + pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", torture_type, TORTURE_FLAG, idx, READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); -- cgit v1.3-14-g43fede From 492b95e59735998312f678d77a2d5fe20af6b0b9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Apr 2017 16:09:15 -0700 Subject: rcuperf: Set more user-friendly defaults Common-case use of rcuperf must set rcuperf.nreaders=0 and if not built as a module, rcuperf.shutdown. This commit therefore sets the default for rcuperf.nreaders to zero and sets the default for rcuperf.shutdown to zero if rcuperf is built as a module and to one otherwise. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuperf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 5158ddba6716..49c8ed6bd2fd 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -63,9 +63,10 @@ torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); -torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, nreaders, 0, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); +torture_param(bool, shutdown, !IS_ENABLED(MODULE), + "Shutdown at end of performance tests."); torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); static char *perf_type = "rcu"; -- cgit v1.3-14-g43fede From 820687a7b98a5031207893ff265f97c0a0ad403e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Apr 2017 15:12:56 -0700 Subject: rcuperf: Add writer_holdoff boot parameter This commit adds a writer_holdoff boot parameter to rcuperf, which is intended to be used to test Tree SRCU's auto-expediting. This boot parameter is in microseconds, and defaults to zero (that is, disabled). Set it to a bit larger than srcutree.exp_holdoff, keeping the nanosecond/microsecond conversion, to force Tree SRCU to auto-expedite more aggressively. This commit also adds documentation for this parameter, and fixes some alphabetization while in the neighborhood. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 11 ++++++++--- kernel/rcu/rcuperf.c | 3 +++ 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3598464ca8ed..01b5ab92d251 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3357,17 +3357,22 @@ rcuperf.perf_runnable= [BOOT] Start rcuperf running at boot time. + rcuperf.perf_type= [KNL] + Specify the RCU implementation to test. + rcuperf.shutdown= [KNL] Shut the system down after performance tests complete. This is useful for hands-off automated testing. - rcuperf.perf_type= [KNL] - Specify the RCU implementation to test. - rcuperf.verbose= [KNL] Enable additional printk() statements. + rcuperf.writer_holdoff= [KNL] + Write-side holdoff between grace periods, + in microseconds. The default of zero says + no holdoff. + rcutorture.cbflood_inter_holdoff= [KNL] Set holdoff time (jiffies) between successive callback-flood tests. diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 49c8ed6bd2fd..d80f11d9f8bd 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -68,6 +68,7 @@ torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, !IS_ENABLED(MODULE), "Shutdown at end of performance tests."); torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); +torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -447,6 +448,8 @@ rcu_perf_writer(void *arg) } do { + if (writer_holdoff) + udelay(writer_holdoff); wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); if (gp_async) { -- cgit v1.3-14-g43fede From f4687d2637a4016b2eedfdb777105c95e8d6fe52 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Apr 2017 16:13:53 -0700 Subject: rcu: Add preemptibility checks in rcu_sched_qs() and rcu_bh_qs() This commit adds WARN_ON_ONCE() calls that trigger if either rcu_sched_qs() or rcu_bh_qs() are invoked with preemption enabled. In the immortal words of Peter Zijlstra: "these are much harder to ignore than comments". Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3bee58fc23b1..b01a02e7a0b7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -250,6 +250,7 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!"); if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) return; trace_rcu_grace_period(TPS("rcu_sched"), @@ -265,6 +266,7 @@ void rcu_sched_qs(void) void rcu_bh_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!"); if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_bh"), __this_cpu_read(rcu_bh_data.gpnum), -- cgit v1.3-14-g43fede From 59d80fd8351b7b9a5dc7bbfa8bc4ca19f6ff3dad Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 10:20:28 -0700 Subject: rcu: Print out rcupdate.c non-default boot-time settings This commit adds a rcupdate_announce_bootup_oddness() function to print out non-default values of significant kernel boot parameter settings to aid in debugging. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 1 + kernel/rcu/tree_plugin.h | 1 + kernel/rcu/update.c | 42 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 43 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index e1e5d002fdb9..393e461d3ea8 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -76,6 +76,7 @@ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ void rcu_expedite_gp(void); void rcu_unexpedite_gp(void); +void rcupdate_announce_bootup_oddness(void); #endif /* #else #ifdef CONFIG_TINY_RCU */ enum rcutorture_type { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3b432fa4c45b..eb5ebdce25ff 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -92,6 +92,7 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); if (IS_ENABLED(CONFIG_RCU_BOOST)) pr_info("\tRCU kthread priority: %d.\n", kthread_prio); + rcupdate_announce_bootup_oddness(); } #ifdef CONFIG_PREEMPT_RCU diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 273e869ca21d..82a5aa10dbc5 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -560,7 +560,8 @@ static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); DEFINE_SRCU(tasks_rcu_exit_srcu); /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ -static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; +#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) +static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); static void rcu_spawn_tasks_kthread(void); @@ -851,6 +852,23 @@ static void rcu_spawn_tasks_kthread(void) #endif /* #ifdef CONFIG_TASKS_RCU */ +#ifndef CONFIG_TINY_RCU + +/* + * Print any non-default Tasks RCU settings. + */ +static void __init rcu_tasks_bootup_oddness(void) +{ +#ifdef CONFIG_TASKS_RCU + if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) + pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); + else + pr_info("\tTasks RCU enabled.\n"); +#endif /* #ifdef CONFIG_TASKS_RCU */ +} + +#endif /* #ifndef CONFIG_TINY_RCU */ + #ifdef CONFIG_PROVE_RCU /* @@ -935,3 +953,25 @@ late_initcall(rcu_verify_early_boot_tests); #else void rcu_early_boot_tests(void) {} #endif /* CONFIG_PROVE_RCU */ + +#ifndef CONFIG_TINY_RCU + +/* + * Print any significant non-default boot-time settings. + */ +void __init rcupdate_announce_bootup_oddness(void) +{ + if (rcu_normal) + pr_info("\tNo expedited grace period (rcu_normal).\n"); + else if (rcu_normal_after_boot) + pr_info("\tNo expedited grace period (rcu_normal_after_boot).\n"); + else if (rcu_expedited) + pr_info("\tAll grace periods are expedited (rcu_expedited).\n"); + if (rcu_cpu_stall_suppress) + pr_info("\tRCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).\n"); + if (rcu_cpu_stall_timeout != CONFIG_RCU_CPU_STALL_TIMEOUT) + pr_info("\tRCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).\n", rcu_cpu_stall_timeout); + rcu_tasks_bootup_oddness(); +} + +#endif /* #ifndef CONFIG_TINY_RCU */ -- cgit v1.3-14-g43fede From 17c7798bea64b9d35c7b4cc14d564e6feff73ac3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 11:12:34 -0700 Subject: rcu: Update rcu_bootup_announce_oddness() This commit updates rcu_bootup_announce_oddness() to check additional Kconfig options and module/boot parameters. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 ++++++--- kernel/rcu/tree_plugin.h | 31 ++++++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b01a02e7a0b7..ac8dce15fd74 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -536,9 +536,12 @@ void rcu_all_qs(void) } EXPORT_SYMBOL_GPL(rcu_all_qs); -static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ -static long qhimark = 10000; /* If this many pending, ignore blimit. */ -static long qlowmark = 100; /* Once only this many pending, use blimit. */ +#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ +static long blimit = DEFAULT_RCU_BLIMIT; +#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ +static long qhimark = DEFAULT_RCU_QHIMARK; +#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ +static long qlowmark = DEFAULT_RCU_QLOMARK; module_param(blimit, long, 0444); module_param(qhimark, long, 0444); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index eb5ebdce25ff..9cb3dff78b6f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -79,7 +79,9 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); - if (IS_ENABLED(CONFIG_PROVE_RCU)) + if (IS_ENABLED(CONFIG_PROVE_RCU_REPEATEDLY)) + pr_info("\tRCU lockdep checking is permanently enabled.\n"); + else if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); @@ -90,8 +92,31 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); - if (IS_ENABLED(CONFIG_RCU_BOOST)) - pr_info("\tRCU kthread priority: %d.\n", kthread_prio); +#ifdef CONFIG_RCU_BOOST + pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); +#endif + if (blimit != DEFAULT_RCU_BLIMIT) + pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit); + if (qhimark != DEFAULT_RCU_QHIMARK) + pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); + if (qlowmark != DEFAULT_RCU_QLOMARK) + pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); + if (jiffies_till_first_fqs != ULONG_MAX) + pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); + if (jiffies_till_next_fqs != ULONG_MAX) + pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs); + if (rcu_kick_kthreads) + pr_info("\tKick kthreads if too-long grace period.\n"); + if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) + pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT)) + pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)) + pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP)) + pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) + pr_info("\tRCU debug extended QS entry/exit.\n"); rcupdate_announce_bootup_oddness(); } -- cgit v1.3-14-g43fede From b5815e6cd3b747cacf7628a32a275aa2c0f61e06 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 11:20:29 -0700 Subject: srcu: Make exp_holdoff module parameter be static Because exp_holdoff is not used outside of srcutree.c, it can be static. This commit therefore makes this change. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 03d57fe9f094..08d43736f72a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -40,7 +40,7 @@ #include "rcu.h" #include "rcu_segcblist.h" -ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */ +static ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */ module_param(exp_holdoff, ulong, 0444); static void srcu_invoke_callbacks(struct work_struct *work); -- cgit v1.3-14-g43fede From 0c8e0e3c37955d17cced37222a10c00ab47efd4b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 11:24:22 -0700 Subject: srcu: Print non-default exp_holdoff values at boot time This commit makes srcu_bootup_announce() check for non-default values of the auto-expedite holdoff time exp_holdoff and print a message if so. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 08d43736f72a..0b6ea105d9f8 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -40,7 +40,9 @@ #include "rcu.h" #include "rcu_segcblist.h" -static ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */ +/* Holdoff in nanoseconds for auto-expediting. */ +#define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) +static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; module_param(exp_holdoff, ulong, 0444); static void srcu_invoke_callbacks(struct work_struct *work); @@ -1171,6 +1173,8 @@ EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); static int __init srcu_bootup_announce(void) { pr_info("Hierarchical SRCU implementation.\n"); + if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF) + pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff); return 0; } early_initcall(srcu_bootup_announce); -- cgit v1.3-14-g43fede From c0b334c5bfa98ab104bde38da330a113a6c7dd56 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 12:32:15 -0700 Subject: rcu: Add lockdep_assert_held() teeth to tree.c Comments can be helpful, but assertions carry more force. This commit therefore adds lockdep_assert_held() and RCU_LOCKDEP_WARN() calls to enforce lock-held and interrupt-disabled preconditions. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ac8dce15fd74..121c1436a7f3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -762,6 +762,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; int *fp = &rnp->need_future_gp[idx]; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!"); return READ_ONCE(*fp); } @@ -773,6 +774,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { + RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!"); if (rcu_gp_in_progress(rsp)) return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) @@ -799,6 +801,7 @@ static void rcu_eqs_enter_common(bool user) struct rcu_data *rdp; struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!"); trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { @@ -972,6 +975,7 @@ static void rcu_eqs_exit(bool user) struct rcu_dynticks *rdtp; long long oldval; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); @@ -1679,6 +1683,8 @@ void rcu_cpu_stall_reset(void) static unsigned long rcu_cbs_completed(struct rcu_state *rsp, struct rcu_node *rnp) { + lockdep_assert_held(&rnp->lock); + /* * If RCU is idle, we just wait for the next grace period. * But we can only be sure that RCU is idle if we are looking @@ -1724,6 +1730,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + lockdep_assert_held(&rnp->lock); + /* * Pick up grace-period number for new callbacks. If this * grace period is already marked as needed, return to the caller. @@ -1850,6 +1858,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, { bool ret = false; + lockdep_assert_held(&rnp->lock); + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; @@ -1888,6 +1898,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + lockdep_assert_held(&rnp->lock); + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; @@ -1914,6 +1926,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, bool ret; bool need_gp; + lockdep_assert_held(&rnp->lock); + /* Handle the ends of any preceding grace periods first. */ if (rdp->completed == rnp->completed && !unlikely(READ_ONCE(rdp->gpwrap))) { @@ -2346,6 +2360,7 @@ static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + lockdep_assert_held(&rnp->lock); if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period @@ -2407,6 +2422,7 @@ static bool rcu_start_gp(struct rcu_state *rsp) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { + lockdep_assert_held(&rcu_get_root(rsp)->lock); WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); @@ -2431,6 +2447,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, unsigned long oldmask = 0; struct rcu_node *rnp_c; + lockdep_assert_held(&rnp->lock); + /* Walk up the rcu_node hierarchy. */ for (;;) { if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { @@ -2491,6 +2509,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, unsigned long mask; struct rcu_node *rnp_p; + lockdep_assert_held(&rnp->lock); if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2604,6 +2623,8 @@ static void rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + lockdep_assert_held(&rsp->orphan_lock); + /* No-CBs CPUs do not have orphanable callbacks. */ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) return; @@ -2644,6 +2665,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + lockdep_assert_held(&rsp->orphan_lock); + /* No-CBs CPUs are handled specially. */ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) @@ -2710,6 +2733,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; + lockdep_assert_held(&rnp->lock); if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) return; @@ -3703,6 +3727,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) long mask; struct rcu_node *rnp = rnp_leaf; + lockdep_assert_held(&rnp->lock); for (;;) { mask = rnp->grpmask; rnp = rnp->parent; -- cgit v1.3-14-g43fede From ea9b0c8a26a2cadfe49382d679eee88d3c4de79c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 13:19:28 -0700 Subject: rcu: Add lockdep_assert_held() teeth to tree_plugin.h Comments can be helpful, but assertions carry more force. This commit therefore adds lockdep_assert_held() and RCU_LOCKDEP_WARN() calls to enforce lock-held and interrupt-disabled preconditions. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9cb3dff78b6f..ee7cea75273e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -181,6 +181,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); struct task_struct *t = current; + lockdep_assert_held(&rnp->lock); + /* * Decide where to queue the newly blocked task. In theory, * this could be an if-statement. In practice, when I tried @@ -289,6 +291,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) */ static void rcu_preempt_qs(void) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n"); if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data_p->gpnum), @@ -318,6 +321,7 @@ static void rcu_preempt_note_context_switch(bool preempt) struct rcu_data *rdp; struct rcu_node *rnp; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n"); WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); if (t->rcu_read_lock_nesting > 0 && !t->rcu_read_unlock_special.b.blocked) { @@ -634,6 +638,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); if (rcu_preempt_has_tasks(rnp)) rnp->gp_tasks = rnp->blkd_tasks.next; @@ -1024,6 +1029,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) { struct task_struct *t; + lockdep_assert_held(&rnp->lock); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { rnp->n_balk_exp_gp_tasks++; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -1404,6 +1410,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); unsigned long dj; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!"); if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) { *nextevt = KTIME_MAX; return 0; @@ -1456,6 +1463,7 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!"); if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || rcu_is_nocb_cpu(smp_processor_id())) return; @@ -1511,6 +1519,7 @@ static void rcu_prepare_for_idle(void) */ static void rcu_cleanup_after_idle(void) { + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!"); if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || rcu_is_nocb_cpu(smp_processor_id())) return; @@ -2544,6 +2553,8 @@ static void rcu_sysidle_enter(int irq) unsigned long j; struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_enter() invoked with irqs enabled!!!"); + /* If there are no nohz_full= CPUs, no need to track this. */ if (!tick_nohz_full_enabled()) return; @@ -2615,6 +2626,8 @@ static void rcu_sysidle_exit(int irq) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_exit() invoked with irqs enabled!!!"); + /* If there are no nohz_full= CPUs, no need to track this. */ if (!tick_nohz_full_enabled()) return; @@ -2674,6 +2687,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long j; struct rcu_dynticks *rdtp = rdp->dynticks; + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_check_cpu() invoked with irqs enabled!!!"); + /* If there are no nohz_full= CPUs, don't check system-wide idleness. */ if (!tick_nohz_full_enabled()) return; @@ -2842,6 +2857,8 @@ bool rcu_sys_is_idle(void) static struct rcu_sysidle_head rsh; int rss = READ_ONCE(full_sysidle_state); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sys_is_idle() invoked with irqs enabled!!!"); + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) return false; -- cgit v1.3-14-g43fede From d4efe6c5ad91f9a1f2f1d66b7fbfc87e320b2abc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 14:16:16 -0700 Subject: srcu: Shrink Tiny SRCU a bit In Tiny SRCU, __srcu_read_lock() is a trivial function, outweighed by its EXPORT_SYMBOL_GPL(), and on many architectures, its call sequence. This commit therefore moves it to srcutiny.h so that it can be inlined. Signed-off-by: Paul E. McKenney --- include/linux/srcutiny.h | 15 +++++++++++++++ kernel/rcu/srcutiny.c | 16 ---------------- 2 files changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index b8859179b001..b6edd9c8fdce 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -63,6 +63,21 @@ void srcu_drive_gp(struct work_struct *wp); void synchronize_srcu(struct srcu_struct *sp); +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Can be invoked from irq/bh handlers, but the matching + * __srcu_read_unlock() must be in the same handler instance. Returns an + * index that must be passed to the matching srcu_read_unlock(). + */ +static inline int __srcu_read_lock(struct srcu_struct *sp) +{ + int idx; + + idx = READ_ONCE(sp->srcu_idx); + WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); + return idx; +} + static inline void synchronize_srcu_expedited(struct srcu_struct *sp) { synchronize_srcu(sp); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 32798eb14853..988543721d5d 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -95,22 +95,6 @@ void cleanup_srcu_struct(struct srcu_struct *sp) } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); -/* - * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Can be invoked from irq/bh handlers, but the matching - * __srcu_read_unlock() must be in the same handler instance. Returns an - * index that must be passed to the matching srcu_read_unlock(). - */ -int __srcu_read_lock(struct srcu_struct *sp) -{ - int idx; - - idx = READ_ONCE(sp->srcu_idx); - WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); - return idx; -} -EXPORT_SYMBOL_GPL(__srcu_read_lock); - /* * Removes the count for the old reader from the appropriate element of * the srcu_struct. -- cgit v1.3-14-g43fede From a602538e46c9c62e75f1f0be9806495c79bcda9f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 15:39:34 -0700 Subject: srcu: Add DEBUG_OBJECTS_RCU_HEAD functionality This commit adds DEBUG_OBJECTS_RCU_HEAD checking to detect call_srcu() counterparts to double-free bugs. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0b6ea105d9f8..c6e2a4a1628b 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -761,6 +761,13 @@ static bool srcu_might_be_idle(struct srcu_struct *sp) return true; /* With reasonable probability, idle! */ } +/* + * SRCU callback function to leak a callback. + */ +static void srcu_leak_callback(struct rcu_head *rhp) +{ +} + /* * Enqueue an SRCU callback on the srcu_data structure associated with * the current CPU and the specified srcu_struct structure, initiating @@ -799,6 +806,12 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, struct srcu_data *sdp; check_init_srcu_struct(sp); + if (debug_rcu_head_queue(rhp)) { + /* Probable double call_srcu(), so leak the callback. */ + WRITE_ONCE(rhp->func, srcu_leak_callback); + WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n"); + return; + } rhp->func = func; local_irq_save(flags); sdp = this_cpu_ptr(sp->sda); @@ -973,9 +986,12 @@ void srcu_barrier(struct srcu_struct *sp) spin_lock_irq(&sdp->lock); atomic_inc(&sp->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; + debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head, 0)) + &sdp->srcu_barrier_head, 0)) { + debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&sp->srcu_barrier_cpu_cnt); + } spin_unlock_irq(&sdp->lock); } @@ -1100,6 +1116,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) spin_unlock_irq(&sdp->lock); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { + debug_rcu_head_unqueue(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); -- cgit v1.3-14-g43fede From 68ab0b4263224157f4d0c0e42854169a183d7534 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 16:19:07 -0700 Subject: rcu: Make synchronize_rcu_mult() check for duplicates Currently, doing synchronize_rcu_mult(call_rcu, call_rcu) might (or might not) wait for two RCU grace periods. One approach is of course "don't do that!", but in CONFIG_PREEMPT=n kernels, synchronize_rcu_mult(call_rcu, call_rcu_sched) does exactly that. This results in an ugly #ifdef in sched_cpu_deactivate(). This commit therefore makes __wait_rcu_gp() check for duplicates, which in turn allows duplicates to be passed to synchronize_rcu_mult() without risk of waiting twice on the same type of grace period. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 82a5aa10dbc5..123a9c4b5055 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -379,6 +379,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array) { int i; + int j; /* Initialize and register callbacks for each flavor specified. */ for (i = 0; i < n; i++) { @@ -390,7 +391,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, } init_rcu_head_on_stack(&rs_array[i].head); init_completion(&rs_array[i].completion); - (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + for (j = 0; j < i; j++) + if (crcu_array[j] == crcu_array[i]) + break; + if (j == i) + (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); } /* Wait for all callbacks to be invoked. */ @@ -399,7 +404,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, (crcu_array[i] == call_rcu || crcu_array[i] == call_rcu_bh)) continue; - wait_for_completion(&rs_array[i].completion); + for (j = 0; j < i; j++) + if (crcu_array[j] == crcu_array[i]) + break; + if (j == i) + wait_for_completion(&rs_array[i].completion); destroy_rcu_head_on_stack(&rs_array[i].head); } } -- cgit v1.3-14-g43fede From d7d34d5e46140a13f86379c87a196dc8a0b9b585 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 16:33:07 -0700 Subject: sched: Rely on synchronize_rcu_mult() de-duplication The synchronize_rcu_mult() function now detects duplicate requests for the same grace-period flavor and waits only once for each flavor. This commit therefore removes the ugly #ifdef from sched_cpu_deactivate() because synchronize_rcu_mult(call_rcu, call_rcu_sched) now does what the #ifdef used to be needed for. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner --- kernel/sched/core.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 803c3bc274c4..e91138fcde86 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5874,15 +5874,9 @@ int sched_cpu_deactivate(unsigned int cpu) * users of this state to go away such that all new such users will * observe it. * - * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so wait for both. - * * Do sync before park smpboot threads to take care the rcu boost case. */ - if (IS_ENABLED(CONFIG_PREEMPT)) - synchronize_rcu_mult(call_rcu, call_rcu_sched); - else - synchronize_rcu(); + synchronize_rcu_mult(call_rcu, call_rcu_sched); if (!sched_smp_initialized) return 0; -- cgit v1.3-14-g43fede From 511324e462a12ea8be1a7e5fc63a992134db80d7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 17:04:09 -0700 Subject: rcu: Use RCU_NOCB_WAKE rather than RCU_NOGP_WAKE The RCU_NOGP_WAKE_NOT, RCU_NOGP_WAKE, and RCU_NOGP_WAKE_FORCE flags are used to mediate wakeups for the no-CBs CPU kthreads. The "NOGP" really doesn't make any sense, so this commit does s/NOGP/NOCB/. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 6 +++--- kernel/rcu/tree_plugin.h | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0fa7aee9ef55..ddfa34d020ba 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -312,9 +312,9 @@ struct rcu_data { }; /* Values for nocb_defer_wakeup field in struct rcu_data. */ -#define RCU_NOGP_WAKE_NOT 0 -#define RCU_NOGP_WAKE 1 -#define RCU_NOGP_WAKE_FORCE 2 +#define RCU_NOCB_WAKE_NOT 0 +#define RCU_NOCB_WAKE 1 +#define RCU_NOCB_WAKE_FORCE 2 #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) /* For jiffies_till_first_fqs and */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ee7cea75273e..0b1042545116 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1901,7 +1901,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, @@ -1915,7 +1915,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, @@ -2242,8 +2242,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) if (!rcu_nocb_need_deferred_wakeup(rdp)) return; ndw = READ_ONCE(rdp->nocb_defer_wakeup); - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT); - wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } -- cgit v1.3-14-g43fede From 6b5fc3a1331810db407c9e0e673dc1837afdc9d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 28 Apr 2017 20:11:09 -0700 Subject: rcu: Add memory barriers for NOCB leader wakeup Wait/wakeup operations do not guarantee ordering on their own. Instead, either locking or memory barriers are required. This commit therefore adds memory barriers to wake_nocb_leader() and nocb_leader_wait(). Signed-off-by: Paul E. McKenney Tested-by: Krister Johansen Cc: # 4.6.x --- kernel/rcu/tree_plugin.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0b1042545116..573fbe9640a0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1810,6 +1810,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); + smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ swake_up(&rdp_leader->nocb_wq); } } @@ -2064,6 +2065,7 @@ wait_again: * nocb_gp_head, where they await a grace period. */ gotcbs = false; + smp_mb(); /* wakeup before ->nocb_head reads. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) -- cgit v1.3-14-g43fede From a68a2bb28bbf7a6dd4672a25bd87fd1b5db4fa7d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 08:34:57 -0700 Subject: rcu: Move docbook comments out of rcupdate.h The include/linux/rcupdate.h file is included by more than 200 files, so shrinking it should provide some build-time benefits. This commit therefore moves several docbook comments from rcupdate.h to kernel/rcu/update.c, kernel/rcu/tree.c, and kernel/rcu/tree_plugin.h, thus reducing the number of times that the compiler has to scan these comments. This likely provides only a small benefit, but every little bit helps. This commit also fixes a malformed bulleted list noted by the 0day Test Robot. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 117 ++--------------------------------------------- kernel/rcu/tree.c | 42 +++++++++++++++-- kernel/rcu/tree_plugin.h | 33 ++++++++++++- kernel/rcu/update.c | 20 ++++++-- 4 files changed, 89 insertions(+), 123 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 393e461d3ea8..7a206f039fc2 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -140,115 +140,14 @@ void do_trace_rcu_torture_read(const char *rcutorturename, /* Exported common interfaces */ #ifdef CONFIG_PREEMPT_RCU - -/** - * call_rcu() - Queue an RCU callback for invocation after a grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all pre-existing RCU read-side - * critical sections have completed. However, the callback function - * might well execute concurrently with RCU read-side critical sections - * that started after call_rcu() was invoked. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - * - * Note that all CPUs must agree that the grace period extended beyond - * all pre-existing RCU read-side critical section. On systems with more - * than one CPU, this means that when "func()" is invoked, each CPU is - * guaranteed to have executed a full memory barrier since the end of its - * last RCU read-side critical section whose beginning preceded the call - * to call_rcu(). It also means that each CPU executing an RCU read-side - * critical section that continues beyond the start of "func()" must have - * executed a memory barrier after the call_rcu() but before the beginning - * of that RCU read-side critical section. Note that these guarantees - * include CPUs that are offline, idle, or executing in user mode, as - * well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the - * resulting RCU callback function "func()", then both CPU A and CPU B are - * guaranteed to execute a full memory barrier during the time interval - * between the call to call_rcu() and the invocation of "func()" -- even - * if CPU A and CPU B are the same CPU (but again only if the system has - * more than one CPU). - */ -void call_rcu(struct rcu_head *head, - rcu_callback_t func); - +void call_rcu(struct rcu_head *head, rcu_callback_t func); #else /* #ifdef CONFIG_PREEMPT_RCU */ - -/* In classic RCU, call_rcu() is just call_rcu_sched(). */ #define call_rcu call_rcu_sched - #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ -/** - * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_bh() assumes - * that the read-side critical sections end on completion of a softirq - * handler. This means that read-side critical sections in process - * context must not be interrupted by softirqs. This interface is to be - * used when most of the read-side critical sections are in softirq context. - * RCU read-side critical sections are delimited by : - * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. - * OR - * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. - * These may be nested. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_bh(struct rcu_head *head, - rcu_callback_t func); - -/** - * call_rcu_sched() - Queue an RCU for invocation after sched grace period. - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_sched() assumes - * that the read-side critical sections end on enabling of preemption - * or on voluntary preemption. - * RCU read-side critical sections are delimited by : - * - rcu_read_lock_sched() and rcu_read_unlock_sched(), - * OR - * anything that disables preemption. - * These may be nested. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ -void call_rcu_sched(struct rcu_head *head, - rcu_callback_t func); - +void call_rcu_bh(struct rcu_head *head, rcu_callback_t func); +void call_rcu_sched(struct rcu_head *head, rcu_callback_t func); void synchronize_sched(void); - -/** - * call_rcu_tasks() - Queue an RCU for invocation task-based grace period - * @head: structure to be used for queueing the RCU updates. - * @func: actual callback function to be invoked after the grace period - * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_tasks() assumes - * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), entry into idle, or transition to usermode - * execution. As such, there are no read-side primitives analogous to - * rcu_read_lock() and rcu_read_unlock() because this primitive is intended - * to determine that all tasks have passed through a safe state, not so - * much for data-strcuture synchronization. - * - * See the description of call_rcu() for more detailed information on - * memory ordering guarantees. - */ void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); void rcu_barrier_tasks(void); @@ -474,18 +373,8 @@ extern struct lockdep_map rcu_bh_lock_map; extern struct lockdep_map rcu_sched_lock_map; extern struct lockdep_map rcu_callback_map; int debug_lockdep_rcu_enabled(void); - int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); - -/** - * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? - * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an - * RCU-sched read-side critical section. In absence of - * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side - * critical section unless it can prove otherwise. - */ int rcu_read_lock_sched_held(void); #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 121c1436a7f3..5ebc830297c1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3223,8 +3223,24 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, local_irq_restore(flags); } -/* - * Queue an RCU-sched callback for invocation after a grace period. +/** + * call_rcu_sched() - Queue an RCU for invocation after sched grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_sched() assumes + * that the read-side critical sections end on enabling of preemption + * or on voluntary preemption. + * RCU read-side critical sections are delimited by : + * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR + * - anything that disables preemption. + * + * These may be nested. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) { @@ -3232,8 +3248,26 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu_sched); -/* - * Queue an RCU callback for invocation after a quicker grace period. +/** + * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_bh() assumes + * that the read-side critical sections end on completion of a softirq + * handler. This means that read-side critical sections in process + * context must not be interrupted by softirqs. This interface is to be + * used when most of the read-side critical sections are in softirq context. + * RCU read-side critical sections are delimited by : + * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. + * OR + * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. + * These may be nested. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 573fbe9640a0..116cf8339826 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -675,8 +675,37 @@ static void rcu_preempt_do_callbacks(void) #endif /* #ifdef CONFIG_RCU_BOOST */ -/* - * Queue a preemptible-RCU callback for invocation after a grace period. +/** + * call_rcu() - Queue an RCU callback for invocation after a grace period. + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all pre-existing RCU read-side + * critical sections have completed. However, the callback function + * might well execute concurrently with RCU read-side critical sections + * that started after call_rcu() was invoked. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing RCU read-side critical section. On systems with more + * than one CPU, this means that when "func()" is invoked, each CPU is + * guaranteed to have executed a full memory barrier since the end of its + * last RCU read-side critical section whose beginning preceded the call + * to call_rcu(). It also means that each CPU executing an RCU read-side + * critical section that continues beyond the start of "func()" must have + * executed a memory barrier after the call_rcu() but before the beginning + * of that RCU read-side critical section. Note that these guarantees + * include CPUs that are offline, idle, or executing in user mode, as + * well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting RCU callback function "func()", then both CPU A and CPU B are + * guaranteed to execute a full memory barrier during the time interval + * between the call to call_rcu() and the invocation of "func()" -- even + * if CPU A and CPU B are the same CPU (but again only if the system has + * more than one CPU). */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 123a9c4b5055..84dec2c8ad1b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -576,9 +576,23 @@ module_param(rcu_task_stall_timeout, int, 0644); static void rcu_spawn_tasks_kthread(void); static struct task_struct *rcu_tasks_kthread_ptr; -/* - * Post an RCU-tasks callback. First call must be from process context - * after the scheduler if fully operational. +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @rhp: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), entry into idle, or transition to usermode + * execution. As such, there are no read-side primitives analogous to + * rcu_read_lock() and rcu_read_unlock() because this primitive is intended + * to determine that all tasks have passed through a safe state, not so + * much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { -- cgit v1.3-14-g43fede From 3caec62fbb313946b9be53720bbf2280bb19ec28 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 09:27:15 -0700 Subject: rcu: Move rcu_expedited and rcu_normal externs from rcupdate.h The rcu_expedited and rcu_normal variables are used only by sysctl and kernel/rcu/update.c, so it does not make sense to their extern declarations in rcupdate.h. This commit therefore moves these extern declarations to update.c. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 7 ------- kernel/rcu/update.c | 2 ++ 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 7a206f039fc2..6e7e930c1610 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -46,13 +46,6 @@ #include #include -#include - -#ifndef CONFIG_TINY_RCU -extern int rcu_expedited; /* for sysctl */ -extern int rcu_normal; /* also for sysctl */ -#endif /* #ifndef CONFIG_TINY_RCU */ - #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 84dec2c8ad1b..00e77c470017 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -62,7 +62,9 @@ #define MODULE_PARAM_PREFIX "rcupdate." #ifndef CONFIG_TINY_RCU +extern int rcu_expedited; /* from sysctl */ module_param(rcu_expedited, int, 0); +extern int rcu_normal; /* from sysctl */ module_param(rcu_normal, int, 0); static int rcu_normal_after_boot; module_param(rcu_normal_after_boot, int, 0); -- cgit v1.3-14-g43fede From 25c36329a30c8cac090effe1fbae9bb916fa95fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 09:51:55 -0700 Subject: rcu: Move expediting-related access/control out of rcupdate.h The rcu_gp_is_normal(), rcu_gp_is_expedited(), rcu_expedite_gp(), and rcu_unexpedite_gp() functions are intended only for use within the RCU implementation itself -- the sysfs access is what should be used outside of RCU. This commit therefore moves the declarations for these functions to kernel/rcu/rcu.h, and also includes this file into kernel/rcu/rcutorture.c and kernel/rcu/rcuperf.c. This also has the beneficial effect of shrinking rcupdate.c a bit. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 26 -------------------------- kernel/rcu/rcu.h | 26 ++++++++++++++++++++++++++ kernel/rcu/rcuperf.c | 2 ++ kernel/rcu/rcutorture.c | 2 ++ 4 files changed, 30 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 6e7e930c1610..049c62c59f1b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -46,32 +46,6 @@ #include #include -#ifdef CONFIG_TINY_RCU -/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ -static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ -{ - return true; -} -static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ -{ - return false; -} - -static inline void rcu_expedite_gp(void) -{ -} - -static inline void rcu_unexpedite_gp(void) -{ -} -#else /* #ifdef CONFIG_TINY_RCU */ -bool rcu_gp_is_normal(void); /* Internal RCU use. */ -bool rcu_gp_is_expedited(void); /* Internal RCU use. */ -void rcu_expedite_gp(void); -void rcu_unexpedite_gp(void); -void rcupdate_announce_bootup_oddness(void); -#endif /* #else #ifdef CONFIG_TINY_RCU */ - enum rcutorture_type { RCU_FLAVOR, RCU_BH_FLAVOR, diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 73e16ec4054b..ceb78110db1b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -293,4 +293,30 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ +#ifdef CONFIG_TINY_RCU +/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ +static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ +{ + return true; +} +static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ +{ + return false; +} + +static inline void rcu_expedite_gp(void) +{ +} + +static inline void rcu_unexpedite_gp(void) +{ +} +#else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_gp_is_normal(void); /* Internal RCU use. */ +bool rcu_gp_is_expedited(void); /* Internal RCU use. */ +void rcu_expedite_gp(void); +void rcu_unexpedite_gp(void); +void rcupdate_announce_bootup_oddness(void); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index d80f11d9f8bd..3cc18110b612 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -48,6 +48,8 @@ #include #include +#include "rcu.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a58592b73f19..03cdf79e73d4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -52,6 +52,8 @@ #include #include +#include "rcu.h" + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); -- cgit v1.3-14-g43fede From cad7b3897279c869de61dc88133037b941f84233 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 10:22:57 -0700 Subject: rcu: Move torture-related definitions from rcupdate.h to rcu.h The include/linux/rcupdate.h file contains a number of definitions that are used only to communicate between rcutorture, rcuperf, and the RCU code itself. There is no point in having these definitions exposed globally throughout the kernel, so this commit moves them to kernel/rcu/rcu.h. This change has the added benefit of shrinking rcupdate.h. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 52 --------------------------- include/linux/rcutiny.h | 5 +++ include/linux/rcutree.h | 1 + include/linux/srcuclassic.h | 14 -------- include/linux/srcutiny.h | 12 ------- include/linux/srcutree.h | 4 --- kernel/rcu/rcu.h | 85 +++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 91 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 049c62c59f1b..7557499d8e70 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -46,58 +46,6 @@ #include #include -enum rcutorture_type { - RCU_FLAVOR, - RCU_BH_FLAVOR, - RCU_SCHED_FLAVOR, - RCU_TASKS_FLAVOR, - SRCU_FLAVOR, - INVALID_RCU_FLAVOR -}; - -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) -void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gpnum, unsigned long *completed); -void rcutorture_record_test_transition(void); -void rcutorture_record_progress(unsigned long vernum); -void do_trace_rcu_torture_read(const char *rcutorturename, - struct rcu_head *rhp, - unsigned long secs, - unsigned long c_old, - unsigned long c); -bool rcu_irq_enter_disabled(void); -#else -static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, - int *flags, - unsigned long *gpnum, - unsigned long *completed) -{ - *flags = 0; - *gpnum = 0; - *completed = 0; -} -static inline void rcutorture_record_test_transition(void) -{ -} -static inline void rcutorture_record_progress(unsigned long vernum) -{ -} -static inline bool rcu_irq_enter_disabled(void) -{ - return false; -} -#ifdef CONFIG_RCU_TRACE -void do_trace_rcu_torture_read(const char *rcutorturename, - struct rcu_head *rhp, - unsigned long secs, - unsigned long c_old, - unsigned long c); -#else -#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ - do { } while (0) -#endif -#endif - #define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) #define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b)) #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 74d9c3a1feee..ade360e0d58c 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -202,6 +202,11 @@ static inline void rcu_irq_enter(void) { } +static inline bool rcu_irq_enter_disabled(void) +{ + return false; +} + static inline void rcu_irq_exit_irqson(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 0bacb6b2af69..28af91a19573 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -101,6 +101,7 @@ void rcu_irq_enter(void); void rcu_irq_exit(void); void rcu_irq_enter_irqson(void); void rcu_irq_exit_irqson(void); +bool rcu_irq_enter_disabled(void); void exit_rcu(void); diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h index 5753f7322262..41cf99930f34 100644 --- a/include/linux/srcuclassic.h +++ b/include/linux/srcuclassic.h @@ -98,18 +98,4 @@ void synchronize_srcu_expedited(struct srcu_struct *sp); void srcu_barrier(struct srcu_struct *sp); unsigned long srcu_batches_completed(struct srcu_struct *sp); -static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, - unsigned long *gpnum, - unsigned long *completed) -{ - if (test_type != SRCU_FLAVOR) - return; - *flags = 0; - *completed = sp->completed; - *gpnum = *completed; - if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head) - (*gpnum)++; -} - #endif diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index b6edd9c8fdce..85bddce6a7a6 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -93,16 +93,4 @@ static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) return 0; } -static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, - unsigned long *gpnum, - unsigned long *completed) -{ - if (test_type != SRCU_FLAVOR) - return; - *flags = 0; - *completed = sp->srcu_gp_seq; - *gpnum = *completed; -} - #endif diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 32e86d85fd11..f4adfed17b51 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -143,8 +143,4 @@ void synchronize_srcu_expedited(struct srcu_struct *sp); void srcu_barrier(struct srcu_struct *sp); unsigned long srcu_batches_completed(struct srcu_struct *sp); -void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, - unsigned long *gpnum, unsigned long *completed); - #endif diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ceb78110db1b..f190fc1c8215 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -319,4 +319,89 @@ void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); #endif /* #else #ifdef CONFIG_TINY_RCU */ +enum rcutorture_type { + RCU_FLAVOR, + RCU_BH_FLAVOR, + RCU_SCHED_FLAVOR, + RCU_TASKS_FLAVOR, + SRCU_FLAVOR, + INVALID_RCU_FLAVOR +}; + +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) +void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, + unsigned long *gpnum, unsigned long *completed); +void rcutorture_record_test_transition(void); +void rcutorture_record_progress(unsigned long vernum); +void do_trace_rcu_torture_read(const char *rcutorturename, + struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, + unsigned long c); +#else +static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, + int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + *flags = 0; + *gpnum = 0; + *completed = 0; +} +static inline void rcutorture_record_test_transition(void) +{ +} +static inline void rcutorture_record_progress(unsigned long vernum) +{ +} +#ifdef CONFIG_RCU_TRACE +void do_trace_rcu_torture_read(const char *rcutorturename, + struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, + unsigned long c); +#else +#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ + do { } while (0) +#endif +#endif + +#ifdef CONFIG_TINY_SRCU + +static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + if (test_type != SRCU_FLAVOR) + return; + *flags = 0; + *completed = sp->srcu_gp_seq; + *gpnum = *completed; +} + +#elif defined(CONFIG_TREE_SRCU) + +void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, unsigned long *completed); + +#elif defined(CONFIG_CLASSIC_SRCU) + +static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + if (test_type != SRCU_FLAVOR) + return; + *flags = 0; + *completed = sp->completed; + *gpnum = *completed; + if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head) + (*gpnum)++; +} + +#endif + #endif /* __LINUX_RCU_H */ -- cgit v1.3-14-g43fede From 791875d16e2f6e2e5b90328ccac643f512ac76c4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 11:06:05 -0700 Subject: rcu: Eliminate the unused __rcu_is_watching() function The __rcu_is_watching() function is currently not used, aside from to implement the rcu_is_watching() function. This commit therefore eliminates __rcu_is_watching(), which has the beneficial side-effect of shrinking include/linux/rcupdate.h a bit. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 4 ---- include/linux/rcutiny.h | 11 ----------- kernel/rcu/tiny.c | 13 ------------- kernel/rcu/tree.c | 19 ++++--------------- 4 files changed, 4 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 415633076cb1..b4edfe0966c6 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -204,10 +204,6 @@ do { \ rcu_note_voluntary_context_switch(current); \ } while (0) -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) -bool __rcu_is_watching(void); -#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ - /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index ade360e0d58c..5ed6934152a6 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -232,22 +232,11 @@ static inline void rcu_scheduler_starting(void) } #endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) - -static inline bool rcu_is_watching(void) -{ - return __rcu_is_watching(); -} - -#else /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ - static inline bool rcu_is_watching(void) { return true; } -#endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ - static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index e5385731e391..2306cab2195d 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -59,19 +59,6 @@ void rcu_barrier_sched(void) } EXPORT_SYMBOL(rcu_barrier_sched); -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) - -/* - * Test whether RCU thinks that the current CPU is idle. - */ -bool notrace __rcu_is_watching(void) -{ - return true; -} -EXPORT_SYMBOL(__rcu_is_watching); - -#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ - /* * Helper function for rcu_sched_qs() and rcu_bh_qs(). * Also irqs are disabled to avoid confusion due to interrupt handlers diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5ebc830297c1..61a97164abcc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1138,23 +1138,12 @@ void rcu_nmi_exit(void) rcu_dynticks_eqs_enter(); } -/** - * __rcu_is_watching - are RCU read-side critical sections safe? - * - * Return true if RCU is watching the running CPU, which means that - * this CPU can safely enter RCU read-side critical sections. Unlike - * rcu_is_watching(), the caller of __rcu_is_watching() must have at - * least disabled preemption. - */ -bool notrace __rcu_is_watching(void) -{ - return !rcu_dynticks_curr_cpu_in_eqs(); -} - /** * rcu_is_watching - see if RCU thinks that the current CPU is idle * - * If the current CPU is in its idle loop and is neither in an interrupt + * Return true if RCU is watching the running CPU, which means that this + * CPU can safely enter RCU read-side critical sections. In other words, + * if the current CPU is in its idle loop and is neither in an interrupt * or NMI handler, return true. */ bool notrace rcu_is_watching(void) @@ -1162,7 +1151,7 @@ bool notrace rcu_is_watching(void) bool ret; preempt_disable_notrace(); - ret = __rcu_is_watching(); + ret = !rcu_dynticks_curr_cpu_in_eqs(); preempt_enable_notrace(); return ret; } -- cgit v1.3-14-g43fede From 82118249d0ca4078d56d5e43172ada1567fdf946 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 11:13:24 -0700 Subject: rcu: Move the RCU_SCHEDULER_ definitions from rcupdate.h The RCU_SCHEDULER_INACTIVE, RCU_SCHEDULER_INIT, and RCU_SCHEDULER_RUNNING definitions are used only within RCU, so this commit moves them from include/linux/rcupdate.h to kernel/rcu/rcu.h. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 4 ---- kernel/rcu/rcu.h | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index b4edfe0966c6..9206a28a2d44 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -217,10 +217,6 @@ do { \ #error "Unknown RCU implementation specified to kernel configuration" #endif -#define RCU_SCHEDULER_INACTIVE 0 -#define RCU_SCHEDULER_INIT 1 -#define RCU_SCHEDULER_RUNNING 2 - /* * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic * initialization and destruction of rcu_head on the stack. rcu_head structures diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index f190fc1c8215..17fee2a667d9 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -319,6 +319,10 @@ void rcu_unexpedite_gp(void); void rcupdate_announce_bootup_oddness(void); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#define RCU_SCHEDULER_INACTIVE 0 +#define RCU_SCHEDULER_INIT 1 +#define RCU_SCHEDULER_RUNNING 2 + enum rcutorture_type { RCU_FLAVOR, RCU_BH_FLAVOR, -- cgit v1.3-14-g43fede From fa3c66476975abf00c97f27b6c2b3d223f7d57f5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 11:38:55 -0700 Subject: rcu: Improve __call_rcu() debug-objects error message The "__call_rcu(): Leaked duplicate callback" error message from __call_rcu() has proven to be unhelpful. This commit therefore changes it to "__call_rcu(): Double-freed CB" and adds the value of the pointer passed in. The value of the pointer improves debuggability by allowing correlation with tracing output, for example, the rcu:rcu_callback trace event. Reported-by: Vegard Nossum Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 61a97164abcc..cac24f5d3fd2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3161,9 +3161,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); if (debug_rcu_head_queue(head)) { - /* Probable double call_rcu(), so leak the callback. */ + /* + * Probable double call_rcu(), so leak the callback. + * Use rcu:rcu_callback trace event to find the previous + * time callback was passed to __call_rcu(). + */ + WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", + head, head->func); WRITE_ONCE(head->func, rcu_leak_callback); - WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); return; } head->func = func; -- cgit v1.3-14-g43fede From 3d54f7983f3e6ac9f444fa20970b1abc8f089b79 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 12:25:50 -0700 Subject: rcu: Move rcu_is_nocb_cpu() from rcupdate.h to rcu.h The rcu_is_nocb_cpu() function is used only internally to RCU. This commit therefore moves its declaration from include/linux/rcupdate.h to kernel/rcu/rcu.h. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 9 --------- kernel/rcu/rcu.h | 8 ++++++++ 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f105f0834bbe..003427425e27 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -867,15 +867,6 @@ static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt) } #endif /* #ifdef CONFIG_TINY_RCU */ -#if defined(CONFIG_RCU_NOCB_CPU_ALL) -static inline bool rcu_is_nocb_cpu(int cpu) { return true; } -#elif defined(CONFIG_RCU_NOCB_CPU) -bool rcu_is_nocb_cpu(int cpu); -#else -static inline bool rcu_is_nocb_cpu(int cpu) { return false; } -#endif - - /* Only for use by adaptive-ticks code. */ #ifdef CONFIG_NO_HZ_FULL_SYSIDLE bool rcu_sys_is_idle(void); diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 17fee2a667d9..2f344662c568 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -408,4 +408,12 @@ static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif +#if defined(CONFIG_RCU_NOCB_CPU_ALL) +static inline bool rcu_is_nocb_cpu(int cpu) { return true; } +#elif defined(CONFIG_RCU_NOCB_CPU) +bool rcu_is_nocb_cpu(int cpu); +#else +static inline bool rcu_is_nocb_cpu(int cpu) { return false; } +#endif + #endif /* __LINUX_RCU_H */ -- cgit v1.3-14-g43fede From b8989b76052eedc99b09322efd6f68816f191a1a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 12:28:59 -0700 Subject: rcu: Move rcu_ftrace_dump() from rcupdate.h to rcu.h The rcu_ftrace_dump() function is used only internally to RCU. This commit therefore moves its declaration from include/linux/rcupdate.h to kernel/rcu/rcu.h. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 12 ------------ kernel/rcu/rcu.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 003427425e27..ad5e6934dcf3 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -883,18 +883,6 @@ static inline void rcu_sysidle_force_exit(void) { } #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ -/* - * Dump the ftrace buffer, but only one time per callsite per boot. - */ -#define rcu_ftrace_dump(oops_dump_mode) \ -do { \ - static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \ - \ - if (!atomic_read(&___rfd_beenhere) && \ - !atomic_xchg(&___rfd_beenhere, 1)) \ - ftrace_dump(oops_dump_mode); \ -} while (0) - /* * Place this after a lock-acquisition primitive to guarantee that * an UNLOCK+LOCK pair acts as a full barrier. This guarantee applies diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 2f344662c568..cdbaa441bdac 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -212,6 +212,18 @@ int rcu_jiffies_till_stall_check(void); */ #define TPS(x) tracepoint_string(x) +/* + * Dump the ftrace buffer, but only one time per callsite per boot. + */ +#define rcu_ftrace_dump(oops_dump_mode) \ +do { \ + static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \ + \ + if (!atomic_read(&___rfd_beenhere) && \ + !atomic_xchg(&___rfd_beenhere, 1)) \ + ftrace_dump(oops_dump_mode); \ +} while (0) + void rcu_early_boot_tests(void); void rcu_test_sync_prims(void); -- cgit v1.3-14-g43fede From e3c8d51e1a58c73a557eb38a9a6afb4f704a3379 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 13:37:16 -0700 Subject: rcu: Move torture-related functions out of rcutiny.h and rcutree.h The various functions similar to rcu_batches_started(), the function show_rcu_gp_kthreads(), the various functions similar to rcu_force_quiescent_state(), and the variables rcutorture_testseq and rcutorture_vernum are used only within RCU. There is therefore no point in exporting them to the kernel at large from include/linux/rcutiny.h and include/linux/rcutree.h. This commit therefore moves all of these to kernel/rcu/rcu.h. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 80 --------------------------------------- include/linux/rcutree.h | 16 -------- kernel/rcu/rcu.h | 99 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 5ed6934152a6..0d9270913686 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -106,86 +106,6 @@ static inline void rcu_virt_note_context_switch(int cpu) { } -/* - * Return the number of grace periods started. - */ -static inline unsigned long rcu_batches_started(void) -{ - return 0; -} - -/* - * Return the number of bottom-half grace periods started. - */ -static inline unsigned long rcu_batches_started_bh(void) -{ - return 0; -} - -/* - * Return the number of sched grace periods started. - */ -static inline unsigned long rcu_batches_started_sched(void) -{ - return 0; -} - -/* - * Return the number of grace periods completed. - */ -static inline unsigned long rcu_batches_completed(void) -{ - return 0; -} - -/* - * Return the number of bottom-half grace periods completed. - */ -static inline unsigned long rcu_batches_completed_bh(void) -{ - return 0; -} - -/* - * Return the number of sched grace periods completed. - */ -static inline unsigned long rcu_batches_completed_sched(void) -{ - return 0; -} - -/* - * Return the number of expedited grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed(void) -{ - return 0; -} - -/* - * Return the number of expedited sched grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed_sched(void) -{ - return 0; -} - -static inline void rcu_force_quiescent_state(void) -{ -} - -static inline void rcu_bh_force_quiescent_state(void) -{ -} - -static inline void rcu_sched_force_quiescent_state(void) -{ -} - -static inline void show_rcu_gp_kthreads(void) -{ -} - static inline void rcu_cpu_stall_reset(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 28af91a19573..43113323ca09 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -79,22 +79,6 @@ void cond_synchronize_rcu(unsigned long oldstate); unsigned long get_state_synchronize_sched(void); void cond_synchronize_sched(unsigned long oldstate); -extern unsigned long rcutorture_testseq; -extern unsigned long rcutorture_vernum; -unsigned long rcu_batches_started(void); -unsigned long rcu_batches_started_bh(void); -unsigned long rcu_batches_started_sched(void); -unsigned long rcu_batches_completed(void); -unsigned long rcu_batches_completed_bh(void); -unsigned long rcu_batches_completed_sched(void); -unsigned long rcu_exp_batches_completed(void); -unsigned long rcu_exp_batches_completed_sched(void); -void show_rcu_gp_kthreads(void); - -void rcu_force_quiescent_state(void); -void rcu_bh_force_quiescent_state(void); -void rcu_sched_force_quiescent_state(void); - void rcu_idle_enter(void); void rcu_idle_exit(void); void rcu_irq_enter(void); diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index cdbaa441bdac..d849b371b32b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -420,6 +420,105 @@ static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif +#ifdef CONFIG_TINY_RCU + +/* + * Return the number of grace periods started. + */ +static inline unsigned long rcu_batches_started(void) +{ + return 0; +} + +/* + * Return the number of bottom-half grace periods started. + */ +static inline unsigned long rcu_batches_started_bh(void) +{ + return 0; +} + +/* + * Return the number of sched grace periods started. + */ +static inline unsigned long rcu_batches_started_sched(void) +{ + return 0; +} + +/* + * Return the number of grace periods completed. + */ +static inline unsigned long rcu_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of bottom-half grace periods completed. + */ +static inline unsigned long rcu_batches_completed_bh(void) +{ + return 0; +} + +/* + * Return the number of sched grace periods completed. + */ +static inline unsigned long rcu_batches_completed_sched(void) +{ + return 0; +} + +/* + * Return the number of expedited grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed(void) +{ + return 0; +} + +/* + * Return the number of expedited sched grace periods completed. + */ +static inline unsigned long rcu_exp_batches_completed_sched(void) +{ + return 0; +} + +static inline void rcu_force_quiescent_state(void) +{ +} + +static inline void rcu_bh_force_quiescent_state(void) +{ +} + +static inline void rcu_sched_force_quiescent_state(void) +{ +} + +static inline void show_rcu_gp_kthreads(void) +{ +} + +#else /* #ifdef CONFIG_TINY_RCU */ +extern unsigned long rcutorture_testseq; +extern unsigned long rcutorture_vernum; +unsigned long rcu_batches_started(void); +unsigned long rcu_batches_started_bh(void); +unsigned long rcu_batches_started_sched(void); +unsigned long rcu_batches_completed(void); +unsigned long rcu_batches_completed_bh(void); +unsigned long rcu_batches_completed_sched(void); +unsigned long rcu_exp_batches_completed(void); +unsigned long rcu_exp_batches_completed_sched(void); +void show_rcu_gp_kthreads(void); +void rcu_force_quiescent_state(void); +void rcu_bh_force_quiescent_state(void); +void rcu_sched_force_quiescent_state(void); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + #if defined(CONFIG_RCU_NOCB_CPU_ALL) static inline bool rcu_is_nocb_cpu(int cpu) { return true; } #elif defined(CONFIG_RCU_NOCB_CPU) -- cgit v1.3-14-g43fede From fe21a27e8ca0937a5ac298de1f4b46382e9c5c88 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 13:45:51 -0700 Subject: rcu: Move rcu_request_urgent_qs_task() out of rcutiny.h and rcutree.h The rcu_request_urgent_qs_task() function is used only within RCU, so there is no point in exporting it to the rest of the kernel from nclude/linux/rcutiny.h and include/linux/rcutree.h. This commit therefore moves this function to kernel/rcu/rcu.h. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 4 ---- include/linux/rcutree.h | 3 --- kernel/rcu/rcu.h | 6 ++++++ 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 0d9270913686..f5067941bc27 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -157,10 +157,6 @@ static inline bool rcu_is_watching(void) return true; } -static inline void rcu_request_urgent_qs_task(struct task_struct *t) -{ -} - static inline void rcu_all_qs(void) { barrier(); /* Avoid RCU read-side critical sections leaking across. */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 43113323ca09..d6aa89d15d47 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -91,10 +91,7 @@ void exit_rcu(void); void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; - bool rcu_is_watching(void); -void rcu_request_urgent_qs_task(struct task_struct *t); - void rcu_all_qs(void); /* RCUtree hotplug events */ diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index d849b371b32b..5b76a5baff2e 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -335,6 +335,12 @@ void rcupdate_announce_bootup_oddness(void); #define RCU_SCHEDULER_INIT 1 #define RCU_SCHEDULER_RUNNING 2 +#ifdef CONFIG_TINY_RCU +static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } +#else /* #ifdef CONFIG_TINY_RCU */ +void rcu_request_urgent_qs_task(struct task_struct *t); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + enum rcutorture_type { RCU_FLAVOR, RCU_BH_FLAVOR, -- cgit v1.3-14-g43fede From c350c008297643dad3c395c2fd92230142da5cf6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 May 2017 15:35:32 -0700 Subject: srcu: Prevent sdp->srcu_gp_seq_needed counter wrap If a given CPU never happens to ever start an SRCU grace period, the grace-period sequence counter might wrap. If this CPU were to decide to finally start a grace period, the state of its sdp->srcu_gp_seq_needed might make it appear that it has already requested this grace period, which would prevent starting the grace period. If no other CPU ever started a grace period again, this would look like a grace-period hang. Even if some other CPU took pity and started the needed grace period, the leaf rcu_node structure's ->srcu_data_have_cbs field won't have record of the fact that this CPU has a callback pending, which would look like a very localized grace-period hang. This might seem very unlikely, but SRCU grace periods can take less than a microsecond on small systems, which means that overflow can happen in much less than an hour on a 32-bit embedded system. And embedded systems are especially likely to have long-term idle CPUs. Therefore, it makes sense to prevent this scenario from happening. This commit therefore scans each srcu_data structure occasionally, with frequency controlled by the srcutree.counter_wrap_check kernel boot parameter. This parameter can be set to something like 255 in order to exercise the counter-wrap-prevention code. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 9 +++++++++ kernel/rcu/srcutree.c | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 01b5ab92d251..6671f9b60a86 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3810,6 +3810,15 @@ spia_pedr= spia_peddr= + srcutree.counter_wrap_check [KNL] + Specifies how frequently to check for + grace-period sequence counter wrap for the + srcu_data structure's ->srcu_gp_seq_needed field. + The greater the number of bits set in this kernel + parameter, the less frequently counter wrap will + be checked for. Note that the bottom two bits + are ignored. + srcutree.exp_holdoff [KNL] Specifies how many nanoseconds must elapse since the end of the last SRCU grace period for diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c6e2a4a1628b..cc06dbfc9692 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -45,6 +45,10 @@ static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; module_param(exp_holdoff, ulong, 0444); +/* Overflow-check frequency. N bits roughly says every 2**N grace periods. */ +static ulong counter_wrap_check = (ULONG_MAX >> 2); +module_param(counter_wrap_check, ulong, 0444); + static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); @@ -496,10 +500,13 @@ static void srcu_gp_end(struct srcu_struct *sp) { unsigned long cbdelay; bool cbs; + int cpu; + unsigned long flags; unsigned long gpseq; int idx; int idxnext; unsigned long mask; + struct srcu_data *sdp; struct srcu_node *snp; /* Prevent more than one additional grace period. */ @@ -538,6 +545,17 @@ static void srcu_gp_end(struct srcu_struct *sp) smp_mb(); /* GP end before CB invocation. */ srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); } + + /* Occasionally prevent srcu_data counter wrap. */ + if (!(gpseq & counter_wrap_check)) + for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { + sdp = per_cpu_ptr(sp->sda, cpu); + spin_lock_irqsave(&sdp->lock, flags); + if (ULONG_CMP_GE(gpseq, + sdp->srcu_gp_seq_needed + 100)) + sdp->srcu_gp_seq_needed = gpseq; + spin_unlock_irqrestore(&sdp->lock, flags); + } } /* Callback initiation done, allow grace periods after next. */ -- cgit v1.3-14-g43fede From 5a0465e17a18c467b712a816985b7b8dd8d10c16 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 May 2017 11:31:04 -0700 Subject: srcu: Shrink srcu.h by moving docbook and private function The call_srcu() docbook entry is currently in include/linux/srcu.h, which causes needless processing for each include point. This commit therefore moves this entry to kernel/rcu/srcutree.c, which the compiler reads only once. In addition, the srcu_batches_completed() function is used only within RCU and its torture-test suites. This commit therefore also moves this function's declaration from include/linux/srcutiny.h, include/linux/srcutree.h, and include/linux/srcuclassic.h to kernel/rcu/rcu.h. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 20 -------------------- include/linux/srcuclassic.h | 1 - include/linux/srcutiny.h | 5 ----- include/linux/srcutree.h | 1 - kernel/rcu/rcu.h | 6 ++++++ kernel/rcu/srcutree.c | 17 +++++++++++++++++ 6 files changed, 23 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index ea356d800675..5f509018e6b5 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -65,32 +65,12 @@ int init_srcu_struct(struct srcu_struct *sp); #elif defined(CONFIG_SRCU) #error "Unknown SRCU implementation specified to kernel configuration" #else - /* Dummy definition for things like notifiers. Actual use gets link error. */ struct srcu_struct { }; - #endif -/** - * call_srcu() - Queue a callback for invocation after an SRCU grace period - * @sp: srcu_struct in queue the callback - * @head: structure to be used for queueing the SRCU callback. - * @func: function to be invoked after the SRCU grace period - * - * The callback function will be invoked some time after a full SRCU - * grace period elapses, in other words after all pre-existing SRCU - * read-side critical sections have completed. However, the callback - * function might well execute concurrently with other SRCU read-side - * critical sections that started after call_srcu() was invoked. SRCU - * read-side critical sections are delimited by srcu_read_lock() and - * srcu_read_unlock(), and may be nested. - * - * The callback will be invoked from process context, but must nevertheless - * be fast and must not block. - */ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, void (*func)(struct rcu_head *head)); - void cleanup_srcu_struct(struct srcu_struct *sp); int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h index 41cf99930f34..67db4a36ef0d 100644 --- a/include/linux/srcuclassic.h +++ b/include/linux/srcuclassic.h @@ -96,6 +96,5 @@ void process_srcu(struct work_struct *work); void synchronize_srcu_expedited(struct srcu_struct *sp); void srcu_barrier(struct srcu_struct *sp); -unsigned long srcu_batches_completed(struct srcu_struct *sp); #endif diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 85bddce6a7a6..4c53e698c6e4 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -88,9 +88,4 @@ static inline void srcu_barrier(struct srcu_struct *sp) synchronize_srcu(sp); } -static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) -{ - return 0; -} - #endif diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index f4adfed17b51..24e949bda12a 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -141,6 +141,5 @@ void process_srcu(struct work_struct *work); void synchronize_srcu_expedited(struct srcu_struct *sp); void srcu_barrier(struct srcu_struct *sp); -unsigned long srcu_batches_completed(struct srcu_struct *sp); #endif diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 5b76a5baff2e..74d9fc205313 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -492,6 +492,11 @@ static inline unsigned long rcu_exp_batches_completed_sched(void) return 0; } +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) +{ + return 0; +} + static inline void rcu_force_quiescent_state(void) { } @@ -519,6 +524,7 @@ unsigned long rcu_batches_completed_bh(void); unsigned long rcu_batches_completed_sched(void); unsigned long rcu_exp_batches_completed(void); unsigned long rcu_exp_batches_completed_sched(void); +unsigned long srcu_batches_completed(struct srcu_struct *sp); void show_rcu_gp_kthreads(void); void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index cc06dbfc9692..66a998f9c5a7 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -854,6 +854,23 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, srcu_funnel_exp_start(sp, sdp->mynode, s); } +/** + * call_srcu() - Queue a callback for invocation after an SRCU grace period + * @sp: srcu_struct in queue the callback + * @head: structure to be used for queueing the SRCU callback. + * @func: function to be invoked after the SRCU grace period + * + * The callback function will be invoked some time after a full SRCU + * grace period elapses, in other words after all pre-existing SRCU + * read-side critical sections have completed. However, the callback + * function might well execute concurrently with other SRCU read-side + * critical sections that started after call_srcu() was invoked. SRCU + * read-side critical sections are delimited by srcu_read_lock() and + * srcu_read_unlock(), and may be nested. + * + * The callback will be invoked from process context, but must nevertheless + * be fast and must not block. + */ void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rcu_callback_t func) { -- cgit v1.3-14-g43fede From 2464dd940e23bad227c387a40eec99f7aa02ed96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 May 2017 14:29:16 -0700 Subject: srcu: Apply trivial callback lists to shrink Tiny SRCU The rcu_segcblist structure provides quite a bit of functionality, and Tiny SRCU needs almost none of it. So this commit replaces Tiny SRCU's uses of rcu_segcblist with a simple singly linked list with tail pointer. This change significantly reduces Tiny SRCU's memory footprint, more than making up for the growth caused by the creation of rcu_segcblist.c Signed-off-by: Paul E. McKenney --- include/linux/srcutiny.h | 7 +++-- init/Kconfig | 2 +- kernel/rcu/rcu.h | 2 +- kernel/rcu/srcutiny.c | 70 ++++++++++++++++++++++-------------------------- 4 files changed, 37 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 4c53e698c6e4..cfbfc540cafc 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -33,9 +33,8 @@ struct srcu_struct { u8 srcu_gp_waiting; /* GP waiting for readers? */ struct swait_queue_head srcu_wq; /* Last srcu_read_unlock() wakes GP. */ - unsigned long srcu_gp_seq; /* GP seq # for callback tagging. */ - struct rcu_segcblist srcu_cblist; - /* Pending SRCU callbacks. */ + struct rcu_head *srcu_cb_head; /* Pending callbacks: Head. */ + struct rcu_head **srcu_cb_tail; /* Pending callbacks: Tail. */ struct work_struct srcu_work; /* For driving grace periods. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -47,7 +46,7 @@ void srcu_drive_gp(struct work_struct *wp); #define __SRCU_STRUCT_INIT(name) \ { \ .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ - .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist), \ + .srcu_cb_tail = &name.srcu_cb_head, \ .srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp), \ __SRCU_DEP_MAP_INIT(name) \ } diff --git a/init/Kconfig b/init/Kconfig index d928a3724af9..a2cfde19e8b8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -573,7 +573,7 @@ config RCU_STALL_COMMON making these warnings mandatory for the tree variants. config RCU_NEED_SEGCBLIST - def_bool ( TREE_RCU || PREEMPT_RCU || TINY_SRCU || TREE_SRCU ) + def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) config CONTEXT_TRACKING bool diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 74d9fc205313..6a1e85bd2eac 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -398,7 +398,7 @@ static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, if (test_type != SRCU_FLAVOR) return; *flags = 0; - *completed = sp->srcu_gp_seq; + *completed = sp->srcu_idx; *gpnum = *completed; } diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 988543721d5d..1a1c1047d2ed 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -38,8 +38,8 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) sp->srcu_lock_nesting[0] = 0; sp->srcu_lock_nesting[1] = 0; init_swait_queue_head(&sp->srcu_wq); - sp->srcu_gp_seq = 0; - rcu_segcblist_init(&sp->srcu_cblist); + sp->srcu_cb_head = NULL; + sp->srcu_cb_tail = &sp->srcu_cb_head; sp->srcu_gp_running = false; sp->srcu_gp_waiting = false; sp->srcu_idx = 0; @@ -88,10 +88,10 @@ void cleanup_srcu_struct(struct srcu_struct *sp) { WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); flush_work(&sp->srcu_work); - WARN_ON(rcu_seq_state(sp->srcu_gp_seq)); WARN_ON(sp->srcu_gp_running); WARN_ON(sp->srcu_gp_waiting); - WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)); + WARN_ON(sp->srcu_cb_head); + WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail); } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); @@ -117,52 +117,44 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); void srcu_drive_gp(struct work_struct *wp) { int idx; - struct rcu_cblist ready_cbs; - struct srcu_struct *sp; + struct rcu_head *lh; struct rcu_head *rhp; + struct srcu_struct *sp; sp = container_of(wp, struct srcu_struct, srcu_work); - if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist)) + if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head)) return; /* Already running or nothing to do. */ - /* Tag recently arrived callbacks and wait for readers. */ + /* Remove recently arrived callbacks and wait for readers. */ WRITE_ONCE(sp->srcu_gp_running, true); - rcu_segcblist_accelerate(&sp->srcu_cblist, - rcu_seq_snap(&sp->srcu_gp_seq)); - rcu_seq_start(&sp->srcu_gp_seq); + local_irq_disable(); + lh = sp->srcu_cb_head; + sp->srcu_cb_head = NULL; + sp->srcu_cb_tail = &sp->srcu_cb_head; + local_irq_enable(); idx = sp->srcu_idx; WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ - rcu_seq_end(&sp->srcu_gp_seq); - - /* Update callback list based on GP, and invoke ready callbacks. */ - rcu_segcblist_advance(&sp->srcu_cblist, - rcu_seq_current(&sp->srcu_gp_seq)); - if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) { - rcu_cblist_init(&ready_cbs); - local_irq_disable(); - rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs); - local_irq_enable(); - rhp = rcu_cblist_dequeue(&ready_cbs); - for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { - local_bh_disable(); - rhp->func(rhp); - local_bh_enable(); - } - local_irq_disable(); - rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs); - local_irq_enable(); + + /* Invoke the callbacks we removed above. */ + while (lh) { + rhp = lh; + lh = lh->next; + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); } - WRITE_ONCE(sp->srcu_gp_running, false); /* - * If more callbacks, reschedule ourselves. This can race with - * a call_srcu() at interrupt level, but the ->srcu_gp_running - * checks will straighten that out. + * Enable rescheduling, and if there are more callbacks, + * reschedule ourselves. This can race with a call_srcu() + * at interrupt level, but the ->srcu_gp_running checks will + * straighten that out. */ - if (!rcu_segcblist_empty(&sp->srcu_cblist)) + WRITE_ONCE(sp->srcu_gp_running, false); + if (READ_ONCE(sp->srcu_cb_head)) schedule_work(&sp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); @@ -171,14 +163,16 @@ EXPORT_SYMBOL_GPL(srcu_drive_gp); * Enqueue an SRCU callback on the specified srcu_struct structure, * initiating grace-period processing if it is not already running. */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *head, +void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; - head->func = func; + rhp->func = func; + rhp->next = NULL; local_irq_save(flags); - rcu_segcblist_enqueue(&sp->srcu_cblist, head, false); + *sp->srcu_cb_tail = rhp; + sp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); if (!READ_ONCE(sp->srcu_gp_running)) schedule_work(&sp->srcu_work); -- cgit v1.3-14-g43fede From 681fbec881dea1848e9246d7d1ecb3b97f11026d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 May 2017 15:44:38 -0700 Subject: lockdep: Use consistent printing primitives Commit a5dd63efda3d ("lockdep: Use "WARNING" tag on lockdep splats") substituted pr_warn() for printk() in places called out by Dmitry Vyukov. However, this resulted in an ugly mix of pr_warn() and printk(). This commit therefore changes printk() to pr_warn() or pr_cont(), depending on the absence or presence of KERN_CONT. This is done in all functions that had printk() changed to pr_warn() by the aforementioned commit. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/locking/lockdep.c | 172 +++++++++++++++++++++++------------------------ 1 file changed, 86 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c0e31bfee25c..cceb9534338a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1157,18 +1157,18 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("======================================================\n"); pr_warn("WARNING: possible circular locking dependency detected\n"); print_kernel_ident(); pr_warn("------------------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", + pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); - printk("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(check_tgt); - printk("\nwhich lock already depends on the new lock.\n\n"); - printk("\nthe existing dependency chain (in reverse order) is:\n"); + pr_warn("\nwhich lock already depends on the new lock.\n\n"); + pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); print_circular_bug_entry(entry, depth); @@ -1495,13 +1495,13 @@ print_bad_irq_dependency(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("=====================================================\n"); pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", irqclass, irqclass); print_kernel_ident(); pr_warn("-----------------------------------------------------\n"); - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", + pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, @@ -1509,46 +1509,46 @@ print_bad_irq_dependency(struct task_struct *curr, curr->softirqs_enabled); print_lock(next); - printk("\nand this task is already holding:\n"); + pr_warn("\nand this task is already holding:\n"); print_lock(prev); - printk("which would create a new lock dependency:\n"); + pr_warn("which would create a new lock dependency:\n"); print_lock_name(hlock_class(prev)); - printk(KERN_CONT " ->"); + pr_cont(" ->"); print_lock_name(hlock_class(next)); - printk(KERN_CONT "\n"); + pr_cont("\n"); - printk("\nbut this new dependency connects a %s-irq-safe lock:\n", + pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); print_lock_name(backwards_entry->class); - printk("\n... which became %s-irq-safe at:\n", irqclass); + pr_warn("\n... which became %s-irq-safe at:\n", irqclass); print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); - printk("\nto a %s-irq-unsafe lock:\n", irqclass); + pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_entry->class); - printk("\n... which became %s-irq-unsafe at:\n", irqclass); - printk("..."); + pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); + pr_warn("..."); print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); - printk("\nother info that might help us debug this:\n\n"); + pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, hlock_class(prev), hlock_class(next)); lockdep_print_held_locks(curr); - printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); + pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); if (!save_trace(&prev_root->trace)) return 0; print_shortest_lock_dependencies(backwards_entry, prev_root); - printk("\nthe dependencies between the lock to be acquired"); - printk(" and %s-irq-unsafe lock:\n", irqclass); + pr_warn("\nthe dependencies between the lock to be acquired"); + pr_warn(" and %s-irq-unsafe lock:\n", irqclass); if (!save_trace(&next_root->trace)) return 0; print_shortest_lock_dependencies(forwards_entry, next_root); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -1724,22 +1724,22 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("============================================\n"); pr_warn("WARNING: possible recursive locking detected\n"); print_kernel_ident(); pr_warn("--------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", + pr_warn("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(next); - printk("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(prev); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); print_deadlock_scenario(next, prev); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -2074,21 +2074,21 @@ static void print_collision(struct task_struct *curr, struct held_lock *hlock_next, struct lock_chain *chain) { - printk("\n"); + pr_warn("\n"); pr_warn("============================\n"); pr_warn("WARNING: chain_key collision\n"); print_kernel_ident(); pr_warn("----------------------------\n"); - printk("%s/%d: ", current->comm, task_pid_nr(current)); - printk("Hash chain already cached but the contents don't match!\n"); + pr_warn("%s/%d: ", current->comm, task_pid_nr(current)); + pr_warn("Hash chain already cached but the contents don't match!\n"); - printk("Held locks:"); + pr_warn("Held locks:"); print_chain_keys_held_locks(curr, hlock_next); - printk("Locks in cached chain:"); + pr_warn("Locks in cached chain:"); print_chain_keys_chain(chain); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } #endif @@ -2373,16 +2373,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("================================\n"); pr_warn("WARNING: inconsistent lock state\n"); print_kernel_ident(); pr_warn("--------------------------------\n"); - printk("inconsistent {%s} -> {%s} usage.\n", + pr_warn("inconsistent {%s} -> {%s} usage.\n", usage_str[prev_bit], usage_str[new_bit]); - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", + pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", curr->comm, task_pid_nr(curr), trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, @@ -2390,16 +2390,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, trace_softirqs_enabled(curr)); print_lock(this); - printk("{%s} state was registered at:\n", usage_str[prev_bit]); + pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); print_irqtrace_events(curr); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); print_usage_bug_scenario(this); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -2438,28 +2438,28 @@ print_irq_inversion_bug(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("========================================================\n"); pr_warn("WARNING: possible irq lock inversion dependency detected\n"); print_kernel_ident(); pr_warn("--------------------------------------------------------\n"); - printk("%s/%d just changed the state of lock:\n", + pr_warn("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); print_lock(this); if (forwards) - printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); + pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else - printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); + pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); print_lock_name(other->class); - printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); + pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); /* Find a middle lock (if one exists) */ depth = get_lock_depth(other); do { if (depth == 0 && (entry != root)) { - printk("lockdep:%s bad path found in chain graph\n", __func__); + pr_warn("lockdep:%s bad path found in chain graph\n", __func__); break; } middle = entry; @@ -2475,12 +2475,12 @@ print_irq_inversion_bug(struct task_struct *curr, lockdep_print_held_locks(curr); - printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); + pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); if (!save_trace(&root->trace)) return 0; print_shortest_lock_dependencies(other, root); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3189,25 +3189,25 @@ print_lock_nested_lock_not_held(struct task_struct *curr, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("==================================\n"); pr_warn("WARNING: Nested lock was not taken\n"); print_kernel_ident(); pr_warn("----------------------------------\n"); - printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); + pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); print_lock(hlock); - printk("\nbut this task is not holding:\n"); - printk("%s\n", hlock->nest_lock->name); + pr_warn("\nbut this task is not holding:\n"); + pr_warn("%s\n", hlock->nest_lock->name); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); - printk("\nother info that might help us debug this:\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3402,21 +3402,21 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("=====================================\n"); pr_warn("WARNING: bad unlock balance detected!\n"); print_kernel_ident(); pr_warn("-------------------------------------\n"); - printk("%s/%d is trying to release lock (", + pr_warn("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(KERN_CONT ") at:\n"); + pr_cont(") at:\n"); print_ip_sym(ip); - printk("but there are no more locks to release!\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("but there are no more locks to release!\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -3974,21 +3974,21 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n"); + pr_warn("\n"); pr_warn("=================================\n"); pr_warn("WARNING: bad contention detected!\n"); print_kernel_ident(); pr_warn("---------------------------------\n"); - printk("%s/%d is trying to contend lock (", + pr_warn("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(KERN_CONT ") at:\n"); + pr_cont(") at:\n"); print_ip_sym(ip); - printk("but there are no locks held!\n"); - printk("\nother info that might help us debug this:\n"); + pr_warn("but there are no locks held!\n"); + pr_warn("\nother info that might help us debug this:\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); return 0; @@ -4318,17 +4318,17 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, if (debug_locks_silent) return; - printk("\n"); + pr_warn("\n"); pr_warn("=========================\n"); pr_warn("WARNING: held lock freed!\n"); print_kernel_ident(); pr_warn("-------------------------\n"); - printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", + pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } @@ -4376,14 +4376,14 @@ static void print_held_locks_bug(void) if (debug_locks_silent) return; - printk("\n"); + pr_warn("\n"); pr_warn("====================================\n"); pr_warn("WARNING: %s/%d still has locks held!\n", current->comm, task_pid_nr(current)); print_kernel_ident(); pr_warn("------------------------------------\n"); lockdep_print_held_locks(current); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } @@ -4402,10 +4402,10 @@ void debug_show_all_locks(void) int unlock = 1; if (unlikely(!debug_locks)) { - printk("INFO: lockdep is turned off.\n"); + pr_warn("INFO: lockdep is turned off.\n"); return; } - printk("\nShowing all locks held in the system:\n"); + pr_warn("\nShowing all locks held in the system:\n"); /* * Here we try to get the tasklist_lock as hard as possible, @@ -4416,18 +4416,18 @@ void debug_show_all_locks(void) retry: if (!read_trylock(&tasklist_lock)) { if (count == 10) - printk("hm, tasklist_lock locked, retrying... "); + pr_warn("hm, tasklist_lock locked, retrying... "); if (count) { count--; - printk(" #%d", 10-count); + pr_cont(" #%d", 10-count); mdelay(200); goto retry; } - printk(" ignoring it.\n"); + pr_cont(" ignoring it.\n"); unlock = 0; } else { if (count != 10) - printk(KERN_CONT " locked it.\n"); + pr_cont(" locked it.\n"); } do_each_thread(g, p) { @@ -4445,7 +4445,7 @@ retry: unlock = 1; } while_each_thread(g, p); - printk("\n"); + pr_warn("\n"); pr_warn("=============================================\n\n"); if (unlock) @@ -4475,12 +4475,12 @@ asmlinkage __visible void lockdep_sys_exit(void) if (unlikely(curr->lockdep_depth)) { if (!debug_locks_off()) return; - printk("\n"); + pr_warn("\n"); pr_warn("================================================\n"); pr_warn("WARNING: lock held when returning to user space!\n"); print_kernel_ident(); pr_warn("------------------------------------------------\n"); - printk("%s/%d is leaving the kernel with locks still held!\n", + pr_warn("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); } @@ -4495,14 +4495,14 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) return; #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ - printk("\n"); + pr_warn("\n"); pr_warn("=============================\n"); pr_warn("WARNING: suspicious RCU usage\n"); print_kernel_ident(); pr_warn("-----------------------------\n"); - printk("%s:%d %s!\n", file, line, s); - printk("\nother info that might help us debug this:\n\n"); - printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", + pr_warn("%s:%d %s!\n", file, line, s); + pr_warn("\nother info that might help us debug this:\n\n"); + pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" : !rcu_is_watching() @@ -4529,10 +4529,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) * rcu_read_lock_bh() and so on from extended quiescent states. */ if (!rcu_is_watching()) - printk("RCU used illegally from extended quiescent state!\n"); + pr_warn("RCU used illegally from extended quiescent state!\n"); lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); + pr_warn("\nstack backtrace:\n"); dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); -- cgit v1.3-14-g43fede From bf32c76540257f9f5f2cf661dbdd8bb4a4bd8c82 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 May 2017 12:05:46 -0700 Subject: rcu: Convert rnp->lock wrappers to macros for SRCU use Use of smp_mb__after_unlock_lock() would allow SRCU to omit a full memory barrier during callback execution, so this commit converts raw_spin_lock_rcu_node() from inline functions to type-generic macros to allow them to handle locks in srcu_node structures as well as rcu_node structures. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ddfa34d020ba..a7f63f1074b4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -580,27 +580,22 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) * As ->lock of struct rcu_node is a __private field, therefore one should use * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. */ -static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) -{ - raw_spin_lock(&ACCESS_PRIVATE(rnp, lock)); - smp_mb__after_unlock_lock(); -} +#define raw_spin_lock_rcu_node(p) \ +do { \ + raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) -static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp) -{ - raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock)); -} +#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock)) -static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) -{ - raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock)); - smp_mb__after_unlock_lock(); -} +#define raw_spin_lock_irq_rcu_node(p) \ +do { \ + raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) -static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp) -{ - raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock)); -} +#define raw_spin_unlock_irq_rcu_node(p) \ + raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) #define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ do { \ @@ -615,11 +610,11 @@ do { \ raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ } while (0) -static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) -{ - bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock)); - - if (locked) - smp_mb__after_unlock_lock(); - return locked; -} +#define raw_spin_trylock_rcu_node(p) \ +({ \ + bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) -- cgit v1.3-14-g43fede From 83d40bd3bc3ab3d6b5a4a331f7667d627948a099 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 May 2017 13:28:51 -0700 Subject: rcu: Move rnp->lock wrappers for SRCU use This commit moves the now-generic rnp->lock wrapper macros from kernel/rcu/tree.h to kernel/rcu/rcu.h, thus allowing SRCU to use them. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcu/tree.h | 53 ----------------------------------------------------- 2 files changed, 53 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 6a1e85bd2eac..2a75beb883c8 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -303,6 +303,59 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) cpu <= rnp->grphi; \ cpu = cpumask_next((cpu), cpu_possible_mask)) +/* + * Wrappers for the rcu_node::lock acquire and release. + * + * Because the rcu_nodes form a tree, the tree traversal locking will observe + * different lock values, this in turn means that an UNLOCK of one level + * followed by a LOCK of another level does not imply a full memory barrier; + * and most importantly transitivity is lost. + * + * In order to restore full ordering between tree levels, augment the regular + * lock acquire functions with smp_mb__after_unlock_lock(). + * + * As ->lock of struct rcu_node is a __private field, therefore one should use + * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. + */ +#define raw_spin_lock_rcu_node(p) \ +do { \ + raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock)) + +#define raw_spin_lock_irq_rcu_node(p) \ +do { \ + raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irq_rcu_node(p) \ + raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) + +#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ +} while (0) + +#define raw_spin_trylock_rcu_node(p) \ +({ \ + bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ #ifdef CONFIG_TINY_RCU diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a7f63f1074b4..baa0bac8da2a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -565,56 +565,3 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ } #endif /* #ifdef CONFIG_RCU_TRACE */ - -/* - * Wrappers for the rcu_node::lock acquire and release. - * - * Because the rcu_nodes form a tree, the tree traversal locking will observe - * different lock values, this in turn means that an UNLOCK of one level - * followed by a LOCK of another level does not imply a full memory barrier; - * and most importantly transitivity is lost. - * - * In order to restore full ordering between tree levels, augment the regular - * lock acquire functions with smp_mb__after_unlock_lock(). - * - * As ->lock of struct rcu_node is a __private field, therefore one should use - * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. - */ -#define raw_spin_lock_rcu_node(p) \ -do { \ - raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock)) - -#define raw_spin_lock_irq_rcu_node(p) \ -do { \ - raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define raw_spin_unlock_irq_rcu_node(p) \ - raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) - -#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ -} while (0) - -#define raw_spin_trylock_rcu_node(p) \ -({ \ - bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \ - \ - if (___locked) \ - smp_mb__after_unlock_lock(); \ - ___locked; \ -}) -- cgit v1.3-14-g43fede From a3883df3935e10caa8297719d85fa8eaff7cabbd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 May 2017 15:00:14 -0700 Subject: srcu: Use rnp->lock wrappers to replace explicit memory barriers This commit uses TREE RCU's rnp->lock wrappers to replace a few explicit memory barriers. This change also has the advantage of making SRCU's memory-ordering properties be implemented in roughly the same way as they are in Tree RCU. Signed-off-by: Paul E. McKenney --- include/linux/srcutree.h | 8 ++--- kernel/rcu/srcutree.c | 91 +++++++++++++++++++++++------------------------- 2 files changed, 47 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 24e949bda12a..42973f787e7e 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -40,7 +40,7 @@ struct srcu_data { unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */ /* Update-side state. */ - spinlock_t lock ____cacheline_internodealigned_in_smp; + raw_spinlock_t __private lock ____cacheline_internodealigned_in_smp; struct rcu_segcblist srcu_cblist; /* List of callbacks.*/ unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ @@ -58,7 +58,7 @@ struct srcu_data { * Node in SRCU combining tree, similar in function to rcu_data. */ struct srcu_node { - spinlock_t lock; + raw_spinlock_t __private lock; unsigned long srcu_have_cbs[4]; /* GP seq for children */ /* having CBs, but only */ /* is > ->srcu_gq_seq. */ @@ -78,7 +78,7 @@ struct srcu_struct { struct srcu_node *level[RCU_NUM_LVLS + 1]; /* First node at each level. */ struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ - spinlock_t gp_lock; /* protect ->srcu_cblist */ + raw_spinlock_t __private lock; /* Protect counters */ struct mutex srcu_gp_mutex; /* Serialize GP work. */ unsigned int srcu_idx; /* Current rdr array element. */ unsigned long srcu_gp_seq; /* Grace-period seq #. */ @@ -109,7 +109,7 @@ void process_srcu(struct work_struct *work); #define __SRCU_STRUCT_INIT(name) \ { \ .sda = &name##_srcu_data, \ - .gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ .srcu_gp_seq_needed = 0 - 1, \ __SRCU_DEP_MAP_INIT(name) \ } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 66a998f9c5a7..d0ca524bf042 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -76,7 +76,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) /* Each pass through this loop initializes one srcu_node structure. */ rcu_for_each_node_breadth_first(sp, snp) { - spin_lock_init(&snp->lock); + raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock)); WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { @@ -110,7 +110,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) snp_first = sp->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - spin_lock_init(&sdp->lock); + raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; @@ -169,7 +169,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name, /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)sp, sizeof(*sp)); lockdep_init_map(&sp->dep_map, name, key, 0); - spin_lock_init(&sp->gp_lock); + raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -186,7 +186,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); */ int init_srcu_struct(struct srcu_struct *sp) { - spin_lock_init(&sp->gp_lock); + raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -197,7 +197,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * First-use initialization of statically allocated srcu_struct * structure. Wiring up the combining tree is more than can be * done with compile-time initialization, so this check is added - * to each update-side SRCU primitive. Use ->gp_lock, which -is- + * to each update-side SRCU primitive. Use sp->lock, which -is- * compile-time initialized, to resolve races involving multiple * CPUs trying to garner first-use privileges. */ @@ -209,13 +209,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp) /* The smp_load_acquire() pairs with the smp_store_release(). */ if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave(&sp->gp_lock, flags); + raw_spin_lock_irqsave_rcu_node(sp, flags); if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); return; } init_srcu_struct_fields(sp, true); - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -411,8 +411,7 @@ static void srcu_gp_start(struct srcu_struct *sp) struct srcu_data *sdp = this_cpu_ptr(sp->sda); int state; - RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock), - "Invoked srcu_gp_start() without ->gp_lock!"); + lockdep_assert_held(&sp->lock); WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -513,7 +512,7 @@ static void srcu_gp_end(struct srcu_struct *sp) mutex_lock(&sp->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); idx = rcu_seq_state(sp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); cbdelay = srcu_get_delay(sp); @@ -522,7 +521,7 @@ static void srcu_gp_end(struct srcu_struct *sp) gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) sp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -530,7 +529,7 @@ static void srcu_gp_end(struct srcu_struct *sp) idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); rcu_for_each_node_breadth_first(sp, snp) { - spin_lock_irq(&snp->lock); + raw_spin_lock_irq_rcu_node(snp); cbs = false; if (snp >= sp->level[rcu_num_lvls - 1]) cbs = snp->srcu_have_cbs[idx] == gpseq; @@ -540,21 +539,19 @@ static void srcu_gp_end(struct srcu_struct *sp) snp->srcu_gp_seq_needed_exp = gpseq; mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; - spin_unlock_irq(&snp->lock); - if (cbs) { - smp_mb(); /* GP end before CB invocation. */ + raw_spin_unlock_irq_rcu_node(snp); + if (cbs) srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); - } /* Occasionally prevent srcu_data counter wrap. */ if (!(gpseq & counter_wrap_check)) for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { sdp = per_cpu_ptr(sp->sda, cpu); - spin_lock_irqsave(&sdp->lock, flags); + raw_spin_lock_irqsave_rcu_node(sdp, flags); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; - spin_unlock_irqrestore(&sdp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); } } @@ -562,17 +559,17 @@ static void srcu_gp_end(struct srcu_struct *sp) mutex_unlock(&sp->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (!rcu_seq_state(gpseq) && ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { srcu_gp_start(sp); - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); /* Throttle expedited grace periods: Should be rare! */ srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff ? 0 : SRCU_INTERVAL); } else { - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); } } @@ -592,18 +589,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, if (rcu_seq_done(&sp->srcu_gp_seq, s) || ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) return; - spin_lock_irqsave(&snp->lock, flags); + raw_spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); return; } WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); } - spin_lock_irqsave(&sp->gp_lock, flags); + raw_spin_lock_irqsave_rcu_node(sp, flags); if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) sp->srcu_gp_seq_needed_exp = s; - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -625,14 +622,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, for (; snp != NULL; snp = snp->srcu_parent) { if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) return; /* GP already done and CBs recorded. */ - spin_lock_irqsave(&snp->lock, flags); + raw_spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { snp_seq = snp->srcu_have_cbs[idx]; if (snp == sdp->mynode && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); if (snp == sdp->mynode && snp_seq != s) { - smp_mb(); /* CBs after GP! */ srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); @@ -647,11 +643,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) snp->srcu_gp_seq_needed_exp = s; - spin_unlock_irqrestore(&snp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave(&sp->gp_lock, flags); + raw_spin_lock_irqsave_rcu_node(sp, flags); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -670,7 +666,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, queue_delayed_work(system_power_efficient_wq, &sp->work, srcu_get_delay(sp)); } - spin_unlock_irqrestore(&sp->gp_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -833,7 +829,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rhp->func = func; local_irq_save(flags); sdp = this_cpu_ptr(sp->sda); - spin_lock(&sdp->lock); + raw_spin_lock_rcu_node(sdp); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -847,7 +843,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, sdp->srcu_gp_seq_needed_exp = s; needexp = true; } - spin_unlock_irqrestore(&sdp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); if (needgp) srcu_funnel_gp_start(sp, sdp, s, do_norm); else if (needexp) @@ -1018,7 +1014,7 @@ void srcu_barrier(struct srcu_struct *sp) */ for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - spin_lock_irq(&sdp->lock); + raw_spin_lock_irq_rcu_node(sdp); atomic_inc(&sp->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); @@ -1027,7 +1023,7 @@ void srcu_barrier(struct srcu_struct *sp) debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&sp->srcu_barrier_cpu_cnt); } - spin_unlock_irq(&sdp->lock); + raw_spin_unlock_irq_rcu_node(sdp); } /* Remove the initial count, at which point reaching zero can happen. */ @@ -1076,17 +1072,17 @@ static void srcu_advance_state(struct srcu_struct *sp) */ idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); return; } idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(sp); - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); if (idx != SRCU_STATE_IDLE) { mutex_unlock(&sp->srcu_gp_mutex); return; /* Someone else started the grace period. */ @@ -1135,20 +1131,19 @@ static void srcu_invoke_callbacks(struct work_struct *work) sdp = container_of(work, struct srcu_data, work.work); sp = sdp->sp; rcu_cblist_init(&ready_cbs); - spin_lock_irq(&sdp->lock); - smp_mb(); /* Old grace periods before callback invocation! */ + raw_spin_lock_irq_rcu_node(sdp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { - spin_unlock_irq(&sdp->lock); + raw_spin_unlock_irq_rcu_node(sdp); return; /* Someone else on the job or nothing to do. */ } /* We are on the job! Extract and invoke ready callbacks. */ sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); - spin_unlock_irq(&sdp->lock); + raw_spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); @@ -1161,13 +1156,13 @@ static void srcu_invoke_callbacks(struct work_struct *work) * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ - spin_lock_irq(&sdp->lock); + raw_spin_lock_irq_rcu_node(sdp); rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&sp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); - spin_unlock_irq(&sdp->lock); + raw_spin_unlock_irq_rcu_node(sdp); if (more) srcu_schedule_cbs_sdp(sdp, 0); } @@ -1180,7 +1175,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) { bool pushgp = true; - spin_lock_irq(&sp->gp_lock); + raw_spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ @@ -1190,7 +1185,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) /* Outstanding request and no GP. Start one. */ srcu_gp_start(sp); } - spin_unlock_irq(&sp->gp_lock); + raw_spin_unlock_irq_rcu_node(sp); if (pushgp) queue_delayed_work(system_power_efficient_wq, &sp->work, delay); -- cgit v1.3-14-g43fede From 90040c9e3015054db7efa0101afdd446d1167fe8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 May 2017 14:36:55 -0700 Subject: rcu: Remove *_SLOW_* Kconfig options The RCU_TORTURE_TEST_SLOW_PREINIT, RCU_TORTURE_TEST_SLOW_PREINIT_DELAY, RCU_TORTURE_TEST_SLOW_PREINIT_DELAY, RCU_TORTURE_TEST_SLOW_INIT, RCU_TORTURE_TEST_SLOW_INIT_DELAY, RCU_TORTURE_TEST_SLOW_CLEANUP, and RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY Kconfig options are only useful for torture testing, and there are the rcutree.gp_cleanup_delay, rcutree.gp_init_delay, and rcutree.gp_preinit_delay kernel boot parameters that rcutorture can use instead. The effect of these parameters is to artificially slow down grace period initialization and cleanup in order to make some types of race conditions happen more often. This commit therefore simplifies Tree RCU a bit by removing the Kconfig options and adding the corresponding kernel parameters to rcutorture's .boot files instead. However, this commit also leaves out the kernel parameters for TREE02, TREE04, and TREE07 in order to have about the same number of tests slowed as not slowed. TREE01, TREE03, TREE05, and TREE06 are slowed, and the rest are not slowed. Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 10 +-- kernel/rcu/tree.c | 26 ++------ kernel/rcu/tree_plugin.h | 6 +- lib/Kconfig.debug | 75 ---------------------- .../selftests/rcutorture/configs/rcu/TREE01 | 3 - .../selftests/rcutorture/configs/rcu/TREE01.boot | 3 + .../selftests/rcutorture/configs/rcu/TREE02 | 3 - .../selftests/rcutorture/configs/rcu/TREE03 | 3 - .../selftests/rcutorture/configs/rcu/TREE03.boot | 3 + .../selftests/rcutorture/configs/rcu/TREE04 | 3 - .../selftests/rcutorture/configs/rcu/TREE05 | 3 - .../selftests/rcutorture/configs/rcu/TREE05.boot | 3 + .../selftests/rcutorture/configs/rcu/TREE06 | 3 - .../selftests/rcutorture/configs/rcu/TREE06.boot | 3 + .../selftests/rcutorture/configs/rcu/TREE07 | 3 - .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt | 9 --- 16 files changed, 24 insertions(+), 135 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6671f9b60a86..f85bfe02f052 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3229,21 +3229,17 @@ rcutree.gp_cleanup_delay= [KNL] Set the number of jiffies to delay each step of - RCU grace-period cleanup. This only has effect - when CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP is set. + RCU grace-period cleanup. rcutree.gp_init_delay= [KNL] Set the number of jiffies to delay each step of - RCU grace-period initialization. This only has - effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT - is set. + RCU grace-period initialization. rcutree.gp_preinit_delay= [KNL] Set the number of jiffies to delay each step of RCU grace-period pre-initialization, that is, the propagation of recent CPU-hotplug changes up - the rcu_node combining tree. This only has effect - when CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT is set. + the rcu_node combining tree. rcutree.rcu_fanout_exact= [KNL] Disable autobalancing of the rcu_node combining diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cac24f5d3fd2..bbbddd85906b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -177,26 +177,12 @@ module_param(kthread_prio, int, 0644); /* Delay in jiffies for grace-period initialization delays, debug only. */ -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT -static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY; -module_param(gp_preinit_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ -static const int gp_preinit_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ - -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT -static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY; -module_param(gp_init_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ -static const int gp_init_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ - -#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP -static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY; -module_param(gp_cleanup_delay, int, 0644); -#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ -static const int gp_cleanup_delay; -#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ +static int gp_preinit_delay; +module_param(gp_preinit_delay, int, 0444); +static int gp_init_delay; +module_param(gp_init_delay, int, 0444); +static int gp_cleanup_delay; +module_param(gp_cleanup_delay, int, 0444); /* * Number of grace periods between delays, normalized by the duration of diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 116cf8339826..0553d9fed7d7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -109,11 +109,11 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tKick kthreads if too-long grace period.\n"); if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT)) + if (gp_preinit_delay) pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)) + if (gp_init_delay) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP)) + if (gp_cleanup_delay) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) pr_info("\tRCU debug extended QS entry/exit.\n"); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e4587ebe52c7..960c5d2d3c03 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1373,81 +1373,6 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. -config RCU_TORTURE_TEST_SLOW_PREINIT - bool "Slow down RCU grace-period pre-initialization to expose races" - depends on RCU_TORTURE_TEST - help - This option delays grace-period pre-initialization (the - propagation of CPU-hotplug changes up the rcu_node combining - tree) for a few jiffies between initializing each pair of - consecutive rcu_node structures. This helps to expose races - involving grace-period pre-initialization, in other words, it - makes your kernel less stable. It can also greatly increase - grace-period latency, especially on systems with large numbers - of CPUs. This is useful when torture-testing RCU, but in - almost no other circumstance. - - Say Y here if you want your system to crash and hang more often. - Say N if you want a sane system. - -config RCU_TORTURE_TEST_SLOW_PREINIT_DELAY - int "How much to slow down RCU grace-period pre-initialization" - range 0 5 - default 3 - depends on RCU_TORTURE_TEST_SLOW_PREINIT - help - This option specifies the number of jiffies to wait between - each rcu_node structure pre-initialization step. - -config RCU_TORTURE_TEST_SLOW_INIT - bool "Slow down RCU grace-period initialization to expose races" - depends on RCU_TORTURE_TEST - help - This option delays grace-period initialization for a few - jiffies between initializing each pair of consecutive - rcu_node structures. This helps to expose races involving - grace-period initialization, in other words, it makes your - kernel less stable. It can also greatly increase grace-period - latency, especially on systems with large numbers of CPUs. - This is useful when torture-testing RCU, but in almost no - other circumstance. - - Say Y here if you want your system to crash and hang more often. - Say N if you want a sane system. - -config RCU_TORTURE_TEST_SLOW_INIT_DELAY - int "How much to slow down RCU grace-period initialization" - range 0 5 - default 3 - depends on RCU_TORTURE_TEST_SLOW_INIT - help - This option specifies the number of jiffies to wait between - each rcu_node structure initialization. - -config RCU_TORTURE_TEST_SLOW_CLEANUP - bool "Slow down RCU grace-period cleanup to expose races" - depends on RCU_TORTURE_TEST - help - This option delays grace-period cleanup for a few jiffies - between cleaning up each pair of consecutive rcu_node - structures. This helps to expose races involving grace-period - cleanup, in other words, it makes your kernel less stable. - It can also greatly increase grace-period latency, especially - on systems with large numbers of CPUs. This is useful when - torture-testing RCU, but in almost no other circumstance. - - Say Y here if you want your system to crash and hang more often. - Say N if you want a sane system. - -config RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY - int "How much to slow down RCU grace-period cleanup" - range 0 5 - default 3 - depends on RCU_TORTURE_TEST_SLOW_CLEANUP - help - This option specifies the number of jiffies to wait between - each rcu_node structure cleanup operation. - config RCU_CPU_STALL_TIMEOUT int "RCU CPU stall timeout in seconds" depends on RCU_STALL_COMMON diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 index cc6c5815236e..92ca49f90ef9 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 @@ -17,6 +17,3 @@ CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y -CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot index adc3abc82fb8..89705ed79596 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot @@ -1 +1,4 @@ rcutorture.torture_type=rcu_bh maxcpus=8 +rcutree.gp_preinit_delay=3 +rcutree.gp_init_delay=3 +rcutree.gp_cleanup_delay=3 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 b/tools/testing/selftests/rcutorture/configs/rcu/TREE02 index 1cecab330ba0..35e639e39366 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02 @@ -19,8 +19,5 @@ CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=n CONFIG_RCU_BOOST=n CONFIG_RCU_EXPERT=y -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y -CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 b/tools/testing/selftests/rcutorture/configs/rcu/TREE03 index 3b93ee544e70..7a17c503b382 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03 @@ -17,6 +17,3 @@ CONFIG_RCU_BOOST=y CONFIG_RCU_KTHREAD_PRIO=2 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y -CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot index 120c0c88d100..9ef3aed126e9 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot @@ -1 +1,4 @@ rcutorture.onoff_interval=1 rcutorture.onoff_holdoff=30 +rcutree.gp_preinit_delay=3 +rcutree.gp_init_delay=3 +rcutree.gp_cleanup_delay=3 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index 851c01ae2cea..27d22695d64c 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -18,7 +18,4 @@ CONFIG_RCU_FANOUT_LEAF=3 CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y -CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y CONFIG_RCU_EQS_DEBUG=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 b/tools/testing/selftests/rcutorture/configs/rcu/TREE05 index d4cdc0d74e16..1257d3227b1e 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05 @@ -19,6 +19,3 @@ CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y -CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot index 15b3e1a86f74..c7fd050dfcd9 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot @@ -1,2 +1,5 @@ rcutorture.torture_type=sched rcupdate.rcu_self_test_sched=1 +rcutree.gp_preinit_delay=3 +rcutree.gp_init_delay=3 +rcutree.gp_cleanup_delay=3 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 b/tools/testing/selftests/rcutorture/configs/rcu/TREE06 index 9215827649bd..05a4eec3f27b 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06 @@ -21,6 +21,3 @@ CONFIG_PROVE_LOCKING=y CONFIG_DEBUG_OBJECTS=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=y CONFIG_RCU_EXPERT=y -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y -CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot index dd90f28ed700..ad18b52a2cad 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot @@ -2,3 +2,6 @@ rcupdate.rcu_self_test=1 rcupdate.rcu_self_test_bh=1 rcupdate.rcu_self_test_sched=1 rcutree.rcu_fanout_exact=1 +rcutree.gp_preinit_delay=3 +rcutree.gp_init_delay=3 +rcutree.gp_cleanup_delay=3 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 index 99f04e4c5162..b9ddd3beeb9a 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 @@ -17,6 +17,3 @@ CONFIG_RCU_FANOUT_LEAF=2 CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP=y -CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT=y diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index 364801b1a230..1dfec4657d95 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -28,9 +28,6 @@ CONFIG_RCU_TRACE -- Do half. CONFIG_SMP -- Need one !SMP for PREEMPT_RCU. CONFIG_RCU_EXPERT=n -- Do a few, but these have to be vanilla configurations. CONFIG_RCU_EQS_DEBUG -- Do at least one for CONFIG_NO_HZ_FULL and not. -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP -- Do for all but a couple TREE scenarios. -CONFIG_RCU_TORTURE_TEST_SLOW_INIT -- Do for all but a couple TREE scenarios. -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT -- Do for all but a couple TREE scenarios. RCU-bh: Do one with PREEMPT and one with !PREEMPT. RCU-sched: Do one with PREEMPT but not BOOST. @@ -78,12 +75,6 @@ CONFIG_RCU_TORTURE_TEST_RUNNABLE Always used in KVM testing. -CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY -CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY -CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY - - Inspection suffices, ignore. - CONFIG_PREEMPT_RCU CONFIG_TREE_RCU CONFIG_TINY_RCU -- cgit v1.3-14-g43fede From f7a10a975036ef9ca957bfe12ab2d4b1a46cccd1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 May 2017 15:57:16 -0700 Subject: rcu: Remove the RCU_KTHREAD_PRIO Kconfig option Anything that can be done with the RCU_KTHREAD_PRIO Kconfig option can also be done with the rcutree.kthread_prio kernel boot parameter. This commit therefore removes this Kconfig option. Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Rik van Riel --- init/Kconfig | 31 ---------------------- kernel/rcu/tree.c | 4 --- .../selftests/rcutorture/configs/rcu/TREE03 | 1 - .../selftests/rcutorture/configs/rcu/TREE03.boot | 1 + .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt | 1 - 5 files changed, 1 insertion(+), 37 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index a2cfde19e8b8..6f257d51f582 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -697,37 +697,6 @@ config RCU_BOOST Say Y here if you are working with real-time apps or heavy loads Say N here if you are unsure. -config RCU_KTHREAD_PRIO - int "Real-time priority to use for RCU worker threads" - range 1 99 if RCU_BOOST - range 0 99 if !RCU_BOOST - default 1 if RCU_BOOST - default 0 if !RCU_BOOST - depends on RCU_EXPERT - help - This option specifies the SCHED_FIFO priority value that will be - assigned to the rcuc/n and rcub/n threads and is also the value - used for RCU_BOOST (if enabled). If you are working with a - real-time application that has one or more CPU-bound threads - running at a real-time priority level, you should set - RCU_KTHREAD_PRIO to a priority higher than the highest-priority - real-time CPU-bound application thread. The default RCU_KTHREAD_PRIO - value of 1 is appropriate in the common case, which is real-time - applications that do not have any CPU-bound threads. - - Some real-time applications might not have a single real-time - thread that saturates a given CPU, but instead might have - multiple real-time threads that, taken together, fully utilize - that CPU. In this case, you should set RCU_KTHREAD_PRIO to - a priority higher than the lowest-priority thread that is - conspiring to prevent the CPU from running any non-real-time - tasks. For example, if one thread at priority 10 and another - thread at priority 5 are between themselves fully consuming - the CPU time on a given CPU, then RCU_KTHREAD_PRIO should be - set to priority 6 or higher. - - Specify the real-time priority, or take the default if unsure. - config RCU_BOOST_DELAY int "Milliseconds to delay boosting after RCU grace-period start" range 0 3000 diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bbbddd85906b..187ac3f41526 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -168,11 +168,7 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ -#ifdef CONFIG_RCU_KTHREAD_PRIO -static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; -#else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; -#endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */ module_param(kthread_prio, int, 0644); /* Delay in jiffies for grace-period initialization delays, debug only. */ diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 b/tools/testing/selftests/rcutorture/configs/rcu/TREE03 index 7a17c503b382..2dc31b16e506 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03 @@ -14,6 +14,5 @@ CONFIG_RCU_FANOUT_LEAF=2 CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_RCU_BOOST=y -CONFIG_RCU_KTHREAD_PRIO=2 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot index 9ef3aed126e9..5d2cc0bd50a0 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot @@ -2,3 +2,4 @@ rcutorture.onoff_interval=1 rcutorture.onoff_holdoff=30 rcutree.gp_preinit_delay=3 rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 +rcutree.kthread_prio=2 diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index 1dfec4657d95..b5ea8489969a 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -16,7 +16,6 @@ CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not. CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING. CONFIG_PROVE_RCU_REPEATEDLY -- Do one. CONFIG_RCU_BOOST -- one of PREEMPT_RCU. -CONFIG_RCU_KTHREAD_PRIO -- set to 2 for _BOOST testing. CONFIG_RCU_FANOUT -- Cover hierarchy, but overlap with others. CONFIG_RCU_FANOUT_LEAF -- Do one non-default. CONFIG_RCU_FAST_NO_HZ -- Do one, but not with CONFIG_RCU_NOCB_CPU_ALL. -- cgit v1.3-14-g43fede From fe5ac724d81a3c7803e60c2232718f212f3f38d4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 May 2017 11:26:22 -0700 Subject: rcu: Remove nohz_full full-system-idle state machine The NO_HZ_FULL_SYSIDLE full-system-idle capability was added in 2013 by commit 0edd1b1784cb ("nohz_full: Add full-system-idle state machine"), but has not been used. This commit therefore removes it. If it turns out to be needed later, this commit can always be reverted. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Rik van Riel Cc: Ingo Molnar Acked-by: Linus Torvalds --- .../RCU/Design/Requirements/Requirements.html | 6 +- include/linux/rcupdate.h | 9 - kernel/rcu/tree.c | 41 +- kernel/rcu/tree.h | 16 - kernel/rcu/tree_plugin.h | 429 --------------------- kernel/time/Kconfig | 50 --- .../selftests/rcutorture/configs/rcu/TREE07 | 1 - .../testing/selftests/rcutorture/doc/TINY_RCU.txt | 1 - .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt | 7 +- 9 files changed, 9 insertions(+), 551 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index cb614f2a69c2..8c94fc1d1c84 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -2520,11 +2520,7 @@ It is similarly socially unacceptable to interrupt an nohz_full CPU running in userspace. RCU must therefore track nohz_full userspace execution. -And in -CONFIG_NO_HZ_FULL_SYSIDLE=y -kernels, RCU must separately track idle CPUs on the one hand and -CPUs that are either idle or executing in userspace on the other. -In both cases, RCU must be able to sample state at two points in +RCU must therefore be able to sample state at two points in time, and be able to determine whether or not some other CPU spent any time idle and/or executing in userspace. diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ee40d7eba741..7f24a5e673f5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -854,15 +854,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define kfree_rcu(ptr, rcu_head) \ __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) -/* Only for use by adaptive-ticks code. */ -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE -bool rcu_sys_is_idle(void); -void rcu_sysidle_force_exit(void); -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ -static inline bool rcu_sys_is_idle(void) { return false; } -static inline void rcu_sysidle_force_exit(void) { } -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - /* * Place this after a lock-acquisition primitive to guarantee that diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 187ac3f41526..51d4c3acf32d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -270,10 +270,6 @@ void rcu_bh_qs(void) static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, - .dynticks_idle = ATOMIC_INIT(1), -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; /* @@ -546,10 +542,7 @@ module_param(jiffies_till_sched_qs, ulong, 0644); static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); -static void force_qs_rnp(struct rcu_state *rsp, - int (*f)(struct rcu_data *rsp, bool *isidle, - unsigned long *maxj), - bool *isidle, unsigned long *maxj); +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); @@ -854,7 +847,6 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); - rcu_sysidle_enter(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -904,7 +896,6 @@ void rcu_irq_exit(void) trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); rdtp->dynticks_nesting--; } - rcu_sysidle_enter(1); } /* @@ -986,7 +977,6 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); - rcu_sysidle_exit(0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -1038,7 +1028,6 @@ void rcu_irq_enter(void) trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else rcu_eqs_exit_common(oldval, true); - rcu_sysidle_exit(1); } /* @@ -1217,11 +1206,9 @@ static int rcu_is_cpu_rrupt_from_idle(void) * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. */ -static int dyntick_save_progress_counter(struct rcu_data *rdp, - bool *isidle, unsigned long *maxj) +static int dyntick_save_progress_counter(struct rcu_data *rdp) { rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); - rcu_sysidle_check_cpu(rdp, isidle, maxj); if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, @@ -1238,8 +1225,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, * idle state since the last call to dyntick_save_progress_counter() * for this same CPU, or by virtue of having been offline. */ -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, - bool *isidle, unsigned long *maxj) +static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { unsigned long jtsq; bool *rnhqp; @@ -2105,25 +2091,16 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) */ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) { - bool isidle = false; - unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); rsp->n_force_qs++; if (first_time) { /* Collect dyntick-idle snapshots. */ - if (is_sysidle_rcu_state(rsp)) { - isidle = true; - maxj = jiffies - ULONG_MAX / 4; - } - force_qs_rnp(rsp, dyntick_save_progress_counter, - &isidle, &maxj); - rcu_sysidle_report_gp(rsp, isidle, maxj); + force_qs_rnp(rsp, dyntick_save_progress_counter); } else { /* Handle dyntick-idle and offline CPUs. */ - isidle = true; - force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); + force_qs_rnp(rsp, rcu_implicit_dynticks_qs); } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { @@ -2895,10 +2872,7 @@ void rcu_check_callbacks(int user) * * The caller must have suppressed start of new grace periods. */ -static void force_qs_rnp(struct rcu_state *rsp, - int (*f)(struct rcu_data *rsp, bool *isidle, - unsigned long *maxj), - bool *isidle, unsigned long *maxj) +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) { int cpu; unsigned long flags; @@ -2937,7 +2911,7 @@ static void force_qs_rnp(struct rcu_state *rsp, for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { - if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) + if (f(per_cpu_ptr(rsp->rda, cpu))) mask |= bit; } } @@ -3793,7 +3767,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) !init_nocb_callback_list(rdp)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_sysidle_init_percpu_data(rdp->dynticks); rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index baa0bac8da2a..2c112bb11aa8 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -45,14 +45,6 @@ struct rcu_dynticks { bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - long long dynticks_idle_nesting; - /* irq/process nesting level from idle. */ - atomic_t dynticks_idle; /* Even value for idle, else odd. */ - /* "Idle" excludes userspace execution. */ - unsigned long dynticks_idle_jiffies; - /* End of last non-NMI non-idle period. */ -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ #ifdef CONFIG_RCU_FAST_NO_HZ bool all_lazy; /* Are all CPU's CBs lazy? */ unsigned long nonlazy_posted; @@ -529,15 +521,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); -static void rcu_sysidle_enter(int irq); -static void rcu_sysidle_exit(int irq); -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj); -static bool is_sysidle_rcu_state(struct rcu_state *rsp); -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj); static void rcu_bind_gp_kthread(void); -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); static bool rcu_nohz_full_cpu(struct rcu_state *rsp); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0553d9fed7d7..f524d967f7b6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2563,429 +2563,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu) #endif /* #ifdef CONFIG_NO_HZ_FULL */ } - -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - -static int full_sysidle_state; /* Current system-idle state. */ -#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ -#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ -#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ -#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ -#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ - -/* - * Invoked to note exit from irq or task transition to idle. Note that - * usermode execution does -not- count as idle here! After all, we want - * to detect full-system idle states, not RCU quiescent states and grace - * periods. The caller must have disabled interrupts. - */ -static void rcu_sysidle_enter(int irq) -{ - unsigned long j; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_enter() invoked with irqs enabled!!!"); - - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - /* Adjust nesting, check for fully idle. */ - if (irq) { - rdtp->dynticks_idle_nesting--; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); - if (rdtp->dynticks_idle_nesting != 0) - return; /* Still not fully idle. */ - } else { - if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == - DYNTICK_TASK_NEST_VALUE) { - rdtp->dynticks_idle_nesting = 0; - } else { - rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); - return; /* Still not fully idle. */ - } - } - - /* Record start of fully idle period. */ - j = jiffies; - WRITE_ONCE(rdtp->dynticks_idle_jiffies, j); - smp_mb__before_atomic(); - atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic(); - WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); -} - -/* - * Unconditionally force exit from full system-idle state. This is - * invoked when a normal CPU exits idle, but must be called separately - * for the timekeeping CPU (tick_do_timer_cpu). The reason for this - * is that the timekeeping CPU is permitted to take scheduling-clock - * interrupts while the system is in system-idle state, and of course - * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock - * interrupt from any other type of interrupt. - */ -void rcu_sysidle_force_exit(void) -{ - int oldstate = READ_ONCE(full_sysidle_state); - int newoldstate; - - /* - * Each pass through the following loop attempts to exit full - * system-idle state. If contention proves to be a problem, - * a trylock-based contention tree could be used here. - */ - while (oldstate > RCU_SYSIDLE_SHORT) { - newoldstate = cmpxchg(&full_sysidle_state, - oldstate, RCU_SYSIDLE_NOT); - if (oldstate == newoldstate && - oldstate == RCU_SYSIDLE_FULL_NOTED) { - rcu_kick_nohz_cpu(tick_do_timer_cpu); - return; /* We cleared it, done! */ - } - oldstate = newoldstate; - } - smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ -} - -/* - * Invoked to note entry to irq or task transition from idle. Note that - * usermode execution does -not- count as idle here! The caller must - * have disabled interrupts. - */ -static void rcu_sysidle_exit(int irq) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_exit() invoked with irqs enabled!!!"); - - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - /* Adjust nesting, check for already non-idle. */ - if (irq) { - rdtp->dynticks_idle_nesting++; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); - if (rdtp->dynticks_idle_nesting != 1) - return; /* Already non-idle. */ - } else { - /* - * Allow for irq misnesting. Yes, it really is possible - * to enter an irq handler then never leave it, and maybe - * also vice versa. Handle both possibilities. - */ - if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { - rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); - return; /* Already non-idle. */ - } else { - rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; - } - } - - /* Record end of idle period. */ - smp_mb__before_atomic(); - atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic(); - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); - - /* - * If we are the timekeeping CPU, we are permitted to be non-idle - * during a system-idle state. This must be the case, because - * the timekeeping CPU has to take scheduling-clock interrupts - * during the time that the system is transitioning to full - * system-idle state. This means that the timekeeping CPU must - * invoke rcu_sysidle_force_exit() directly if it does anything - * more than take a scheduling-clock interrupt. - */ - if (smp_processor_id() == tick_do_timer_cpu) - return; - - /* Update system-idle state: We are clearly no longer fully idle! */ - rcu_sysidle_force_exit(); -} - -/* - * Check to see if the current CPU is idle. Note that usermode execution - * does not count as idle. The caller must have disabled interrupts, - * and must be running on tick_do_timer_cpu. - */ -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj) -{ - int cur; - unsigned long j; - struct rcu_dynticks *rdtp = rdp->dynticks; - - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sysidle_check_cpu() invoked with irqs enabled!!!"); - - /* If there are no nohz_full= CPUs, don't check system-wide idleness. */ - if (!tick_nohz_full_enabled()) - return; - - /* - * If some other CPU has already reported non-idle, if this is - * not the flavor of RCU that tracks sysidle state, or if this - * is an offline or the timekeeping CPU, nothing to do. - */ - if (!*isidle || rdp->rsp != rcu_state_p || - cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) - return; - /* Verify affinity of current kthread. */ - WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); - - /* Pick up current idle and NMI-nesting counter and check. */ - cur = atomic_read(&rdtp->dynticks_idle); - if (cur & 0x1) { - *isidle = false; /* We are not idle! */ - return; - } - smp_mb(); /* Read counters before timestamps. */ - - /* Pick up timestamps. */ - j = READ_ONCE(rdtp->dynticks_idle_jiffies); - /* If this CPU entered idle more recently, update maxj timestamp. */ - if (ULONG_CMP_LT(*maxj, j)) - *maxj = j; -} - -/* - * Is this the flavor of RCU that is handling full-system idle? - */ -static bool is_sysidle_rcu_state(struct rcu_state *rsp) -{ - return rsp == rcu_state_p; -} - -/* - * Return a delay in jiffies based on the number of CPUs, rcu_node - * leaf fanout, and jiffies tick rate. The idea is to allow larger - * systems more time to transition to full-idle state in order to - * avoid the cache thrashing that otherwise occur on the state variable. - * Really small systems (less than a couple of tens of CPUs) should - * instead use a single global atomically incremented counter, and later - * versions of this will automatically reconfigure themselves accordingly. - */ -static unsigned long rcu_sysidle_delay(void) -{ - if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) - return 0; - return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); -} - -/* - * Advance the full-system-idle state. This is invoked when all of - * the non-timekeeping CPUs are idle. - */ -static void rcu_sysidle(unsigned long j) -{ - /* Check the current state. */ - switch (READ_ONCE(full_sysidle_state)) { - case RCU_SYSIDLE_NOT: - - /* First time all are idle, so note a short idle period. */ - WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT); - break; - - case RCU_SYSIDLE_SHORT: - - /* - * Idle for a bit, time to advance to next state? - * cmpxchg failure means race with non-idle, let them win. - */ - if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) - (void)cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); - break; - - case RCU_SYSIDLE_LONG: - - /* - * Do an additional check pass before advancing to full. - * cmpxchg failure means race with non-idle, let them win. - */ - if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) - (void)cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); - break; - - default: - break; - } -} - -/* - * Found a non-idle non-timekeeping CPU, so kick the system-idle state - * back to the beginning. - */ -static void rcu_sysidle_cancel(void) -{ - smp_mb(); - if (full_sysidle_state > RCU_SYSIDLE_SHORT) - WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT); -} - -/* - * Update the sysidle state based on the results of a force-quiescent-state - * scan of the CPUs' dyntick-idle state. - */ -static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, - unsigned long maxj, bool gpkt) -{ - if (rsp != rcu_state_p) - return; /* Wrong flavor, ignore. */ - if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) - return; /* Running state machine from timekeeping CPU. */ - if (isidle) - rcu_sysidle(maxj); /* More idle! */ - else - rcu_sysidle_cancel(); /* Idle is over. */ -} - -/* - * Wrapper for rcu_sysidle_report() when called from the grace-period - * kthread's context. - */ -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj) -{ - /* If there are no nohz_full= CPUs, no need to track this. */ - if (!tick_nohz_full_enabled()) - return; - - rcu_sysidle_report(rsp, isidle, maxj, true); -} - -/* Callback and function for forcing an RCU grace period. */ -struct rcu_sysidle_head { - struct rcu_head rh; - int inuse; -}; - -static void rcu_sysidle_cb(struct rcu_head *rhp) -{ - struct rcu_sysidle_head *rshp; - - /* - * The following memory barrier is needed to replace the - * memory barriers that would normally be in the memory - * allocator. - */ - smp_mb(); /* grace period precedes setting inuse. */ - - rshp = container_of(rhp, struct rcu_sysidle_head, rh); - WRITE_ONCE(rshp->inuse, 0); -} - -/* - * Check to see if the system is fully idle, other than the timekeeping CPU. - * The caller must have disabled interrupts. This is not intended to be - * called unless tick_nohz_full_enabled(). - */ -bool rcu_sys_is_idle(void) -{ - static struct rcu_sysidle_head rsh; - int rss = READ_ONCE(full_sysidle_state); - - RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_sys_is_idle() invoked with irqs enabled!!!"); - - if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) - return false; - - /* Handle small-system case by doing a full scan of CPUs. */ - if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { - int oldrss = rss - 1; - - /* - * One pass to advance to each state up to _FULL. - * Give up if any pass fails to advance the state. - */ - while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { - int cpu; - bool isidle = true; - unsigned long maxj = jiffies - ULONG_MAX / 4; - struct rcu_data *rdp; - - /* Scan all the CPUs looking for nonidle CPUs. */ - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rcu_state_p->rda, cpu); - rcu_sysidle_check_cpu(rdp, &isidle, &maxj); - if (!isidle) - break; - } - rcu_sysidle_report(rcu_state_p, isidle, maxj, false); - oldrss = rss; - rss = READ_ONCE(full_sysidle_state); - } - } - - /* If this is the first observation of an idle period, record it. */ - if (rss == RCU_SYSIDLE_FULL) { - rss = cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); - return rss == RCU_SYSIDLE_FULL; - } - - smp_mb(); /* ensure rss load happens before later caller actions. */ - - /* If already fully idle, tell the caller (in case of races). */ - if (rss == RCU_SYSIDLE_FULL_NOTED) - return true; - - /* - * If we aren't there yet, and a grace period is not in flight, - * initiate a grace period. Either way, tell the caller that - * we are not there yet. We use an xchg() rather than an assignment - * to make up for the memory barriers that would otherwise be - * provided by the memory allocator. - */ - if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && - !rcu_gp_in_progress(rcu_state_p) && - !rsh.inuse && xchg(&rsh.inuse, 1) == 0) - call_rcu(&rsh.rh, rcu_sysidle_cb); - return false; -} - -/* - * Initialize dynticks sysidle state for CPUs coming online. - */ -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) -{ - rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; -} - -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - -static void rcu_sysidle_enter(int irq) -{ -} - -static void rcu_sysidle_exit(int irq) -{ -} - -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj) -{ -} - -static bool is_sysidle_rcu_state(struct rcu_state *rsp) -{ - return false; -} - -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj) -{ -} - -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) -{ -} - -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? @@ -3016,13 +2593,7 @@ static void rcu_bind_gp_kthread(void) if (!tick_nohz_full_enabled()) return; -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - cpu = tick_do_timer_cpu; - if (cpu >= 0 && cpu < nr_cpu_ids) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ housekeeping_affine(current); -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } /* Record the current task on dyntick-idle entry. */ diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 4008d9f95dd7..ac09bc29eb08 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -126,56 +126,6 @@ config NO_HZ_FULL_ALL Note the boot CPU will still be kept outside the range to handle the timekeeping duty. -config NO_HZ_FULL_SYSIDLE - bool "Detect full-system idle state for full dynticks system" - depends on NO_HZ_FULL - default n - help - At least one CPU must keep the scheduling-clock tick running for - timekeeping purposes whenever there is a non-idle CPU, where - "non-idle" also includes dynticks CPUs as long as they are - running non-idle tasks. Because the underlying adaptive-tick - support cannot distinguish between all CPUs being idle and - all CPUs each running a single task in dynticks mode, the - underlying support simply ensures that there is always a CPU - handling the scheduling-clock tick, whether or not all CPUs - are idle. This Kconfig option enables scalable detection of - the all-CPUs-idle state, thus allowing the scheduling-clock - tick to be disabled when all CPUs are idle. Note that scalable - detection of the all-CPUs-idle state means that larger systems - will be slower to declare the all-CPUs-idle state. - - Say Y if you would like to help debug all-CPUs-idle detection. - - Say N if you are unsure. - -config NO_HZ_FULL_SYSIDLE_SMALL - int "Number of CPUs above which large-system approach is used" - depends on NO_HZ_FULL_SYSIDLE - range 1 NR_CPUS - default 8 - help - The full-system idle detection mechanism takes a lazy approach - on large systems, as is required to attain decent scalability. - However, on smaller systems, scalability is not anywhere near as - large a concern as is energy efficiency. The sysidle subsystem - therefore uses a fast but non-scalable algorithm for small - systems and a lazier but scalable algorithm for large systems. - This Kconfig parameter defines the number of CPUs in the largest - system that will be considered to be "small". - - The default value will be fine in most cases. Battery-powered - systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger - numbers of CPUs, and (3) are suffering from battery-lifetime - problems due to long sysidle latencies might wish to experiment - with larger values for this Kconfig parameter. On the other - hand, they might be even better served by disabling NO_HZ_FULL - entirely, given that NO_HZ_FULL is intended for HPC and - real-time workloads that at present do not tend to be run on - battery-powered systems. - - Take the default if you are unsure. - config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 index b9ddd3beeb9a..0f4759f4232e 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 @@ -8,7 +8,6 @@ CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y CONFIG_NO_HZ_FULL_ALL=n -CONFIG_NO_HZ_FULL_SYSIDLE=y CONFIG_RCU_FAST_NO_HZ=n CONFIG_RCU_TRACE=y CONFIG_HOTPLUG_CPU=y diff --git a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt index 24396ae8355b..a75b16991a92 100644 --- a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt +++ b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt @@ -18,7 +18,6 @@ CONFIG_PROVE_RCU In common code tested by TREE_RCU test cases. -CONFIG_NO_HZ_FULL_SYSIDLE CONFIG_RCU_NOCB_CPU Meaningless for TINY_RCU. diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index b5ea8489969a..519e06d34d0b 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -9,8 +9,7 @@ CONFIG_DEBUG_OBJECTS_RCU_HEAD -- Do one. CONFIG_HOTPLUG_CPU -- Do half. (Every second.) CONFIG_HZ_PERIODIC -- Do one. CONFIG_NO_HZ_IDLE -- Do those not otherwise specified. (Groups of two.) -CONFIG_NO_HZ_FULL -- Do two, one with CONFIG_NO_HZ_FULL_SYSIDLE. -CONFIG_NO_HZ_FULL_SYSIDLE -- Do one. +CONFIG_NO_HZ_FULL -- Do two, one with partial CPU enablement. CONFIG_PREEMPT -- Do half. (First three and #8.) CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not. CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING. @@ -48,10 +47,6 @@ CONFIG_64BIT Used only to check CONFIG_RCU_FANOUT value, inspection suffices. -CONFIG_NO_HZ_FULL_SYSIDLE_SMALL - - Defer until Frederic uses this. - CONFIG_PREEMPT_COUNT CONFIG_PREEMPT_RCU -- cgit v1.3-14-g43fede From 4e4bea7427062ec15df7084f97728e2a44d912e3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 May 2017 15:33:23 -0700 Subject: rcu: Remove typecheck() from RCU locking wrapper functions Because raw_spin_lock_irqsave() and raw_spin_unlock_irqrestore() both do typecheck() on their flags argument, there is no point in duplicating this check in raw_spin_lock_irqsave_rcu_node() and raw_spin_unlock_irqrestore_rcu_node(). This commit therefore saves a few lines by removing this duplicated check. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 2a75beb883c8..bc55b5716c37 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -334,18 +334,14 @@ do { \ #define raw_spin_unlock_irq_rcu_node(p) \ raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) -#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +#define raw_spin_lock_irqsave_rcu_node(p, flags) \ do { \ - typecheck(unsigned long, flags); \ - raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ + raw_spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ smp_mb__after_unlock_lock(); \ } while (0) -#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ -do { \ - typecheck(unsigned long, flags); \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ -} while (0) +#define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ #define raw_spin_trylock_rcu_node(p) \ ({ \ -- cgit v1.3-14-g43fede From c4a09ff752e164c020bced6513e2008f992a02e6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 May 2017 14:37:19 -0700 Subject: rcu: Remove the now-obsolete PROVE_RCU_REPEATEDLY Kconfig option The PROVE_RCU_REPEATEDLY Kconfig option was initially added due to the volume of messages from PROVE_RCU: Doing just one per boot would have required excessive numbers of boots to locate them all. However, PROVE_RCU messages are now relatively rare, so there is no longer any reason to need more than one such message per boot. This commit therefore removes the PROVE_RCU_REPEATEDLY Kconfig option. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar --- kernel/locking/lockdep.c | 4 ---- kernel/rcu/tree_plugin.h | 4 +--- lib/Kconfig.debug | 14 -------------- tools/testing/selftests/rcutorture/configs/rcu/TINY02 | 1 - .../testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt | 1 - 5 files changed, 1 insertion(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index cceb9534338a..7d2499bec5fe 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4490,10 +4490,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; -#ifndef CONFIG_PROVE_RCU_REPEATEDLY - if (!debug_locks_off()) - return; -#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ pr_warn("\n"); pr_warn("=============================\n"); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f524d967f7b6..7f5919ab24c4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -79,9 +79,7 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); - if (IS_ENABLED(CONFIG_PROVE_RCU_REPEATEDLY)) - pr_info("\tRCU lockdep checking is permanently enabled.\n"); - else if (IS_ENABLED(CONFIG_PROVE_RCU)) + if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 960c5d2d3c03..762deab304fe 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1306,20 +1306,6 @@ menu "RCU Debugging" config PROVE_RCU def_bool PROVE_LOCKING -config PROVE_RCU_REPEATEDLY - bool "RCU debugging: don't disable PROVE_RCU on first splat" - depends on PROVE_RCU - default n - help - By itself, PROVE_RCU will disable checking upon issuing the - first warning (or "splat"). This feature prevents such - disabling, allowing multiple RCU-lockdep warnings to be printed - on a single reboot. - - Say Y to allow multiple RCU-lockdep warnings per boot. - - Say N if you are unsure. - config SPARSE_RCU_POINTER bool "RCU debugging: sparse-based checks for pointer usage" default n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY02 b/tools/testing/selftests/rcutorture/configs/rcu/TINY02 index 9007cd979df7..1f6bebbf5da8 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TINY02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY02 @@ -8,7 +8,6 @@ CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=n CONFIG_RCU_TRACE=y CONFIG_PROVE_LOCKING=y -CONFIG_PROVE_RCU_REPEATEDLY=y #CHECK#CONFIG_PROVE_RCU=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_DEBUG_OBJECTS=y diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index 519e06d34d0b..b778a28f1386 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -13,7 +13,6 @@ CONFIG_NO_HZ_FULL -- Do two, one with partial CPU enablement. CONFIG_PREEMPT -- Do half. (First three and #8.) CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not. CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING. -CONFIG_PROVE_RCU_REPEATEDLY -- Do one. CONFIG_RCU_BOOST -- one of PREEMPT_RCU. CONFIG_RCU_FANOUT -- Cover hierarchy, but overlap with others. CONFIG_RCU_FANOUT_LEAF -- Do one non-default. -- cgit v1.3-14-g43fede From 7f0cd6333086ae09962791c31f0d4845a3329df9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 14 May 2017 17:06:30 -0700 Subject: srcu: Fix rcutorture-statistics typo The function srcutorture_get_gp_data() duplicated the check for sp->batch_check0.head instead of also checking sp->batch_check1.head. The only effect of this typo would be for rcutorture statistics to understate the fraction of time that an SRCU grace period was in flight, and only for Classic SRCU. This commit fixes this typo. Reported-by: David Binderman Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bc55b5716c37..27f871c88e0a 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -469,7 +469,7 @@ static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, *flags = 0; *completed = sp->completed; *gpnum = *completed; - if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head) + if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check1.head) (*gpnum)++; } -- cgit v1.3-14-g43fede From bd8cc5a062f41e334596edbe823e2fa0adddd1b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 May 2017 14:57:01 -0700 Subject: srcu: Remove Classic SRCU Classic SRCU was only ever intended to be a fallback in case of issues with Tree/Tiny SRCU, and the latter two are doing quite well in testing. This commit therefore removes Classic SRCU. Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 2 - include/linux/srcuclassic.h | 100 --- init/Kconfig | 21 +- kernel/rcu/Makefile | 1 - kernel/rcu/rcu.h | 16 - kernel/rcu/rcutorture.c | 17 - kernel/rcu/srcu.c | 668 --------------------- .../selftests/rcutorture/configs/rcu/CFLIST | 1 - .../selftests/rcutorture/configs/rcu/SRCU-C | 11 - .../rcutorture/configs/rcuperf/SRCUCLASSIC | 16 - 10 files changed, 2 insertions(+), 851 deletions(-) delete mode 100644 include/linux/srcuclassic.h delete mode 100644 kernel/rcu/srcu.c delete mode 100644 tools/testing/selftests/rcutorture/configs/rcu/SRCU-C delete mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/SRCUCLASSIC (limited to 'kernel') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 5f509018e6b5..39af9bc0f653 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -60,8 +60,6 @@ int init_srcu_struct(struct srcu_struct *sp); #include #elif defined(CONFIG_TREE_SRCU) #include -#elif defined(CONFIG_CLASSIC_SRCU) -#include #elif defined(CONFIG_SRCU) #error "Unknown SRCU implementation specified to kernel configuration" #else diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h deleted file mode 100644 index 67db4a36ef0d..000000000000 --- a/include/linux/srcuclassic.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Sleepable Read-Copy Update mechanism for mutual exclusion, - * classic v4.11 variant. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright (C) IBM Corporation, 2017 - * - * Author: Paul McKenney - */ - -#ifndef _LINUX_SRCU_CLASSIC_H -#define _LINUX_SRCU_CLASSIC_H - -struct srcu_array { - unsigned long lock_count[2]; - unsigned long unlock_count[2]; -}; - -struct rcu_batch { - struct rcu_head *head, **tail; -}; - -#define RCU_BATCH_INIT(name) { NULL, &(name.head) } - -struct srcu_struct { - unsigned long completed; - struct srcu_array __percpu *per_cpu_ref; - spinlock_t queue_lock; /* protect ->batch_queue, ->running */ - bool running; - /* callbacks just queued */ - struct rcu_batch batch_queue; - /* callbacks try to do the first check_zero */ - struct rcu_batch batch_check0; - /* callbacks done with the first check_zero and the flip */ - struct rcu_batch batch_check1; - struct rcu_batch batch_done; - struct delayed_work work; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -}; - -void process_srcu(struct work_struct *work); - -#define __SRCU_STRUCT_INIT(name) \ - { \ - .completed = -300, \ - .per_cpu_ref = &name##_srcu_array, \ - .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ - .running = false, \ - .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ - .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ - .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ - .batch_done = RCU_BATCH_INIT(name.batch_done), \ - .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ - __SRCU_DEP_MAP_INIT(name) \ - } - -/* - * Define and initialize a srcu struct at build time. - * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. - * - * Note that although DEFINE_STATIC_SRCU() hides the name from other - * files, the per-CPU variable rules nevertheless require that the - * chosen name be globally unique. These rules also prohibit use of - * DEFINE_STATIC_SRCU() within a function. If these rules are too - * restrictive, declare the srcu_struct manually. For example, in - * each file: - * - * static struct srcu_struct my_srcu; - * - * Then, before the first use of each my_srcu, manually initialize it: - * - * init_srcu_struct(&my_srcu); - * - * See include/linux/percpu-defs.h for the rules on per-CPU variables. - */ -#define __DEFINE_SRCU(name, is_static) \ - static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ - is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) -#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) -#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) - -void synchronize_srcu_expedited(struct srcu_struct *sp); -void srcu_barrier(struct srcu_struct *sp); - -#endif diff --git a/init/Kconfig b/init/Kconfig index 6f257d51f582..2aa14ff40e88 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -526,32 +526,15 @@ config SRCU permits arbitrary sleeping or blocking within RCU read-side critical sections. -config CLASSIC_SRCU - bool "Use v4.11 classic SRCU implementation" - default n - depends on RCU_EXPERT && SRCU - help - This option selects the traditional well-tested classic SRCU - implementation from v4.11, as might be desired for enterprise - Linux distributions. Without this option, the shiny new - Tiny SRCU and Tree SRCU implementations are used instead. - At some point, it is hoped that Tiny SRCU and Tree SRCU - will accumulate enough test time and confidence to allow - Classic SRCU to be dropped entirely. - - Say Y if you need a rock-solid SRCU. - - Say N if you would like help test Tree SRCU. - config TINY_SRCU bool - default y if SRCU && TINY_RCU && !CLASSIC_SRCU + default y if SRCU && TINY_RCU help This option selects the single-CPU non-preemptible version of SRCU. config TREE_SRCU bool - default y if SRCU && !TINY_RCU && !CLASSIC_SRCU + default y if SRCU && !TINY_RCU help This option selects the full-fledged version of SRCU. diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 23803c7d5180..3945337c8ce4 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -3,7 +3,6 @@ KCOV_INSTRUMENT := n obj-y += update.o sync.o -obj-$(CONFIG_CLASSIC_SRCU) += srcu.o obj-$(CONFIG_TREE_SRCU) += srcutree.o obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 27f871c88e0a..d06c42deee0b 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -457,22 +457,6 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, unsigned long *gpnum, unsigned long *completed); -#elif defined(CONFIG_CLASSIC_SRCU) - -static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, - unsigned long *gpnum, - unsigned long *completed) -{ - if (test_type != SRCU_FLAVOR) - return; - *flags = 0; - *completed = sp->completed; - *gpnum = *completed; - if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check1.head) - (*gpnum)++; -} - #endif #ifdef CONFIG_TINY_RCU diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 03cdf79e73d4..b8f7f8ce8575 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -564,31 +564,19 @@ static void srcu_torture_stats(void) int __maybe_unused cpu; int idx; -#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU) #ifdef CONFIG_TREE_SRCU idx = srcu_ctlp->srcu_idx & 0x1; -#else /* #ifdef CONFIG_TREE_SRCU */ - idx = srcu_ctlp->completed & 0x1; -#endif /* #else #ifdef CONFIG_TREE_SRCU */ pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; long c0, c1; -#ifdef CONFIG_TREE_SRCU struct srcu_data *counts; counts = per_cpu_ptr(srcu_ctlp->sda, cpu); u0 = counts->srcu_unlock_count[!idx]; u1 = counts->srcu_unlock_count[idx]; -#else /* #ifdef CONFIG_TREE_SRCU */ - struct srcu_array *counts; - - counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); - u0 = counts->unlock_count[!idx]; - u1 = counts->unlock_count[idx]; -#endif /* #else #ifdef CONFIG_TREE_SRCU */ /* * Make sure that a lock is always counted if the corresponding @@ -596,13 +584,8 @@ static void srcu_torture_stats(void) */ smp_rmb(); -#ifdef CONFIG_TREE_SRCU l0 = counts->srcu_lock_count[!idx]; l1 = counts->srcu_lock_count[idx]; -#else /* #ifdef CONFIG_TREE_SRCU */ - l0 = counts->lock_count[!idx]; - l1 = counts->lock_count[idx]; -#endif /* #else #ifdef CONFIG_TREE_SRCU */ c0 = l0 - u0; c1 = l1 - u1; diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c deleted file mode 100644 index 4e3f558409a0..000000000000 --- a/kernel/rcu/srcu.c +++ /dev/null @@ -1,668 +0,0 @@ -/* - * Sleepable Read-Copy Update mechanism for mutual exclusion. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright (C) IBM Corporation, 2006 - * Copyright (C) Fujitsu, 2012 - * - * Author: Paul McKenney - * Lai Jiangshan - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rcu.h" - -/* - * Initialize an rcu_batch structure to empty. - */ -static inline void rcu_batch_init(struct rcu_batch *b) -{ - b->head = NULL; - b->tail = &b->head; -} - -/* - * Enqueue a callback onto the tail of the specified rcu_batch structure. - */ -static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) -{ - *b->tail = head; - b->tail = &head->next; -} - -/* - * Is the specified rcu_batch structure empty? - */ -static inline bool rcu_batch_empty(struct rcu_batch *b) -{ - return b->tail == &b->head; -} - -/* - * Remove the callback at the head of the specified rcu_batch structure - * and return a pointer to it, or return NULL if the structure is empty. - */ -static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) -{ - struct rcu_head *head; - - if (rcu_batch_empty(b)) - return NULL; - - head = b->head; - b->head = head->next; - if (b->tail == &head->next) - rcu_batch_init(b); - - return head; -} - -/* - * Move all callbacks from the rcu_batch structure specified by "from" to - * the structure specified by "to". - */ -static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) -{ - if (!rcu_batch_empty(from)) { - *to->tail = from->head; - to->tail = from->tail; - rcu_batch_init(from); - } -} - -static int init_srcu_struct_fields(struct srcu_struct *sp) -{ - sp->completed = 0; - spin_lock_init(&sp->queue_lock); - sp->running = false; - rcu_batch_init(&sp->batch_queue); - rcu_batch_init(&sp->batch_check0); - rcu_batch_init(&sp->batch_check1); - rcu_batch_init(&sp->batch_done); - INIT_DELAYED_WORK(&sp->work, process_srcu); - sp->per_cpu_ref = alloc_percpu(struct srcu_array); - return sp->per_cpu_ref ? 0 : -ENOMEM; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -int __init_srcu_struct(struct srcu_struct *sp, const char *name, - struct lock_class_key *key) -{ - /* Don't re-initialize a lock while it is held. */ - debug_check_no_locks_freed((void *)sp, sizeof(*sp)); - lockdep_init_map(&sp->dep_map, name, key, 0); - return init_srcu_struct_fields(sp); -} -EXPORT_SYMBOL_GPL(__init_srcu_struct); - -#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -/** - * init_srcu_struct - initialize a sleep-RCU structure - * @sp: structure to initialize. - * - * Must invoke this on a given srcu_struct before passing that srcu_struct - * to any other function. Each srcu_struct represents a separate domain - * of SRCU protection. - */ -int init_srcu_struct(struct srcu_struct *sp) -{ - return init_srcu_struct_fields(sp); -} -EXPORT_SYMBOL_GPL(init_srcu_struct); - -#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -/* - * Returns approximate total of the readers' ->lock_count[] values for the - * rank of per-CPU counters specified by idx. - */ -static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->lock_count[idx]); - } - return sum; -} - -/* - * Returns approximate total of the readers' ->unlock_count[] values for the - * rank of per-CPU counters specified by idx. - */ -static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->unlock_count[idx]); - } - return sum; -} - -/* - * Return true if the number of pre-existing readers is determined to - * be zero. - */ -static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) -{ - unsigned long unlocks; - - unlocks = srcu_readers_unlock_idx(sp, idx); - - /* - * Make sure that a lock is always counted if the corresponding unlock - * is counted. Needs to be a smp_mb() as the read side may contain a - * read from a variable that is written to before the synchronize_srcu() - * in the write side. In this case smp_mb()s A and B act like the store - * buffering pattern. - * - * This smp_mb() also pairs with smp_mb() C to prevent accesses after the - * synchronize_srcu() from being executed before the grace period ends. - */ - smp_mb(); /* A */ - - /* - * If the locks are the same as the unlocks, then there must have - * been no readers on this index at some time in between. This does not - * mean that there are no more readers, as one could have read the - * current index but not have incremented the lock counter yet. - * - * Possible bug: There is no guarantee that there haven't been ULONG_MAX - * increments of ->lock_count[] since the unlocks were counted, meaning - * that this could return true even if there are still active readers. - * Since there are no memory barriers around srcu_flip(), the CPU is not - * required to increment ->completed before running - * srcu_readers_unlock_idx(), which means that there could be an - * arbitrarily large number of critical sections that execute after - * srcu_readers_unlock_idx() but use the old value of ->completed. - */ - return srcu_readers_lock_idx(sp, idx) == unlocks; -} - -/** - * srcu_readers_active - returns true if there are readers. and false - * otherwise - * @sp: which srcu_struct to count active readers (holding srcu_read_lock). - * - * Note that this is not an atomic primitive, and can therefore suffer - * severe errors when invoked on an active srcu_struct. That said, it - * can be useful as an error check at cleanup time. - */ -static bool srcu_readers_active(struct srcu_struct *sp) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); - - sum += READ_ONCE(cpuc->lock_count[0]); - sum += READ_ONCE(cpuc->lock_count[1]); - sum -= READ_ONCE(cpuc->unlock_count[0]); - sum -= READ_ONCE(cpuc->unlock_count[1]); - } - return sum; -} - -/** - * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. - * - * Must invoke this only after you are finished using a given srcu_struct - * that was initialized via init_srcu_struct(). This code does some - * probabalistic checking, spotting late uses of srcu_read_lock(), - * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu(). - * If any such late uses are detected, the per-CPU memory associated with - * the srcu_struct is simply leaked and WARN_ON() is invoked. If the - * caller frees the srcu_struct itself, a use-after-free crash will likely - * ensue, but at least there will be a warning printed. - */ -void cleanup_srcu_struct(struct srcu_struct *sp) -{ - if (WARN_ON(srcu_readers_active(sp))) - return; /* Leakage unless caller handles error. */ - free_percpu(sp->per_cpu_ref); - sp->per_cpu_ref = NULL; -} -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); - -/* - * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. - * Returns an index that must be passed to the matching srcu_read_unlock(). - */ -int __srcu_read_lock(struct srcu_struct *sp) -{ - int idx; - - idx = READ_ONCE(sp->completed) & 0x1; - this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); - smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return idx; -} -EXPORT_SYMBOL_GPL(__srcu_read_lock); - -/* - * Removes the count for the old reader from the appropriate per-CPU - * element of the srcu_struct. Note that this may well be a different - * CPU than that which was incremented by the corresponding srcu_read_lock(). - */ -void __srcu_read_unlock(struct srcu_struct *sp, int idx) -{ - smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]); -} -EXPORT_SYMBOL_GPL(__srcu_read_unlock); - -/* - * We use an adaptive strategy for synchronize_srcu() and especially for - * synchronize_srcu_expedited(). We spin for a fixed time period - * (defined below) to allow SRCU readers to exit their read-side critical - * sections. If there are still some readers after 10 microseconds, - * we repeatedly block for 1-millisecond time periods. This approach - * has done well in testing, so there is no need for a config parameter. - */ -#define SRCU_RETRY_CHECK_DELAY 5 -#define SYNCHRONIZE_SRCU_TRYCOUNT 2 -#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 - -/* - * @@@ Wait until all pre-existing readers complete. Such readers - * will have used the index specified by "idx". - * the caller should ensures the ->completed is not changed while checking - * and idx = (->completed & 1) ^ 1 - */ -static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) -{ - for (;;) { - if (srcu_readers_active_idx_check(sp, idx)) - return true; - if (--trycount <= 0) - return false; - udelay(SRCU_RETRY_CHECK_DELAY); - } -} - -/* - * Increment the ->completed counter so that future SRCU readers will - * use the other rank of the ->(un)lock_count[] arrays. This allows - * us to wait for pre-existing readers in a starvation-free manner. - */ -static void srcu_flip(struct srcu_struct *sp) -{ - WRITE_ONCE(sp->completed, sp->completed + 1); - - /* - * Ensure that if the updater misses an __srcu_read_unlock() - * increment, that task's next __srcu_read_lock() will see the - * above counter update. Note that both this memory barrier - * and the one in srcu_readers_active_idx_check() provide the - * guarantee for __srcu_read_lock(). - */ - smp_mb(); /* D */ /* Pairs with C. */ -} - -/* - * Enqueue an SRCU callback on the specified srcu_struct structure, - * initiating grace-period processing if it is not already running. - * - * Note that all CPUs must agree that the grace period extended beyond - * all pre-existing SRCU read-side critical section. On systems with - * more than one CPU, this means that when "func()" is invoked, each CPU - * is guaranteed to have executed a full memory barrier since the end of - * its last corresponding SRCU read-side critical section whose beginning - * preceded the call to call_rcu(). It also means that each CPU executing - * an SRCU read-side critical section that continues beyond the start of - * "func()" must have executed a memory barrier after the call_rcu() - * but before the beginning of that SRCU read-side critical section. - * Note that these guarantees include CPUs that are offline, idle, or - * executing in user mode, as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the - * resulting SRCU callback function "func()", then both CPU A and CPU - * B are guaranteed to execute a full memory barrier during the time - * interval between the call to call_rcu() and the invocation of "func()". - * This guarantee applies even if CPU A and CPU B are the same CPU (but - * again only if the system has more than one CPU). - * - * Of course, these guarantees apply only for invocations of call_srcu(), - * srcu_read_lock(), and srcu_read_unlock() that are all passed the same - * srcu_struct structure. - */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *head, - rcu_callback_t func) -{ - unsigned long flags; - - head->next = NULL; - head->func = func; - spin_lock_irqsave(&sp->queue_lock, flags); - smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ - rcu_batch_queue(&sp->batch_queue, head); - if (!sp->running) { - sp->running = true; - queue_delayed_work(system_power_efficient_wq, &sp->work, 0); - } - spin_unlock_irqrestore(&sp->queue_lock, flags); -} -EXPORT_SYMBOL_GPL(call_srcu); - -static void srcu_advance_batches(struct srcu_struct *sp, int trycount); -static void srcu_reschedule(struct srcu_struct *sp); - -/* - * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). - */ -static void __synchronize_srcu(struct srcu_struct *sp, int trycount) -{ - struct rcu_synchronize rcu; - struct rcu_head *head = &rcu.head; - bool done = false; - - RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || - lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); - - might_sleep(); - init_completion(&rcu.completion); - - head->next = NULL; - head->func = wakeme_after_rcu; - spin_lock_irq(&sp->queue_lock); - smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ - if (!sp->running) { - /* steal the processing owner */ - sp->running = true; - rcu_batch_queue(&sp->batch_check0, head); - spin_unlock_irq(&sp->queue_lock); - - srcu_advance_batches(sp, trycount); - if (!rcu_batch_empty(&sp->batch_done)) { - BUG_ON(sp->batch_done.head != head); - rcu_batch_dequeue(&sp->batch_done); - done = true; - } - /* give the processing owner to work_struct */ - srcu_reschedule(sp); - } else { - rcu_batch_queue(&sp->batch_queue, head); - spin_unlock_irq(&sp->queue_lock); - } - - if (!done) { - wait_for_completion(&rcu.completion); - smp_mb(); /* Caller's later accesses after GP. */ - } - -} - -/** - * synchronize_srcu - wait for prior SRCU read-side critical-section completion - * @sp: srcu_struct with which to synchronize. - * - * Wait for the count to drain to zero of both indexes. To avoid the - * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->completed & 1) ^ 1) to drain to zero at first, - * and then flip the completed and wait for the count of the other index. - * - * Can block; must be called from process context. - * - * Note that it is illegal to call synchronize_srcu() from the corresponding - * SRCU read-side critical section; doing so will result in deadlock. - * However, it is perfectly legal to call synchronize_srcu() on one - * srcu_struct from some other srcu_struct's read-side critical section, - * as long as the resulting graph of srcu_structs is acyclic. - * - * There are memory-ordering constraints implied by synchronize_srcu(). - * On systems with more than one CPU, when synchronize_srcu() returns, - * each CPU is guaranteed to have executed a full memory barrier since - * the end of its last corresponding SRCU-sched read-side critical section - * whose beginning preceded the call to synchronize_srcu(). In addition, - * each CPU having an SRCU read-side critical section that extends beyond - * the return from synchronize_srcu() is guaranteed to have executed a - * full memory barrier after the beginning of synchronize_srcu() and before - * the beginning of that SRCU read-side critical section. Note that these - * guarantees include CPUs that are offline, idle, or executing in user mode, - * as well as CPUs that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_srcu(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_srcu(). This guarantee applies even if CPU A and CPU B - * are the same CPU, but again only if the system has more than one CPU. - * - * Of course, these memory-ordering guarantees apply only when - * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are - * passed the same srcu_struct structure. - */ -void synchronize_srcu(struct srcu_struct *sp) -{ - __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal()) - ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT - : SYNCHRONIZE_SRCU_TRYCOUNT); -} -EXPORT_SYMBOL_GPL(synchronize_srcu); - -/** - * synchronize_srcu_expedited - Brute-force SRCU grace period - * @sp: srcu_struct with which to synchronize. - * - * Wait for an SRCU grace period to elapse, but be more aggressive about - * spinning rather than blocking when waiting. - * - * Note that synchronize_srcu_expedited() has the same deadlock and - * memory-ordering properties as does synchronize_srcu(). - */ -void synchronize_srcu_expedited(struct srcu_struct *sp) -{ - __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); -} -EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); - -/** - * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. - * @sp: srcu_struct on which to wait for in-flight callbacks. - */ -void srcu_barrier(struct srcu_struct *sp) -{ - synchronize_srcu(sp); -} -EXPORT_SYMBOL_GPL(srcu_barrier); - -/** - * srcu_batches_completed - return batches completed. - * @sp: srcu_struct on which to report batch completion. - * - * Report the number of batches, correlated with, but not necessarily - * precisely the same as, the number of grace periods that have elapsed. - */ -unsigned long srcu_batches_completed(struct srcu_struct *sp) -{ - return sp->completed; -} -EXPORT_SYMBOL_GPL(srcu_batches_completed); - -#define SRCU_CALLBACK_BATCH 10 -#define SRCU_INTERVAL 1 - -/* - * Move any new SRCU callbacks to the first stage of the SRCU grace - * period pipeline. - */ -static void srcu_collect_new(struct srcu_struct *sp) -{ - if (!rcu_batch_empty(&sp->batch_queue)) { - spin_lock_irq(&sp->queue_lock); - rcu_batch_move(&sp->batch_check0, &sp->batch_queue); - spin_unlock_irq(&sp->queue_lock); - } -} - -/* - * Core SRCU state machine. Advance callbacks from ->batch_check0 to - * ->batch_check1 and then to ->batch_done as readers drain. - */ -static void srcu_advance_batches(struct srcu_struct *sp, int trycount) -{ - int idx = 1 ^ (sp->completed & 1); - - /* - * Because readers might be delayed for an extended period after - * fetching ->completed for their index, at any point in time there - * might well be readers using both idx=0 and idx=1. We therefore - * need to wait for readers to clear from both index values before - * invoking a callback. - */ - - if (rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_check1)) - return; /* no callbacks need to be advanced */ - - if (!try_check_zero(sp, idx, trycount)) - return; /* failed to advance, will try after SRCU_INTERVAL */ - - /* - * The callbacks in ->batch_check1 have already done with their - * first zero check and flip back when they were enqueued on - * ->batch_check0 in a previous invocation of srcu_advance_batches(). - * (Presumably try_check_zero() returned false during that - * invocation, leaving the callbacks stranded on ->batch_check1.) - * They are therefore ready to invoke, so move them to ->batch_done. - */ - rcu_batch_move(&sp->batch_done, &sp->batch_check1); - - if (rcu_batch_empty(&sp->batch_check0)) - return; /* no callbacks need to be advanced */ - srcu_flip(sp); - - /* - * The callbacks in ->batch_check0 just finished their - * first check zero and flip, so move them to ->batch_check1 - * for future checking on the other idx. - */ - rcu_batch_move(&sp->batch_check1, &sp->batch_check0); - - /* - * SRCU read-side critical sections are normally short, so check - * at least twice in quick succession after a flip. - */ - trycount = trycount < 2 ? 2 : trycount; - if (!try_check_zero(sp, idx^1, trycount)) - return; /* failed to advance, will try after SRCU_INTERVAL */ - - /* - * The callbacks in ->batch_check1 have now waited for all - * pre-existing readers using both idx values. They are therefore - * ready to invoke, so move them to ->batch_done. - */ - rcu_batch_move(&sp->batch_done, &sp->batch_check1); -} - -/* - * Invoke a limited number of SRCU callbacks that have passed through - * their grace period. If there are more to do, SRCU will reschedule - * the workqueue. Note that needed memory barriers have been executed - * in this task's context by srcu_readers_active_idx_check(). - */ -static void srcu_invoke_callbacks(struct srcu_struct *sp) -{ - int i; - struct rcu_head *head; - - for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { - head = rcu_batch_dequeue(&sp->batch_done); - if (!head) - break; - local_bh_disable(); - head->func(head); - local_bh_enable(); - } -} - -/* - * Finished one round of SRCU grace period. Start another if there are - * more SRCU callbacks queued, otherwise put SRCU into not-running state. - */ -static void srcu_reschedule(struct srcu_struct *sp) -{ - bool pending = true; - - if (rcu_batch_empty(&sp->batch_done) && - rcu_batch_empty(&sp->batch_check1) && - rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_queue)) { - spin_lock_irq(&sp->queue_lock); - if (rcu_batch_empty(&sp->batch_done) && - rcu_batch_empty(&sp->batch_check1) && - rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_queue)) { - sp->running = false; - pending = false; - } - spin_unlock_irq(&sp->queue_lock); - } - - if (pending) - queue_delayed_work(system_power_efficient_wq, - &sp->work, SRCU_INTERVAL); -} - -/* - * This is the work-queue function that handles SRCU grace periods. - */ -void process_srcu(struct work_struct *work) -{ - struct srcu_struct *sp; - - sp = container_of(work, struct srcu_struct, work.work); - - srcu_collect_new(sp); - srcu_advance_batches(sp, 1); - srcu_invoke_callbacks(sp); - srcu_reschedule(sp); -} -EXPORT_SYMBOL_GPL(process_srcu); - -static int __init srcu_bootup_announce(void) -{ - pr_info("Classic SRCU implementation.\n"); - return 0; -} -early_initcall(srcu_bootup_announce); diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST index 0c1da784b8cb..6a0b9f69faad 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST @@ -7,7 +7,6 @@ TREE06 TREE07 TREE08 TREE09 -SRCU-C SRCU-N SRCU-P SRCU-t diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C deleted file mode 100644 index d4e19c087c21..000000000000 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C +++ /dev/null @@ -1,11 +0,0 @@ -CONFIG_RCU_TRACE=n -CONFIG_SMP=y -CONFIG_NR_CPUS=3 -CONFIG_HOTPLUG_CPU=y -CONFIG_RCU_EXPERT=y -CONFIG_CLASSIC_SRCU=y -CONFIG_PREEMPT_NONE=n -CONFIG_PREEMPT_VOLUNTARY=n -CONFIG_PREEMPT=y -CONFIG_DEBUG_LOCK_ALLOC=y -CONFIG_PROVE_LOCKING=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/SRCUCLASSIC b/tools/testing/selftests/rcutorture/configs/rcuperf/SRCUCLASSIC deleted file mode 100644 index a1395af60ef4..000000000000 --- a/tools/testing/selftests/rcutorture/configs/rcuperf/SRCUCLASSIC +++ /dev/null @@ -1,16 +0,0 @@ -CONFIG_SMP=y -CONFIG_HZ_PERIODIC=n -CONFIG_NO_HZ_IDLE=y -CONFIG_NO_HZ_FULL=n -CONFIG_RCU_FAST_NO_HZ=n -CONFIG_HOTPLUG_CPU=n -CONFIG_SUSPEND=n -CONFIG_HIBERNATION=n -CONFIG_RCU_NOCB_CPU=n -CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_PROVE_LOCKING=n -CONFIG_RCU_BOOST=n -CONFIG_DEBUG_OBJECTS_RCU_HEAD=n -CONFIG_RCU_EXPERT=y -CONFIG_RCU_TRACE=y -CONFIG_CLASSIC_SRCU=y -- cgit v1.3-14-g43fede From ae91aa0adb14dc33114d566feca2f7cb7a96b8b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 May 2017 15:30:32 -0700 Subject: rcu: Remove debugfs tracing RCU's debugfs tracing used to be the only reasonable low-level debug information available, but ftrace and event tracing has since surpassed the RCU debugfs level of usefulness. This commit therefore removes RCU's debugfs tracing. Signed-off-by: Paul E. McKenney --- Documentation/RCU/00-INDEX | 2 - .../RCU/Design/Requirements/Requirements.html | 2 +- Documentation/RCU/trace.txt | 535 --------------------- init/Kconfig | 8 - kernel/rcu/Makefile | 1 - kernel/rcu/tiny_plugin.h | 45 -- kernel/rcu/tree.h | 27 -- kernel/rcu/tree_plugin.h | 31 +- kernel/rcu/tree_trace.c | 494 ------------------- lib/Kconfig.debug | 5 +- .../selftests/rcutorture/configs/rcu/TREE02-T | 21 - .../selftests/rcutorture/configs/rcu/TREE08-T | 21 - .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt | 4 - 13 files changed, 4 insertions(+), 1192 deletions(-) delete mode 100644 Documentation/RCU/trace.txt delete mode 100644 kernel/rcu/tree_trace.c delete mode 100644 tools/testing/selftests/rcutorture/configs/rcu/TREE02-T delete mode 100644 tools/testing/selftests/rcutorture/configs/rcu/TREE08-T (limited to 'kernel') diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX index 1672573b037a..f46980c060aa 100644 --- a/Documentation/RCU/00-INDEX +++ b/Documentation/RCU/00-INDEX @@ -28,8 +28,6 @@ stallwarn.txt - RCU CPU stall warnings (module parameter rcu_cpu_stall_suppress) torture.txt - RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST) -trace.txt - - CONFIG_RCU_TRACE debugfs files and formats UP.txt - RCU on Uniprocessor Systems whatisRCU.txt diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 0e6550a8c926..95b30fa25d56 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -2034,7 +2034,7 @@ guard against mishaps and misuse: some other synchronization mechanism, for example, reference counting.
  • In kernels built with CONFIG_RCU_TRACE=y, RCU-related - information is provided via both debugfs and event tracing. + information is provided via event tracing.
  • Open-coded use of rcu_assign_pointer() and rcu_dereference() to create typical linked data structures can be surprisingly error-prone. diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt deleted file mode 100644 index 6549012033f9..000000000000 --- a/Documentation/RCU/trace.txt +++ /dev/null @@ -1,535 +0,0 @@ -CONFIG_RCU_TRACE debugfs Files and Formats - - -The rcutree and rcutiny implementations of RCU provide debugfs trace -output that summarizes counters and state. This information is useful for -debugging RCU itself, and can sometimes also help to debug abuses of RCU. -The following sections describe the debugfs files and formats, first -for rcutree and next for rcutiny. - - -CONFIG_TREE_RCU and CONFIG_PREEMPT_RCU debugfs Files and Formats - -These implementations of RCU provide several debugfs directories under the -top-level directory "rcu": - -rcu/rcu_bh -rcu/rcu_preempt -rcu/rcu_sched - -Each directory contains files for the corresponding flavor of RCU. -Note that rcu/rcu_preempt is only present for CONFIG_PREEMPT_RCU. -For CONFIG_TREE_RCU, the RCU flavor maps onto the RCU-sched flavor, -so that activity for both appears in rcu/rcu_sched. - -In addition, the following file appears in the top-level directory: -rcu/rcutorture. This file displays rcutorture test progress. The output -of "cat rcu/rcutorture" looks as follows: - -rcutorture test sequence: 0 (test in progress) -rcutorture update version number: 615 - -The first line shows the number of rcutorture tests that have completed -since boot. If a test is currently running, the "(test in progress)" -string will appear as shown above. The second line shows the number of -update cycles that the current test has started, or zero if there is -no test in progress. - - -Within each flavor directory (rcu/rcu_bh, rcu/rcu_sched, and possibly -also rcu/rcu_preempt) the following files will be present: - -rcudata: - Displays fields in struct rcu_data. -rcuexp: - Displays statistics for expedited grace periods. -rcugp: - Displays grace-period counters. -rcuhier: - Displays the struct rcu_node hierarchy. -rcu_pending: - Displays counts of the reasons rcu_pending() decided that RCU had - work to do. -rcuboost: - Displays RCU boosting statistics. Only present if - CONFIG_RCU_BOOST=y. - -The output of "cat rcu/rcu_preempt/rcudata" looks as follows: - - 0!c=30455 g=30456 cnq=1/0:1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716 - 1!c=30719 g=30720 cnq=1/0:0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982 - 2!c=30150 g=30151 cnq=1/1:1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458 - 3 c=31249 g=31250 cnq=1/1:0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622 - 4!c=29502 g=29503 cnq=1/0:1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521 - 5 c=31201 g=31202 cnq=1/0:1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698 - 6!c=30253 g=30254 cnq=1/0:1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353 - 7 c=31178 g=31178 cnq=1/0:0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969 - -This file has one line per CPU, or eight for this 8-CPU system. -The fields are as follows: - -o The number at the beginning of each line is the CPU number. - CPUs numbers followed by an exclamation mark are offline, - but have been online at least once since boot. There will be - no output for CPUs that have never been online, which can be - a good thing in the surprisingly common case where NR_CPUS is - substantially larger than the number of actual CPUs. - -o "c" is the count of grace periods that this CPU believes have - completed. Offlined CPUs and CPUs in dynticks idle mode may lag - quite a ways behind, for example, CPU 4 under "rcu_sched" above, - which has been offline through 16 RCU grace periods. It is not - unusual to see offline CPUs lagging by thousands of grace periods. - Note that although the grace-period number is an unsigned long, - it is printed out as a signed long to allow more human-friendly - representation near boot time. - -o "g" is the count of grace periods that this CPU believes have - started. Again, offlined CPUs and CPUs in dynticks idle mode - may lag behind. If the "c" and "g" values are equal, this CPU - has already reported a quiescent state for the last RCU grace - period that it is aware of, otherwise, the CPU believes that it - owes RCU a quiescent state. - -o "pq" indicates that this CPU has passed through a quiescent state - for the current grace period. It is possible for "pq" to be - "1" and "c" different than "g", which indicates that although - the CPU has passed through a quiescent state, either (1) this - CPU has not yet reported that fact, (2) some other CPU has not - yet reported for this grace period, or (3) both. - -o "qp" indicates that RCU still expects a quiescent state from - this CPU. Offlined CPUs and CPUs in dyntick idle mode might - well have qp=1, which is OK: RCU is still ignoring them. - -o "dt" is the current value of the dyntick counter that is incremented - when entering or leaving idle, either due to a context switch or - due to an interrupt. This number is even if the CPU is in idle - from RCU's viewpoint and odd otherwise. The number after the - first "/" is the interrupt nesting depth when in idle state, - or a large number added to the interrupt-nesting depth when - running a non-idle task. Some architectures do not accurately - count interrupt nesting when running in non-idle kernel context, - which can result in interesting anomalies such as negative - interrupt-nesting levels. The number after the second "/" - is the NMI nesting depth. - -o "df" is the number of times that some other CPU has forced a - quiescent state on behalf of this CPU due to this CPU being in - idle state. - -o "of" is the number of times that some other CPU has forced a - quiescent state on behalf of this CPU due to this CPU being - offline. In a perfect world, this might never happen, but it - turns out that offlining and onlining a CPU can take several grace - periods, and so there is likely to be an extended period of time - when RCU believes that the CPU is online when it really is not. - Please note that erring in the other direction (RCU believing a - CPU is offline when it is really alive and kicking) is a fatal - error, so it makes sense to err conservatively. - -o "ql" is the number of RCU callbacks currently residing on - this CPU. The first number is the number of "lazy" callbacks - that are known to RCU to only be freeing memory, and the number - after the "/" is the total number of callbacks, lazy or not. - These counters count callbacks regardless of what phase of - grace-period processing that they are in (new, waiting for - grace period to start, waiting for grace period to end, ready - to invoke). - -o "qs" gives an indication of the state of the callback queue - with four characters: - - "N" Indicates that there are callbacks queued that are not - ready to be handled by the next grace period, and thus - will be handled by the grace period following the next - one. - - "R" Indicates that there are callbacks queued that are - ready to be handled by the next grace period. - - "W" Indicates that there are callbacks queued that are - waiting on the current grace period. - - "D" Indicates that there are callbacks queued that have - already been handled by a prior grace period, and are - thus waiting to be invoked. Note that callbacks in - the process of being invoked are not counted here. - Callbacks in the process of being invoked are those - that have been removed from the rcu_data structures - queues by rcu_do_batch(), but which have not yet been - invoked. - - If there are no callbacks in a given one of the above states, - the corresponding character is replaced by ".". - -o "b" is the batch limit for this CPU. If more than this number - of RCU callbacks is ready to invoke, then the remainder will - be deferred. - -o "ci" is the number of RCU callbacks that have been invoked for - this CPU. Note that ci+nci+ql is the number of callbacks that have - been registered in absence of CPU-hotplug activity. - -o "nci" is the number of RCU callbacks that have been offloaded from - this CPU. This will always be zero unless the kernel was built - with CONFIG_RCU_NOCB_CPU=y and the "rcu_nocbs=" kernel boot - parameter was specified. - -o "co" is the number of RCU callbacks that have been orphaned due to - this CPU going offline. These orphaned callbacks have been moved - to an arbitrarily chosen online CPU. - -o "ca" is the number of RCU callbacks that have been adopted by this - CPU due to other CPUs going offline. Note that ci+co-ca+ql is - the number of RCU callbacks registered on this CPU. - - -Kernels compiled with CONFIG_RCU_BOOST=y display the following from -/debug/rcu/rcu_preempt/rcudata: - - 0!c=12865 g=12866 cnq=1/0:1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871 - 1 c=14407 g=14408 cnq=1/0:0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485 - 2 c=14407 g=14408 cnq=1/0:0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490 - 3 c=14407 g=14408 cnq=1/0:0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290 - 4 c=14405 g=14406 cnq=1/0:1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114 - 5!c=14168 g=14169 cnq=1/0:0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722 - 6 c=14404 g=14405 cnq=1/0:0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811 - 7 c=14407 g=14408 cnq=1/0:1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042 - -This is similar to the output discussed above, but contains the following -additional fields: - -o "kt" is the per-CPU kernel-thread state. The digit preceding - the first slash is zero if there is no work pending and 1 - otherwise. The character between the first pair of slashes is - as follows: - - "S" The kernel thread is stopped, in other words, all - CPUs corresponding to this rcu_node structure are - offline. - - "R" The kernel thread is running. - - "W" The kernel thread is waiting because there is no work - for it to do. - - "O" The kernel thread is waiting because it has been - forced off of its designated CPU or because its - ->cpus_allowed mask permits it to run on other than - its designated CPU. - - "Y" The kernel thread is yielding to avoid hogging CPU. - - "?" Unknown value, indicates a bug. - - The number after the final slash is the CPU that the kthread - is actually running on. - - This field is displayed only for CONFIG_RCU_BOOST kernels. - -o "ktl" is the low-order 16 bits (in hexadecimal) of the count of - the number of times that this CPU's per-CPU kthread has gone - through its loop servicing invoke_rcu_cpu_kthread() requests. - - This field is displayed only for CONFIG_RCU_BOOST kernels. - - -The output of "cat rcu/rcu_preempt/rcuexp" looks as follows: - -s=21872 wd1=0 wd2=0 wd3=5 enq=0 sc=21872 - -These fields are as follows: - -o "s" is the sequence number, with an odd number indicating that - an expedited grace period is in progress. - -o "wd1", "wd2", and "wd3" are the number of times that an attempt - to start an expedited grace period found that someone else had - completed an expedited grace period that satisfies the attempted - request. "Our work is done." - -o "enq" is the number of quiescent states still outstanding. - -o "sc" is the number of times that the attempt to start a - new expedited grace period succeeded. - - -The output of "cat rcu/rcu_preempt/rcugp" looks as follows: - -completed=31249 gpnum=31250 age=1 max=18 - -These fields are taken from the rcu_state structure, and are as follows: - -o "completed" is the number of grace periods that have completed. - It is comparable to the "c" field from rcu/rcudata in that a - CPU whose "c" field matches the value of "completed" is aware - that the corresponding RCU grace period has completed. - -o "gpnum" is the number of grace periods that have started. It is - similarly comparable to the "g" field from rcu/rcudata in that - a CPU whose "g" field matches the value of "gpnum" is aware that - the corresponding RCU grace period has started. - - If these two fields are equal, then there is no grace period - in progress, in other words, RCU is idle. On the other hand, - if the two fields differ (as they are above), then an RCU grace - period is in progress. - -o "age" is the number of jiffies that the current grace period - has extended for, or zero if there is no grace period currently - in effect. - -o "max" is the age in jiffies of the longest-duration grace period - thus far. - -The output of "cat rcu/rcu_preempt/rcuhier" looks as follows: - -c=14407 g=14408 s=0 jfq=2 j=c863 nfqs=12040/nfqsng=0(12040) fqlh=1051 oqlen=0/0 -3/3 ..>. 0:7 ^0 -e/e ..>. 0:3 ^0 d/d ..>. 4:7 ^1 - -The fields are as follows: - -o "c" is exactly the same as "completed" under rcu/rcu_preempt/rcugp. - -o "g" is exactly the same as "gpnum" under rcu/rcu_preempt/rcugp. - -o "s" is the current state of the force_quiescent_state() - state machine. - -o "jfq" is the number of jiffies remaining for this grace period - before force_quiescent_state() is invoked to help push things - along. Note that CPUs in idle mode throughout the grace period - will not report on their own, but rather must be check by some - other CPU via force_quiescent_state(). - -o "j" is the low-order four hex digits of the jiffies counter. - Yes, Paul did run into a number of problems that turned out to - be due to the jiffies counter no longer counting. Why do you ask? - -o "nfqs" is the number of calls to force_quiescent_state() since - boot. - -o "nfqsng" is the number of useless calls to force_quiescent_state(), - where there wasn't actually a grace period active. This can - no longer happen due to grace-period processing being pushed - into a kthread. The number in parentheses is the difference - between "nfqs" and "nfqsng", or the number of times that - force_quiescent_state() actually did some real work. - -o "fqlh" is the number of calls to force_quiescent_state() that - exited immediately (without even being counted in nfqs above) - due to contention on ->fqslock. - -o Each element of the form "3/3 ..>. 0:7 ^0" represents one rcu_node - structure. Each line represents one level of the hierarchy, - from root to leaves. It is best to think of the rcu_data - structures as forming yet another level after the leaves. - Note that there might be either one, two, three, or even four - levels of rcu_node structures, depending on the relationship - between CONFIG_RCU_FANOUT, CONFIG_RCU_FANOUT_LEAF (possibly - adjusted using the rcu_fanout_leaf kernel boot parameter), and - CONFIG_NR_CPUS (possibly adjusted using the nr_cpu_ids count of - possible CPUs for the booting hardware). - - o The numbers separated by the "/" are the qsmask followed - by the qsmaskinit. The qsmask will have one bit - set for each entity in the next lower level that has - not yet checked in for the current grace period ("e" - indicating CPUs 5, 6, and 7 in the example above). - The qsmaskinit will have one bit for each entity that is - currently expected to check in during each grace period. - The value of qsmaskinit is assigned to that of qsmask - at the beginning of each grace period. - - o The characters separated by the ">" indicate the state - of the blocked-tasks lists. A "G" preceding the ">" - indicates that at least one task blocked in an RCU - read-side critical section blocks the current grace - period, while a "E" preceding the ">" indicates that - at least one task blocked in an RCU read-side critical - section blocks the current expedited grace period. - A "T" character following the ">" indicates that at - least one task is blocked within an RCU read-side - critical section, regardless of whether any current - grace period (expedited or normal) is inconvenienced. - A "." character appears if the corresponding condition - does not hold, so that "..>." indicates that no tasks - are blocked. In contrast, "GE>T" indicates maximal - inconvenience from blocked tasks. CONFIG_TREE_RCU - builds of the kernel will always show "..>.". - - o The numbers separated by the ":" are the range of CPUs - served by this struct rcu_node. This can be helpful - in working out how the hierarchy is wired together. - - For example, the example rcu_node structure shown above - has "0:7", indicating that it covers CPUs 0 through 7. - - o The number after the "^" indicates the bit in the - next higher level rcu_node structure that this rcu_node - structure corresponds to. For example, the "d/d ..>. 4:7 - ^1" has a "1" in this position, indicating that it - corresponds to the "1" bit in the "3" shown in the - "3/3 ..>. 0:7 ^0" entry on the next level up. - - -The output of "cat rcu/rcu_sched/rcu_pending" looks as follows: - - 0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903 ndw=0 - 1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113 ndw=0 - 2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889 ndw=0 - 3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469 ndw=0 - 4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042 ndw=0 - 5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422 ndw=0 - 6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699 ndw=0 - 7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147 ndw=0 - -The fields are as follows: - -o The leading number is the CPU number, with "!" indicating - an offline CPU. - -o "np" is the number of times that __rcu_pending() has been invoked - for the corresponding flavor of RCU. - -o "qsp" is the number of times that the RCU was waiting for a - quiescent state from this CPU. - -o "rpq" is the number of times that the CPU had passed through - a quiescent state, but not yet reported it to RCU. - -o "cbr" is the number of times that this CPU had RCU callbacks - that had passed through a grace period, and were thus ready - to be invoked. - -o "cng" is the number of times that this CPU needed another - grace period while RCU was idle. - -o "gpc" is the number of times that an old grace period had - completed, but this CPU was not yet aware of it. - -o "gps" is the number of times that a new grace period had started, - but this CPU was not yet aware of it. - -o "ndw" is the number of times that a wakeup of an rcuo - callback-offload kthread had to be deferred in order to avoid - deadlock. - -o "nn" is the number of times that this CPU needed nothing. - - -The output of "cat rcu/rcuboost" looks as follows: - -0:3 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894 - balk: nt=0 egt=4695 bt=0 nb=0 ny=56 nos=0 -4:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894 - balk: nt=0 egt=6541 bt=0 nb=0 ny=126 nos=0 - -This information is output only for rcu_preempt. Each two-line entry -corresponds to a leaf rcu_node structure. The fields are as follows: - -o "n:m" is the CPU-number range for the corresponding two-line - entry. In the sample output above, the first entry covers - CPUs zero through three and the second entry covers CPUs four - through seven. - -o "tasks=TNEB" gives the state of the various segments of the - rnp->blocked_tasks list: - - "T" This indicates that there are some tasks that blocked - while running on one of the corresponding CPUs while - in an RCU read-side critical section. - - "N" This indicates that some of the blocked tasks are preventing - the current normal (non-expedited) grace period from - completing. - - "E" This indicates that some of the blocked tasks are preventing - the current expedited grace period from completing. - - "B" This indicates that some of the blocked tasks are in - need of RCU priority boosting. - - Each character is replaced with "." if the corresponding - condition does not hold. - -o "kt" is the state of the RCU priority-boosting kernel - thread associated with the corresponding rcu_node structure. - The state can be one of the following: - - "S" The kernel thread is stopped, in other words, all - CPUs corresponding to this rcu_node structure are - offline. - - "R" The kernel thread is running. - - "W" The kernel thread is waiting because there is no work - for it to do. - - "Y" The kernel thread is yielding to avoid hogging CPU. - - "?" Unknown value, indicates a bug. - -o "ntb" is the number of tasks boosted. - -o "neb" is the number of tasks boosted in order to complete an - expedited grace period. - -o "nnb" is the number of tasks boosted in order to complete a - normal (non-expedited) grace period. When boosting a task - that was blocking both an expedited and a normal grace period, - it is counted against the expedited total above. - -o "j" is the low-order 16 bits of the jiffies counter in - hexadecimal. - -o "bt" is the low-order 16 bits of the value that the jiffies - counter will have when we next start boosting, assuming that - the current grace period does not end beforehand. This is - also in hexadecimal. - -o "balk: nt" counts the number of times we didn't boost (in - other words, we balked) even though it was time to boost because - there were no blocked tasks to boost. This situation occurs - when there is one blocked task on one rcu_node structure and - none on some other rcu_node structure. - -o "egt" counts the number of times we balked because although - there were blocked tasks, none of them were blocking the - current grace period, whether expedited or otherwise. - -o "bt" counts the number of times we balked because boosting - had already been initiated for the current grace period. - -o "nb" counts the number of times we balked because there - was at least one task blocking the current non-expedited grace - period that never had blocked. If it is already running, it - just won't help to boost its priority! - -o "ny" counts the number of times we balked because it was - not yet time to start boosting. - -o "nos" counts the number of times we balked for other - reasons, e.g., the grace period ended first. - - -CONFIG_TINY_RCU debugfs Files and Formats - -These implementations of RCU provides a single debugfs file under the -top-level directory RCU, namely rcu/rcudata, which displays fields in -rcu_bh_ctrlblk and rcu_sched_ctrlblk. - -The output of "cat rcu/rcudata" is as follows: - -rcu_sched: qlen: 0 -rcu_bh: qlen: 0 - -This is split into rcu_sched and rcu_bh sections. The field is as -follows: - -o "qlen" is the number of RCU callbacks currently waiting either - for an RCU grace period or waiting to be invoked. This is the - only field present for rcu_sched and rcu_bh, due to the - short-circuiting of grace period in those two cases. diff --git a/init/Kconfig b/init/Kconfig index 2aa14ff40e88..3025383ab443 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -659,14 +659,6 @@ config RCU_FAST_NO_HZ Say N if you are unsure. -config TREE_RCU_TRACE - def_bool RCU_TRACE && ( TREE_RCU || PREEMPT_RCU ) - select DEBUG_FS - help - This option provides tracing for the TREE_RCU and - PREEMPT_RCU implementations, permitting Makefile to - trivially select kernel/rcutree_trace.c. - config RCU_BOOST bool "Enable RCU priority boosting" depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 3945337c8ce4..13c0fc852767 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -9,6 +9,5 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o -obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o obj-$(CONFIG_TINY_RCU) += tiny.o obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 371034e77f87..c642f23f1582 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -24,8 +24,6 @@ #include #include -#include -#include /* Global control variables for rcupdate callback mechanism. */ struct rcu_ctrlblk { @@ -87,49 +85,6 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) local_irq_restore(flags); } -/* - * Dump statistics for TINY_RCU, such as they are. - */ -static int show_tiny_stats(struct seq_file *m, void *unused) -{ - seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); - seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); - return 0; -} - -static int show_tiny_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_tiny_stats, NULL); -} - -static const struct file_operations show_tiny_stats_fops = { - .owner = THIS_MODULE, - .open = show_tiny_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static struct dentry *rcudir; - -static int __init rcutiny_trace_init(void) -{ - struct dentry *retval; - - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto free_out; - retval = debugfs_create_file("rcudata", 0444, rcudir, - NULL, &show_tiny_stats_fops); - if (!retval) - goto free_out; - return 0; -free_out: - debugfs_remove_recursive(rcudir); - return 1; -} -device_initcall(rcutiny_trace_init); - static void check_cpu_stall(struct rcu_ctrlblk *rcp) { unsigned long j; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2c112bb11aa8..9af0f31d6847 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -152,19 +152,6 @@ struct rcu_node { /* Number of tasks boosted for expedited GP. */ unsigned long n_normal_boosts; /* Number of tasks boosted for normal GP. */ - unsigned long n_balk_blkd_tasks; - /* Refused to boost: no blocked tasks. */ - unsigned long n_balk_exp_gp_tasks; - /* Refused to boost: nothing blocking GP. */ - unsigned long n_balk_boost_tasks; - /* Refused to boost: already boosting. */ - unsigned long n_balk_notblocked; - /* Refused to boost: RCU RS CS still running. */ - unsigned long n_balk_notyet; - /* Refused to boost: not yet time. */ - unsigned long n_balk_nos; - /* Refused to boost: not sure why, though. */ - /* This can happen due to race conditions. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ @@ -535,17 +522,3 @@ void srcu_offline_cpu(unsigned int cpu) { } #endif /* #else #ifdef CONFIG_SRCU */ #endif /* #ifndef RCU_TREE_NONCORE */ - -#ifdef CONFIG_RCU_TRACE -/* Read out queue lengths for tracing. */ -static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) -{ -#ifdef CONFIG_RCU_NOCB_CPU - *ql = atomic_long_read(&rdp->nocb_q_count); - *qll = atomic_long_read(&rdp->nocb_q_count_lazy); -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ - *ql = 0; - *qll = 0; -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ -} -#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7f5919ab24c4..43f2f8026b4a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -70,7 +70,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ static void __init rcu_bootup_announce_oddness(void) { if (IS_ENABLED(CONFIG_RCU_TRACE)) - pr_info("\tRCU debugfs-based tracing is enabled.\n"); + pr_info("\tRCU event tracing is enabled.\n"); if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", @@ -899,33 +899,6 @@ void exit_rcu(void) #include "../locking/rtmutex_common.h" -#ifdef CONFIG_RCU_TRACE - -static void rcu_initiate_boost_trace(struct rcu_node *rnp) -{ - if (!rcu_preempt_has_tasks(rnp)) - rnp->n_balk_blkd_tasks++; - else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) - rnp->n_balk_exp_gp_tasks++; - else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) - rnp->n_balk_boost_tasks++; - else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) - rnp->n_balk_notblocked++; - else if (rnp->gp_tasks != NULL && - ULONG_CMP_LT(jiffies, rnp->boost_time)) - rnp->n_balk_notyet++; - else - rnp->n_balk_nos++; -} - -#else /* #ifdef CONFIG_RCU_TRACE */ - -static void rcu_initiate_boost_trace(struct rcu_node *rnp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - static void rcu_wake_cond(struct task_struct *t, int status) { /* @@ -1058,7 +1031,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) lockdep_assert_held(&rnp->lock); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { - rnp->n_balk_exp_gp_tasks++; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -1074,7 +1046,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (t) rcu_wake_cond(t, rnp->boost_kthread_status); } else { - rcu_initiate_boost_trace(rnp); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c deleted file mode 100644 index 6cea17a1ea30..000000000000 --- a/kernel/rcu/tree_trace.c +++ /dev/null @@ -1,494 +0,0 @@ -/* - * Read-Copy Update tracing for hierarchical implementation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright IBM Corporation, 2008 - * Author: Paul E. McKenney - * - * Papers: http://www.rdrop.com/users/paulmck/RCU - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define RCU_TREE_NONCORE -#include "tree.h" -#include "rcu.h" - -static int r_open(struct inode *inode, struct file *file, - const struct seq_operations *op) -{ - int ret = seq_open(file, op); - if (!ret) { - struct seq_file *m = (struct seq_file *)file->private_data; - m->private = inode->i_private; - } - return ret; -} - -static void *r_start(struct seq_file *m, loff_t *pos) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - *pos = cpumask_next(*pos - 1, cpu_possible_mask); - if ((*pos) < nr_cpu_ids) - return per_cpu_ptr(rsp->rda, *pos); - return NULL; -} - -static void *r_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return r_start(m, pos); -} - -static void r_stop(struct seq_file *m, void *v) -{ -} - -static int show_rcubarrier(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "bcc: %d bseq: %lu\n", - atomic_read(&rsp->barrier_cpu_count), - rsp->barrier_sequence); - return 0; -} - -static int rcubarrier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcubarrier, inode->i_private); -} - -static const struct file_operations rcubarrier_fops = { - .owner = THIS_MODULE, - .open = rcubarrier_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#ifdef CONFIG_RCU_BOOST - -static char convert_kthread_status(unsigned int kthread_status) -{ - if (kthread_status > RCU_KTHREAD_MAX) - return '?'; - return "SRWOY"[kthread_status]; -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) -{ - long ql, qll; - - if (!rdp->beenonline) - return; - seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? '!' : ' ', - ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->cpu_no_qs.b.norm, - rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu), - rdp->core_needs_qs); - seq_printf(m, " dt=%d/%llx/%d df=%lu", - rcu_dynticks_snap(rdp->dynticks), - rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi_nesting, - rdp->dynticks_fqs); - seq_printf(m, " of=%lu", rdp->offline_fqs); - rcu_nocb_q_lengths(rdp, &ql, &qll); - qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist); - ql += rcu_segcblist_n_cbs(&rdp->cblist); - seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", - qll, ql, - ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)], - ".R"[!rcu_segcblist_segempty(&rdp->cblist, - RCU_NEXT_READY_TAIL)], - ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)], - ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, " kt=%d/%c ktl=%x", - per_cpu(rcu_cpu_has_work, rdp->cpu), - convert_kthread_status(per_cpu(rcu_cpu_kthread_status, - rdp->cpu)), - per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_printf(m, " b=%ld", rdp->blimit); - seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", - rdp->n_cbs_invoked, rdp->n_nocbs_invoked, - rdp->n_cbs_orphaned, rdp->n_cbs_adopted); -} - -static int show_rcudata(struct seq_file *m, void *v) -{ - print_one_rcu_data(m, (struct rcu_data *)v); - return 0; -} - -static const struct seq_operations rcudate_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = show_rcudata, -}; - -static int rcudata_open(struct inode *inode, struct file *file) -{ - return r_open(inode, file, &rcudate_op); -} - -static const struct file_operations rcudata_fops = { - .owner = THIS_MODULE, - .open = rcudata_open, - .read = seq_read, - .llseek = no_llseek, - .release = seq_release, -}; - -static int show_rcuexp(struct seq_file *m, void *v) -{ - int cpu; - struct rcu_state *rsp = (struct rcu_state *)m->private; - struct rcu_data *rdp; - unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; - - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - s0 += atomic_long_read(&rdp->exp_workdone0); - s1 += atomic_long_read(&rdp->exp_workdone1); - s2 += atomic_long_read(&rdp->exp_workdone2); - s3 += atomic_long_read(&rdp->exp_workdone3); - } - seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, s0, s1, s2, s3, - atomic_read(&rsp->expedited_need_qs), - rsp->expedited_sequence / 2); - return 0; -} - -static int rcuexp_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcuexp, inode->i_private); -} - -static const struct file_operations rcuexp_fops = { - .owner = THIS_MODULE, - .open = rcuexp_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#ifdef CONFIG_RCU_BOOST - -static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) -{ - seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", - rnp->grplo, rnp->grphi, - "T."[list_empty(&rnp->blkd_tasks)], - "N."[!rnp->gp_tasks], - "E."[!rnp->exp_tasks], - "B."[!rnp->boost_tasks], - convert_kthread_status(rnp->boost_kthread_status), - rnp->n_tasks_boosted, rnp->n_exp_boosts, - rnp->n_normal_boosts); - seq_printf(m, "j=%04x bt=%04x\n", - (int)(jiffies & 0xffff), - (int)(rnp->boost_time & 0xffff)); - seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", - rnp->n_balk_blkd_tasks, - rnp->n_balk_exp_gp_tasks, - rnp->n_balk_boost_tasks, - rnp->n_balk_notblocked, - rnp->n_balk_notyet, - rnp->n_balk_nos); -} - -static int show_rcu_node_boost(struct seq_file *m, void *unused) -{ - struct rcu_node *rnp; - - rcu_for_each_leaf_node(&rcu_preempt_state, rnp) - print_one_rcu_node_boost(m, rnp); - return 0; -} - -static int rcu_node_boost_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcu_node_boost, NULL); -} - -static const struct file_operations rcu_node_boost_fops = { - .owner = THIS_MODULE, - .open = rcu_node_boost_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) -{ - unsigned long gpnum; - int level = 0; - struct rcu_node *rnp; - - gpnum = rsp->gpnum; - seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", - ulong2long(rsp->completed), ulong2long(gpnum), - rsp->gp_state, - (long)(rsp->jiffies_force_qs - jiffies), - (int)(jiffies & 0xffff)); - seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", - rsp->n_force_qs, rsp->n_force_qs_ngp, - rsp->n_force_qs - rsp->n_force_qs_ngp, - READ_ONCE(rsp->n_force_qs_lh), - rsp->orphan_done.len_lazy, - rsp->orphan_done.len); - for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { - if (rnp->level != level) { - seq_puts(m, "\n"); - level = rnp->level; - } - seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ", - rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext, - ".G"[rnp->gp_tasks != NULL], - ".E"[rnp->exp_tasks != NULL], - ".T"[!list_empty(&rnp->blkd_tasks)], - rnp->grplo, rnp->grphi, rnp->grpnum); - } - seq_puts(m, "\n"); -} - -static int show_rcuhier(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - print_one_rcu_state(m, rsp); - return 0; -} - -static int rcuhier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcuhier, inode->i_private); -} - -static const struct file_operations rcuhier_fops = { - .owner = THIS_MODULE, - .open = rcuhier_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long completed; - unsigned long gpnum; - unsigned long gpage; - unsigned long gpmax; - struct rcu_node *rnp = &rsp->node[0]; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - completed = READ_ONCE(rsp->completed); - gpnum = READ_ONCE(rsp->gpnum); - if (completed == gpnum) - gpage = 0; - else - gpage = jiffies - rsp->gp_start; - gpmax = rsp->gp_max; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", - ulong2long(completed), ulong2long(gpnum), gpage, gpmax); -} - -static int show_rcugp(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - show_one_rcugp(m, rsp); - return 0; -} - -static int rcugp_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcugp, inode->i_private); -} - -static const struct file_operations rcugp_fops = { - .owner = THIS_MODULE, - .open = rcugp_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) -{ - if (!rdp->beenonline) - return; - seq_printf(m, "%3d%cnp=%ld ", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->n_rcu_pending); - seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", - rdp->n_rp_core_needs_qs, - rdp->n_rp_report_qs, - rdp->n_rp_cb_ready, - rdp->n_rp_cpu_needs_gp); - seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n", - rdp->n_rp_gp_completed, - rdp->n_rp_gp_started, - rdp->n_rp_nocb_defer_wakeup, - rdp->n_rp_need_nothing); -} - -static int show_rcu_pending(struct seq_file *m, void *v) -{ - print_one_rcu_pending(m, (struct rcu_data *)v); - return 0; -} - -static const struct seq_operations rcu_pending_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = show_rcu_pending, -}; - -static int rcu_pending_open(struct inode *inode, struct file *file) -{ - return r_open(inode, file, &rcu_pending_op); -} - -static const struct file_operations rcu_pending_fops = { - .owner = THIS_MODULE, - .open = rcu_pending_open, - .read = seq_read, - .llseek = no_llseek, - .release = seq_release, -}; - -static int show_rcutorture(struct seq_file *m, void *unused) -{ - seq_printf(m, "rcutorture test sequence: %lu %s\n", - rcutorture_testseq >> 1, - (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); - seq_printf(m, "rcutorture update version number: %lu\n", - rcutorture_vernum); - return 0; -} - -static int rcutorture_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcutorture, NULL); -} - -static const struct file_operations rcutorture_fops = { - .owner = THIS_MODULE, - .open = rcutorture_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static struct dentry *rcudir; - -static int __init rcutree_trace_init(void) -{ - struct rcu_state *rsp; - struct dentry *retval; - struct dentry *rspdir; - - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto free_out; - - for_each_rcu_flavor(rsp) { - rspdir = debugfs_create_dir(rsp->name, rcudir); - if (!rspdir) - goto free_out; - - retval = debugfs_create_file("rcudata", 0444, - rspdir, rsp, &rcudata_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcuexp", 0444, - rspdir, rsp, &rcuexp_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcu_pending", 0444, - rspdir, rsp, &rcu_pending_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcubarrier", 0444, - rspdir, rsp, &rcubarrier_fops); - if (!retval) - goto free_out; - -#ifdef CONFIG_RCU_BOOST - if (rsp == &rcu_preempt_state) { - retval = debugfs_create_file("rcuboost", 0444, - rspdir, NULL, &rcu_node_boost_fops); - if (!retval) - goto free_out; - } -#endif - - retval = debugfs_create_file("rcugp", 0444, - rspdir, rsp, &rcugp_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcuhier", 0444, - rspdir, rsp, &rcuhier_fops); - if (!retval) - goto free_out; - } - - retval = debugfs_create_file("rcutorture", 0444, rcudir, - NULL, &rcutorture_fops); - if (!retval) - goto free_out; - return 0; -free_out: - debugfs_remove_recursive(rcudir); - return 1; -} -device_initcall(rcutree_trace_init); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 498d5dd63bf4..8c10b5a97b9e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1361,9 +1361,8 @@ config RCU_TRACE default y if TREE_RCU select TRACE_CLOCK help - This option provides tracing in RCU which presents stats - in debugfs for debugging RCU implementation. It also enables - additional tracepoints for ftrace-style event tracing. + This option enables additional tracepoints for ftrace-style + event tracing. Say Y here if you want to enable RCU tracing Say N if you are unsure. diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T b/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T deleted file mode 100644 index 917d2517b5b5..000000000000 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE02-T +++ /dev/null @@ -1,21 +0,0 @@ -CONFIG_SMP=y -CONFIG_NR_CPUS=8 -CONFIG_PREEMPT_NONE=n -CONFIG_PREEMPT_VOLUNTARY=n -CONFIG_PREEMPT=y -#CHECK#CONFIG_PREEMPT_RCU=y -CONFIG_HZ_PERIODIC=n -CONFIG_NO_HZ_IDLE=y -CONFIG_NO_HZ_FULL=n -CONFIG_RCU_FAST_NO_HZ=n -CONFIG_RCU_TRACE=y -CONFIG_HOTPLUG_CPU=n -CONFIG_SUSPEND=n -CONFIG_HIBERNATION=n -CONFIG_RCU_FANOUT=3 -CONFIG_RCU_FANOUT_LEAF=3 -CONFIG_RCU_NOCB_CPU=n -CONFIG_DEBUG_LOCK_ALLOC=y -CONFIG_PROVE_LOCKING=n -CONFIG_RCU_BOOST=n -CONFIG_DEBUG_OBJECTS_RCU_HEAD=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T b/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T deleted file mode 100644 index 2ad13f0d29cc..000000000000 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08-T +++ /dev/null @@ -1,21 +0,0 @@ -CONFIG_SMP=y -CONFIG_NR_CPUS=16 -CONFIG_PREEMPT_NONE=n -CONFIG_PREEMPT_VOLUNTARY=n -CONFIG_PREEMPT=y -#CHECK#CONFIG_PREEMPT_RCU=y -CONFIG_HZ_PERIODIC=n -CONFIG_NO_HZ_IDLE=y -CONFIG_NO_HZ_FULL=n -CONFIG_RCU_FAST_NO_HZ=n -CONFIG_RCU_TRACE=y -CONFIG_HOTPLUG_CPU=n -CONFIG_SUSPEND=n -CONFIG_HIBERNATION=n -CONFIG_RCU_FANOUT=3 -CONFIG_RCU_FANOUT_LEAF=2 -CONFIG_RCU_NOCB_CPU=y -CONFIG_RCU_NOCB_CPU_ALL=y -CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_RCU_BOOST=n -CONFIG_DEBUG_OBJECTS_RCU_HEAD=n diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index c5c29fb7438c..928fadaecc25 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -79,9 +79,5 @@ CONFIG_TASKS_RCU Selected by CONFIG_RCU_TORTURE_TEST, so cannot disable. -CONFIG_RCU_TRACE - - Implied by CONFIG_RCU_TRACE for Tree RCU. - boot parameters ignored: TBD -- cgit v1.3-14-g43fede From 44c65ff2e3b0b48250a970183ab53b0602c25764 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 May 2017 16:26:34 -0700 Subject: rcu: Eliminate NOCBs CPU-state Kconfig options The CONFIG_RCU_NOCB_CPU_ALL, CONFIG_RCU_NOCB_CPU_NONE, and CONFIG_RCU_NOCB_CPU_ZERO Kconfig options are used only in testing and are redundant with the rcu_nocbs= boot parameter. This commit therefore removes these three Kconfig options and adjusts the rcutorture scripts to use the boot parameter instead. Signed-off-by: Paul E. McKenney --- Documentation/kernel-per-CPU-kthreads.txt | 31 ++++++------- Documentation/timers/NO_HZ.txt | 29 ++---------- init/Kconfig | 53 ---------------------- kernel/rcu/rcu.h | 4 +- kernel/rcu/tree_plugin.h | 27 ++--------- .../selftests/rcutorture/configs/rcu/TREE01 | 1 - .../selftests/rcutorture/configs/rcu/TREE01.boot | 1 + .../selftests/rcutorture/configs/rcu/TREE05 | 1 - .../selftests/rcutorture/configs/rcu/TREE08 | 1 - .../selftests/rcutorture/configs/rcu/TREE08.boot | 1 + .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt | 8 ++-- 11 files changed, 26 insertions(+), 131 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt index df31e30b6a02..2cb7dc5c0e0d 100644 --- a/Documentation/kernel-per-CPU-kthreads.txt +++ b/Documentation/kernel-per-CPU-kthreads.txt @@ -109,13 +109,12 @@ SCHED_SOFTIRQ: Do all of the following: on that CPU. If a thread that expects to run on the de-jittered CPU awakens, the scheduler will send an IPI that can result in a subsequent SCHED_SOFTIRQ. -2. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y, - CONFIG_NO_HZ_FULL=y, and, in addition, ensure that the CPU - to be de-jittered is marked as an adaptive-ticks CPU using the - "nohz_full=" boot parameter. This reduces the number of - scheduler-clock interrupts that the de-jittered CPU receives, - minimizing its chances of being selected to do the load balancing - work that runs in SCHED_SOFTIRQ context. +2. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered + is marked as an adaptive-ticks CPU using the "nohz_full=" + boot parameter. This reduces the number of scheduler-clock + interrupts that the de-jittered CPU receives, minimizing its + chances of being selected to do the load balancing work that + runs in SCHED_SOFTIRQ context. 3. To the extent possible, keep the CPU out of the kernel when it is non-idle, for example, by avoiding system calls and by forcing both kernel threads and interrupts to execute elsewhere. @@ -135,11 +134,10 @@ HRTIMER_SOFTIRQ: Do all of the following: RCU_SOFTIRQ: Do at least one of the following: 1. Offload callbacks and keep the CPU in either dyntick-idle or adaptive-ticks state by doing all of the following: - a. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y, - CONFIG_NO_HZ_FULL=y, and, in addition ensure that the CPU - to be de-jittered is marked as an adaptive-ticks CPU using - the "nohz_full=" boot parameter. Bind the rcuo kthreads - to housekeeping CPUs, which can tolerate OS jitter. + a. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be + de-jittered is marked as an adaptive-ticks CPU using the + "nohz_full=" boot parameter. Bind the rcuo kthreads to + housekeeping CPUs, which can tolerate OS jitter. b. To the extent possible, keep the CPU out of the kernel when it is non-idle, for example, by avoiding system calls and by forcing both kernel threads and interrupts @@ -236,11 +234,10 @@ To reduce its OS jitter, do at least one of the following: is feasible only if your workload never requires RCU priority boosting, for example, if you ensure frequent idle time on all CPUs that might execute within the kernel. -3. Build with CONFIG_RCU_NOCB_CPU=y and CONFIG_RCU_NOCB_CPU_ALL=y, - which offloads all RCU callbacks to kthreads that can be moved - off of CPUs susceptible to OS jitter. This approach prevents the - rcuc/%u kthreads from having any work to do, so that they are - never awakened. +3. Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs= + boot parameter offloading RCU callbacks from all CPUs susceptible + to OS jitter. This approach prevents the rcuc/%u kthreads from + having any work to do, so that they are never awakened. 4. Ensure that the CPU never enters the kernel, and, in particular, avoid initiating any CPU hotplug operations on this CPU. This is another way of preventing any callbacks from being queued on the diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt index 6eaf576294f3..2dcaf9adb7a7 100644 --- a/Documentation/timers/NO_HZ.txt +++ b/Documentation/timers/NO_HZ.txt @@ -194,32 +194,9 @@ that the RCU callbacks are processed in a timely fashion. Another approach is to offload RCU callback processing to "rcuo" kthreads using the CONFIG_RCU_NOCB_CPU=y Kconfig option. The specific CPUs to -offload may be selected via several methods: - -1. One of three mutually exclusive Kconfig options specify a - build-time default for the CPUs to offload: - - a. The CONFIG_RCU_NOCB_CPU_NONE=y Kconfig option results in - no CPUs being offloaded. - - b. The CONFIG_RCU_NOCB_CPU_ZERO=y Kconfig option causes - CPU 0 to be offloaded. - - c. The CONFIG_RCU_NOCB_CPU_ALL=y Kconfig option causes all - CPUs to be offloaded. Note that the callbacks will be - offloaded to "rcuo" kthreads, and that those kthreads - will in fact run on some CPU. However, this approach - gives fine-grained control on exactly which CPUs the - callbacks run on, along with their scheduling priority - (including the default of SCHED_OTHER), and it further - allows this control to be varied dynamically at runtime. - -2. The "rcu_nocbs=" kernel boot parameter, which takes a comma-separated - list of CPUs and CPU ranges, for example, "1,3-5" selects CPUs 1, - 3, 4, and 5. The specified CPUs will be offloaded in addition to - any CPUs specified as offloaded by CONFIG_RCU_NOCB_CPU_ZERO=y or - CONFIG_RCU_NOCB_CPU_ALL=y. This means that the "rcu_nocbs=" boot - parameter has no effect for kernels built with RCU_NOCB_CPU_ALL=y. +offload may be selected using The "rcu_nocbs=" kernel boot parameter, +which takes a comma-separated list of CPUs and CPU ranges, for example, +"1,3-5" selects CPUs 1, 3, 4, and 5. The offloaded CPUs will never queue RCU callbacks, and therefore RCU never prevents offloaded CPUs from entering either dyntick-idle mode diff --git a/init/Kconfig b/init/Kconfig index 3025383ab443..dc431c6109f2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -709,59 +709,6 @@ config RCU_NOCB_CPU Say Y here if you want to help to debug reduced OS jitter. Say N here if you are unsure. -choice - prompt "Build-forced no-CBs CPUs" - default RCU_NOCB_CPU_NONE - depends on RCU_NOCB_CPU - help - This option allows no-CBs CPUs (whose RCU callbacks are invoked - from kthreads rather than from softirq context) to be specified - at build time. Additional no-CBs CPUs may be specified by - the rcu_nocbs= boot parameter. - -config RCU_NOCB_CPU_NONE - bool "No build_forced no-CBs CPUs" - help - This option does not force any of the CPUs to be no-CBs CPUs. - Only CPUs designated by the rcu_nocbs= boot parameter will be - no-CBs CPUs, whose RCU callbacks will be invoked by per-CPU - kthreads whose names begin with "rcuo". All other CPUs will - invoke their own RCU callbacks in softirq context. - - Select this option if you want to choose no-CBs CPUs at - boot time, for example, to allow testing of different no-CBs - configurations without having to rebuild the kernel each time. - -config RCU_NOCB_CPU_ZERO - bool "CPU 0 is a build_forced no-CBs CPU" - help - This option forces CPU 0 to be a no-CBs CPU, so that its RCU - callbacks are invoked by a per-CPU kthread whose name begins - with "rcuo". Additional CPUs may be designated as no-CBs - CPUs using the rcu_nocbs= boot parameter will be no-CBs CPUs. - All other CPUs will invoke their own RCU callbacks in softirq - context. - - Select this if CPU 0 needs to be a no-CBs CPU for real-time - or energy-efficiency reasons, but the real reason it exists - is to ensure that randconfig testing covers mixed systems. - -config RCU_NOCB_CPU_ALL - bool "All CPUs are build_forced no-CBs CPUs" - help - This option forces all CPUs to be no-CBs CPUs. The rcu_nocbs= - boot parameter will be ignored. All CPUs' RCU callbacks will - be executed in the context of per-CPU rcuo kthreads created for - this purpose. Assuming that the kthreads whose names start with - "rcuo" are bound to "housekeeping" CPUs, this reduces OS jitter - on the remaining CPUs, but might decrease memory locality during - RCU-callback invocation, thus potentially degrading throughput. - - Select this if all CPUs need to be no-CBs CPUs for real-time - or energy-efficiency reasons. - -endchoice - endmenu # "RCU Subsystem" config BUILD_BIN2C diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index d06c42deee0b..808b8c85f626 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -564,9 +564,7 @@ void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); #endif /* #else #ifdef CONFIG_TINY_RCU */ -#if defined(CONFIG_RCU_NOCB_CPU_ALL) -static inline bool rcu_is_nocb_cpu(int cpu) { return true; } -#elif defined(CONFIG_RCU_NOCB_CPU) +#ifdef CONFIG_RCU_NOCB_CPU bool rcu_is_nocb_cpu(int cpu); #else static inline bool rcu_is_nocb_cpu(int cpu) { return false; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 43f2f8026b4a..908b309d60d7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1296,8 +1296,7 @@ static void rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; - return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) - ? 0 : rcu_cpu_has_callbacks(NULL); + return rcu_cpu_has_callbacks(NULL); } /* @@ -1409,10 +1408,6 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) unsigned long dj; RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!"); - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) { - *nextevt = KTIME_MAX; - return 0; - } /* Snapshot to detect later posting of non-lazy callback. */ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; @@ -1462,8 +1457,7 @@ static void rcu_prepare_for_idle(void) int tne; RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!"); - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || - rcu_is_nocb_cpu(smp_processor_id())) + if (rcu_is_nocb_cpu(smp_processor_id())) return; /* Handle nohz enablement switches conservatively. */ @@ -1518,8 +1512,7 @@ static void rcu_prepare_for_idle(void) static void rcu_cleanup_after_idle(void) { RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!"); - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || - rcu_is_nocb_cpu(smp_processor_id())) + if (rcu_is_nocb_cpu(smp_processor_id())) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -1786,7 +1779,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_swait_queue_head(&rnp->nocb_gp_wq[1]); } -#ifndef CONFIG_RCU_NOCB_CPU_ALL /* Is the specified CPU a no-CBs CPU? */ bool rcu_is_nocb_cpu(int cpu) { @@ -1794,7 +1786,6 @@ bool rcu_is_nocb_cpu(int cpu) return cpumask_test_cpu(cpu, rcu_nocb_mask); return false; } -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Kick the leader kthread for this NOCB group. @@ -2253,10 +2244,6 @@ void __init rcu_init_nohz(void) bool need_rcu_nocb_mask = true; struct rcu_state *rsp; -#ifdef CONFIG_RCU_NOCB_CPU_NONE - need_rcu_nocb_mask = false; -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ - #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) need_rcu_nocb_mask = true; @@ -2272,14 +2259,6 @@ void __init rcu_init_nohz(void) if (!have_rcu_nocb_mask) return; -#ifdef CONFIG_RCU_NOCB_CPU_ZERO - pr_info("\tOffload RCU callbacks from CPU 0\n"); - cpumask_set_cpu(0, rcu_nocb_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ -#ifdef CONFIG_RCU_NOCB_CPU_ALL - pr_info("\tOffload RCU callbacks from all CPUs\n"); - cpumask_copy(rcu_nocb_mask, cpu_possible_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running) cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 index 92ca49f90ef9..b5b53973c01e 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 @@ -12,7 +12,6 @@ CONFIG_HOTPLUG_CPU=y CONFIG_MAXSMP=y CONFIG_CPUMASK_OFFSTACK=y CONFIG_RCU_NOCB_CPU=y -CONFIG_RCU_NOCB_CPU_ZERO=y CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_RCU_BOOST=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot index 89705ed79596..1d14e1383016 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot @@ -2,3 +2,4 @@ rcutorture.torture_type=rcu_bh maxcpus=8 rcutree.gp_preinit_delay=3 rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 +rcu_nocbs=0 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 b/tools/testing/selftests/rcutorture/configs/rcu/TREE05 index 1257d3227b1e..2dde0d9964e3 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05 @@ -13,7 +13,6 @@ CONFIG_HOTPLUG_CPU=y CONFIG_RCU_FANOUT=6 CONFIG_RCU_FANOUT_LEAF=6 CONFIG_RCU_NOCB_CPU=y -CONFIG_RCU_NOCB_CPU_NONE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 b/tools/testing/selftests/rcutorture/configs/rcu/TREE08 index 099cc63c6a3b..fb1c763c10c5 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08 @@ -15,7 +15,6 @@ CONFIG_HIBERNATION=n CONFIG_RCU_FANOUT=3 CONFIG_RCU_FANOUT_LEAF=2 CONFIG_RCU_NOCB_CPU=y -CONFIG_RCU_NOCB_CPU_ALL=y CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_PROVE_LOCKING=n CONFIG_RCU_BOOST=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot index fb066dc82769..1bd8efc4141e 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot @@ -2,3 +2,4 @@ rcutorture.torture_type=sched rcupdate.rcu_self_test=1 rcupdate.rcu_self_test_sched=1 rcutree.rcu_fanout_exact=1 +rcu_nocbs=0-7 diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index 928fadaecc25..9ad3f89c8dc7 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -16,11 +16,9 @@ CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING. CONFIG_RCU_BOOST -- one of PREEMPT_RCU. CONFIG_RCU_FANOUT -- Cover hierarchy, but overlap with others. CONFIG_RCU_FANOUT_LEAF -- Do one non-default. -CONFIG_RCU_FAST_NO_HZ -- Do one, but not with CONFIG_RCU_NOCB_CPU_ALL. -CONFIG_RCU_NOCB_CPU -- Do three, see below. -CONFIG_RCU_NOCB_CPU_ALL -- Do one. -CONFIG_RCU_NOCB_CPU_NONE -- Do one. -CONFIG_RCU_NOCB_CPU_ZERO -- Do one. +CONFIG_RCU_FAST_NO_HZ -- Do one, but not with all nohz_full CPUs. +CONFIG_RCU_NOCB_CPU -- Do three, one with no rcu_nocbs CPUs, one with + rcu_nocbs=0, and one with all rcu_nocbs CPUs. CONFIG_RCU_TRACE -- Do half. CONFIG_SMP -- Need one !SMP for PREEMPT_RCU. CONFIG_RCU_EXPERT=n -- Do a few, but these have to be vanilla configurations. -- cgit v1.3-14-g43fede From 0af92d46098a092aa5817dfeb6d24a8d85b66205 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 May 2017 08:43:40 -0700 Subject: rcu: Move RCU non-debug Kconfig options to kernel/rcu RCU's Kconfig options are scattered, and there are enough of them that it would be good for them to be more centralized. This commit therefore extracts RCU's Kconfig options from init/Kconfig into a new kernel/rcu/Kconfig file. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- init/Kconfig | 239 +--------------------------------------------------- kernel/rcu/Kconfig | 242 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 243 insertions(+), 238 deletions(-) create mode 100644 kernel/rcu/Kconfig (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index dc431c6109f2..bc4c180c66a5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -472,244 +472,7 @@ config TASK_IO_ACCOUNTING endmenu # "CPU/Task time and stats accounting" -menu "RCU Subsystem" - -config TREE_RCU - bool - default y if !PREEMPT && SMP - help - This option selects the RCU implementation that is - designed for very large SMP system with hundreds or - thousands of CPUs. It also scales down nicely to - smaller systems. - -config PREEMPT_RCU - bool - default y if PREEMPT - help - This option selects the RCU implementation that is - designed for very large SMP systems with hundreds or - thousands of CPUs, but for which real-time response - is also required. It also scales down nicely to - smaller systems. - - Select this option if you are unsure. - -config TINY_RCU - bool - default y if !PREEMPT && !SMP - help - This option selects the RCU implementation that is - designed for UP systems from which real-time response - is not required. This option greatly reduces the - memory footprint of RCU. - -config RCU_EXPERT - bool "Make expert-level adjustments to RCU configuration" - default n - help - This option needs to be enabled if you wish to make - expert-level adjustments to RCU configuration. By default, - no such adjustments can be made, which has the often-beneficial - side-effect of preventing "make oldconfig" from asking you all - sorts of detailed questions about how you would like numerous - obscure RCU options to be set up. - - Say Y if you need to make expert-level adjustments to RCU. - - Say N if you are unsure. - -config SRCU - bool - help - This option selects the sleepable version of RCU. This version - permits arbitrary sleeping or blocking within RCU read-side critical - sections. - -config TINY_SRCU - bool - default y if SRCU && TINY_RCU - help - This option selects the single-CPU non-preemptible version of SRCU. - -config TREE_SRCU - bool - default y if SRCU && !TINY_RCU - help - This option selects the full-fledged version of SRCU. - -config TASKS_RCU - bool - default n - select SRCU - help - This option enables a task-based RCU implementation that uses - only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. - -config RCU_STALL_COMMON - def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE ) - help - This option enables RCU CPU stall code that is common between - the TINY and TREE variants of RCU. The purpose is to allow - the tiny variants to disable RCU CPU stall warnings, while - making these warnings mandatory for the tree variants. - -config RCU_NEED_SEGCBLIST - def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) - -config CONTEXT_TRACKING - bool - -config CONTEXT_TRACKING_FORCE - bool "Force context tracking" - depends on CONTEXT_TRACKING - default y if !NO_HZ_FULL - help - The major pre-requirement for full dynticks to work is to - support the context tracking subsystem. But there are also - other dependencies to provide in order to make the full - dynticks working. - - This option stands for testing when an arch implements the - context tracking backend but doesn't yet fullfill all the - requirements to make the full dynticks feature working. - Without the full dynticks, there is no way to test the support - for context tracking and the subsystems that rely on it: RCU - userspace extended quiescent state and tickless cputime - accounting. This option copes with the absence of the full - dynticks subsystem by forcing the context tracking on all - CPUs in the system. - - Say Y only if you're working on the development of an - architecture backend for the context tracking. - - Say N otherwise, this option brings an overhead that you - don't want in production. - - -config RCU_FANOUT - int "Tree-based hierarchical RCU fanout value" - range 2 64 if 64BIT - range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT - default 64 if 64BIT - default 32 if !64BIT - help - This option controls the fanout of hierarchical implementations - of RCU, allowing RCU to work efficiently on machines with - large numbers of CPUs. This value must be at least the fourth - root of NR_CPUS, which allows NR_CPUS to be insanely large. - The default value of RCU_FANOUT should be used for production - systems, but if you are stress-testing the RCU implementation - itself, small RCU_FANOUT values allow you to test large-system - code paths on small(er) systems. - - Select a specific number if testing RCU itself. - Take the default if unsure. - -config RCU_FANOUT_LEAF - int "Tree-based hierarchical RCU leaf-level fanout value" - range 2 64 if 64BIT - range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT - default 16 - help - This option controls the leaf-level fanout of hierarchical - implementations of RCU, and allows trading off cache misses - against lock contention. Systems that synchronize their - scheduling-clock interrupts for energy-efficiency reasons will - want the default because the smaller leaf-level fanout keeps - lock contention levels acceptably low. Very large systems - (hundreds or thousands of CPUs) will instead want to set this - value to the maximum value possible in order to reduce the - number of cache misses incurred during RCU's grace-period - initialization. These systems tend to run CPU-bound, and thus - are not helped by synchronized interrupts, and thus tend to - skew them, which reduces lock contention enough that large - leaf-level fanouts work well. That said, setting leaf-level - fanout to a large number will likely cause problematic - lock contention on the leaf-level rcu_node structures unless - you boot with the skew_tick kernel parameter. - - Select a specific number if testing RCU itself. - - Select the maximum permissible value for large systems, but - please understand that you may also need to set the skew_tick - kernel boot parameter to avoid contention on the rcu_node - structure's locks. - - Take the default if unsure. - -config RCU_FAST_NO_HZ - bool "Accelerate last non-dyntick-idle CPU's grace periods" - depends on NO_HZ_COMMON && SMP && RCU_EXPERT - default n - help - This option permits CPUs to enter dynticks-idle state even if - they have RCU callbacks queued, and prevents RCU from waking - these CPUs up more than roughly once every four jiffies (by - default, you can adjust this using the rcutree.rcu_idle_gp_delay - parameter), thus improving energy efficiency. On the other - hand, this option increases the duration of RCU grace periods, - for example, slowing down synchronize_rcu(). - - Say Y if energy efficiency is critically important, and you - don't care about increased grace-period durations. - - Say N if you are unsure. - -config RCU_BOOST - bool "Enable RCU priority boosting" - depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT - default n - help - This option boosts the priority of preempted RCU readers that - block the current preemptible RCU grace period for too long. - This option also prevents heavy loads from blocking RCU - callback invocation for all flavors of RCU. - - Say Y here if you are working with real-time apps or heavy loads - Say N here if you are unsure. - -config RCU_BOOST_DELAY - int "Milliseconds to delay boosting after RCU grace-period start" - range 0 3000 - depends on RCU_BOOST - default 500 - help - This option specifies the time to wait after the beginning of - a given grace period before priority-boosting preempted RCU - readers blocking that grace period. Note that any RCU reader - blocking an expedited RCU grace period is boosted immediately. - - Accept the default if unsure. - -config RCU_NOCB_CPU - bool "Offload RCU callback processing from boot-selected CPUs" - depends on TREE_RCU || PREEMPT_RCU - depends on RCU_EXPERT || NO_HZ_FULL - default n - help - Use this option to reduce OS jitter for aggressive HPC or - real-time workloads. It can also be used to offload RCU - callback invocation to energy-efficient CPUs in battery-powered - asymmetric multiprocessors. - - This option offloads callback invocation from the set of - CPUs specified at boot time by the rcu_nocbs parameter. - For each such CPU, a kthread ("rcuox/N") will be created to - invoke callbacks, where the "N" is the CPU being offloaded, - and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and - "s" for RCU-sched. Nothing prevents this kthread from running - on the specified CPUs, but (1) the kthreads may be preempted - between each callback, and (2) affinity or cgroups can be used - to force the kthreads to run on whatever set of CPUs is desired. - - Say Y here if you want to help to debug reduced OS jitter. - Say N here if you are unsure. - -endmenu # "RCU Subsystem" +source "kernel/rcu/Kconfig" config BUILD_BIN2C bool diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig new file mode 100644 index 000000000000..8edff43e8e94 --- /dev/null +++ b/kernel/rcu/Kconfig @@ -0,0 +1,242 @@ +# +# RCU-related configuration options +# + +menu "RCU Subsystem" + +config TREE_RCU + bool + default y if !PREEMPT && SMP + help + This option selects the RCU implementation that is + designed for very large SMP system with hundreds or + thousands of CPUs. It also scales down nicely to + smaller systems. + +config PREEMPT_RCU + bool + default y if PREEMPT + help + This option selects the RCU implementation that is + designed for very large SMP systems with hundreds or + thousands of CPUs, but for which real-time response + is also required. It also scales down nicely to + smaller systems. + + Select this option if you are unsure. + +config TINY_RCU + bool + default y if !PREEMPT && !SMP + help + This option selects the RCU implementation that is + designed for UP systems from which real-time response + is not required. This option greatly reduces the + memory footprint of RCU. + +config RCU_EXPERT + bool "Make expert-level adjustments to RCU configuration" + default n + help + This option needs to be enabled if you wish to make + expert-level adjustments to RCU configuration. By default, + no such adjustments can be made, which has the often-beneficial + side-effect of preventing "make oldconfig" from asking you all + sorts of detailed questions about how you would like numerous + obscure RCU options to be set up. + + Say Y if you need to make expert-level adjustments to RCU. + + Say N if you are unsure. + +config SRCU + bool + help + This option selects the sleepable version of RCU. This version + permits arbitrary sleeping or blocking within RCU read-side critical + sections. + +config TINY_SRCU + bool + default y if SRCU && TINY_RCU + help + This option selects the single-CPU non-preemptible version of SRCU. + +config TREE_SRCU + bool + default y if SRCU && !TINY_RCU + help + This option selects the full-fledged version of SRCU. + +config TASKS_RCU + bool + default n + select SRCU + help + This option enables a task-based RCU implementation that uses + only voluntary context switch (not preemption!), idle, and + user-mode execution as quiescent states. + +config RCU_STALL_COMMON + def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE ) + help + This option enables RCU CPU stall code that is common between + the TINY and TREE variants of RCU. The purpose is to allow + the tiny variants to disable RCU CPU stall warnings, while + making these warnings mandatory for the tree variants. + +config RCU_NEED_SEGCBLIST + def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) + +config CONTEXT_TRACKING + bool + +config CONTEXT_TRACKING_FORCE + bool "Force context tracking" + depends on CONTEXT_TRACKING + default y if !NO_HZ_FULL + help + The major pre-requirement for full dynticks to work is to + support the context tracking subsystem. But there are also + other dependencies to provide in order to make the full + dynticks working. + + This option stands for testing when an arch implements the + context tracking backend but doesn't yet fullfill all the + requirements to make the full dynticks feature working. + Without the full dynticks, there is no way to test the support + for context tracking and the subsystems that rely on it: RCU + userspace extended quiescent state and tickless cputime + accounting. This option copes with the absence of the full + dynticks subsystem by forcing the context tracking on all + CPUs in the system. + + Say Y only if you're working on the development of an + architecture backend for the context tracking. + + Say N otherwise, this option brings an overhead that you + don't want in production. + + +config RCU_FANOUT + int "Tree-based hierarchical RCU fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + default 64 if 64BIT + default 32 if !64BIT + help + This option controls the fanout of hierarchical implementations + of RCU, allowing RCU to work efficiently on machines with + large numbers of CPUs. This value must be at least the fourth + root of NR_CPUS, which allows NR_CPUS to be insanely large. + The default value of RCU_FANOUT should be used for production + systems, but if you are stress-testing the RCU implementation + itself, small RCU_FANOUT values allow you to test large-system + code paths on small(er) systems. + + Select a specific number if testing RCU itself. + Take the default if unsure. + +config RCU_FANOUT_LEAF + int "Tree-based hierarchical RCU leaf-level fanout value" + range 2 64 if 64BIT + range 2 32 if !64BIT + depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT + default 16 + help + This option controls the leaf-level fanout of hierarchical + implementations of RCU, and allows trading off cache misses + against lock contention. Systems that synchronize their + scheduling-clock interrupts for energy-efficiency reasons will + want the default because the smaller leaf-level fanout keeps + lock contention levels acceptably low. Very large systems + (hundreds or thousands of CPUs) will instead want to set this + value to the maximum value possible in order to reduce the + number of cache misses incurred during RCU's grace-period + initialization. These systems tend to run CPU-bound, and thus + are not helped by synchronized interrupts, and thus tend to + skew them, which reduces lock contention enough that large + leaf-level fanouts work well. That said, setting leaf-level + fanout to a large number will likely cause problematic + lock contention on the leaf-level rcu_node structures unless + you boot with the skew_tick kernel parameter. + + Select a specific number if testing RCU itself. + + Select the maximum permissible value for large systems, but + please understand that you may also need to set the skew_tick + kernel boot parameter to avoid contention on the rcu_node + structure's locks. + + Take the default if unsure. + +config RCU_FAST_NO_HZ + bool "Accelerate last non-dyntick-idle CPU's grace periods" + depends on NO_HZ_COMMON && SMP && RCU_EXPERT + default n + help + This option permits CPUs to enter dynticks-idle state even if + they have RCU callbacks queued, and prevents RCU from waking + these CPUs up more than roughly once every four jiffies (by + default, you can adjust this using the rcutree.rcu_idle_gp_delay + parameter), thus improving energy efficiency. On the other + hand, this option increases the duration of RCU grace periods, + for example, slowing down synchronize_rcu(). + + Say Y if energy efficiency is critically important, and you + don't care about increased grace-period durations. + + Say N if you are unsure. + +config RCU_BOOST + bool "Enable RCU priority boosting" + depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT + default n + help + This option boosts the priority of preempted RCU readers that + block the current preemptible RCU grace period for too long. + This option also prevents heavy loads from blocking RCU + callback invocation for all flavors of RCU. + + Say Y here if you are working with real-time apps or heavy loads + Say N here if you are unsure. + +config RCU_BOOST_DELAY + int "Milliseconds to delay boosting after RCU grace-period start" + range 0 3000 + depends on RCU_BOOST + default 500 + help + This option specifies the time to wait after the beginning of + a given grace period before priority-boosting preempted RCU + readers blocking that grace period. Note that any RCU reader + blocking an expedited RCU grace period is boosted immediately. + + Accept the default if unsure. + +config RCU_NOCB_CPU + bool "Offload RCU callback processing from boot-selected CPUs" + depends on TREE_RCU || PREEMPT_RCU + depends on RCU_EXPERT || NO_HZ_FULL + default n + help + Use this option to reduce OS jitter for aggressive HPC or + real-time workloads. It can also be used to offload RCU + callback invocation to energy-efficient CPUs in battery-powered + asymmetric multiprocessors. + + This option offloads callback invocation from the set of + CPUs specified at boot time by the rcu_nocbs parameter. + For each such CPU, a kthread ("rcuox/N") will be created to + invoke callbacks, where the "N" is the CPU being offloaded, + and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and + "s" for RCU-sched. Nothing prevents this kthread from running + on the specified CPUs, but (1) the kthreads may be preempted + between each callback, and (2) affinity or cgroups can be used + to force the kthreads to run on whatever set of CPUs is desired. + + Say Y here if you want to help to debug reduced OS jitter. + Say N here if you are unsure. + +endmenu # "RCU Subsystem" -- cgit v1.3-14-g43fede From 43a0a2a7d725f2ed2547cd656749eb66c093f2c2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 May 2017 09:19:44 -0700 Subject: rcu: Move RCU debug Kconfig options to kernel/rcu RCU's debugging Kconfig options are in the unintuitive location lib/Kconfig.debug, and there are enough of them that it would be good for them to be more centralized. This commit therefore extracts RCU's Kconfig options from init/Kconfig into a new kernel/rcu/Kconfig.debug file. Reported-by: Ingo Molnar Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig.debug | 82 ++++++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 79 +--------------------------------------------- 2 files changed, 83 insertions(+), 78 deletions(-) create mode 100644 kernel/rcu/Kconfig.debug (limited to 'kernel') diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug new file mode 100644 index 000000000000..0ec7d1d33a14 --- /dev/null +++ b/kernel/rcu/Kconfig.debug @@ -0,0 +1,82 @@ +# +# RCU-related debugging configuration options +# + +menu "RCU Debugging" + +config PROVE_RCU + def_bool PROVE_LOCKING + +config TORTURE_TEST + tristate + default n + +config RCU_PERF_TEST + tristate "performance tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs performance + tests on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU performance tests to be built into + the kernel. + Say M if you want the RCU performance tests to build as a module. + Say N if you are unsure. + +config RCU_TORTURE_TEST + tristate "torture tests for RCU" + depends on DEBUG_KERNEL + select TORTURE_TEST + select SRCU + select TASKS_RCU + default n + help + This option provides a kernel module that runs torture tests + on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU torture tests to be built into + the kernel. + Say M if you want the RCU torture tests to build as a module. + Say N if you are unsure. + +config RCU_CPU_STALL_TIMEOUT + int "RCU CPU stall timeout in seconds" + depends on RCU_STALL_COMMON + range 3 300 + default 21 + help + If a given RCU grace period extends more than the specified + number of seconds, a CPU stall warning is printed. If the + RCU grace period persists, additional CPU stall warnings are + printed at more widely spaced intervals. + +config RCU_TRACE + bool "Enable tracing for RCU" + depends on DEBUG_KERNEL + default y if TREE_RCU + select TRACE_CLOCK + help + This option enables additional tracepoints for ftrace-style + event tracing. + + Say Y here if you want to enable RCU tracing + Say N if you are unsure. + +config RCU_EQS_DEBUG + bool "Provide debugging asserts for adding NO_HZ support to an arch" + depends on DEBUG_KERNEL + help + This option provides consistency checks in RCU's handling of + NO_HZ. These checks have proven quite helpful in detecting + bugs in arch-specific NO_HZ code. + + Say N here if you need ultimate kernel/user switch latencies + Say Y if you are unsure + +endmenu # "RCU Debugging" diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8c10b5a97b9e..a7a751a75cfd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1301,84 +1301,7 @@ config DEBUG_CREDENTIALS If unsure, say N. -menu "RCU Debugging" - -config PROVE_RCU - def_bool PROVE_LOCKING - -config TORTURE_TEST - tristate - default n - -config RCU_PERF_TEST - tristate "performance tests for RCU" - depends on DEBUG_KERNEL - select TORTURE_TEST - select SRCU - select TASKS_RCU - default n - help - This option provides a kernel module that runs performance - tests on the RCU infrastructure. The kernel module may be built - after the fact on the running kernel to be tested, if desired. - - Say Y here if you want RCU performance tests to be built into - the kernel. - Say M if you want the RCU performance tests to build as a module. - Say N if you are unsure. - -config RCU_TORTURE_TEST - tristate "torture tests for RCU" - depends on DEBUG_KERNEL - select TORTURE_TEST - select SRCU - select TASKS_RCU - default n - help - This option provides a kernel module that runs torture tests - on the RCU infrastructure. The kernel module may be built - after the fact on the running kernel to be tested, if desired. - - Say Y here if you want RCU torture tests to be built into - the kernel. - Say M if you want the RCU torture tests to build as a module. - Say N if you are unsure. - -config RCU_CPU_STALL_TIMEOUT - int "RCU CPU stall timeout in seconds" - depends on RCU_STALL_COMMON - range 3 300 - default 21 - help - If a given RCU grace period extends more than the specified - number of seconds, a CPU stall warning is printed. If the - RCU grace period persists, additional CPU stall warnings are - printed at more widely spaced intervals. - -config RCU_TRACE - bool "Enable tracing for RCU" - depends on DEBUG_KERNEL - default y if TREE_RCU - select TRACE_CLOCK - help - This option enables additional tracepoints for ftrace-style - event tracing. - - Say Y here if you want to enable RCU tracing - Say N if you are unsure. - -config RCU_EQS_DEBUG - bool "Provide debugging asserts for adding NO_HZ support to an arch" - depends on DEBUG_KERNEL - help - This option provides consistency checks in RCU's handling of - NO_HZ. These checks have proven quite helpful in detecting - bugs in arch-specific NO_HZ code. - - Say N here if you need ultimate kernel/user switch latencies - Say Y if you are unsure - -endmenu # "RCU Debugging" +source "kernel/rcu/Kconfig.debug" config DEBUG_WQ_FORCE_RR_CPU bool "Force round-robin CPU selection for unbound work items" -- cgit v1.3-14-g43fede From c23484f0e7bc89e1facb04103ce24efeebee76b9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 May 2017 10:17:03 -0700 Subject: rcu: Remove event tracing from Tiny RCU This commit saves a few lines by getting rid of Tiny RCU's event tracing. Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 2306cab2195d..595cb1bf944f 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -35,7 +35,6 @@ #include #include #include -#include #include "rcu.h" @@ -139,7 +138,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); return; } - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);) list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; @@ -161,10 +159,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) RCU_TRACE(cb_count++;) } RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);) - RCU_TRACE(trace_rcu_batch_end(rcp->name, - cb_count, 0, need_resched(), - is_idle_task(current), - false)); } static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) -- cgit v1.3-14-g43fede From 6d48152eafde1f0d0a4a9e0584fa7d9ff4fbfdac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 May 2017 10:54:29 -0700 Subject: rcu: Remove RCU CPU stall warnings from Tiny RCU Tiny RCU's job is to be tiny, so this commit removes its RCU CPU stall warning code. After this, there is no longer any need for rcu_sched_ctrlblk and rcu_bh_ctrlblk to be in tiny_plugin.h, so this commit also moves them to tiny.c. Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 2 +- kernel/rcu/tiny.c | 35 +++++----- kernel/rcu/tiny_plugin.h | 78 ---------------------- .../selftests/rcutorture/configs/rcu/TINY02 | 1 - 4 files changed, 19 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 8edff43e8e94..be90c945063f 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -78,7 +78,7 @@ config TASKS_RCU user-mode execution as quiescent states. config RCU_STALL_COMMON - def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE ) + def_bool ( TREE_RCU || PREEMPT_RCU ) help This option enables RCU CPU stall code that is common between the TINY and TREE variants of RCU. The purpose is to allow diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 595cb1bf944f..f8488965250f 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -38,11 +38,23 @@ #include "rcu.h" -/* Forward declarations for tiny_plugin.h. */ -struct rcu_ctrlblk; -static void __call_rcu(struct rcu_head *head, - rcu_callback_t func, - struct rcu_ctrlblk *rcp); +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ + struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ + struct rcu_head **curtail; /* ->next pointer of last CB. */ +}; + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_sched_ctrlblk = { + .donetail = &rcu_sched_ctrlblk.rcucblist, + .curtail = &rcu_sched_ctrlblk.rcucblist, +}; + +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .donetail = &rcu_bh_ctrlblk.rcucblist, + .curtail = &rcu_bh_ctrlblk.rcucblist, +}; #include "tiny_plugin.h" @@ -65,7 +77,6 @@ EXPORT_SYMBOL(rcu_barrier_sched); */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - RCU_TRACE(reset_cpu_stall_ticks(rcp);) if (rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; return 1; @@ -111,7 +122,6 @@ void rcu_bh_qs(void) */ void rcu_check_callbacks(int user) { - RCU_TRACE(check_cpu_stalls();) if (user) rcu_sched_qs(); else if (!in_softirq()) @@ -126,10 +136,8 @@ void rcu_check_callbacks(int user) */ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) { - const char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; - RCU_TRACE(int cb_count = 0;) /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); @@ -147,18 +155,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); /* Invoke the callbacks on the local list. */ - RCU_TRACE(rn = rcp->name;) while (list) { next = list->next; prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - __rcu_reclaim(rn, list); + __rcu_reclaim("", list); local_bh_enable(); list = next; - RCU_TRACE(cb_count++;) } - RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);) } static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) @@ -202,7 +207,6 @@ static void __call_rcu(struct rcu_head *head, local_irq_save(flags); *rcp->curtail = head; rcp->curtail = &head->next; - RCU_TRACE(rcp->qlen++;) local_irq_restore(flags); if (unlikely(is_idle_task(current))) { @@ -235,8 +239,5 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);) - RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);) - rcu_early_boot_tests(); } diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index c642f23f1582..f0a01b2a3062 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -22,34 +22,6 @@ * Author: Paul E. McKenney */ -#include -#include - -/* Global control variables for rcupdate callback mechanism. */ -struct rcu_ctrlblk { - struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ - struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ - struct rcu_head **curtail; /* ->next pointer of last CB. */ - RCU_TRACE(long qlen); /* Number of pending CBs. */ - RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ - RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ - RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ - RCU_TRACE(const char *name); /* Name of RCU type. */ -}; - -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_sched_ctrlblk = { - .donetail = &rcu_sched_ctrlblk.rcucblist, - .curtail = &rcu_sched_ctrlblk.rcucblist, - RCU_TRACE(.name = "rcu_sched") -}; - -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .donetail = &rcu_bh_ctrlblk.rcucblist, - .curtail = &rcu_bh_ctrlblk.rcucblist, - RCU_TRACE(.name = "rcu_bh") -}; - #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) #include @@ -73,53 +45,3 @@ void __init rcu_scheduler_starting(void) } #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ - -#ifdef CONFIG_RCU_TRACE - -static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) -{ - unsigned long flags; - - local_irq_save(flags); - rcp->qlen -= n; - local_irq_restore(flags); -} - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long j; - unsigned long js; - - if (rcu_cpu_stall_suppress) - return; - rcp->ticks_this_gp++; - j = jiffies; - js = READ_ONCE(rcp->jiffies_stall); - if (rcp->rcucblist && ULONG_CMP_GE(j, js)) { - pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE, - jiffies - rcp->gp_start, rcp->qlen); - dump_stack(); - WRITE_ONCE(rcp->jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - } else if (ULONG_CMP_GE(j, js)) { - WRITE_ONCE(rcp->jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); - } -} - -static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) -{ - rcp->ticks_this_gp = 0; - rcp->gp_start = jiffies; - WRITE_ONCE(rcp->jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); -} - -static void check_cpu_stalls(void) -{ - RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);) - RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);) -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY02 b/tools/testing/selftests/rcutorture/configs/rcu/TINY02 index 1f6bebbf5da8..d8674264318d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TINY02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY02 @@ -6,7 +6,6 @@ CONFIG_PREEMPT=n CONFIG_HZ_PERIODIC=y CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=n -CONFIG_RCU_TRACE=y CONFIG_PROVE_LOCKING=y #CHECK#CONFIG_PROVE_RCU=y CONFIG_DEBUG_LOCK_ALLOC=y -- cgit v1.3-14-g43fede From 67e707bd68269aac70904943f07a979eeb163b13 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Thu, 8 Jun 2017 18:09:09 +0530 Subject: config: android-recommended: enable fstack-protector-strong If compiler has stack protector support, set CONFIG_CC_STACKPROTECTOR_STRONG. Reviewed-at: https://android-review.googlesource.com/#/c/238388/ Signed-off-by: Jeff Vander Stoep [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-recommended.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 28ee064b6744..a86faa41bfd2 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -11,6 +11,7 @@ CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_CC_STACKPROTECTOR_STRONG=y CONFIG_COMPACTION=y CONFIG_STRICT_KERNEL_RWX=y CONFIG_DM_CRYPT=y -- cgit v1.3-14-g43fede From 0c9238c7a1cfd834d8bb96a2b1fabe0b1a5961df Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 8 Jun 2017 18:09:10 +0530 Subject: config: android-recommended: enable CONFIG_ARM64_SW_TTBR0_PAN Enable PAN emulation using TTBR0_EL1 switching. Reviewed-at: https://android-review.googlesource.com/#/c/325997/ Signed-off-by: Sami Tolvanen [AmitP: cherry-picked this change from Android common kernel and updated the commit message] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-recommended.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index a86faa41bfd2..a02c447769f7 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -6,6 +6,7 @@ # CONFIG_NF_CONNTRACK_SIP is not set # CONFIG_PM_WAKELOCKS_GC is not set # CONFIG_VT is not set +CONFIG_ARM64_SW_TTBR0_PAN=y CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_LOOP=y -- cgit v1.3-14-g43fede From c1ebc2febdb85a73a4f91a9b9eaab6387619eaa6 Mon Sep 17 00:00:00 2001 From: Max Shi Date: Thu, 8 Jun 2017 18:09:11 +0530 Subject: config: android-base: disable CONFIG_USELIB and CONFIG_FHANDLE Turn off the two kernel configs to disable related system ABI. Reviewed-at: https://android-review.googlesource.com/#/c/264976/ Signed-off-by: Max Shi [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 26a06e09a5bd..efe5ff86767e 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -1,10 +1,12 @@ # KEEP ALPHABETICALLY SORTED # CONFIG_DEVKMEM is not set # CONFIG_DEVMEM is not set +# CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set # CONFIG_MODULES is not set # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set +# CONFIG_USELIB is not set CONFIG_ANDROID=y CONFIG_ANDROID_BINDER_IPC=y CONFIG_ANDROID_LOW_MEMORY_KILLER=y -- cgit v1.3-14-g43fede From fb0b1538983c1cf7d2a2242b332a34a953753624 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 8 Jun 2017 18:09:12 +0530 Subject: config: android-recommended: enable CONFIG_CPU_SW_DOMAIN_PAN Enable CPU domain PAN to ensure that normal kernel accesses are unable to access userspace addresses. Reviewed-at: https://android-review.googlesource.com/#/c/334035/ Signed-off-by: Sami Tolvanen [AmitP: cherry-picked this change from Android common kernel, updated the commit message and re-placed the CONFIG_STRICT_KERNEL_RWX config in sorted order] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-recommended.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index a02c447769f7..946fb92418f7 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -14,7 +14,7 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 CONFIG_CC_STACKPROTECTOR_STRONG=y CONFIG_COMPACTION=y -CONFIG_STRICT_KERNEL_RWX=y +CONFIG_CPU_SW_DOMAIN_PAN=y CONFIG_DM_CRYPT=y CONFIG_DM_UEVENT=y CONFIG_DM_VERITY=y @@ -107,6 +107,7 @@ CONFIG_SCHEDSTATS=y CONFIG_SMARTJOYPLUS_FF=y CONFIG_SND=y CONFIG_SOUND=y +CONFIG_STRICT_KERNEL_RWX=y CONFIG_SUSPEND_TIME=y CONFIG_TABLET_USB_ACECAD=y CONFIG_TABLET_USB_AIPTEK=y -- cgit v1.3-14-g43fede From 5b89db2fa545b473dc352689ac3afe407367ea34 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 8 Jun 2017 18:09:13 +0530 Subject: config: android-base: add CONFIG_IKCONFIG option This adds CONFIG_IKCONFIG and CONFIG_IKCONFIG_PROC options, which are a requirement for the O release. Reviewed-at: https://android-review.googlesource.com/#/c/364553/ Signed-off-by: Greg Kroah-Hartman [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index efe5ff86767e..e12cfec25758 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -25,6 +25,8 @@ CONFIG_EMBEDDED=y CONFIG_FB=y CONFIG_HARDENED_USERCOPY=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y CONFIG_INET6_AH=y CONFIG_INET6_ESP=y CONFIG_INET6_IPCOMP=y -- cgit v1.3-14-g43fede From 2096e1706336d83cd66ca744e4d904af4d63e25c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 8 Jun 2017 18:09:14 +0530 Subject: config: android-base: add CONFIG_MODULES option This adds CONFIG_MODULES, CONFIG_MODULE_UNLOAD, and CONFIG_MODVERSIONS which are required by the O release. Reviewed-at: https://android-review.googlesource.com/#/c/364554/ Signed-off-by: Greg Kroah-Hartman [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index e12cfec25758..62cb392fc34b 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -3,7 +3,6 @@ # CONFIG_DEVMEM is not set # CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set -# CONFIG_MODULES is not set # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set # CONFIG_USELIB is not set @@ -64,6 +63,9 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y CONFIG_IP_NF_TARGET_NETMAP=y CONFIG_IP_NF_TARGET_REDIRECT=y CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODVERSIONS=y CONFIG_NET=y CONFIG_NETDEVICES=y CONFIG_NETFILTER=y -- cgit v1.3-14-g43fede From 2edfe6be206adc4c1055e053322d27267f8952bc Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Thu, 8 Jun 2017 18:09:15 +0530 Subject: config: android-base: add CGROUP_BPF Add CONFIG_CGROUP_BPF as a default configuration in android base config since it is used to replace XT_QTAGUID in future. Reviewed-at: https://android-review.googlesource.com/#/c/400374/ Signed-off-by: Chenbo Feng [AmitP: cherry-picked this change from Android common kernel] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 62cb392fc34b..cdde5af6b332 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -14,6 +14,7 @@ CONFIG_ASHMEM=y CONFIG_AUDIT=y CONFIG_BLK_DEV_INITRD=y CONFIG_CGROUPS=y +CONFIG_CGROUP_BPF=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_FREEZER=y -- cgit v1.3-14-g43fede From 9e69dd0179c346dfb5d08b8d46d5f5c9c81ab1b7 Mon Sep 17 00:00:00 2001 From: Roberto Pereira Date: Thu, 8 Jun 2017 18:09:16 +0530 Subject: config: android-base: disable CONFIG_NFSD and CONFIG_NFS_FS Disable Network file system support. Reviewed-at: https://android-review.googlesource.com/#/c/409559/ Signed-off-by: Roberto Pereira [AmitP: cherry-picked this change from Android common kernel and updated commit message] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- kernel/configs/android-base.config | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index cdde5af6b332..d70829033bb7 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -3,6 +3,8 @@ # CONFIG_DEVMEM is not set # CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set +# CONFIG_NFSD is not set +# CONFIG_NFS_FS is not set # CONFIG_OABI_COMPAT is not set # CONFIG_SYSVIPC is not set # CONFIG_USELIB is not set -- cgit v1.3-14-g43fede From 4e4cbee93d56137ebff722be022cae5f70ef84fb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 3 Jun 2017 09:38:06 +0200 Subject: block: switch bios to blk_status_t Replace bi_error with a new bi_status to allow for a clear conversion. Note that device mapper overloaded bi_error with a private value, which we'll have to keep arround at least for now and thus propagate to a proper blk_status_t value. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio-integrity.c | 8 ++-- block/bio.c | 8 ++-- block/blk-core.c | 20 ++++++---- block/blk-integrity.c | 4 +- block/bounce.c | 4 +- block/t10-pi.c | 30 +++++++-------- drivers/block/aoe/aoecmd.c | 10 ++--- drivers/block/aoe/aoedev.c | 2 +- drivers/block/drbd/drbd_actlog.c | 2 +- drivers/block/drbd/drbd_bitmap.c | 6 +-- drivers/block/drbd/drbd_int.h | 2 +- drivers/block/drbd/drbd_receiver.c | 6 +-- drivers/block/drbd/drbd_req.c | 6 +-- drivers/block/drbd/drbd_worker.c | 16 ++++---- drivers/block/floppy.c | 4 +- drivers/block/pktcdvd.c | 18 ++++----- drivers/block/ps3vram.c | 14 +++---- drivers/block/rsxx/dev.c | 14 +++---- drivers/block/rsxx/dma.c | 13 +++---- drivers/block/rsxx/rsxx_priv.h | 2 +- drivers/block/umem.c | 2 +- drivers/block/xen-blkback/blkback.c | 19 ++++------ drivers/block/xen-blkfront.c | 2 +- drivers/lightnvm/pblk-core.c | 4 +- drivers/lightnvm/pblk-read.c | 4 +- drivers/lightnvm/pblk-write.c | 2 +- drivers/lightnvm/rrpc.c | 8 ++-- drivers/md/bcache/bcache.h | 7 ++-- drivers/md/bcache/btree.c | 6 +-- drivers/md/bcache/io.c | 6 +-- drivers/md/bcache/journal.c | 2 +- drivers/md/bcache/movinggc.c | 10 ++--- drivers/md/bcache/request.c | 28 +++++++------- drivers/md/bcache/request.h | 2 +- drivers/md/bcache/super.c | 6 +-- drivers/md/bcache/writeback.c | 4 +- drivers/md/dm-bio-prison-v1.c | 4 +- drivers/md/dm-bio-prison-v1.h | 2 +- drivers/md/dm-bufio.c | 28 +++++++------- drivers/md/dm-cache-target.c | 34 +++++++++-------- drivers/md/dm-crypt.c | 34 ++++++++--------- drivers/md/dm-flakey.c | 5 ++- drivers/md/dm-integrity.c | 18 ++++----- drivers/md/dm-io.c | 10 ++--- drivers/md/dm-log-writes.c | 7 ++-- drivers/md/dm-mpath.c | 15 ++++---- drivers/md/dm-raid1.c | 13 ++++--- drivers/md/dm-rq.c | 2 +- drivers/md/dm-snap.c | 5 ++- drivers/md/dm-stripe.c | 5 ++- drivers/md/dm-thin.c | 65 ++++++++++++++++---------------- drivers/md/dm-verity-target.c | 10 ++--- drivers/md/dm.c | 40 ++++++++++---------- drivers/md/md.c | 8 ++-- drivers/md/multipath.c | 10 ++--- drivers/md/raid1.c | 36 +++++++++--------- drivers/md/raid10.c | 36 +++++++++--------- drivers/md/raid5-cache.c | 4 +- drivers/md/raid5-ppl.c | 2 +- drivers/md/raid5.c | 22 +++++------ drivers/nvdimm/blk.c | 4 +- drivers/nvdimm/btt.c | 4 +- drivers/nvdimm/pmem.c | 28 +++++++------- drivers/nvme/target/io-cmd.c | 4 +- drivers/target/target_core_iblock.c | 10 ++--- fs/block_dev.c | 18 +++++---- fs/btrfs/btrfs_inode.h | 3 +- fs/btrfs/check-integrity.c | 4 +- fs/btrfs/compression.c | 44 +++++++++++----------- fs/btrfs/compression.h | 4 +- fs/btrfs/ctree.h | 6 +-- fs/btrfs/disk-io.c | 75 ++++++++++++++++++------------------- fs/btrfs/disk-io.h | 12 +++--- fs/btrfs/extent_io.c | 23 +++++++----- fs/btrfs/extent_io.h | 6 +-- fs/btrfs/file-item.c | 14 +++---- fs/btrfs/inode.c | 73 ++++++++++++++++++------------------ fs/btrfs/raid56.c | 16 ++++---- fs/btrfs/scrub.c | 26 ++++++------- fs/btrfs/volumes.c | 11 +++--- fs/buffer.c | 2 +- fs/crypto/bio.c | 2 +- fs/direct-io.c | 8 ++-- fs/ext4/page-io.c | 13 ++++--- fs/ext4/readpage.c | 4 +- fs/f2fs/data.c | 10 ++--- fs/f2fs/segment.c | 2 +- fs/gfs2/lops.c | 8 ++-- fs/gfs2/meta_io.c | 2 +- fs/gfs2/ops_fstype.c | 4 +- fs/iomap.c | 4 +- fs/jfs/jfs_logmgr.c | 2 +- fs/jfs/jfs_metapage.c | 4 +- fs/mpage.c | 3 +- fs/nfs/blocklayout/blocklayout.c | 4 +- fs/nilfs2/segbuf.c | 2 +- fs/ocfs2/cluster/heartbeat.c | 6 +-- fs/xfs/xfs_aops.c | 7 ++-- fs/xfs/xfs_buf.c | 7 +++- include/linux/bio.h | 2 +- include/linux/blk_types.h | 5 ++- include/linux/blkdev.h | 2 +- include/linux/device-mapper.h | 2 +- kernel/power/swap.c | 14 +++---- kernel/trace/blktrace.c | 4 +- mm/page_io.c | 4 +- 106 files changed, 625 insertions(+), 603 deletions(-) (limited to 'kernel') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 5384713d48bc..17b9740e138b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -221,7 +221,7 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, * @bio: bio to generate/verify integrity metadata for * @proc_fn: Pointer to the relevant processing function */ -static int bio_integrity_process(struct bio *bio, +static blk_status_t bio_integrity_process(struct bio *bio, integrity_processing_fn *proc_fn) { struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); @@ -229,7 +229,7 @@ static int bio_integrity_process(struct bio *bio, struct bvec_iter bviter; struct bio_vec bv; struct bio_integrity_payload *bip = bio_integrity(bio); - unsigned int ret = 0; + blk_status_t ret = BLK_STS_OK; void *prot_buf = page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; @@ -366,7 +366,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio *bio = bip->bip_bio; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - bio->bi_error = bio_integrity_process(bio, bi->profile->verify_fn); + bio->bi_status = bio_integrity_process(bio, bi->profile->verify_fn); /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; @@ -395,7 +395,7 @@ void bio_integrity_endio(struct bio *bio) * integrity metadata. Restore original bio end_io handler * and run it. */ - if (bio->bi_error) { + if (bio->bi_status) { bio->bi_end_io = bip->bip_end_io; bio_endio(bio); diff --git a/block/bio.c b/block/bio.c index 888e7801c638..7a5c8ed27f42 100644 --- a/block/bio.c +++ b/block/bio.c @@ -309,8 +309,8 @@ static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; - if (!parent->bi_error) - parent->bi_error = bio->bi_error; + if (!parent->bi_status) + parent->bi_status = bio->bi_status; bio_put(bio); return parent; } @@ -918,7 +918,7 @@ static void submit_bio_wait_endio(struct bio *bio) { struct submit_bio_ret *ret = bio->bi_private; - ret->error = bio->bi_error; + ret->error = blk_status_to_errno(bio->bi_status); complete(&ret->event); } @@ -1818,7 +1818,7 @@ again: if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), - bio, bio->bi_error); + bio, bio->bi_status); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } diff --git a/block/blk-core.c b/block/blk-core.c index e942a9f814c7..3d84820ace9e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -144,6 +144,9 @@ static const struct { [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, + /* device mapper special case, should not leak out: */ + [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, + /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; @@ -188,7 +191,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, blk_status_t error) { if (error) - bio->bi_error = blk_status_to_errno(error); + bio->bi_status = error; if (unlikely(rq->rq_flags & RQF_QUIET)) bio_set_flag(bio, BIO_QUIET); @@ -1717,7 +1720,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) blk_queue_split(q, &bio, q->bio_split); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return BLK_QC_T_NONE; } @@ -1775,7 +1778,10 @@ get_rq: req = get_request(q, bio->bi_opf, bio, GFP_NOIO); if (IS_ERR(req)) { __wbt_done(q->rq_wb, wb_acct); - bio->bi_error = PTR_ERR(req); + if (PTR_ERR(req) == -ENOMEM) + bio->bi_status = BLK_STS_RESOURCE; + else + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); goto out_unlock; } @@ -1930,7 +1936,7 @@ generic_make_request_checks(struct bio *bio) { struct request_queue *q; int nr_sectors = bio_sectors(bio); - int err = -EIO; + blk_status_t status = BLK_STS_IOERR; char b[BDEVNAME_SIZE]; struct hd_struct *part; @@ -1973,7 +1979,7 @@ generic_make_request_checks(struct bio *bio) !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!nr_sectors) { - err = 0; + status = BLK_STS_OK; goto end_io; } } @@ -2025,9 +2031,9 @@ generic_make_request_checks(struct bio *bio) return true; not_supported: - err = -EOPNOTSUPP; + status = BLK_STS_NOTSUPP; end_io: - bio->bi_error = err; + bio->bi_status = status; bio_endio(bio); return false; } diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 0f891a9aff4d..feb30570eaf5 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -384,9 +384,9 @@ static struct kobj_type integrity_ktype = { .sysfs_ops = &integrity_ops, }; -static int blk_integrity_nop_fn(struct blk_integrity_iter *iter) +static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) { - return 0; + return BLK_STS_OK; } static const struct blk_integrity_profile nop_profile = { diff --git a/block/bounce.c b/block/bounce.c index 1cb5dd3a5da1..e4703181d97f 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -143,7 +143,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) mempool_free(bvec->bv_page, pool); } - bio_orig->bi_error = bio->bi_error; + bio_orig->bi_status = bio->bi_status; bio_endio(bio_orig); bio_put(bio); } @@ -163,7 +163,7 @@ static void __bounce_end_io_read(struct bio *bio, mempool_t *pool) { struct bio *bio_orig = bio->bi_private; - if (!bio->bi_error) + if (!bio->bi_status) copy_to_high_bio_irq(bio_orig, bio); bounce_end_io(bio, pool); diff --git a/block/t10-pi.c b/block/t10-pi.c index 680c6d636298..350b3cbcf9e5 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -46,8 +46,8 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len) * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref * tag. */ -static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn, - unsigned int type) +static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, + csum_fn *fn, unsigned int type) { unsigned int i; @@ -67,11 +67,11 @@ static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn, iter->seed++; } - return 0; + return BLK_STS_OK; } -static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, - unsigned int type) +static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, + csum_fn *fn, unsigned int type) { unsigned int i; @@ -108,7 +108,7 @@ static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, "(rcvd %04x, want %04x)\n", iter->disk_name, (unsigned long long)iter->seed, be16_to_cpu(pi->guard_tag), be16_to_cpu(csum)); - return -EILSEQ; + return BLK_STS_PROTECTION; } next: @@ -117,45 +117,45 @@ next: iter->seed++; } - return 0; + return BLK_STS_OK; } -static int t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) { return t10_pi_generate(iter, t10_pi_crc_fn, 1); } -static int t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) { return t10_pi_generate(iter, t10_pi_ip_fn, 1); } -static int t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) { return t10_pi_verify(iter, t10_pi_crc_fn, 1); } -static int t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) { return t10_pi_verify(iter, t10_pi_ip_fn, 1); } -static int t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) { return t10_pi_generate(iter, t10_pi_crc_fn, 3); } -static int t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) { return t10_pi_generate(iter, t10_pi_ip_fn, 3); } -static int t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) { return t10_pi_verify(iter, t10_pi_crc_fn, 3); } -static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) +static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) { return t10_pi_verify(iter, t10_pi_ip_fn, 3); } diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 5bf0c9d21fc1..dc43254e05a4 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1070,7 +1070,7 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail) d->ip.rq = NULL; do { bio = rq->bio; - bok = !fastfail && !bio->bi_error; + bok = !fastfail && !bio->bi_status; } while (__blk_end_request(rq, bok ? BLK_STS_OK : BLK_STS_IOERR, bio->bi_iter.bi_size)); /* cf. http://lkml.org/lkml/2006/10/31/28 */ @@ -1131,7 +1131,7 @@ ktiocomplete(struct frame *f) ahout->cmdstat, ahin->cmdstat, d->aoemajor, d->aoeminor); noskb: if (buf) - buf->bio->bi_error = -EIO; + buf->bio->bi_status = BLK_STS_IOERR; goto out; } @@ -1144,7 +1144,7 @@ noskb: if (buf) "aoe: runt data size in read from", (long) d->aoemajor, d->aoeminor, skb->len, n); - buf->bio->bi_error = -EIO; + buf->bio->bi_status = BLK_STS_IOERR; break; } if (n > f->iter.bi_size) { @@ -1152,7 +1152,7 @@ noskb: if (buf) "aoe: too-large data size in read from", (long) d->aoemajor, d->aoeminor, n, f->iter.bi_size); - buf->bio->bi_error = -EIO; + buf->bio->bi_status = BLK_STS_IOERR; break; } bvcpy(skb, f->buf->bio, f->iter, n); @@ -1654,7 +1654,7 @@ aoe_failbuf(struct aoedev *d, struct buf *buf) if (buf == NULL) return; buf->iter.bi_size = 0; - buf->bio->bi_error = -EIO; + buf->bio->bi_status = BLK_STS_IOERR; if (buf->nframesout == 0) aoe_end_buf(d, buf); } diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index ffd1947500c6..b28fefb90391 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -170,7 +170,7 @@ aoe_failip(struct aoedev *d) if (rq == NULL) return; while ((bio = d->ip.nxbio)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; d->ip.nxbio = bio->bi_next; n = (unsigned long) rq->special; rq->special = (void *) --n; diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 8d7bcfa49c12..e02c45cd3c5a 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -178,7 +178,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, else submit_bio(bio); wait_until_done_or_force_detached(device, bdev, &device->md_io.done); - if (!bio->bi_error) + if (!bio->bi_status) err = device->md_io.error; out: diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index a804a4107fbc..809fd245c3dc 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -959,16 +959,16 @@ static void drbd_bm_endio(struct bio *bio) !bm_test_page_unchanged(b->bm_pages[idx])) drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx); - if (bio->bi_error) { + if (bio->bi_status) { /* ctx error will hold the completed-last non-zero error code, * in case error codes differ. */ - ctx->error = bio->bi_error; + ctx->error = blk_status_to_errno(bio->bi_status); bm_set_page_io_err(b->bm_pages[idx]); /* Not identical to on disk version of it. * Is BM_PAGE_IO_ERROR enough? */ if (__ratelimit(&drbd_ratelimit_state)) drbd_err(device, "IO ERROR %d on bitmap page idx %u\n", - bio->bi_error, idx); + bio->bi_status, idx); } else { bm_clear_page_io_err(b->bm_pages[idx]); dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index d5da45bb03a6..76761b4ca13e 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1627,7 +1627,7 @@ static inline void drbd_generic_make_request(struct drbd_device *device, __release(local); if (!bio->bi_bdev) { drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n"); - bio->bi_error = -ENODEV; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; } diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 1b0a2be24f39..c7e95e6380fb 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1229,9 +1229,9 @@ void one_flush_endio(struct bio *bio) struct drbd_device *device = octx->device; struct issue_flush_context *ctx = octx->ctx; - if (bio->bi_error) { - ctx->error = bio->bi_error; - drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_error); + if (bio->bi_status) { + ctx->error = blk_status_to_errno(bio->bi_status); + drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_status); } kfree(octx); bio_put(bio); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 656624314f0d..fca6b9914948 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -203,7 +203,7 @@ void start_new_tl_epoch(struct drbd_connection *connection) void complete_master_bio(struct drbd_device *device, struct bio_and_error *m) { - m->bio->bi_error = m->error; + m->bio->bi_status = errno_to_blk_status(m->error); bio_endio(m->bio); dec_ap_bio(device); } @@ -1157,7 +1157,7 @@ static void drbd_process_discard_req(struct drbd_request *req) if (blkdev_issue_zeroout(bdev, req->i.sector, req->i.size >> 9, GFP_NOIO, 0)) - req->private_bio->bi_error = -EIO; + req->private_bio->bi_status = BLK_STS_IOERR; bio_endio(req->private_bio); } @@ -1225,7 +1225,7 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long /* only pass the error to the upper layers. * if user cannot handle io errors, that's not our business. */ drbd_err(device, "could not kmalloc() req\n"); - bio->bi_error = -ENOMEM; + bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return ERR_PTR(-ENOMEM); } diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 1afcb4e02d8d..1d8726a8df34 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -63,7 +63,7 @@ void drbd_md_endio(struct bio *bio) struct drbd_device *device; device = bio->bi_private; - device->md_io.error = bio->bi_error; + device->md_io.error = blk_status_to_errno(bio->bi_status); /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able * to timeout on the lower level device, and eventually detach from it. @@ -177,13 +177,13 @@ void drbd_peer_request_endio(struct bio *bio) bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES || bio_op(bio) == REQ_OP_DISCARD; - if (bio->bi_error && __ratelimit(&drbd_ratelimit_state)) + if (bio->bi_status && __ratelimit(&drbd_ratelimit_state)) drbd_warn(device, "%s: error=%d s=%llus\n", is_write ? (is_discard ? "discard" : "write") - : "read", bio->bi_error, + : "read", bio->bi_status, (unsigned long long)peer_req->i.sector); - if (bio->bi_error) + if (bio->bi_status) set_bit(__EE_WAS_ERROR, &peer_req->flags); bio_put(bio); /* no need for the bio anymore */ @@ -243,16 +243,16 @@ void drbd_request_endio(struct bio *bio) if (__ratelimit(&drbd_ratelimit_state)) drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); - if (!bio->bi_error) + if (!bio->bi_status) drbd_panic_after_delayed_completion_of_aborted_request(device); } /* to avoid recursion in __req_mod */ - if (unlikely(bio->bi_error)) { + if (unlikely(bio->bi_status)) { switch (bio_op(bio)) { case REQ_OP_WRITE_ZEROES: case REQ_OP_DISCARD: - if (bio->bi_error == -EOPNOTSUPP) + if (bio->bi_status == BLK_STS_NOTSUPP) what = DISCARD_COMPLETED_NOTSUPP; else what = DISCARD_COMPLETED_WITH_ERROR; @@ -272,7 +272,7 @@ void drbd_request_endio(struct bio *bio) } bio_put(req->private_bio); - req->private_bio = ERR_PTR(bio->bi_error); + req->private_bio = ERR_PTR(blk_status_to_errno(bio->bi_status)); /* not req_mod(), we need irqsave here! */ spin_lock_irqsave(&device->resource->req_lock, flags); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index cc75a5176057..9e3cb32e365d 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3780,9 +3780,9 @@ static void floppy_rb0_cb(struct bio *bio) struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private; int drive = cbdata->drive; - if (bio->bi_error) { + if (bio->bi_status) { pr_info("floppy: error %d while reading block 0\n", - bio->bi_error); + bio->bi_status); set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); } complete(&cbdata->complete); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 42e3c880a8a5..e8a381161db6 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -952,9 +952,9 @@ static void pkt_end_io_read(struct bio *bio) pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n", bio, (unsigned long long)pkt->sector, - (unsigned long long)bio->bi_iter.bi_sector, bio->bi_error); + (unsigned long long)bio->bi_iter.bi_sector, bio->bi_status); - if (bio->bi_error) + if (bio->bi_status) atomic_inc(&pkt->io_errors); if (atomic_dec_and_test(&pkt->io_wait)) { atomic_inc(&pkt->run_sm); @@ -969,7 +969,7 @@ static void pkt_end_io_packet_write(struct bio *bio) struct pktcdvd_device *pd = pkt->pd; BUG_ON(!pd); - pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_error); + pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_status); pd->stats.pkt_ended++; @@ -1305,16 +1305,16 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) pkt_queue_bio(pd, pkt->w_bio); } -static void pkt_finish_packet(struct packet_data *pkt, int error) +static void pkt_finish_packet(struct packet_data *pkt, blk_status_t status) { struct bio *bio; - if (error) + if (status) pkt->cache_valid = 0; /* Finish all bios corresponding to this packet */ while ((bio = bio_list_pop(&pkt->orig_bios))) { - bio->bi_error = error; + bio->bi_status = status; bio_endio(bio); } } @@ -1349,7 +1349,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data if (atomic_read(&pkt->io_wait) > 0) return; - if (!pkt->w_bio->bi_error) { + if (!pkt->w_bio->bi_status) { pkt_set_state(pkt, PACKET_FINISHED_STATE); } else { pkt_set_state(pkt, PACKET_RECOVERY_STATE); @@ -1366,7 +1366,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data break; case PACKET_FINISHED_STATE: - pkt_finish_packet(pkt, pkt->w_bio->bi_error); + pkt_finish_packet(pkt, pkt->w_bio->bi_status); return; default: @@ -2301,7 +2301,7 @@ static void pkt_end_io_read_cloned(struct bio *bio) struct packet_stacked_data *psd = bio->bi_private; struct pktcdvd_device *pd = psd->pd; - psd->bio->bi_error = bio->bi_error; + psd->bio->bi_status = bio->bi_status; bio_put(bio); bio_endio(psd->bio); mempool_free(psd, psd_pool); diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 456b4fe21559..6fa2b8197013 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -428,7 +428,7 @@ static void ps3vram_cache_cleanup(struct ps3_system_bus_device *dev) kfree(priv->cache.tags); } -static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from, +static blk_status_t ps3vram_read(struct ps3_system_bus_device *dev, loff_t from, size_t len, size_t *retlen, u_char *buf) { struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); @@ -438,7 +438,7 @@ static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from, (unsigned int)from, len); if (from >= priv->size) - return -EIO; + return BLK_STS_IOERR; if (len > priv->size - from) len = priv->size - from; @@ -472,14 +472,14 @@ static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from, return 0; } -static int ps3vram_write(struct ps3_system_bus_device *dev, loff_t to, +static blk_status_t ps3vram_write(struct ps3_system_bus_device *dev, loff_t to, size_t len, size_t *retlen, const u_char *buf) { struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); unsigned int cached, count; if (to >= priv->size) - return -EIO; + return BLK_STS_IOERR; if (len > priv->size - to) len = priv->size - to; @@ -554,7 +554,7 @@ static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev, int write = bio_data_dir(bio) == WRITE; const char *op = write ? "write" : "read"; loff_t offset = bio->bi_iter.bi_sector << 9; - int error = 0; + blk_status_t error = 0; struct bio_vec bvec; struct bvec_iter iter; struct bio *next; @@ -578,7 +578,7 @@ static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev, if (retlen != len) { dev_err(&dev->core, "Short %s\n", op); - error = -EIO; + error = BLK_STS_IOERR; goto out; } @@ -593,7 +593,7 @@ out: next = bio_list_peek(&priv->list); spin_unlock_irq(&priv->lock); - bio->bi_error = error; + bio->bi_status = error; bio_endio(bio); return next; } diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 9c566364ac9c..0b0a0a902355 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -149,7 +149,7 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio) { struct rsxx_cardinfo *card = q->queuedata; struct rsxx_bio_meta *bio_meta; - int st = -EINVAL; + blk_status_t st = BLK_STS_IOERR; blk_queue_split(q, &bio, q->bio_split); @@ -161,15 +161,11 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio) if (bio_end_sector(bio) > get_capacity(card->gendisk)) goto req_err; - if (unlikely(card->halt)) { - st = -EFAULT; + if (unlikely(card->halt)) goto req_err; - } - if (unlikely(card->dma_fault)) { - st = (-EFAULT); + if (unlikely(card->dma_fault)) goto req_err; - } if (bio->bi_iter.bi_size == 0) { dev_err(CARD_TO_DEV(card), "size zero BIO!\n"); @@ -178,7 +174,7 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio) bio_meta = kmem_cache_alloc(bio_meta_pool, GFP_KERNEL); if (!bio_meta) { - st = -ENOMEM; + st = BLK_STS_RESOURCE; goto req_err; } @@ -205,7 +201,7 @@ queue_err: kmem_cache_free(bio_meta_pool, bio_meta); req_err: if (st) - bio->bi_error = st; + bio->bi_status = st; bio_endio(bio); return BLK_QC_T_NONE; } diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index 5a20385f87d0..6a1b2177951c 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c @@ -611,7 +611,7 @@ static void rsxx_schedule_done(struct work_struct *work) mutex_unlock(&ctrl->work_lock); } -static int rsxx_queue_discard(struct rsxx_cardinfo *card, +static blk_status_t rsxx_queue_discard(struct rsxx_cardinfo *card, struct list_head *q, unsigned int laddr, rsxx_dma_cb cb, @@ -621,7 +621,7 @@ static int rsxx_queue_discard(struct rsxx_cardinfo *card, dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); if (!dma) - return -ENOMEM; + return BLK_STS_RESOURCE; dma->cmd = HW_CMD_BLK_DISCARD; dma->laddr = laddr; @@ -640,7 +640,7 @@ static int rsxx_queue_discard(struct rsxx_cardinfo *card, return 0; } -static int rsxx_queue_dma(struct rsxx_cardinfo *card, +static blk_status_t rsxx_queue_dma(struct rsxx_cardinfo *card, struct list_head *q, int dir, unsigned int dma_off, @@ -655,7 +655,7 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card, dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); if (!dma) - return -ENOMEM; + return BLK_STS_RESOURCE; dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ; dma->laddr = laddr; @@ -677,7 +677,7 @@ static int rsxx_queue_dma(struct rsxx_cardinfo *card, return 0; } -int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, +blk_status_t rsxx_dma_queue_bio(struct rsxx_cardinfo *card, struct bio *bio, atomic_t *n_dmas, rsxx_dma_cb cb, @@ -694,7 +694,7 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, unsigned int dma_len; int dma_cnt[RSXX_MAX_TARGETS]; int tgt; - int st; + blk_status_t st; int i; addr8 = bio->bi_iter.bi_sector << 9; /* sectors are 512 bytes */ @@ -769,7 +769,6 @@ bvec_err: for (i = 0; i < card->n_targets; i++) rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i], FREE_DMA); - return st; } diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h index 6bbc64d0f690..277f27e673a2 100644 --- a/drivers/block/rsxx/rsxx_priv.h +++ b/drivers/block/rsxx/rsxx_priv.h @@ -391,7 +391,7 @@ int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl); void rsxx_dma_cleanup(void); void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); int rsxx_dma_configure(struct rsxx_cardinfo *card); -int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, +blk_status_t rsxx_dma_queue_bio(struct rsxx_cardinfo *card, struct bio *bio, atomic_t *n_dmas, rsxx_dma_cb cb, diff --git a/drivers/block/umem.c b/drivers/block/umem.c index c141cc3be22b..4b3c947697b1 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -454,7 +454,7 @@ static void process_page(unsigned long data) PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); if (control & DMASCR_HARD_ERROR) { /* error */ - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; dev_printk(KERN_WARNING, &card->dev->dev, "I/O error on sector %d/%d\n", le32_to_cpu(desc->local_addr)>>9, diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 726c32e35db9..746bd8c8c09a 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -1069,20 +1069,17 @@ static void xen_blk_drain_io(struct xen_blkif_ring *ring) atomic_set(&blkif->drain, 0); } -/* - * Completion callback on the bio's. Called as bh->b_end_io() - */ - -static void __end_block_io_op(struct pending_req *pending_req, int error) +static void __end_block_io_op(struct pending_req *pending_req, + blk_status_t error) { /* An error fails the entire request. */ - if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && - (error == -EOPNOTSUPP)) { + if (pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE && + error == BLK_STS_NOTSUPP) { pr_debug("flush diskcache op failed, not supported\n"); xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; - } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && - (error == -EOPNOTSUPP)) { + } else if (pending_req->operation == BLKIF_OP_WRITE_BARRIER && + error == BLK_STS_NOTSUPP) { pr_debug("write barrier op failed, not supported\n"); xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; @@ -1106,7 +1103,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) */ static void end_block_io_op(struct bio *bio) { - __end_block_io_op(bio->bi_private, bio->bi_error); + __end_block_io_op(bio->bi_private, bio->bi_status); bio_put(bio); } @@ -1423,7 +1420,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, for (i = 0; i < nbio; i++) bio_put(biolist[i]); atomic_set(&pending_req->pendcnt, 1); - __end_block_io_op(pending_req, -EINVAL); + __end_block_io_op(pending_req, BLK_STS_RESOURCE); msleep(1); /* back off a bit */ return -EIO; } diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2f468cf86dcf..e3be666c2776 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2006,7 +2006,7 @@ static void split_bio_end(struct bio *bio) if (atomic_dec_and_test(&split_bio->pending)) { split_bio->bio->bi_phys_segments = 0; - split_bio->bio->bi_error = bio->bi_error; + split_bio->bio->bi_status = bio->bi_status; bio_endio(split_bio->bio); kfree(split_bio); } diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 5e44768ccffa..4e0de995cd90 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -296,8 +296,8 @@ void pblk_flush_writer(struct pblk *pblk) pr_err("pblk: tear down bio failed\n"); } - if (bio->bi_error) - pr_err("pblk: flush sync write failed (%u)\n", bio->bi_error); + if (bio->bi_status) + pr_err("pblk: flush sync write failed (%u)\n", bio->bi_status); bio_put(bio); } diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 4a12f14d78c6..762c0b73cb67 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -114,7 +114,7 @@ static void pblk_end_io_read(struct nvm_rq *rqd) pblk_log_read_err(pblk, rqd); #ifdef CONFIG_NVM_DEBUG else - WARN_ONCE(bio->bi_error, "pblk: corrupted read error\n"); + WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n"); #endif if (rqd->nr_ppas > 1) @@ -123,7 +123,7 @@ static void pblk_end_io_read(struct nvm_rq *rqd) bio_put(bio); if (r_ctx->orig_bio) { #ifdef CONFIG_NVM_DEBUG - WARN_ONCE(r_ctx->orig_bio->bi_error, + WARN_ONCE(r_ctx->orig_bio->bi_status, "pblk: corrupted read bio\n"); #endif bio_endio(r_ctx->orig_bio); diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index aef6fd7c4a0c..79b90d8dbcb3 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -186,7 +186,7 @@ static void pblk_end_io_write(struct nvm_rq *rqd) } #ifdef CONFIG_NVM_DEBUG else - WARN_ONCE(rqd->bio->bi_error, "pblk: corrupted write error\n"); + WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); #endif pblk_complete_write(pblk, rqd, c_ctx); diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index cf0e28a0ff61..8d3b53bb3307 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -279,8 +279,8 @@ static void rrpc_end_sync_bio(struct bio *bio) { struct completion *waiting = bio->bi_private; - if (bio->bi_error) - pr_err("nvm: gc request failed (%u).\n", bio->bi_error); + if (bio->bi_status) + pr_err("nvm: gc request failed (%u).\n", bio->bi_status); complete(waiting); } @@ -359,7 +359,7 @@ try: goto finished; } wait_for_completion_io(&wait); - if (bio->bi_error) { + if (bio->bi_status) { rrpc_inflight_laddr_release(rrpc, rqd); goto finished; } @@ -385,7 +385,7 @@ try: wait_for_completion_io(&wait); rrpc_inflight_laddr_release(rrpc, rqd); - if (bio->bi_error) + if (bio->bi_status) goto finished; bio_reset(bio); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index c3ea03c9a1a8..dee542fff68e 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -849,10 +849,11 @@ static inline void wake_up_allocators(struct cache_set *c) /* Forward declarations */ -void bch_count_io_errors(struct cache *, int, const char *); +void bch_count_io_errors(struct cache *, blk_status_t, const char *); void bch_bbio_count_io_errors(struct cache_set *, struct bio *, - int, const char *); -void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *); + blk_status_t, const char *); +void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t, + const char *); void bch_bbio_free(struct bio *, struct cache_set *); struct bio *bch_bbio_alloc(struct cache_set *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 450d0e848ae4..866dcf78ff8e 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -307,7 +307,7 @@ static void bch_btree_node_read(struct btree *b) bch_submit_bbio(bio, b->c, &b->key, 0); closure_sync(&cl); - if (bio->bi_error) + if (bio->bi_status) set_btree_node_io_error(b); bch_bbio_free(bio, b->c); @@ -374,10 +374,10 @@ static void btree_node_write_endio(struct bio *bio) struct closure *cl = bio->bi_private; struct btree *b = container_of(cl, struct btree, io); - if (bio->bi_error) + if (bio->bi_status) set_btree_node_io_error(b); - bch_bbio_count_io_errors(b->c, bio, bio->bi_error, "writing btree"); + bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree"); closure_put(cl); } diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index db45a88c0ce9..6a9b85095e7b 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -50,7 +50,7 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, /* IO errors */ -void bch_count_io_errors(struct cache *ca, int error, const char *m) +void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) { /* * The halflife of an error is: @@ -103,7 +103,7 @@ void bch_count_io_errors(struct cache *ca, int error, const char *m) } void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, - int error, const char *m) + blk_status_t error, const char *m) { struct bbio *b = container_of(bio, struct bbio, bio); struct cache *ca = PTR_CACHE(c, &b->key, 0); @@ -132,7 +132,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, } void bch_bbio_endio(struct cache_set *c, struct bio *bio, - int error, const char *m) + blk_status_t error, const char *m) { struct closure *cl = bio->bi_private; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 1198e53d5670..0352d05e495c 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -549,7 +549,7 @@ static void journal_write_endio(struct bio *bio) { struct journal_write *w = bio->bi_private; - cache_set_err_on(bio->bi_error, w->c, "journal io error"); + cache_set_err_on(bio->bi_status, w->c, "journal io error"); closure_put(&w->c->journal.io); } diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 13b8a907006d..f633b30c962e 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -63,14 +63,14 @@ static void read_moving_endio(struct bio *bio) struct moving_io *io = container_of(bio->bi_private, struct moving_io, cl); - if (bio->bi_error) - io->op.error = bio->bi_error; + if (bio->bi_status) + io->op.status = bio->bi_status; else if (!KEY_DIRTY(&b->key) && ptr_stale(io->op.c, &b->key, 0)) { - io->op.error = -EINTR; + io->op.status = BLK_STS_IOERR; } - bch_bbio_endio(io->op.c, bio, bio->bi_error, "reading data to move"); + bch_bbio_endio(io->op.c, bio, bio->bi_status, "reading data to move"); } static void moving_init(struct moving_io *io) @@ -92,7 +92,7 @@ static void write_moving(struct closure *cl) struct moving_io *io = container_of(cl, struct moving_io, cl); struct data_insert_op *op = &io->op; - if (!op->error) { + if (!op->status) { moving_init(io); io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 709c9cc34369..019b3df9f1c6 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -81,7 +81,7 @@ static void bch_data_insert_keys(struct closure *cl) if (ret == -ESRCH) { op->replace_collision = true; } else if (ret) { - op->error = -ENOMEM; + op->status = BLK_STS_RESOURCE; op->insert_data_done = true; } @@ -178,17 +178,17 @@ static void bch_data_insert_endio(struct bio *bio) struct closure *cl = bio->bi_private; struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - if (bio->bi_error) { + if (bio->bi_status) { /* TODO: We could try to recover from this. */ if (op->writeback) - op->error = bio->bi_error; + op->status = bio->bi_status; else if (!op->replace) set_closure_fn(cl, bch_data_insert_error, op->wq); else set_closure_fn(cl, NULL, NULL); } - bch_bbio_endio(op->c, bio, bio->bi_error, "writing data to cache"); + bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache"); } static void bch_data_insert_start(struct closure *cl) @@ -488,15 +488,15 @@ static void bch_cache_read_endio(struct bio *bio) * from the backing device. */ - if (bio->bi_error) - s->iop.error = bio->bi_error; + if (bio->bi_status) + s->iop.status = bio->bi_status; else if (!KEY_DIRTY(&b->key) && ptr_stale(s->iop.c, &b->key, 0)) { atomic_long_inc(&s->iop.c->cache_read_races); - s->iop.error = -EINTR; + s->iop.status = BLK_STS_IOERR; } - bch_bbio_endio(s->iop.c, bio, bio->bi_error, "reading from cache"); + bch_bbio_endio(s->iop.c, bio, bio->bi_status, "reading from cache"); } /* @@ -593,9 +593,9 @@ static void request_endio(struct bio *bio) { struct closure *cl = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { struct search *s = container_of(cl, struct search, cl); - s->iop.error = bio->bi_error; + s->iop.status = bio->bi_status; /* Only cache read errors are recoverable */ s->recoverable = false; } @@ -611,7 +611,7 @@ static void bio_complete(struct search *s) &s->d->disk->part0, s->start_time); trace_bcache_request_end(s->d, s->orig_bio); - s->orig_bio->bi_error = s->iop.error; + s->orig_bio->bi_status = s->iop.status; bio_endio(s->orig_bio); s->orig_bio = NULL; } @@ -664,7 +664,7 @@ static inline struct search *search_alloc(struct bio *bio, s->iop.inode = d->id; s->iop.write_point = hash_long((unsigned long) current, 16); s->iop.write_prio = 0; - s->iop.error = 0; + s->iop.status = 0; s->iop.flags = 0; s->iop.flush_journal = op_is_flush(bio->bi_opf); s->iop.wq = bcache_wq; @@ -707,7 +707,7 @@ static void cached_dev_read_error(struct closure *cl) /* Retry from the backing device: */ trace_bcache_read_retry(s->orig_bio); - s->iop.error = 0; + s->iop.status = 0; do_bio_hook(s, s->orig_bio); /* XXX: invalidate cache */ @@ -767,7 +767,7 @@ static void cached_dev_read_done_bh(struct closure *cl) !s->cache_miss, s->iop.bypass); trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); - if (s->iop.error) + if (s->iop.status) continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); else if (s->iop.bio || verify(dc, &s->bio.bio)) continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 1ff36875c2b3..7689176951ce 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -10,7 +10,7 @@ struct data_insert_op { unsigned inode; uint16_t write_point; uint16_t write_prio; - short error; + blk_status_t status; union { uint16_t flags; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e57353e39168..fbc4f5412dec 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -271,7 +271,7 @@ static void write_super_endio(struct bio *bio) { struct cache *ca = bio->bi_private; - bch_count_io_errors(ca, bio->bi_error, "writing superblock"); + bch_count_io_errors(ca, bio->bi_status, "writing superblock"); closure_put(&ca->set->sb_write); } @@ -321,7 +321,7 @@ static void uuid_endio(struct bio *bio) struct closure *cl = bio->bi_private; struct cache_set *c = container_of(cl, struct cache_set, uuid_write); - cache_set_err_on(bio->bi_error, c, "accessing uuids"); + cache_set_err_on(bio->bi_status, c, "accessing uuids"); bch_bbio_free(bio, c); closure_put(cl); } @@ -494,7 +494,7 @@ static void prio_endio(struct bio *bio) { struct cache *ca = bio->bi_private; - cache_set_err_on(bio->bi_error, ca->set, "accessing priorities"); + cache_set_err_on(bio->bi_status, ca->set, "accessing priorities"); bch_bbio_free(bio, ca->set); closure_put(&ca->prio); } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 6ac2e48b9235..42c66e76f05e 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -167,7 +167,7 @@ static void dirty_endio(struct bio *bio) struct keybuf_key *w = bio->bi_private; struct dirty_io *io = w->private; - if (bio->bi_error) + if (bio->bi_status) SET_KEY_DIRTY(&w->key, false); closure_put(&io->cl); @@ -195,7 +195,7 @@ static void read_dirty_endio(struct bio *bio) struct dirty_io *io = w->private; bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), - bio->bi_error, "reading dirty data from cache"); + bio->bi_status, "reading dirty data from cache"); dirty_endio(bio); } diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c index ae7da2c30a57..82d27384d31f 100644 --- a/drivers/md/dm-bio-prison-v1.c +++ b/drivers/md/dm-bio-prison-v1.c @@ -229,7 +229,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); void dm_cell_error(struct dm_bio_prison *prison, - struct dm_bio_prison_cell *cell, int error) + struct dm_bio_prison_cell *cell, blk_status_t error) { struct bio_list bios; struct bio *bio; @@ -238,7 +238,7 @@ void dm_cell_error(struct dm_bio_prison *prison, dm_cell_release(prison, cell, &bios); while ((bio = bio_list_pop(&bios))) { - bio->bi_error = error; + bio->bi_status = error; bio_endio(bio); } } diff --git a/drivers/md/dm-bio-prison-v1.h b/drivers/md/dm-bio-prison-v1.h index cddd4ac07e2c..cec52ac5e1ae 100644 --- a/drivers/md/dm-bio-prison-v1.h +++ b/drivers/md/dm-bio-prison-v1.h @@ -91,7 +91,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell, struct bio_list *inmates); void dm_cell_error(struct dm_bio_prison *prison, - struct dm_bio_prison_cell *cell, int error); + struct dm_bio_prison_cell *cell, blk_status_t error); /* * Visits the cell and then releases. Guarantees no new inmates are diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index cd8139593ccd..0902d2fd1743 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -145,8 +145,8 @@ struct dm_buffer { enum data_mode data_mode; unsigned char list_mode; /* LIST_* */ unsigned hold_count; - int read_error; - int write_error; + blk_status_t read_error; + blk_status_t write_error; unsigned long state; unsigned long last_accessed; struct dm_bufio_client *c; @@ -555,7 +555,7 @@ static void dmio_complete(unsigned long error, void *context) { struct dm_buffer *b = context; - b->bio.bi_error = error ? -EIO : 0; + b->bio.bi_status = error ? BLK_STS_IOERR : 0; b->bio.bi_end_io(&b->bio); } @@ -588,7 +588,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, r = dm_io(&io_req, 1, ®ion, NULL); if (r) { - b->bio.bi_error = r; + b->bio.bi_status = errno_to_blk_status(r); end_io(&b->bio); } } @@ -596,7 +596,7 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t sector, static void inline_endio(struct bio *bio) { bio_end_io_t *end_fn = bio->bi_private; - int error = bio->bi_error; + blk_status_t status = bio->bi_status; /* * Reset the bio to free any attached resources @@ -604,7 +604,7 @@ static void inline_endio(struct bio *bio) */ bio_reset(bio); - bio->bi_error = error; + bio->bi_status = status; end_fn(bio); } @@ -685,11 +685,12 @@ static void write_endio(struct bio *bio) { struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - b->write_error = bio->bi_error; - if (unlikely(bio->bi_error)) { + b->write_error = bio->bi_status; + if (unlikely(bio->bi_status)) { struct dm_bufio_client *c = b->c; - int error = bio->bi_error; - (void)cmpxchg(&c->async_write_error, 0, error); + + (void)cmpxchg(&c->async_write_error, 0, + blk_status_to_errno(bio->bi_status)); } BUG_ON(!test_bit(B_WRITING, &b->state)); @@ -1063,7 +1064,7 @@ static void read_endio(struct bio *bio) { struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - b->read_error = bio->bi_error; + b->read_error = bio->bi_status; BUG_ON(!test_bit(B_READING, &b->state)); @@ -1107,7 +1108,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); if (b->read_error) { - int error = b->read_error; + int error = blk_status_to_errno(b->read_error); dm_bufio_release(b); @@ -1257,7 +1258,8 @@ EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); */ int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) { - int a, f; + blk_status_t a; + int f; unsigned long buffers_processed = 0; struct dm_buffer *b, *tmp; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index c48612e6d525..c5ea03fc7ee1 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -119,7 +119,7 @@ static void iot_io_end(struct io_tracker *iot, sector_t len) */ struct continuation { struct work_struct ws; - int input; + blk_status_t input; }; static inline void init_continuation(struct continuation *k, @@ -145,7 +145,7 @@ struct batcher { /* * The operation that everyone is waiting for. */ - int (*commit_op)(void *context); + blk_status_t (*commit_op)(void *context); void *commit_context; /* @@ -171,8 +171,7 @@ struct batcher { static void __commit(struct work_struct *_ws) { struct batcher *b = container_of(_ws, struct batcher, commit_work); - - int r; + blk_status_t r; unsigned long flags; struct list_head work_items; struct work_struct *ws, *tmp; @@ -205,7 +204,7 @@ static void __commit(struct work_struct *_ws) while ((bio = bio_list_pop(&bios))) { if (r) { - bio->bi_error = r; + bio->bi_status = r; bio_endio(bio); } else b->issue_op(bio, b->issue_context); @@ -213,7 +212,7 @@ static void __commit(struct work_struct *_ws) } static void batcher_init(struct batcher *b, - int (*commit_op)(void *), + blk_status_t (*commit_op)(void *), void *commit_context, void (*issue_op)(struct bio *bio, void *), void *issue_context, @@ -955,7 +954,7 @@ static void writethrough_endio(struct bio *bio) dm_unhook_bio(&pb->hook_info, bio); - if (bio->bi_error) { + if (bio->bi_status) { bio_endio(bio); return; } @@ -1220,7 +1219,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); if (read_err || write_err) - mg->k.input = -EIO; + mg->k.input = BLK_STS_IOERR; queue_continuation(mg->cache->wq, &mg->k); } @@ -1266,8 +1265,8 @@ static void overwrite_endio(struct bio *bio) dm_unhook_bio(&pb->hook_info, bio); - if (bio->bi_error) - mg->k.input = bio->bi_error; + if (bio->bi_status) + mg->k.input = bio->bi_status; queue_continuation(mg->cache->wq, &mg->k); } @@ -1323,8 +1322,10 @@ static void mg_complete(struct dm_cache_migration *mg, bool success) if (mg->overwrite_bio) { if (success) force_set_dirty(cache, cblock); + else if (mg->k.input) + mg->overwrite_bio->bi_status = mg->k.input; else - mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO); + mg->overwrite_bio->bi_status = BLK_STS_IOERR; bio_endio(mg->overwrite_bio); } else { if (success) @@ -1504,7 +1505,7 @@ static void mg_copy(struct work_struct *ws) r = copy(mg, is_policy_promote); if (r) { DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); - mg->k.input = -EIO; + mg->k.input = BLK_STS_IOERR; mg_complete(mg, false); } } @@ -1907,12 +1908,12 @@ static int commit(struct cache *cache, bool clean_shutdown) /* * Used by the batcher. */ -static int commit_op(void *context) +static blk_status_t commit_op(void *context) { struct cache *cache = context; if (dm_cache_changed_this_transaction(cache->cmd)) - return commit(cache, false); + return errno_to_blk_status(commit(cache, false)); return 0; } @@ -2018,7 +2019,7 @@ static void requeue_deferred_bios(struct cache *cache) bio_list_init(&cache->deferred_bios); while ((bio = bio_list_pop(&bios))) { - bio->bi_error = DM_ENDIO_REQUEUE; + bio->bi_status = BLK_STS_DM_REQUEUE; bio_endio(bio); } } @@ -2820,7 +2821,8 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return r; } -static int cache_end_io(struct dm_target *ti, struct bio *bio, int *error) +static int cache_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { struct cache *cache = ti->private; unsigned long flags; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index f4b51809db21..586cef085c6a 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -71,7 +71,7 @@ struct dm_crypt_io { struct convert_context ctx; atomic_t io_pending; - int error; + blk_status_t error; sector_t sector; struct rb_node rb_node; @@ -1292,7 +1292,7 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_ /* * Encrypt / decrypt data from one bio to another one (can be the same one) */ -static int crypt_convert(struct crypt_config *cc, +static blk_status_t crypt_convert(struct crypt_config *cc, struct convert_context *ctx) { unsigned int tag_offset = 0; @@ -1343,13 +1343,13 @@ static int crypt_convert(struct crypt_config *cc, */ case -EBADMSG: atomic_dec(&ctx->cc_pending); - return -EILSEQ; + return BLK_STS_PROTECTION; /* * There was an error while processing the request. */ default: atomic_dec(&ctx->cc_pending); - return -EIO; + return BLK_STS_IOERR; } } @@ -1463,7 +1463,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; struct bio *base_bio = io->base_bio; - int error = io->error; + blk_status_t error = io->error; if (!atomic_dec_and_test(&io->io_pending)) return; @@ -1476,7 +1476,7 @@ static void crypt_dec_pending(struct dm_crypt_io *io) else kfree(io->integrity_metadata); - base_bio->bi_error = error; + base_bio->bi_status = error; bio_endio(base_bio); } @@ -1502,7 +1502,7 @@ static void crypt_endio(struct bio *clone) struct dm_crypt_io *io = clone->bi_private; struct crypt_config *cc = io->cc; unsigned rw = bio_data_dir(clone); - int error; + blk_status_t error; /* * free the processed pages @@ -1510,7 +1510,7 @@ static void crypt_endio(struct bio *clone) if (rw == WRITE) crypt_free_buffer_pages(cc, clone); - error = clone->bi_error; + error = clone->bi_status; bio_put(clone); if (rw == READ && !error) { @@ -1570,7 +1570,7 @@ static void kcryptd_io_read_work(struct work_struct *work) crypt_inc_pending(io); if (kcryptd_io_read(io, GFP_NOIO)) - io->error = -ENOMEM; + io->error = BLK_STS_RESOURCE; crypt_dec_pending(io); } @@ -1656,7 +1656,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) sector_t sector; struct rb_node **rbp, *parent; - if (unlikely(io->error < 0)) { + if (unlikely(io->error)) { crypt_free_buffer_pages(cc, clone); bio_put(clone); crypt_dec_pending(io); @@ -1697,7 +1697,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) struct bio *clone; int crypt_finished; sector_t sector = io->sector; - int r; + blk_status_t r; /* * Prevent io from disappearing until this function completes. @@ -1707,7 +1707,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size); if (unlikely(!clone)) { - io->error = -EIO; + io->error = BLK_STS_IOERR; goto dec; } @@ -1718,7 +1718,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) crypt_inc_pending(io); r = crypt_convert(cc, &io->ctx); - if (r < 0) + if (r) io->error = r; crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); @@ -1740,7 +1740,7 @@ static void kcryptd_crypt_read_done(struct dm_crypt_io *io) static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; - int r = 0; + blk_status_t r; crypt_inc_pending(io); @@ -1748,7 +1748,7 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) io->sector); r = crypt_convert(cc, &io->ctx); - if (r < 0) + if (r) io->error = r; if (atomic_dec_and_test(&io->ctx.cc_pending)) @@ -1781,9 +1781,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, if (error == -EBADMSG) { DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq))); - io->error = -EILSEQ; + io->error = BLK_STS_PROTECTION; } else if (error < 0) - io->error = -EIO; + io->error = BLK_STS_IOERR; crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio); diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index c9539917a59b..3d04d5ce19d9 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -358,7 +358,8 @@ map_bio: return DM_MAPIO_REMAPPED; } -static int flakey_end_io(struct dm_target *ti, struct bio *bio, int *error) +static int flakey_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { struct flakey_c *fc = ti->private; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); @@ -377,7 +378,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int *error) * Error read during the down_interval if drop_writes * and error_writes were not configured. */ - *error = -EIO; + *error = BLK_STS_IOERR; } } diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index ee78fb471229..ccc6ef4d00b9 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -246,7 +246,7 @@ struct dm_integrity_io { unsigned metadata_offset; atomic_t in_flight; - int bi_error; + blk_status_t bi_status; struct completion *completion; @@ -1114,8 +1114,8 @@ static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io * static void do_endio(struct dm_integrity_c *ic, struct bio *bio) { int r = dm_integrity_failed(ic); - if (unlikely(r) && !bio->bi_error) - bio->bi_error = r; + if (unlikely(r) && !bio->bi_status) + bio->bi_status = errno_to_blk_status(r); bio_endio(bio); } @@ -1123,7 +1123,7 @@ static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *di { struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); - if (unlikely(dio->fua) && likely(!bio->bi_error) && likely(!dm_integrity_failed(ic))) + if (unlikely(dio->fua) && likely(!bio->bi_status) && likely(!dm_integrity_failed(ic))) submit_flush_bio(ic, dio); else do_endio(ic, bio); @@ -1142,9 +1142,9 @@ static void dec_in_flight(struct dm_integrity_io *dio) bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); - if (unlikely(dio->bi_error) && !bio->bi_error) - bio->bi_error = dio->bi_error; - if (likely(!bio->bi_error) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) { + if (unlikely(dio->bi_status) && !bio->bi_status) + bio->bi_status = dio->bi_status; + if (likely(!bio->bi_status) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) { dio->range.logical_sector += dio->range.n_sectors; bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT); INIT_WORK(&dio->work, integrity_bio_wait); @@ -1318,7 +1318,7 @@ skip_io: dec_in_flight(dio); return; error: - dio->bi_error = r; + dio->bi_status = errno_to_blk_status(r); dec_in_flight(dio); } @@ -1331,7 +1331,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) sector_t area, offset; dio->ic = ic; - dio->bi_error = 0; + dio->bi_status = 0; if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { submit_flush_bio(ic, dio); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 3702e502466d..c8f8f3004085 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -124,7 +124,7 @@ static void complete_io(struct io *io) fn(error_bits, context); } -static void dec_count(struct io *io, unsigned int region, int error) +static void dec_count(struct io *io, unsigned int region, blk_status_t error) { if (error) set_bit(region, &io->error_bits); @@ -137,9 +137,9 @@ static void endio(struct bio *bio) { struct io *io; unsigned region; - int error; + blk_status_t error; - if (bio->bi_error && bio_data_dir(bio) == READ) + if (bio->bi_status && bio_data_dir(bio) == READ) zero_fill_bio(bio); /* @@ -147,7 +147,7 @@ static void endio(struct bio *bio) */ retrieve_io_and_region_from_bio(bio, &io, ®ion); - error = bio->bi_error; + error = bio->bi_status; bio_put(bio); dec_count(io, region, error); @@ -319,7 +319,7 @@ static void do_region(int op, int op_flags, unsigned region, if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES || op == REQ_OP_WRITE_SAME) && special_cmd_max_sectors == 0) { - dec_count(io, region, -EOPNOTSUPP); + dec_count(io, region, BLK_STS_NOTSUPP); return; } diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index cc57c7fa1268..a1da0eb58a93 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -150,10 +150,10 @@ static void log_end_io(struct bio *bio) { struct log_writes_c *lc = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { unsigned long flags; - DMERR("Error writing log block, error=%d", bio->bi_error); + DMERR("Error writing log block, error=%d", bio->bi_status); spin_lock_irqsave(&lc->blocks_lock, flags); lc->logging_enabled = false; spin_unlock_irqrestore(&lc->blocks_lock, flags); @@ -664,7 +664,8 @@ map_bio: return DM_MAPIO_REMAPPED; } -static int normal_end_io(struct dm_target *ti, struct bio *bio, int *error) +static int normal_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { struct log_writes_c *lc = ti->private; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 39262e344ae1..a7d2e0840cc5 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -565,7 +565,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m mpio->pgpath = pgpath; mpio->nr_bytes = nr_bytes; - bio->bi_error = 0; + bio->bi_status = 0; bio->bi_bdev = pgpath->path.dev->bdev; bio->bi_opf |= REQ_FAILFAST_TRANSPORT; @@ -623,10 +623,10 @@ static void process_queued_bios(struct work_struct *work) r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio)); switch (r) { case DM_MAPIO_KILL: - r = -EIO; - /*FALLTHRU*/ + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); case DM_MAPIO_REQUEUE: - bio->bi_error = r; + bio->bi_status = BLK_STS_DM_REQUEUE; bio_endio(bio); break; case DM_MAPIO_REMAPPED: @@ -1510,7 +1510,8 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, return r; } -static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int *error) +static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, + blk_status_t *error) { struct multipath *m = ti->private; struct dm_mpath_io *mpio = get_mpio_from_bio(clone); @@ -1518,7 +1519,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int *er unsigned long flags; int r = DM_ENDIO_DONE; - if (!*error || noretry_error(errno_to_blk_status(*error))) + if (!*error || noretry_error(*error)) goto done; if (pgpath) @@ -1527,7 +1528,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, int *er if (atomic_read(&m->nr_valid_paths) == 0 && !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { dm_report_EIO(m); - *error = -EIO; + *error = BLK_STS_IOERR; goto done; } diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 77bcf50ce75f..0822e4a6f67d 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -490,9 +490,9 @@ static void hold_bio(struct mirror_set *ms, struct bio *bio) * If device is suspended, complete the bio. */ if (dm_noflush_suspending(ms->ti)) - bio->bi_error = DM_ENDIO_REQUEUE; + bio->bi_status = BLK_STS_DM_REQUEUE; else - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; @@ -626,7 +626,7 @@ static void write_callback(unsigned long error, void *context) * degrade the array. */ if (bio_op(bio) == REQ_OP_DISCARD) { - bio->bi_error = -EOPNOTSUPP; + bio->bi_status = BLK_STS_NOTSUPP; bio_endio(bio); return; } @@ -1236,7 +1236,8 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } -static int mirror_end_io(struct dm_target *ti, struct bio *bio, int *error) +static int mirror_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { int rw = bio_data_dir(bio); struct mirror_set *ms = (struct mirror_set *) ti->private; @@ -1255,7 +1256,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int *error) return DM_ENDIO_DONE; } - if (*error == -EOPNOTSUPP) + if (*error == BLK_STS_NOTSUPP) return DM_ENDIO_DONE; if (bio->bi_opf & REQ_RAHEAD) @@ -1277,7 +1278,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int *error) bd = &bio_record->details; dm_bio_restore(bd, bio); - bio->bi_error = 0; + bio->bi_status = 0; queue_bio(ms, bio, rw); return DM_ENDIO_INCOMPLETE; diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 63402f8a38de..fafd5326e572 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -119,7 +119,7 @@ static void end_clone_bio(struct bio *clone) struct dm_rq_target_io *tio = info->tio; struct bio *bio = info->orig; unsigned int nr_bytes = info->orig->bi_iter.bi_size; - blk_status_t error = errno_to_blk_status(clone->bi_error); + blk_status_t error = clone->bi_status; bio_put(clone); diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 79a845798e2f..1ba41048b438 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1590,7 +1590,7 @@ static void full_bio_end_io(struct bio *bio) { void *callback_data = bio->bi_private; - dm_kcopyd_do_callback(callback_data, 0, bio->bi_error ? 1 : 0); + dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0); } static void start_full_bio(struct dm_snap_pending_exception *pe, @@ -1851,7 +1851,8 @@ out_unlock: return r; } -static int snapshot_end_io(struct dm_target *ti, struct bio *bio, int *error) +static int snapshot_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { struct dm_snapshot *s = ti->private; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 49888bc2c909..11621a0af887 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -375,7 +375,8 @@ static void stripe_status(struct dm_target *ti, status_type_t type, } } -static int stripe_end_io(struct dm_target *ti, struct bio *bio, int *error) +static int stripe_end_io(struct dm_target *ti, struct bio *bio, + blk_status_t *error) { unsigned i; char major_minor[16]; @@ -387,7 +388,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, int *error) if (bio->bi_opf & REQ_RAHEAD) return DM_ENDIO_DONE; - if (*error == -EOPNOTSUPP) + if (*error == BLK_STS_NOTSUPP) return DM_ENDIO_DONE; memset(major_minor, 0, sizeof(major_minor)); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 22b1a64c44b7..3490b300cbff 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -383,8 +383,8 @@ static void end_discard(struct discard_op *op, int r) * Even if r is set, there could be sub discards in flight that we * need to wait for. */ - if (r && !op->parent_bio->bi_error) - op->parent_bio->bi_error = r; + if (r && !op->parent_bio->bi_status) + op->parent_bio->bi_status = errno_to_blk_status(r); bio_endio(op->parent_bio); } @@ -450,22 +450,20 @@ static void cell_release_no_holder(struct pool *pool, } static void cell_error_with_code(struct pool *pool, - struct dm_bio_prison_cell *cell, int error_code) + struct dm_bio_prison_cell *cell, blk_status_t error_code) { dm_cell_error(pool->prison, cell, error_code); dm_bio_prison_free_cell(pool->prison, cell); } -static int get_pool_io_error_code(struct pool *pool) +static blk_status_t get_pool_io_error_code(struct pool *pool) { - return pool->out_of_data_space ? -ENOSPC : -EIO; + return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR; } static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) { - int error = get_pool_io_error_code(pool); - - cell_error_with_code(pool, cell, error); + cell_error_with_code(pool, cell, get_pool_io_error_code(pool)); } static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) @@ -475,7 +473,7 @@ static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell) { - cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE); + cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE); } /*----------------------------------------------------------------*/ @@ -555,17 +553,18 @@ static void __merge_bio_list(struct bio_list *bios, struct bio_list *master) bio_list_init(master); } -static void error_bio_list(struct bio_list *bios, int error) +static void error_bio_list(struct bio_list *bios, blk_status_t error) { struct bio *bio; while ((bio = bio_list_pop(bios))) { - bio->bi_error = error; + bio->bi_status = error; bio_endio(bio); } } -static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error) +static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, + blk_status_t error) { struct bio_list bios; unsigned long flags; @@ -608,11 +607,11 @@ static void requeue_io(struct thin_c *tc) __merge_bio_list(&bios, &tc->retry_on_resume_list); spin_unlock_irqrestore(&tc->lock, flags); - error_bio_list(&bios, DM_ENDIO_REQUEUE); + error_bio_list(&bios, BLK_STS_DM_REQUEUE); requeue_deferred_cells(tc); } -static void error_retry_list_with_code(struct pool *pool, int error) +static void error_retry_list_with_code(struct pool *pool, blk_status_t error) { struct thin_c *tc; @@ -624,9 +623,7 @@ static void error_retry_list_with_code(struct pool *pool, int error) static void error_retry_list(struct pool *pool) { - int error = get_pool_io_error_code(pool); - - error_retry_list_with_code(pool, error); + error_retry_list_with_code(pool, get_pool_io_error_code(pool)); } /* @@ -774,7 +771,7 @@ struct dm_thin_new_mapping { */ atomic_t prepare_actions; - int err; + blk_status_t status; struct thin_c *tc; dm_block_t virt_begin, virt_end; dm_block_t data_block; @@ -814,7 +811,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) { struct dm_thin_new_mapping *m = context; - m->err = read_err || write_err ? -EIO : 0; + m->status = read_err || write_err ? BLK_STS_IOERR : 0; complete_mapping_preparation(m); } @@ -825,7 +822,7 @@ static void overwrite_endio(struct bio *bio) bio->bi_end_io = m->saved_bi_end_io; - m->err = bio->bi_error; + m->status = bio->bi_status; complete_mapping_preparation(m); } @@ -925,7 +922,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) struct bio *bio = m->bio; int r; - if (m->err) { + if (m->status) { cell_error(pool, m->cell); goto out; } @@ -1495,7 +1492,7 @@ static void retry_on_resume(struct bio *bio) spin_unlock_irqrestore(&tc->lock, flags); } -static int should_error_unserviceable_bio(struct pool *pool) +static blk_status_t should_error_unserviceable_bio(struct pool *pool) { enum pool_mode m = get_pool_mode(pool); @@ -1503,27 +1500,27 @@ static int should_error_unserviceable_bio(struct pool *pool) case PM_WRITE: /* Shouldn't get here */ DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); - return -EIO; + return BLK_STS_IOERR; case PM_OUT_OF_DATA_SPACE: - return pool->pf.error_if_no_space ? -ENOSPC : 0; + return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0; case PM_READ_ONLY: case PM_FAIL: - return -EIO; + return BLK_STS_IOERR; default: /* Shouldn't get here */ DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); - return -EIO; + return BLK_STS_IOERR; } } static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) { - int error = should_error_unserviceable_bio(pool); + blk_status_t error = should_error_unserviceable_bio(pool); if (error) { - bio->bi_error = error; + bio->bi_status = error; bio_endio(bio); } else retry_on_resume(bio); @@ -1533,7 +1530,7 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c { struct bio *bio; struct bio_list bios; - int error; + blk_status_t error; error = should_error_unserviceable_bio(pool); if (error) { @@ -2071,7 +2068,8 @@ static void process_thin_deferred_bios(struct thin_c *tc) unsigned count = 0; if (tc->requeue_mode) { - error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE); + error_thin_bio_list(tc, &tc->deferred_bio_list, + BLK_STS_DM_REQUEUE); return; } @@ -2322,7 +2320,7 @@ static void do_no_space_timeout(struct work_struct *ws) if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) { pool->pf.error_if_no_space = true; notify_of_pool_mode_change_to_oods(pool); - error_retry_list_with_code(pool, -ENOSPC); + error_retry_list_with_code(pool, BLK_STS_NOSPC); } } @@ -2624,7 +2622,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) thin_hook_bio(tc, bio); if (tc->requeue_mode) { - bio->bi_error = DM_ENDIO_REQUEUE; + bio->bi_status = BLK_STS_DM_REQUEUE; bio_endio(bio); return DM_MAPIO_SUBMITTED; } @@ -4177,7 +4175,8 @@ static int thin_map(struct dm_target *ti, struct bio *bio) return thin_bio_map(ti, bio); } -static int thin_endio(struct dm_target *ti, struct bio *bio, int *err) +static int thin_endio(struct dm_target *ti, struct bio *bio, + blk_status_t *err) { unsigned long flags; struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 9ed55468b98b..2dca66eb67e1 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -538,13 +538,13 @@ static int verity_verify_io(struct dm_verity_io *io) /* * End one "io" structure with a given error. */ -static void verity_finish_io(struct dm_verity_io *io, int error) +static void verity_finish_io(struct dm_verity_io *io, blk_status_t status) { struct dm_verity *v = io->v; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); bio->bi_end_io = io->orig_bi_end_io; - bio->bi_error = error; + bio->bi_status = status; verity_fec_finish_io(io); @@ -555,15 +555,15 @@ static void verity_work(struct work_struct *w) { struct dm_verity_io *io = container_of(w, struct dm_verity_io, work); - verity_finish_io(io, verity_verify_io(io)); + verity_finish_io(io, errno_to_blk_status(verity_verify_io(io))); } static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; - if (bio->bi_error && !verity_fec_is_enabled(io->v)) { - verity_finish_io(io, bio->bi_error); + if (bio->bi_status && !verity_fec_is_enabled(io->v)) { + verity_finish_io(io, bio->bi_status); return; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7a7047211c64..f38f9dd5cbdd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -63,7 +63,7 @@ static struct workqueue_struct *deferred_remove_workqueue; */ struct dm_io { struct mapped_device *md; - int error; + blk_status_t status; atomic_t io_count; struct bio *bio; unsigned long start_time; @@ -768,23 +768,24 @@ static int __noflush_suspending(struct mapped_device *md) * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. */ -static void dec_pending(struct dm_io *io, int error) +static void dec_pending(struct dm_io *io, blk_status_t error) { unsigned long flags; - int io_error; + blk_status_t io_error; struct bio *bio; struct mapped_device *md = io->md; /* Push-back supersedes any I/O errors */ if (unlikely(error)) { spin_lock_irqsave(&io->endio_lock, flags); - if (!(io->error > 0 && __noflush_suspending(md))) - io->error = error; + if (!(io->status == BLK_STS_DM_REQUEUE && + __noflush_suspending(md))) + io->status = error; spin_unlock_irqrestore(&io->endio_lock, flags); } if (atomic_dec_and_test(&io->io_count)) { - if (io->error == DM_ENDIO_REQUEUE) { + if (io->status == BLK_STS_DM_REQUEUE) { /* * Target requested pushing back the I/O. */ @@ -793,16 +794,16 @@ static void dec_pending(struct dm_io *io, int error) bio_list_add_head(&md->deferred, io->bio); else /* noflush suspend was interrupted. */ - io->error = -EIO; + io->status = BLK_STS_IOERR; spin_unlock_irqrestore(&md->deferred_lock, flags); } - io_error = io->error; + io_error = io->status; bio = io->bio; end_io_acct(io); free_io(md, io); - if (io_error == DM_ENDIO_REQUEUE) + if (io_error == BLK_STS_DM_REQUEUE) return; if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) { @@ -814,7 +815,7 @@ static void dec_pending(struct dm_io *io, int error) queue_io(md, bio); } else { /* done with normal IO or empty flush */ - bio->bi_error = io_error; + bio->bi_status = io_error; bio_endio(bio); } } @@ -838,14 +839,13 @@ void disable_write_zeroes(struct mapped_device *md) static void clone_endio(struct bio *bio) { - int error = bio->bi_error; - int r = error; + blk_status_t error = bio->bi_status; struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); struct dm_io *io = tio->io; struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; - if (unlikely(error == -EREMOTEIO)) { + if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_WRITE_SAME && !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) disable_write_same(md); @@ -855,10 +855,10 @@ static void clone_endio(struct bio *bio) } if (endio) { - r = endio(tio->ti, bio, &error); + int r = endio(tio->ti, bio, &error); switch (r) { case DM_ENDIO_REQUEUE: - error = DM_ENDIO_REQUEUE; + error = BLK_STS_DM_REQUEUE; /*FALLTHRU*/ case DM_ENDIO_DONE: break; @@ -1094,11 +1094,11 @@ static void __map_bio(struct dm_target_io *tio) generic_make_request(clone); break; case DM_MAPIO_KILL: - r = -EIO; - /*FALLTHRU*/ + dec_pending(tio->io, BLK_STS_IOERR); + free_tio(tio); + break; case DM_MAPIO_REQUEUE: - /* error the io and bail out, or requeue it if needed */ - dec_pending(tio->io, r); + dec_pending(tio->io, BLK_STS_DM_REQUEUE); free_tio(tio); break; default: @@ -1366,7 +1366,7 @@ static void __split_and_process_bio(struct mapped_device *md, ci.map = map; ci.md = md; ci.io = alloc_io(md); - ci.io->error = 0; + ci.io->status = 0; atomic_set(&ci.io->io_count, 1); ci.io->bio = bio; ci.io->md = md; diff --git a/drivers/md/md.c b/drivers/md/md.c index 10367ffe92e3..6452e83fd650 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -273,7 +273,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) } if (mddev->ro == 1 && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) - bio->bi_error = -EROFS; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return BLK_QC_T_NONE; } @@ -719,8 +719,8 @@ static void super_written(struct bio *bio) struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; - if (bio->bi_error) { - pr_err("md: super_written gets error=%d\n", bio->bi_error); + if (bio->bi_status) { + pr_err("md: super_written gets error=%d\n", bio->bi_status); md_error(mddev, rdev); if (!test_bit(Faulty, &rdev->flags) && (bio->bi_opf & MD_FAILFAST)) { @@ -801,7 +801,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, submit_bio_wait(bio); - ret = !bio->bi_error; + ret = !bio->bi_status; bio_put(bio); return ret; } diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index e95d521d93e9..68d036e64041 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -73,12 +73,12 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) * operation and are ready to return a success/failure code to the buffer * cache layer. */ -static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) +static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status) { struct bio *bio = mp_bh->master_bio; struct mpconf *conf = mp_bh->mddev->private; - bio->bi_error = err; + bio->bi_status = status; bio_endio(bio); mempool_free(mp_bh, conf->pool); } @@ -89,7 +89,7 @@ static void multipath_end_request(struct bio *bio) struct mpconf *conf = mp_bh->mddev->private; struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev; - if (!bio->bi_error) + if (!bio->bi_status) multipath_end_bh_io(mp_bh, 0); else if (!(bio->bi_opf & REQ_RAHEAD)) { /* @@ -102,7 +102,7 @@ static void multipath_end_request(struct bio *bio) (unsigned long long)bio->bi_iter.bi_sector); multipath_reschedule_retry(mp_bh); } else - multipath_end_bh_io(mp_bh, bio->bi_error); + multipath_end_bh_io(mp_bh, bio->bi_status); rdev_dec_pending(rdev, conf->mddev); } @@ -347,7 +347,7 @@ static void multipathd(struct md_thread *thread) pr_err("multipath: %s: unrecoverable IO read error for block %llu\n", bdevname(bio->bi_bdev,b), (unsigned long long)bio->bi_iter.bi_sector); - multipath_end_bh_io(mp_bh, -EIO); + multipath_end_bh_io(mp_bh, BLK_STS_IOERR); } else { pr_err("multipath: %s: redirecting sector %llu to another IO path\n", bdevname(bio->bi_bdev,b), diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index af5056d56878..94b87c4d0f7b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -277,7 +277,7 @@ static void call_bio_endio(struct r1bio *r1_bio) struct r1conf *conf = r1_bio->mddev->private; if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); /* @@ -335,7 +335,7 @@ static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) static void raid1_end_read_request(struct bio *bio) { - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct r1bio *r1_bio = bio->bi_private; struct r1conf *conf = r1_bio->mddev->private; struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; @@ -426,12 +426,12 @@ static void raid1_end_write_request(struct bio *bio) struct md_rdev *rdev = conf->mirrors[mirror].rdev; bool discard_error; - discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD; + discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; /* * 'one mirror IO has finished' event handler: */ - if (bio->bi_error && !discard_error) { + if (bio->bi_status && !discard_error) { set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & @@ -802,7 +802,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) bio->bi_next = NULL; bio->bi_bdev = rdev->bdev; if (test_bit(Faulty, &rdev->flags)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) @@ -1856,7 +1856,7 @@ static void end_sync_read(struct bio *bio) * or re-read if the read failed. * We don't do much here, just schedule handling by raid1d */ - if (!bio->bi_error) + if (!bio->bi_status) set_bit(R1BIO_Uptodate, &r1_bio->state); if (atomic_dec_and_test(&r1_bio->remaining)) @@ -1865,7 +1865,7 @@ static void end_sync_read(struct bio *bio) static void end_sync_write(struct bio *bio) { - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct r1bio *r1_bio = get_resync_r1bio(bio); struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; @@ -2058,7 +2058,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) idx ++; } set_bit(R1BIO_Uptodate, &r1_bio->state); - bio->bi_error = 0; + bio->bi_status = 0; return 1; } @@ -2082,16 +2082,16 @@ static void process_checks(struct r1bio *r1_bio) for (i = 0; i < conf->raid_disks * 2; i++) { int j; int size; - int error; + blk_status_t status; struct bio_vec *bi; struct bio *b = r1_bio->bios[i]; struct resync_pages *rp = get_resync_pages(b); if (b->bi_end_io != end_sync_read) continue; /* fixup the bio for reuse, but preserve errno */ - error = b->bi_error; + status = b->bi_status; bio_reset(b); - b->bi_error = error; + b->bi_status = status; b->bi_vcnt = vcnt; b->bi_iter.bi_size = r1_bio->sectors << 9; b->bi_iter.bi_sector = r1_bio->sector + @@ -2113,7 +2113,7 @@ static void process_checks(struct r1bio *r1_bio) } for (primary = 0; primary < conf->raid_disks * 2; primary++) if (r1_bio->bios[primary]->bi_end_io == end_sync_read && - !r1_bio->bios[primary]->bi_error) { + !r1_bio->bios[primary]->bi_status) { r1_bio->bios[primary]->bi_end_io = NULL; rdev_dec_pending(conf->mirrors[primary].rdev, mddev); break; @@ -2123,7 +2123,7 @@ static void process_checks(struct r1bio *r1_bio) int j; struct bio *pbio = r1_bio->bios[primary]; struct bio *sbio = r1_bio->bios[i]; - int error = sbio->bi_error; + blk_status_t status = sbio->bi_status; struct page **ppages = get_resync_pages(pbio)->pages; struct page **spages = get_resync_pages(sbio)->pages; struct bio_vec *bi; @@ -2132,12 +2132,12 @@ static void process_checks(struct r1bio *r1_bio) if (sbio->bi_end_io != end_sync_read) continue; /* Now we can 'fixup' the error value */ - sbio->bi_error = 0; + sbio->bi_status = 0; bio_for_each_segment_all(bi, sbio, j) page_len[j] = bi->bv_len; - if (!error) { + if (!status) { for (j = vcnt; j-- ; ) { if (memcmp(page_address(ppages[j]), page_address(spages[j]), @@ -2149,7 +2149,7 @@ static void process_checks(struct r1bio *r1_bio) if (j >= 0) atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) - && !error)) { + && !status)) { /* No need to write to this device. */ sbio->bi_end_io = NULL; rdev_dec_pending(conf->mirrors[i].rdev, mddev); @@ -2400,11 +2400,11 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio struct bio *bio = r1_bio->bios[m]; if (bio->bi_end_io == NULL) continue; - if (!bio->bi_error && + if (!bio->bi_status && test_bit(R1BIO_MadeGood, &r1_bio->state)) { rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); } - if (bio->bi_error && + if (bio->bi_status && test_bit(R1BIO_WriteError, &r1_bio->state)) { if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) md_error(conf->mddev, rdev); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 4343d7ff9916..89ad1cd29037 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -336,7 +336,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio) struct r10conf *conf = r10_bio->mddev->private; if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); /* @@ -389,7 +389,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, static void raid10_end_read_request(struct bio *bio) { - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct r10bio *r10_bio = bio->bi_private; int slot, dev; struct md_rdev *rdev; @@ -477,7 +477,7 @@ static void raid10_end_write_request(struct bio *bio) struct bio *to_put = NULL; bool discard_error; - discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD; + discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); @@ -491,7 +491,7 @@ static void raid10_end_write_request(struct bio *bio) /* * this branch is our 'one mirror IO has finished' event handler: */ - if (bio->bi_error && !discard_error) { + if (bio->bi_status && !discard_error) { if (repl) /* Never record new bad blocks to replacement, * just fail it. @@ -913,7 +913,7 @@ static void flush_pending_writes(struct r10conf *conf) bio->bi_next = NULL; bio->bi_bdev = rdev->bdev; if (test_bit(Faulty, &rdev->flags)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) @@ -1098,7 +1098,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) bio->bi_next = NULL; bio->bi_bdev = rdev->bdev; if (test_bit(Faulty, &rdev->flags)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) @@ -1888,7 +1888,7 @@ static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d) { struct r10conf *conf = r10_bio->mddev->private; - if (!bio->bi_error) + if (!bio->bi_status) set_bit(R10BIO_Uptodate, &r10_bio->state); else /* The write handler will notice the lack of @@ -1972,7 +1972,7 @@ static void end_sync_write(struct bio *bio) else rdev = conf->mirrors[d].rdev; - if (bio->bi_error) { + if (bio->bi_status) { if (repl) md_error(mddev, rdev); else { @@ -2021,7 +2021,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) /* find the first device with a block */ for (i=0; icopies; i++) - if (!r10_bio->devs[i].bio->bi_error) + if (!r10_bio->devs[i].bio->bi_status) break; if (i == conf->copies) @@ -2050,7 +2050,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) tpages = get_resync_pages(tbio)->pages; d = r10_bio->devs[i].devnum; rdev = conf->mirrors[d].rdev; - if (!r10_bio->devs[i].bio->bi_error) { + if (!r10_bio->devs[i].bio->bi_status) { /* We know that the bi_io_vec layout is the same for * both 'first' and 'i', so we just compare them. * All vec entries are PAGE_SIZE; @@ -2633,7 +2633,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev = conf->mirrors[dev].rdev; if (r10_bio->devs[m].bio == NULL) continue; - if (!r10_bio->devs[m].bio->bi_error) { + if (!r10_bio->devs[m].bio->bi_status) { rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, @@ -2649,7 +2649,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) if (r10_bio->devs[m].repl_bio == NULL) continue; - if (!r10_bio->devs[m].repl_bio->bi_error) { + if (!r10_bio->devs[m].repl_bio->bi_status) { rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, @@ -2675,7 +2675,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) r10_bio->devs[m].addr, r10_bio->sectors, 0); rdev_dec_pending(rdev, conf->mddev); - } else if (bio != NULL && bio->bi_error) { + } else if (bio != NULL && bio->bi_status) { fail = true; if (!narrow_write_error(r10_bio, m)) { md_error(conf->mddev, rdev); @@ -3267,7 +3267,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->devs[i].repl_bio->bi_end_io = NULL; bio = r10_bio->devs[i].bio; - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; rcu_read_lock(); rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { @@ -3309,7 +3309,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Need to set up for writing to the replacement */ bio = r10_bio->devs[i].repl_bio; - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; sector = r10_bio->devs[i].addr; bio->bi_next = biolist; @@ -3375,7 +3375,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (bio->bi_end_io == end_sync_read) { md_sync_acct(bio->bi_bdev, nr_sectors); - bio->bi_error = 0; + bio->bi_status = 0; generic_make_request(bio); } } @@ -4394,7 +4394,7 @@ read_more: read_bio->bi_end_io = end_reshape_read; bio_set_op_attrs(read_bio, REQ_OP_READ, 0); read_bio->bi_flags &= (~0UL << BIO_RESET_BITS); - read_bio->bi_error = 0; + read_bio->bi_status = 0; read_bio->bi_vcnt = 0; read_bio->bi_iter.bi_size = 0; r10_bio->master_bio = read_bio; @@ -4638,7 +4638,7 @@ static void end_reshape_write(struct bio *bio) rdev = conf->mirrors[d].rdev; } - if (bio->bi_error) { + if (bio->bi_status) { /* FIXME should record badblock */ md_error(mddev, rdev); } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 4c00bc248287..3ed6a0d89db8 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -572,7 +572,7 @@ static void r5l_log_endio(struct bio *bio) struct r5l_log *log = io->log; unsigned long flags; - if (bio->bi_error) + if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); bio_put(bio); @@ -1247,7 +1247,7 @@ static void r5l_log_flush_endio(struct bio *bio) unsigned long flags; struct r5l_io_unit *io; - if (bio->bi_error) + if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); spin_lock_irqsave(&log->io_list_lock, flags); diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 5d25bebf3328..09e04be34e5f 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -397,7 +397,7 @@ static void ppl_log_endio(struct bio *bio) pr_debug("%s: seq: %llu\n", __func__, io->seq); - if (bio->bi_error) + if (bio->bi_status) md_error(ppl_conf->mddev, log->rdev); list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9c4f7659f8b1..e1bdc320f664 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2476,7 +2476,7 @@ static void raid5_end_read_request(struct bio * bi) pr_debug("end_read_request %llu/%d, count: %d, error %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), - bi->bi_error); + bi->bi_status); if (i == disks) { bio_reset(bi); BUG(); @@ -2496,7 +2496,7 @@ static void raid5_end_read_request(struct bio * bi) s = sh->sector + rdev->new_data_offset; else s = sh->sector + rdev->data_offset; - if (!bi->bi_error) { + if (!bi->bi_status) { set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { /* Note that this cannot happen on a @@ -2613,7 +2613,7 @@ static void raid5_end_write_request(struct bio *bi) } pr_debug("end_write_request %llu/%d, count %d, error: %d.\n", (unsigned long long)sh->sector, i, atomic_read(&sh->count), - bi->bi_error); + bi->bi_status); if (i == disks) { bio_reset(bi); BUG(); @@ -2621,14 +2621,14 @@ static void raid5_end_write_request(struct bio *bi) } if (replacement) { - if (bi->bi_error) + if (bi->bi_status) md_error(conf->mddev, rdev); else if (is_badblock(rdev, sh->sector, STRIPE_SECTORS, &first_bad, &bad_sectors)) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { - if (bi->bi_error) { + if (bi->bi_status) { set_bit(STRIPE_DEGRADED, &sh->state); set_bit(WriteErrorSeen, &rdev->flags); set_bit(R5_WriteError, &sh->dev[i].flags); @@ -2649,7 +2649,7 @@ static void raid5_end_write_request(struct bio *bi) } rdev_dec_pending(rdev, conf->mddev); - if (sh->batch_head && bi->bi_error && !replacement) + if (sh->batch_head && bi->bi_status && !replacement) set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); bio_reset(bi); @@ -3381,7 +3381,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; md_write_end(conf->mddev); bio_endio(bi); bi = nextbi; @@ -3403,7 +3403,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; md_write_end(conf->mddev); bio_endio(bi); bi = bi2; @@ -3429,7 +3429,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; bio_endio(bi); bi = nextbi; } @@ -5144,7 +5144,7 @@ static void raid5_align_endio(struct bio *bi) struct mddev *mddev; struct r5conf *conf; struct md_rdev *rdev; - int error = bi->bi_error; + blk_status_t error = bi->bi_status; bio_put(bi); @@ -5721,7 +5721,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) release_stripe_plug(mddev, sh); } else { /* cannot get stripe for read-ahead, just give-up */ - bi->bi_error = -EIO; + bi->bi_status = BLK_STS_IOERR; break; } } diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 822198a75e96..79eb9fb358d5 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -186,7 +186,7 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio) * another kernel subsystem, and we just pass it through. */ if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; goto out; } @@ -205,7 +205,7 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio) "io error in %s sector %lld, len %d,\n", (rw == READ) ? "READ" : "WRITE", (unsigned long long) iter.bi_sector, len); - bio->bi_error = err; + bio->bi_status = errno_to_blk_status(err); break; } } diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 983718b8fd9b..31b2d14e210d 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1210,7 +1210,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) * another kernel subsystem, and we just pass it through. */ if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; goto out; } @@ -1232,7 +1232,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) (op_is_write(bio_op(bio))) ? "WRITE" : "READ", (unsigned long long) iter.bi_sector, len); - bio->bi_error = err; + bio->bi_status = errno_to_blk_status(err); break; } } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index c544d466ea51..7bd383aeea14 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -49,19 +49,19 @@ static struct nd_region *to_region(struct pmem_device *pmem) return to_nd_region(to_dev(pmem)->parent); } -static int pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, - unsigned int len) +static blk_status_t pmem_clear_poison(struct pmem_device *pmem, + phys_addr_t offset, unsigned int len) { struct device *dev = to_dev(pmem); sector_t sector; long cleared; - int rc = 0; + blk_status_t rc = BLK_STS_OK; sector = (offset - pmem->data_offset) / 512; cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len); if (cleared < len) - rc = -EIO; + rc = BLK_STS_IOERR; if (cleared > 0 && cleared / 512) { cleared /= 512; dev_dbg(dev, "%s: %#llx clear %ld sector%s\n", __func__, @@ -84,7 +84,7 @@ static void write_pmem(void *pmem_addr, struct page *page, kunmap_atomic(mem); } -static int read_pmem(struct page *page, unsigned int off, +static blk_status_t read_pmem(struct page *page, unsigned int off, void *pmem_addr, unsigned int len) { int rc; @@ -93,15 +93,15 @@ static int read_pmem(struct page *page, unsigned int off, rc = memcpy_mcsafe(mem + off, pmem_addr, len); kunmap_atomic(mem); if (rc) - return -EIO; - return 0; + return BLK_STS_IOERR; + return BLK_STS_OK; } -static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, +static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page, unsigned int len, unsigned int off, bool is_write, sector_t sector) { - int rc = 0; + blk_status_t rc = BLK_STS_OK; bool bad_pmem = false; phys_addr_t pmem_off = sector * 512 + pmem->data_offset; void *pmem_addr = pmem->virt_addr + pmem_off; @@ -111,7 +111,7 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, if (!is_write) { if (unlikely(bad_pmem)) - rc = -EIO; + rc = BLK_STS_IOERR; else { rc = read_pmem(page, off, pmem_addr, len); flush_dcache_page(page); @@ -149,7 +149,7 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) { - int rc = 0; + blk_status_t rc = 0; bool do_acct; unsigned long start; struct bio_vec bvec; @@ -166,7 +166,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) bvec.bv_offset, op_is_write(bio_op(bio)), iter.bi_sector); if (rc) { - bio->bi_error = rc; + bio->bi_status = rc; break; } } @@ -184,7 +184,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, struct page *page, bool is_write) { struct pmem_device *pmem = bdev->bd_queue->queuedata; - int rc; + blk_status_t rc; rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector); @@ -197,7 +197,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, if (rc == 0) page_endio(page, is_write, 0); - return rc; + return blk_status_to_errno(rc); } /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index c77940d80fc8..40128793e613 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -21,7 +21,7 @@ static void nvmet_bio_done(struct bio *bio) struct nvmet_req *req = bio->bi_private; nvmet_req_complete(req, - bio->bi_error ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); + bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); if (bio != &req->inline_bio) bio_put(bio); @@ -145,7 +145,7 @@ static void nvmet_execute_discard(struct nvmet_req *req) bio->bi_private = req; bio->bi_end_io = nvmet_bio_done; if (status) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } else { submit_bio(bio); diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index bb069ebe4aa6..75373624604b 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -296,8 +296,8 @@ static void iblock_bio_done(struct bio *bio) struct se_cmd *cmd = bio->bi_private; struct iblock_req *ibr = cmd->priv; - if (bio->bi_error) { - pr_err("bio error: %p, err: %d\n", bio, bio->bi_error); + if (bio->bi_status) { + pr_err("bio error: %p, err: %d\n", bio, bio->bi_status); /* * Bump the ib_bio_err_cnt and release bio. */ @@ -354,11 +354,11 @@ static void iblock_end_io_flush(struct bio *bio) { struct se_cmd *cmd = bio->bi_private; - if (bio->bi_error) - pr_err("IBLOCK: cache flush failed: %d\n", bio->bi_error); + if (bio->bi_status) + pr_err("IBLOCK: cache flush failed: %d\n", bio->bi_status); if (cmd) { - if (bio->bi_error) + if (bio->bi_status) target_complete_cmd(cmd, SAM_STAT_CHECK_CONDITION); else target_complete_cmd(cmd, SAM_STAT_GOOD); diff --git a/fs/block_dev.c b/fs/block_dev.c index c1dc393ad6b9..bcd8e16a34e1 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -262,8 +262,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, if (vecs != inline_vecs) kfree(vecs); - if (unlikely(bio.bi_error)) - return bio.bi_error; + if (unlikely(bio.bi_status)) + return blk_status_to_errno(bio.bi_status); return ret; } @@ -288,16 +288,18 @@ static void blkdev_bio_end_io(struct bio *bio) bool should_dirty = dio->should_dirty; if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { - if (bio->bi_error && !dio->bio.bi_error) - dio->bio.bi_error = bio->bi_error; + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; } else { if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; - ssize_t ret = dio->bio.bi_error; + ssize_t ret; - if (likely(!ret)) { + if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(dio->bio.bi_status); } dio->iocb->ki_complete(iocb, ret, 0); @@ -363,7 +365,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); break; } @@ -413,7 +415,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) __set_current_state(TASK_RUNNING); if (!ret) - ret = dio->bio.bi_error; + ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) ret = dio->size; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b8622e4d1744..d87ac27a5f2b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -310,7 +310,8 @@ struct btrfs_dio_private { * The original bio may be split to several sub-bios, this is * done during endio of sub-bios */ - int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); + blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *, + blk_status_t); }; /* diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ab14c2e635ca..4ded1c3f92b8 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2129,7 +2129,7 @@ static void btrfsic_bio_end_io(struct bio *bp) /* mutex is not held! This is not save if IO is not yet completed * on umount */ iodone_w_error = 0; - if (bp->bi_error) + if (bp->bi_status) iodone_w_error = 1; BUG_ON(NULL == block); @@ -2143,7 +2143,7 @@ static void btrfsic_bio_end_io(struct bio *bp) if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", - bp->bi_error, + bp->bi_status, btrfsic_get_block_type(dev_state->state, block), block->logical_bytenr, dev_state->name, block->dev_bytenr, block->mirror_num); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 10e6b282d09d..9ac55b266e78 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -155,7 +155,7 @@ static void end_compressed_bio_read(struct bio *bio) unsigned long index; int ret; - if (bio->bi_error) + if (bio->bi_status) cb->errors = 1; /* if there are more bios still pending for this compressed @@ -268,7 +268,7 @@ static void end_compressed_bio_write(struct bio *bio) struct page *page; unsigned long index; - if (bio->bi_error) + if (bio->bi_status) cb->errors = 1; /* if there are more bios still pending for this compressed @@ -287,7 +287,7 @@ static void end_compressed_bio_write(struct bio *bio) cb->start, cb->start + cb->len - 1, NULL, - bio->bi_error ? 0 : 1); + bio->bi_status ? 0 : 1); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -320,7 +320,7 @@ out: * This also checksums the file bytes and gets things ready for * the end io hooks. */ -int btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, @@ -335,13 +335,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, struct page *page; u64 first_byte = disk_start; struct block_device *bdev; - int ret; + blk_status_t ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; WARN_ON(start & ((u64)PAGE_SIZE - 1)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) - return -ENOMEM; + return BLK_STS_RESOURCE; refcount_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; @@ -358,7 +358,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); if (!bio) { kfree(cb); - return -ENOMEM; + return BLK_STS_RESOURCE; } bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; @@ -368,17 +368,17 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, /* create and submit bios for the compressed pages */ bytes_left = compressed_len; for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { + int submit = 0; + page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - ret = io_tree->ops->merge_bio_hook(page, 0, + submit = io_tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); - else - ret = 0; page->mapping = NULL; - if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) < + if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_get(bio); @@ -400,7 +400,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } @@ -434,7 +434,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } @@ -569,7 +569,7 @@ next: * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -586,7 +586,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - int ret = -ENOMEM; + blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; u32 *sums; @@ -600,7 +600,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, PAGE_SIZE); read_unlock(&em_tree->lock); if (!em) - return -EIO; + return BLK_STS_IOERR; compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); @@ -659,19 +659,19 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, refcount_set(&cb->pending_bios, 1); for (pg_index = 0; pg_index < nr_pages; pg_index++) { + int submit = 0; + page = cb->compressed_pages[pg_index]; page->mapping = inode->i_mapping; page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - ret = tree->ops->merge_bio_hook(page, 0, + submit = tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, comp_bio, 0); - else - ret = 0; page->mapping = NULL; - if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < + if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_get(comp_bio); @@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { - comp_bio->bi_error = ret; + comp_bio->bi_status = ret; bio_endio(comp_bio); } @@ -726,7 +726,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { - comp_bio->bi_error = ret; + comp_bio->bi_status = ret; bio_endio(comp_bio); } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 39ec43ab8df1..680d4265d601 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -48,12 +48,12 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, struct bio *bio); -int btrfs_submit_compressed_write(struct inode *inode, u64 start, +blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, unsigned long nr_pages); -int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); enum btrfs_compression_type { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 643c70d2b2e6..d2da0a52d560 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3078,8 +3078,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); -int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); -int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); +blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3094,7 +3094,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8685d67185d0..46accc75ad5a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -87,7 +87,7 @@ struct btrfs_end_io_wq { bio_end_io_t *end_io; void *private; struct btrfs_fs_info *info; - int error; + blk_status_t status; enum btrfs_wq_endio_type metadata; struct list_head list; struct btrfs_work work; @@ -131,7 +131,7 @@ struct async_submit_bio { */ u64 bio_offset; struct btrfs_work work; - int error; + blk_status_t status; }; /* @@ -799,7 +799,7 @@ static void end_workqueue_bio(struct bio *bio) btrfs_work_func_t func; fs_info = end_io_wq->info; - end_io_wq->error = bio->bi_error; + end_io_wq->status = bio->bi_status; if (bio_op(bio) == REQ_OP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { @@ -836,19 +836,19 @@ static void end_workqueue_bio(struct bio *bio) btrfs_queue_work(wq, &end_io_wq->work); } -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, +blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata) { struct btrfs_end_io_wq *end_io_wq; end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); if (!end_io_wq) - return -ENOMEM; + return BLK_STS_RESOURCE; end_io_wq->private = bio->bi_private; end_io_wq->end_io = bio->bi_end_io; end_io_wq->info = info; - end_io_wq->error = 0; + end_io_wq->status = 0; end_io_wq->bio = bio; end_io_wq->metadata = metadata; @@ -868,14 +868,14 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; - int ret; + blk_status_t ret; async = container_of(work, struct async_submit_bio, work); ret = async->submit_bio_start(async->inode, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); if (ret) - async->error = ret; + async->status = ret; } static void run_one_async_done(struct btrfs_work *work) @@ -898,8 +898,8 @@ static void run_one_async_done(struct btrfs_work *work) wake_up(&fs_info->async_submit_wait); /* If an error occurred we just want to clean up the bio and move on */ - if (async->error) { - async->bio->bi_error = async->error; + if (async->status) { + async->bio->bi_status = async->status; bio_endio(async->bio); return; } @@ -916,18 +916,17 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - struct bio *bio, int mirror_num, - unsigned long bio_flags, - u64 bio_offset, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done) +blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, + struct inode *inode, struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 bio_offset, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done) { struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) - return -ENOMEM; + return BLK_STS_RESOURCE; async->inode = inode; async->bio = bio; @@ -941,7 +940,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; async->bio_offset = bio_offset; - async->error = 0; + async->status = 0; atomic_inc(&fs_info->nr_async_submits); @@ -959,7 +958,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, return 0; } -static int btree_csum_one_bio(struct bio *bio) +static blk_status_t btree_csum_one_bio(struct bio *bio) { struct bio_vec *bvec; struct btrfs_root *root; @@ -972,12 +971,12 @@ static int btree_csum_one_bio(struct bio *bio) break; } - return ret; + return errno_to_blk_status(ret); } -static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btree_submit_bio_start(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -986,11 +985,11 @@ static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, return btree_csum_one_bio(bio); } -static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btree_submit_bio_done(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset) { - int ret; + blk_status_t ret; /* * when we're called for a write, we're already in the async @@ -998,7 +997,7 @@ static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, */ ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1015,13 +1014,13 @@ static int check_async_write(unsigned long bio_flags) return 1; } -static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, +static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(bio_flags); - int ret; + blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { /* @@ -1054,7 +1053,7 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, return 0; out_w_error: - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); return ret; } @@ -1820,7 +1819,7 @@ static void end_workqueue_fn(struct btrfs_work *work) end_io_wq = container_of(work, struct btrfs_end_io_wq, work); bio = end_io_wq->bio; - bio->bi_error = end_io_wq->error; + bio->bi_status = end_io_wq->status; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); @@ -3495,11 +3494,11 @@ static void btrfs_end_empty_barrier(struct bio *bio) * any device where the flush fails with eopnotsupp are flagged as not-barrier * capable */ -static int write_dev_flush(struct btrfs_device *device, int wait) +static blk_status_t write_dev_flush(struct btrfs_device *device, int wait) { struct request_queue *q = bdev_get_queue(device->bdev); struct bio *bio; - int ret = 0; + blk_status_t ret = 0; if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) return 0; @@ -3511,8 +3510,8 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); - if (bio->bi_error) { - ret = bio->bi_error; + if (bio->bi_status) { + ret = bio->bi_status; btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); } @@ -3531,7 +3530,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) device->flush_bio = NULL; bio = btrfs_io_bio_alloc(GFP_NOFS, 0); if (!bio) - return -ENOMEM; + return BLK_STS_RESOURCE; bio->bi_end_io = btrfs_end_empty_barrier; bio->bi_bdev = device->bdev; @@ -3556,7 +3555,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) struct btrfs_device *dev; int errors_send = 0; int errors_wait = 0; - int ret; + blk_status_t ret; /* send down all the barriers */ head = &info->fs_devices->devices; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 21f1ceb85b76..c581927555f3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -118,13 +118,13 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(const char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, u8 *result); -int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, +blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); -int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 bio_offset, - extent_submit_bio_hook_t *submit_bio_start, - extent_submit_bio_hook_t *submit_bio_done); +blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, + struct inode *inode, struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 bio_offset, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done); unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); int btrfs_write_tree_block(struct extent_buffer *buf); int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d8da3edf2ac3..35cbb6ceb70d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2399,6 +2399,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct bio *bio; int read_mode = 0; + blk_status_t status; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2431,11 +2432,12 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", read_mode, failrec->this_mirror, failrec->in_validation); - ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, + status = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, failrec->bio_flags, 0); - if (ret) { + if (status) { free_io_failure(BTRFS_I(inode), failrec); bio_put(bio); + ret = blk_status_to_errno(status); } return ret; @@ -2474,6 +2476,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) */ static void end_bio_extent_writepage(struct bio *bio) { + int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; u64 start; u64 end; @@ -2503,7 +2506,7 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - end_extent_writepage(page, bio->bi_error, start, end); + end_extent_writepage(page, error, start, end); end_page_writeback(page); } @@ -2536,7 +2539,7 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - int uptodate = !bio->bi_error; + int uptodate = !bio->bi_status; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree; u64 offset = 0; @@ -2556,7 +2559,7 @@ static void end_bio_extent_readpage(struct bio *bio) btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", - (u64)bio->bi_iter.bi_sector, bio->bi_error, + (u64)bio->bi_iter.bi_sector, bio->bi_status, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; @@ -2615,7 +2618,7 @@ static void end_bio_extent_readpage(struct bio *bio) ret = bio_readpage_error(bio, offset, page, start, end, mirror); if (ret == 0) { - uptodate = !bio->bi_error; + uptodate = !bio->bi_status; offset += len; continue; } @@ -2673,7 +2676,7 @@ readpage_ok: endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); if (io_bio->end_io) - io_bio->end_io(io_bio, bio->bi_error); + io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status)); bio_put(bio); } @@ -2743,7 +2746,7 @@ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) static int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags) { - int ret = 0; + blk_status_t ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; @@ -2761,7 +2764,7 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, btrfsic_submit_bio(bio); bio_put(bio); - return ret; + return blk_status_to_errno(ret); } static int merge_bio(struct extent_io_tree *tree, struct page *page, @@ -3707,7 +3710,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (bio->bi_error || + if (bio->bi_status || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); set_btree_ioerr(page); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1eafa2f0ede3..487ca0207cb6 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -92,9 +92,9 @@ struct btrfs_inode; struct btrfs_io_bio; struct io_failure_record; -typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset); +typedef blk_status_t (extent_submit_bio_hook_t)(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset); struct extent_io_ops { /* * The following callbacks must be allways defined, the function diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 64fcb31d7163..5b1c7090e546 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -160,7 +160,7 @@ static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) kfree(bio->csum_allocated); } -static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, +static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -182,7 +182,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, path = btrfs_alloc_path(); if (!path) - return -ENOMEM; + return BLK_STS_RESOURCE; nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { @@ -191,7 +191,7 @@ static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, csum_size, GFP_NOFS); if (!btrfs_bio->csum_allocated) { btrfs_free_path(path); - return -ENOMEM; + return BLK_STS_RESOURCE; } btrfs_bio->csum = btrfs_bio->csum_allocated; btrfs_bio->end_io = btrfs_io_bio_endio_readpage; @@ -303,12 +303,12 @@ next: return 0; } -int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) { return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); } -int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) +blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) { return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1); } @@ -433,7 +433,7 @@ fail: return ret; } -int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, +blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -452,7 +452,7 @@ int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), GFP_NOFS); if (!sums) - return -ENOMEM; + return BLK_STS_RESOURCE; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 758b2666885e..ea7cae1003eb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -842,13 +842,12 @@ retry: NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK); - ret = btrfs_submit_compressed_write(inode, + if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, - async_extent->nr_pages); - if (ret) { + async_extent->nr_pages)) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; @@ -1901,11 +1900,11 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btrfs_submit_bio_start(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset) { - int ret = 0; + blk_status_t ret = 0; ret = btrfs_csum_one_bio(inode, bio, 0, 0); BUG_ON(ret); /* -ENOMEM */ @@ -1920,16 +1919,16 @@ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 bio_offset) +static blk_status_t __btrfs_submit_bio_done(struct inode *inode, + struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; + blk_status_t ret; ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); if (ret) { - bio->bi_error = ret; + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -1939,14 +1938,14 @@ static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read */ -static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, +static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; - int ret = 0; + blk_status_t ret = 0; int skip_sum; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); @@ -1991,8 +1990,8 @@ mapit: ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); out: - if (ret < 0) { - bio->bi_error = ret; + if (ret) { + bio->bi_status = ret; bio_endio(bio); } return ret; @@ -8037,7 +8036,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) struct bio_vec *bvec; int i; - if (bio->bi_error) + if (bio->bi_status) goto end; ASSERT(bio->bi_vcnt == 1); @@ -8116,7 +8115,7 @@ static void btrfs_retry_endio(struct bio *bio) int ret; int i; - if (bio->bi_error) + if (bio->bi_status) goto end; uptodate = 1; @@ -8141,8 +8140,8 @@ end: bio_put(bio); } -static int __btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, int err) +static blk_status_t __btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, blk_status_t err) { struct btrfs_fs_info *fs_info; struct bio_vec *bvec; @@ -8184,7 +8183,7 @@ try_again: io_bio->mirror_num, btrfs_retry_endio, &done); if (ret) { - err = ret; + err = errno_to_blk_status(ret); goto next; } @@ -8211,8 +8210,8 @@ next: return err; } -static int btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, int err) +static blk_status_t btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, blk_status_t err) { bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -8232,7 +8231,7 @@ static void btrfs_endio_direct_read(struct bio *bio) struct inode *inode = dip->inode; struct bio *dio_bio; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - int err = bio->bi_error; + blk_status_t err = bio->bi_status; if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) err = btrfs_subio_endio_read(inode, io_bio, err); @@ -8243,11 +8242,11 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); - dio_bio->bi_error = bio->bi_error; + dio_bio->bi_status = bio->bi_status; dio_end_io(dio_bio); if (io_bio->end_io) - io_bio->end_io(io_bio, err); + io_bio->end_io(io_bio, blk_status_to_errno(err)); bio_put(bio); } @@ -8299,20 +8298,20 @@ static void btrfs_endio_direct_write(struct bio *bio) struct bio *dio_bio = dip->dio_bio; __endio_write_update_ordered(dip->inode, dip->logical_offset, - dip->bytes, !bio->bi_error); + dip->bytes, !bio->bi_status); kfree(dip); - dio_bio->bi_error = bio->bi_error; + dio_bio->bi_status = bio->bi_status; dio_end_io(dio_bio); bio_put(bio); } -static int __btrfs_submit_bio_start_direct_io(struct inode *inode, +static blk_status_t __btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 offset) { - int ret; + blk_status_t ret; ret = btrfs_csum_one_bio(inode, bio, offset, 1); BUG_ON(ret); /* -ENOMEM */ return 0; @@ -8321,7 +8320,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, static void btrfs_end_dio_bio(struct bio *bio) { struct btrfs_dio_private *dip = bio->bi_private; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, @@ -8351,7 +8350,7 @@ static void btrfs_end_dio_bio(struct bio *bio) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - dip->dio_bio->bi_error = 0; + dip->dio_bio->bi_status = 0; bio_endio(dip->orig_bio); } out: @@ -8368,14 +8367,14 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, return bio; } -static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode, +static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, struct btrfs_dio_private *dip, struct bio *bio, u64 file_offset) { struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); - int ret; + blk_status_t ret; /* * We load all the csum data we need when we submit @@ -8406,7 +8405,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; bool write = bio_op(bio) == REQ_OP_WRITE; - int ret; + blk_status_t ret; if (async_submit) async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); @@ -8649,7 +8648,7 @@ free_ordered: * callbacks - they require an allocated dip and a clone of dio_bio. */ if (io_bio && dip) { - io_bio->bi_error = -EIO; + io_bio->bi_status = BLK_STS_IOERR; bio_endio(io_bio); /* * The end io callbacks free our dip, do the final put on io_bio @@ -8668,7 +8667,7 @@ free_ordered: unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); - dio_bio->bi_error = -EIO; + dio_bio->bi_status = BLK_STS_IOERR; /* * Releases and cleans up our dio_bio, no need to bio_put() * nor bio_endio()/bio_io_error() against dio_bio. diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d8ea0eb76325..f3d30d9ea8f9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -871,7 +871,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *next; @@ -884,7 +884,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) while (cur) { next = cur->bi_next; cur->bi_next = NULL; - cur->bi_error = err; + cur->bi_status = err; bio_endio(cur); cur = next; } @@ -897,7 +897,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err) static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; int max_errors; if (err) @@ -914,7 +914,7 @@ static void raid_write_end_io(struct bio *bio) max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 0 : rbio->bbio->max_errors; if (atomic_read(&rbio->error) > max_errors) - err = -EIO; + err = BLK_STS_IOERR; rbio_orig_end_io(rbio, err); } @@ -1092,7 +1092,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, * devices or if they are not contiguous */ if (last_end == disk_start && stripe->dev->bdev && - !last->bi_error && + !last->bi_status && last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, page, PAGE_SIZE, 0); if (ret == PAGE_SIZE) @@ -1448,7 +1448,7 @@ static void raid_rmw_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -1991,7 +1991,7 @@ static void raid_recover_end_io(struct bio *bio) * we only read stripe pages off the disk, set them * up to date if there were no errors */ - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); @@ -2530,7 +2530,7 @@ static void raid56_parity_scrub_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) fail_bio_stripe(rbio, bio); else set_bio_pages_uptodate(bio); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c7b45eb2403d..ba5595d19de1 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -95,7 +95,7 @@ struct scrub_bio { struct scrub_ctx *sctx; struct btrfs_device *dev; struct bio *bio; - int err; + blk_status_t status; u64 logical; u64 physical; #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO @@ -1668,14 +1668,14 @@ leave_nomem: struct scrub_bio_ret { struct completion event; - int error; + blk_status_t status; }; static void scrub_bio_wait_endio(struct bio *bio) { struct scrub_bio_ret *ret = bio->bi_private; - ret->error = bio->bi_error; + ret->status = bio->bi_status; complete(&ret->event); } @@ -1693,7 +1693,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, int ret; init_completion(&done.event); - done.error = 0; + done.status = 0; bio->bi_iter.bi_sector = page->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; @@ -1705,7 +1705,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, return ret; wait_for_completion(&done.event); - if (done.error) + if (done.status) return -EIO; return 0; @@ -1937,7 +1937,7 @@ again: bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - sbio->err = 0; + sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || sbio->logical + sbio->page_count * PAGE_SIZE != @@ -1992,7 +1992,7 @@ static void scrub_wr_bio_end_io(struct bio *bio) struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - sbio->err = bio->bi_error; + sbio->status = bio->bi_status; sbio->bio = bio; btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, @@ -2007,7 +2007,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) int i; WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); - if (sbio->err) { + if (sbio->status) { struct btrfs_dev_replace *dev_replace = &sbio->sctx->fs_info->dev_replace; @@ -2341,7 +2341,7 @@ again: bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); - sbio->err = 0; + sbio->status = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || sbio->logical + sbio->page_count * PAGE_SIZE != @@ -2377,7 +2377,7 @@ static void scrub_missing_raid56_end_io(struct bio *bio) struct scrub_block *sblock = bio->bi_private; struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - if (bio->bi_error) + if (bio->bi_status) sblock->no_io_error_seen = 0; bio_put(bio); @@ -2588,7 +2588,7 @@ static void scrub_bio_end_io(struct bio *bio) struct scrub_bio *sbio = bio->bi_private; struct btrfs_fs_info *fs_info = sbio->dev->fs_info; - sbio->err = bio->bi_error; + sbio->status = bio->bi_status; sbio->bio = bio; btrfs_queue_work(fs_info->scrub_workers, &sbio->work); @@ -2601,7 +2601,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) int i; BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); - if (sbio->err) { + if (sbio->status) { for (i = 0; i < sbio->page_count; i++) { struct scrub_page *spage = sbio->pagev[i]; @@ -3004,7 +3004,7 @@ static void scrub_parity_bio_endio(struct bio *bio) struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; - if (bio->bi_error) + if (bio->bi_status) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 017b67daa3bb..84a495967e0a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6042,9 +6042,10 @@ static void btrfs_end_bio(struct bio *bio) struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; - if (bio->bi_error) { + if (bio->bi_status) { atomic_inc(&bbio->error); - if (bio->bi_error == -EIO || bio->bi_error == -EREMOTEIO) { + if (bio->bi_status == BLK_STS_IOERR || + bio->bi_status == BLK_STS_TARGET) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; struct btrfs_device *dev; @@ -6082,13 +6083,13 @@ static void btrfs_end_bio(struct bio *bio) * beyond the tolerance of the btrfs bio */ if (atomic_read(&bbio->error) > bbio->max_errors) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; } else { /* * this bio is actually up to date, we didn't * go over the max number of errors */ - bio->bi_error = 0; + bio->bi_status = 0; } btrfs_end_bbio(bbio, bio); @@ -6199,7 +6200,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; btrfs_end_bbio(bbio, bio); } } diff --git a/fs/buffer.c b/fs/buffer.c index 161be58c5cb0..306b720f7383 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3038,7 +3038,7 @@ static void end_bio_bh_io_sync(struct bio *bio) if (unlikely(bio_flagged(bio, BIO_QUIET))) set_bit(BH_Quiet, &bh->b_state); - bh->b_end_io(bh, !bio->bi_error); + bh->b_end_io(bh, !bio->bi_status); bio_put(bio); } diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index a409a84f1bca..6181e9526860 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -129,7 +129,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, goto errout; } err = submit_bio_wait(bio); - if ((err == 0) && bio->bi_error) + if (err == 0 && bio->bi_status) err = -EIO; bio_put(bio); if (err) diff --git a/fs/direct-io.c b/fs/direct-io.c index bb711e4b86c2..e8baaabebf13 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -294,7 +294,7 @@ static void dio_aio_complete_work(struct work_struct *work) dio_complete(dio, 0, true); } -static int dio_bio_complete(struct dio *dio, struct bio *bio); +static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); /* * Asynchronous IO callback. @@ -473,11 +473,11 @@ static struct bio *dio_await_one(struct dio *dio) /* * Process one completed BIO. No locks are held. */ -static int dio_bio_complete(struct dio *dio, struct bio *bio) +static blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) { struct bio_vec *bvec; unsigned i; - int err = bio->bi_error; + blk_status_t err = bio->bi_status; if (err) dio->io_error = -EIO; @@ -536,7 +536,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) bio = dio->bio_list; dio->bio_list = bio->bi_private; spin_unlock_irqrestore(&dio->bio_lock, flags); - ret2 = dio_bio_complete(dio, bio); + ret2 = blk_status_to_errno(dio_bio_complete(dio, bio)); if (ret == 0) ret = ret2; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1a82138ba739..930ca0fc9a0f 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -85,7 +85,7 @@ static void ext4_finish_bio(struct bio *bio) } #endif - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); mapping_set_error(page->mapping, -EIO); } @@ -104,7 +104,7 @@ static void ext4_finish_bio(struct bio *bio) continue; } clear_buffer_async_write(bh); - if (bio->bi_error) + if (bio->bi_status) buffer_io_error(bh); } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); @@ -303,24 +303,25 @@ static void ext4_end_bio(struct bio *bio) bdevname(bio->bi_bdev, b), (long long) bio->bi_iter.bi_sector, (unsigned) bio_sectors(bio), - bio->bi_error)) { + bio->bi_status)) { ext4_finish_bio(bio); bio_put(bio); return; } bio->bi_end_io = NULL; - if (bio->bi_error) { + if (bio->bi_status) { struct inode *inode = io_end->inode; ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - bio->bi_error, inode->i_ino, + bio->bi_status, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); - mapping_set_error(inode->i_mapping, bio->bi_error); + mapping_set_error(inode->i_mapping, + blk_status_to_errno(bio->bi_status)); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index a81b829d56de..40a5497b0f60 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -73,7 +73,7 @@ static void mpage_end_io(struct bio *bio) int i; if (ext4_bio_encrypted(bio)) { - if (bio->bi_error) { + if (bio->bi_status) { fscrypt_release_ctx(bio->bi_private); } else { fscrypt_decrypt_bio_pages(bio->bi_private, bio); @@ -83,7 +83,7 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - if (!bio->bi_error) { + if (!bio->bi_status) { SetPageUptodate(page); } else { ClearPageUptodate(page); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7c0f6bdf817d..36fe82012a33 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -58,12 +58,12 @@ static void f2fs_read_end_io(struct bio *bio) #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { f2fs_show_injection_info(FAULT_IO); - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; } #endif if (f2fs_bio_encrypted(bio)) { - if (bio->bi_error) { + if (bio->bi_status) { fscrypt_release_ctx(bio->bi_private); } else { fscrypt_decrypt_bio_pages(bio->bi_private, bio); @@ -74,7 +74,7 @@ static void f2fs_read_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (!bio->bi_error) { + if (!bio->bi_status) { if (!PageUptodate(page)) SetPageUptodate(page); } else { @@ -102,14 +102,14 @@ static void f2fs_write_end_io(struct bio *bio) unlock_page(page); mempool_free(page, sbi->write_io_dummy); - if (unlikely(bio->bi_error)) + if (unlikely(bio->bi_status)) f2fs_stop_checkpoint(sbi, true); continue; } fscrypt_pullback_bio_page(&page, true); - if (unlikely(bio->bi_error)) { + if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); f2fs_stop_checkpoint(sbi, true); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 96845854e7ee..ea9f455d94ba 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -749,7 +749,7 @@ static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - dc->error = bio->bi_error; + dc->error = blk_status_to_errno(bio->bi_status); dc->state = D_DONE; complete(&dc->wait); bio_put(bio); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 13ebf15a4db0..885d36e7a29f 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -170,7 +170,7 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp) */ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, - int error) + blk_status_t error) { struct buffer_head *bh, *next; struct page *page = bvec->bv_page; @@ -209,13 +209,13 @@ static void gfs2_end_log_write(struct bio *bio) struct page *page; int i; - if (bio->bi_error) - fs_err(sdp, "Error %d writing to log\n", bio->bi_error); + if (bio->bi_status) + fs_err(sdp, "Error %d writing to log\n", bio->bi_status); bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; if (page_has_buffers(page)) - gfs2_end_log_write_bh(sdp, bvec, bio->bi_error); + gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); else mempool_free(page, gfs2_page_pool); } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 663ffc135ef3..fabe1614f879 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -201,7 +201,7 @@ static void gfs2_meta_read_endio(struct bio *bio) do { struct buffer_head *next = bh->b_this_page; len -= bh->b_size; - bh->b_end_io(bh, !bio->bi_error); + bh->b_end_io(bh, !bio->bi_status); bh = next; } while (bh && len); } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ed67548b286c..83953cdbbc6c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -176,10 +176,10 @@ static void end_bio_io_page(struct bio *bio) { struct page *page = bio->bi_private; - if (!bio->bi_error) + if (!bio->bi_status) SetPageUptodate(page); else - pr_warn("error %d reading superblock\n", bio->bi_error); + pr_warn("error %d reading superblock\n", bio->bi_status); unlock_page(page); } diff --git a/fs/iomap.c b/fs/iomap.c index 4b10892967a5..18f2f2b8ba2c 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -672,8 +672,8 @@ static void iomap_dio_bio_end_io(struct bio *bio) struct iomap_dio *dio = bio->bi_private; bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); - if (bio->bi_error) - iomap_dio_set_error(dio, bio->bi_error); + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); if (atomic_dec_and_test(&dio->ref)) { if (is_sync_kiocb(dio->iocb)) { diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index bb1da1feafeb..a21f0e9eecd4 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2205,7 +2205,7 @@ static void lbmIODone(struct bio *bio) bp->l_flag |= lbmDONE; - if (bio->bi_error) { + if (bio->bi_status) { bp->l_flag |= lbmERROR; jfs_err("lbmIODone: I/O error in JFS log"); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 489aaa1403e5..ce93db3aef3c 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -280,7 +280,7 @@ static void metapage_read_end_io(struct bio *bio) { struct page *page = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ERR "metapage_read_end_io: I/O error\n"); SetPageError(page); } @@ -337,7 +337,7 @@ static void metapage_write_end_io(struct bio *bio) BUG_ON(!PagePrivate(page)); - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ERR "metapage_write_end_io: I/O error\n"); SetPageError(page); } diff --git a/fs/mpage.c b/fs/mpage.c index baff8f820c29..9524fdde00c2 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -50,7 +50,8 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - page_endio(page, op_is_write(bio_op(bio)), bio->bi_error); + page_endio(page, op_is_write(bio_op(bio)), + blk_status_to_errno(bio->bi_status)); } bio_put(bio); diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 0ca370d23ddb..d8863a804b15 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -188,7 +188,7 @@ static void bl_end_io_read(struct bio *bio) { struct parallel_io *par = bio->bi_private; - if (bio->bi_error) { + if (bio->bi_status) { struct nfs_pgio_header *header = par->data; if (!header->pnfs_error) @@ -319,7 +319,7 @@ static void bl_end_io_write(struct bio *bio) struct parallel_io *par = bio->bi_private; struct nfs_pgio_header *header = par->data; - if (bio->bi_error) { + if (bio->bi_status) { if (!header->pnfs_error) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 6f87b2ac1aeb..e73c86d9855c 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -338,7 +338,7 @@ static void nilfs_end_bio_write(struct bio *bio) { struct nilfs_segment_buffer *segbuf = bio->bi_private; - if (bio->bi_error) + if (bio->bi_status) atomic_inc(&segbuf->sb_err); bio_put(bio); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 0da0332725aa..ffe003982d95 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -516,9 +516,9 @@ static void o2hb_bio_end_io(struct bio *bio) { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (bio->bi_error) { - mlog(ML_ERROR, "IO Error %d\n", bio->bi_error); - wc->wc_error = bio->bi_error; + if (bio->bi_status) { + mlog(ML_ERROR, "IO Error %d\n", bio->bi_status); + wc->wc_error = blk_status_to_errno(bio->bi_status); } o2hb_bio_wait_dec(wc, 1); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 09af0f7cd55e..76b6f988e2fa 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -276,7 +276,7 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; - int error = ioend->io_bio->bi_error; + int error; /* * Just clean up the in-memory strutures if the fs has been shut down. @@ -289,6 +289,7 @@ xfs_end_io( /* * Clean up any COW blocks on an I/O error. */ + error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { switch (ioend->io_type) { case XFS_IO_COW: @@ -332,7 +333,7 @@ xfs_end_bio( else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); else - xfs_destroy_ioend(ioend, bio->bi_error); + xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); } STATIC int @@ -500,7 +501,7 @@ xfs_submit_ioend( * time. */ if (status) { - ioend->io_bio->bi_error = status; + ioend->io_bio->bi_status = errno_to_blk_status(status); bio_endio(ioend->io_bio); return status; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 62fa39276a24..15c7a484a5d2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1213,8 +1213,11 @@ xfs_buf_bio_end_io( * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (bio->bi_error) - cmpxchg(&bp->b_io_error, 0, bio->bi_error); + if (bio->bi_status) { + int error = blk_status_to_errno(bio->bi_status); + + cmpxchg(&bp->b_io_error, 0, error); + } if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); diff --git a/include/linux/bio.h b/include/linux/bio.h index d1b04b0e99cf..9455aada1399 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -414,7 +414,7 @@ extern void bio_endio(struct bio *); static inline void bio_io_error(struct bio *bio) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 59378939a8cd..dcd45b15a3a5 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -33,6 +33,9 @@ typedef u8 __bitwise blk_status_t; #define BLK_STS_RESOURCE ((__force blk_status_t)9) #define BLK_STS_IOERR ((__force blk_status_t)10) +/* hack for device mapper, don't use elsewhere: */ +#define BLK_STS_DM_REQUEUE ((__force blk_status_t)11) + struct blk_issue_stat { u64 stat; }; @@ -44,7 +47,7 @@ struct blk_issue_stat { struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; - int bi_error; + blk_status_t bi_status; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use * accessors. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2a8871638453..76b6df862a12 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1782,7 +1782,7 @@ struct blk_integrity_iter { const char *disk_name; }; -typedef int (integrity_processing_fn) (struct blk_integrity_iter *); +typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); struct blk_integrity_profile { integrity_processing_fn *generate_fn; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 5de5c53251ec..456da5017b32 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -72,7 +72,7 @@ typedef void (*dm_release_clone_request_fn) (struct request *clone); * 2 : The target wants to push back the io */ typedef int (*dm_endio_fn) (struct dm_target *ti, - struct bio *bio, int *error); + struct bio *bio, blk_status_t *error); typedef int (*dm_request_endio_fn) (struct dm_target *ti, struct request *clone, blk_status_t error, union map_info *map_context); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index f80fd33639e0..57d22571f306 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -225,14 +225,14 @@ static struct block_device *hib_resume_bdev; struct hib_bio_batch { atomic_t count; wait_queue_head_t wait; - int error; + blk_status_t error; }; static void hib_init_batch(struct hib_bio_batch *hb) { atomic_set(&hb->count, 0); init_waitqueue_head(&hb->wait); - hb->error = 0; + hb->error = BLK_STS_OK; } static void hib_end_io(struct bio *bio) @@ -240,7 +240,7 @@ static void hib_end_io(struct bio *bio) struct hib_bio_batch *hb = bio->bi_private; struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), @@ -253,8 +253,8 @@ static void hib_end_io(struct bio *bio) flush_icache_range((unsigned long)page_address(page), (unsigned long)page_address(page) + PAGE_SIZE); - if (bio->bi_error && !hb->error) - hb->error = bio->bi_error; + if (bio->bi_status && !hb->error) + hb->error = bio->bi_status; if (atomic_dec_and_test(&hb->count)) wake_up(&hb->wait); @@ -293,10 +293,10 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, return error; } -static int hib_wait_io(struct hib_bio_batch *hb) +static blk_status_t hib_wait_io(struct hib_bio_batch *hb) { wait_event(hb->wait, atomic_read(&hb->count) == 0); - return hb->error; + return blk_status_to_errno(hb->error); } /* diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 193c5f5e3f79..bc364f86100a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -867,7 +867,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, - BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), + BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), &rpdu); } } @@ -900,7 +900,7 @@ static void blk_add_trace_bio_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error, + bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, sizeof(r), &r); } diff --git a/mm/page_io.c b/mm/page_io.c index 23f6d0d3470f..2da71e627812 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -45,7 +45,7 @@ void end_swap_bio_write(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); /* * We failed to write the page out to swap-space. @@ -118,7 +118,7 @@ static void end_swap_bio_read(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; - if (bio->bi_error) { + if (bio->bi_status) { SetPageError(page); ClearPageUptodate(page); pr_alert("Read-error on swap-device (%u:%u:%llu)\n", -- cgit v1.3-14-g43fede From 1e1fc133483ef3b56c20bf3cd9241146c41042f8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 30 May 2017 00:29:38 -0400 Subject: compat_{get,put}_bitmap(): use unsafe_{get,put}_user() unroll the inner loops, while we are at it Signed-off-by: Al Viro --- include/linux/compat.h | 3 +- kernel/compat.c | 81 +++++++++++++++++--------------------------------- 2 files changed, 29 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 1c5f3152cbb5..94ceb0348a25 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -388,8 +388,7 @@ asmlinkage long compat_sys_wait4(compat_pid_t pid, #define BITS_PER_COMPAT_LONG (8*sizeof(compat_long_t)) -#define BITS_TO_COMPAT_LONGS(bits) \ - (((bits)+BITS_PER_COMPAT_LONG-1)/BITS_PER_COMPAT_LONG) +#define BITS_TO_COMPAT_LONGS(bits) DIV_ROUND_UP(bits, BITS_PER_COMPAT_LONG) long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size); diff --git a/kernel/compat.c b/kernel/compat.c index 860f674fa556..9c2a8f3788d5 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -871,84 +871,59 @@ int get_compat_sigevent(struct sigevent *event, long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size) { - int i, j; - unsigned long m; - compat_ulong_t um; unsigned long nr_compat_longs; /* align bitmap up to nearest compat_long_t boundary */ bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); + nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) return -EFAULT; - nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - - for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { - m = 0; - - for (j = 0; j < sizeof(m)/sizeof(um); j++) { - /* - * We dont want to read past the end of the userspace - * bitmap. We must however ensure the end of the - * kernel bitmap is zeroed. - */ - if (nr_compat_longs) { - nr_compat_longs--; - if (__get_user(um, umask)) - return -EFAULT; - } else { - um = 0; - } - - umask++; - m |= (long)um << (j * BITS_PER_COMPAT_LONG); - } - *mask++ = m; + user_access_begin(); + while (nr_compat_longs > 1) { + compat_ulong_t l1, l2; + unsafe_get_user(l1, umask++, Efault); + unsafe_get_user(l2, umask++, Efault); + *mask++ = ((unsigned long)l2 << BITS_PER_COMPAT_LONG) | l1; + nr_compat_longs -= 2; } - + if (nr_compat_longs) + unsafe_get_user(*mask, umask++, Efault); + user_access_end(); return 0; + +Efault: + user_access_end(); + return -EFAULT; } long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, unsigned long bitmap_size) { - int i, j; - unsigned long m; - compat_ulong_t um; unsigned long nr_compat_longs; /* align bitmap up to nearest compat_long_t boundary */ bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); + nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) return -EFAULT; - nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - - for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { - m = *mask++; - - for (j = 0; j < sizeof(m)/sizeof(um); j++) { - um = m; - - /* - * We dont want to write past the end of the userspace - * bitmap. - */ - if (nr_compat_longs) { - nr_compat_longs--; - if (__put_user(um, umask)) - return -EFAULT; - } - - umask++; - m >>= 4*sizeof(um); - m >>= 4*sizeof(um); - } + user_access_begin(); + while (nr_compat_longs > 1) { + unsigned long m = *mask++; + unsafe_put_user((compat_ulong_t)m, umask++, Efault); + unsafe_put_user(m >> BITS_PER_COMPAT_LONG, umask++, Efault); + nr_compat_longs -= 2; } - + if (nr_compat_longs) + unsafe_put_user((compat_ulong_t)*mask, umask++, Efault); + user_access_end(); return 0; +Efault: + user_access_end(); + return -EFAULT; } void -- cgit v1.3-14-g43fede From ca2406ed58fef3f7c8ef6470cba807bfc3415605 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 04:22:44 -0400 Subject: times(2): move compat to native Signed-off-by: Al Viro --- include/linux/time.h | 3 --- kernel/compat.c | 24 ------------------------ kernel/sys.c | 28 +++++++++++++++++++++++++++- 3 files changed, 27 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/linux/time.h b/include/linux/time.h index c0543f5f25de..f769ea88250d 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -171,9 +171,6 @@ extern int do_getitimer(int which, struct itimerval *value); extern long do_utimes(int dfd, const char __user *filename, struct timespec *times, int flags); -struct tms; -extern void do_sys_times(struct tms *); - /* * Similar to the struct tm in userspace , but it needs to be here so * that the kernel source is self contained. diff --git a/kernel/compat.c b/kernel/compat.c index 9c2a8f3788d5..99408252762e 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -350,30 +350,6 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which, return 0; } -static compat_clock_t clock_t_to_compat_clock_t(clock_t x) -{ - return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); -} - -COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) -{ - if (tbuf) { - struct tms tms; - struct compat_tms tmp; - - do_sys_times(&tms); - /* Convert our struct tms to the compat version. */ - tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); - tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); - tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); - tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); - if (copy_to_user(tbuf, &tmp, sizeof(tmp))) - return -EFAULT; - } - force_successful_syscall_return(); - return compat_jiffies_to_clock_t(jiffies); -} - #ifdef __ARCH_WANT_SYS_SIGPENDING /* diff --git a/kernel/sys.c b/kernel/sys.c index 3778a8a417b6..161b5eae9c77 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -886,7 +886,7 @@ SYSCALL_DEFINE0(getegid) return from_kgid_munged(current_user_ns(), current_egid()); } -void do_sys_times(struct tms *tms) +static void do_sys_times(struct tms *tms) { u64 tgutime, tgstime, cutime, cstime; @@ -912,6 +912,32 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf) return (long) jiffies_64_to_clock_t(get_jiffies_64()); } +#ifdef CONFIG_COMPAT +static compat_clock_t clock_t_to_compat_clock_t(clock_t x) +{ + return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); +} + +COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) +{ + if (tbuf) { + struct tms tms; + struct compat_tms tmp; + + do_sys_times(&tms); + /* Convert our struct tms to the compat version. */ + tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); + tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); + tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); + tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); + if (copy_to_user(tbuf, &tmp, sizeof(tmp))) + return -EFAULT; + } + force_successful_syscall_return(); + return compat_jiffies_to_clock_t(jiffies); +} +#endif + /* * This needs some heavy checking ... * I just haven't the stomach for it. I also don't fully -- cgit v1.3-14-g43fede From d9e968cb9f849770288f5fde3d8d3a5f7e339052 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 04:33:51 -0400 Subject: getrlimit()/setrlimit(): move compat to native Signed-off-by: Al Viro --- kernel/compat.c | 38 -------------------------------------- kernel/sys.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 99408252762e..58b8e57398d1 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -427,44 +427,6 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, #endif -COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, - struct compat_rlimit __user *, rlim) -{ - struct rlimit r; - - if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || - __get_user(r.rlim_cur, &rlim->rlim_cur) || - __get_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - - if (r.rlim_cur == COMPAT_RLIM_INFINITY) - r.rlim_cur = RLIM_INFINITY; - if (r.rlim_max == COMPAT_RLIM_INFINITY) - r.rlim_max = RLIM_INFINITY; - return do_prlimit(current, resource, &r, NULL); -} - -COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, - struct compat_rlimit __user *, rlim) -{ - struct rlimit r; - int ret; - - ret = do_prlimit(current, resource, NULL, &r); - if (!ret) { - if (r.rlim_cur > COMPAT_RLIM_INFINITY) - r.rlim_cur = COMPAT_RLIM_INFINITY; - if (r.rlim_max > COMPAT_RLIM_INFINITY) - r.rlim_max = COMPAT_RLIM_INFINITY; - - if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || - __put_user(r.rlim_cur, &rlim->rlim_cur) || - __put_user(r.rlim_max, &rlim->rlim_max)) - return -EFAULT; - } - return ret; -} - int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) { if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || diff --git a/kernel/sys.c b/kernel/sys.c index 161b5eae9c77..873e6eaa314f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1332,6 +1332,54 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) return ret; } +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) +{ + struct rlimit r; + struct compat_rlimit r32; + + if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit))) + return -EFAULT; + + if (r32.rlim_cur == COMPAT_RLIM_INFINITY) + r.rlim_cur = RLIM_INFINITY; + else + r.rlim_cur = r32.rlim_cur; + if (r32.rlim_max == COMPAT_RLIM_INFINITY) + r.rlim_max = RLIM_INFINITY; + else + r.rlim_max = r32.rlim_max; + return do_prlimit(current, resource, &r, NULL); +} + +COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) +{ + struct rlimit r; + int ret; + + ret = do_prlimit(current, resource, NULL, &r); + if (!ret) { + struct rlimit r32; + if (r.rlim_cur > COMPAT_RLIM_INFINITY) + r32.rlim_cur = COMPAT_RLIM_INFINITY; + else + r32.rlim_cur = r.rlim_cur; + if (r.rlim_max > COMPAT_RLIM_INFINITY) + r32.rlim_max = COMPAT_RLIM_INFINITY; + else + r32.rlim_max = r.rlim_max; + + if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit))) + return -EFAULT; + } + return ret; +} + +#endif + #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT /* -- cgit v1.3-14-g43fede From 8f13621abcedb278cfecf9703583743f9c474c97 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 04:42:07 -0400 Subject: sigpending(): move compat to native ... and kill set_fs() use Signed-off-by: Al Viro --- kernel/compat.c | 23 ----------------------- kernel/signal.c | 12 ++++++++++++ 2 files changed, 12 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 58b8e57398d1..195e23469854 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -350,29 +350,6 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which, return 0; } -#ifdef __ARCH_WANT_SYS_SIGPENDING - -/* - * Assumption: old_sigset_t and compat_old_sigset_t are both - * types that can be passed to put_user()/get_user(). - */ - -COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set) -{ - old_sigset_t s; - long ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_sigpending((old_sigset_t __user *) &s); - set_fs(old_fs); - if (ret == 0) - ret = put_user(s, set); - return ret; -} - -#endif - #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* diff --git a/kernel/signal.c b/kernel/signal.c index d1eed0d7ca64..6237f492adfc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3254,6 +3254,18 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) +{ + sigset_t set; + int err = do_sigpending(&set, sizeof(old_sigset_t)); + if (err == 0) + if (copy_to_user(set32, &set, sizeof(old_sigset_t))) + err = -EFAULT; + return err; +} +#endif + #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK -- cgit v1.3-14-g43fede From 7668b679c3b7931f6436d1eef50904831209e749 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 06:39:31 -0400 Subject: put_compat_rusage(): switch to copy_to_user() Signed-off-by: Al Viro --- kernel/compat.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 195e23469854..64e772aabdb5 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -406,25 +406,27 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) { - if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || - __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) || - __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) || - __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) || - __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) || - __put_user(r->ru_maxrss, &ru->ru_maxrss) || - __put_user(r->ru_ixrss, &ru->ru_ixrss) || - __put_user(r->ru_idrss, &ru->ru_idrss) || - __put_user(r->ru_isrss, &ru->ru_isrss) || - __put_user(r->ru_minflt, &ru->ru_minflt) || - __put_user(r->ru_majflt, &ru->ru_majflt) || - __put_user(r->ru_nswap, &ru->ru_nswap) || - __put_user(r->ru_inblock, &ru->ru_inblock) || - __put_user(r->ru_oublock, &ru->ru_oublock) || - __put_user(r->ru_msgsnd, &ru->ru_msgsnd) || - __put_user(r->ru_msgrcv, &ru->ru_msgrcv) || - __put_user(r->ru_nsignals, &ru->ru_nsignals) || - __put_user(r->ru_nvcsw, &ru->ru_nvcsw) || - __put_user(r->ru_nivcsw, &ru->ru_nivcsw)) + struct compat_rusage r32; + memset(&r32, 0, sizeof(r32)); + r32.ru_utime.tv_sec = r->ru_utime.tv_sec; + r32.ru_utime.tv_usec = r->ru_utime.tv_usec; + r32.ru_stime.tv_sec = r->ru_stime.tv_sec; + r32.ru_stime.tv_usec = r->ru_stime.tv_usec; + r32.ru_maxrss = r->ru_maxrss; + r32.ru_ixrss = r->ru_ixrss; + r32.ru_idrss = r->ru_idrss; + r32.ru_isrss = r->ru_isrss; + r32.ru_minflt = r->ru_minflt; + r32.ru_majflt = r->ru_majflt; + r32.ru_nswap = r->ru_nswap; + r32.ru_inblock = r->ru_inblock; + r32.ru_oublock = r->ru_oublock; + r32.ru_msgsnd = r->ru_msgsnd; + r32.ru_msgrcv = r->ru_msgrcv; + r32.ru_nsignals = r->ru_nsignals; + r32.ru_nvcsw = r->ru_nvcsw; + r32.ru_nivcsw = r->ru_nivcsw; + if (copy_to_user(ru, &r32, sizeof(r32))) return -EFAULT; return 0; } -- cgit v1.3-14-g43fede From 1b3c872c8342803d0fcd8042e4e007d173191db6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 31 May 2017 04:46:17 -0400 Subject: rt_sigtimedwait(): move compat to native Signed-off-by: Al Viro --- include/linux/signal.h | 2 -- kernel/compat.c | 32 -------------------------------- kernel/signal.c | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 35 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index 1f5a16620693..231603ac20a3 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -246,8 +246,6 @@ extern int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, bool group); extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p); extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); -extern int do_sigtimedwait(const sigset_t *, siginfo_t *, - const struct timespec *); extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); diff --git a/kernel/compat.c b/kernel/compat.c index 64e772aabdb5..36e6e7c405e3 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -866,38 +866,6 @@ sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) } } -COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, - struct compat_siginfo __user *, uinfo, - struct compat_timespec __user *, uts, compat_size_t, sigsetsize) -{ - compat_sigset_t s32; - sigset_t s; - struct timespec t; - siginfo_t info; - long ret; - - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) - return -EFAULT; - sigset_from_compat(&s, &s32); - - if (uts) { - if (compat_get_timespec(&t, uts)) - return -EFAULT; - } - - ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); - - if (ret > 0 && uinfo) { - if (copy_siginfo_to_user32(uinfo, &info)) - ret = -EFAULT; - } - - return ret; -} - #ifdef __ARCH_WANT_COMPAT_SYS_TIME /* compat_time_t is a 32 bit "long" and needs to get converted. */ diff --git a/kernel/signal.c b/kernel/signal.c index 6237f492adfc..fe5b3608c31c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2768,7 +2768,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) * @info: if non-null, the signal's siginfo is returned here * @ts: upper bound on process time suspension */ -int do_sigtimedwait(const sigset_t *which, siginfo_t *info, +static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, const struct timespec *ts) { ktime_t *to = NULL, timeout = KTIME_MAX; @@ -2857,6 +2857,40 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return ret; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, + struct compat_siginfo __user *, uinfo, + struct compat_timespec __user *, uts, compat_size_t, sigsetsize) +{ + compat_sigset_t s32; + sigset_t s; + struct timespec t; + siginfo_t info; + long ret; + + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) + return -EFAULT; + sigset_from_compat(&s, &s32); + + if (uts) { + if (compat_get_timespec(&t, uts)) + return -EFAULT; + } + + ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); + + if (ret > 0 && uinfo) { + if (copy_siginfo_to_user32(uinfo, &info)) + ret = -EFAULT; + } + + return ret; +} +#endif + /** * sys_kill - send a signal to a process * @pid: the PID of the process -- cgit v1.3-14-g43fede From 20b9d7ac48526ce9a14106241e76e8382d126a60 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:40 +0200 Subject: bpf: avoid excessive stack usage for perf_sample_data perf_sample_data consumes 386 bytes on stack, reduce excessive stack usage and move it to per cpu buffer. It's allowed due to preemption being disabled for tracing, xdp and tc programs, thus at all times only one program can run on a specific CPU and programs cannot run from interrupt. We similarly also handle bpf_pt_regs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 08eb072430b9..051d7fca0c09 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -266,14 +266,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); + static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_raw_record *raw) { struct bpf_array *array = container_of(map, struct bpf_array, map); + struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; - struct perf_sample_data sample_data; struct bpf_event_entry *ee; struct perf_event *event; @@ -294,9 +296,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_sample_data_init(&sample_data, 0, 0); - sample_data.raw = raw; - perf_event_output(event, &sample_data, regs); + perf_sample_data_init(sd, 0, 0); + sd->raw = raw; + perf_event_output(event, sd, regs); return 0; } -- cgit v1.3-14-g43fede From d25da6caa2a1d6644360c40d7c5fd7c057551360 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:41 +0200 Subject: bpf: don't check spilled reg state for non-STACK_SPILLed type slots spilled_regs[] state is only used for stack slots of type STACK_SPILL, never for STACK_MISC. Right now, in states_equal(), even if we have old and current stack state of type STACK_MISC, we compare spilled_regs[] for that particular offset. Just skip these like we do everywhere else. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 14ccb0759fa4..d031b3b0752e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2828,6 +2828,8 @@ static bool states_equal(struct bpf_verifier_env *env, return false; if (i % BPF_REG_SIZE) continue; + if (old->stack_slot_type[i] != STACK_SPILL) + continue; if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], &cur->spilled_regs[i / BPF_REG_SIZE], sizeof(old->spilled_regs[0]))) -- cgit v1.3-14-g43fede From 4a2ff55aa4946b036b87572976cbfc6ab244c497 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:42 +0200 Subject: bpf: reset id on CONST_IMM transition Whenever we set the register to the type CONST_IMM, we currently don't reset the id to 0. id member is not used in CONST_IMM case, so don't let it become stale, where pruning won't be able to match later on. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d031b3b0752e..d195d825515a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1952,6 +1952,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = insn->imm; + regs[insn->dst_reg].id = 0; regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; regs[insn->dst_reg].min_align = calc_align(insn->imm); @@ -2409,6 +2410,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[insn->dst_reg].type = CONST_IMM; regs[insn->dst_reg].imm = imm; + regs[insn->dst_reg].id = 0; return 0; } -- cgit v1.3-14-g43fede From 36e24c003091a11ec847291c9a1d36d2ec92b155 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Jun 2017 00:50:43 +0200 Subject: bpf: reset id on spilled regs in clear_all_pkt_pointers Right now, we don't reset the id of spilled registers in case of clear_all_pkt_pointers(). Given pkt_pointers are highly likely to contain an id, do so by reusing __mark_reg_unknown_value(). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d195d825515a..519a6144d3d3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1346,8 +1346,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_PACKET_END) continue; - reg->type = UNKNOWN_VALUE; - reg->imm = 0; + __mark_reg_unknown_value(state->spilled_regs, + i / BPF_REG_SIZE); } } -- cgit v1.3-14-g43fede From c6503be587e9c5c0aac4e2b45de982352f676a5b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 Jun 2017 17:21:26 +0200 Subject: posix-timers: Fix inverted SIGEV_NONE logic in common_timer_get() The refactoring of the posix-timer core to allow better code sharing introduced inverted logic vs. SIGEV_NONE timers in common_timer_get(). That causes hrtimer_forward() to be called on active timers, which rightfully triggers the warning hrtimer_forward(). Make sig_none what it says: signal mode == SIGEV_NONE. Fixes: 91d57bae0868 ("posix-timers: Make use of forward/remaining callbacks") Reported-by: Ye Xiaolong Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170609104457.GA39907@inn.lkp.intel.com --- kernel/time/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 6e7a70b1bf37..b53a0b562516 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -644,7 +644,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) struct timespec64 ts64; bool sig_none; - sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE; + sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; iv = timr->it_interval; /* interval timer ? */ -- cgit v1.3-14-g43fede From 57de72125d34f83bfd39615fcc3cc25ca3b9c0ec Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 8 Jun 2017 10:55:33 +0200 Subject: cpu/hotplug: Remove unused check_for_tasks() function clang -Wunused-function found one remaining function that was apparently meant to be removed in a recent code cleanup: kernel/cpu.c:565:20: warning: unused function 'check_for_tasks' [-Wunused-function] Sebastian explained: The function became unused unintentionally, but there is already a failure check, when a task cannot be removed from the outgoing cpu in the scheduler code, so bringing it back is not really giving any extra value. Fixes: 530e9b76ae8f ("cpu/hotplug: Remove obsolete cpu hotplug register/unregister functions") Signed-off-by: Arnd Bergmann Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Boris Ostrovsky Cc: Anna-Maria Gleixner Link: http://lkml.kernel.org/r/20170608085544.2257132-1-arnd@arndb.de Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 7435ffc6163b..d0f5f54aa087 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -562,30 +562,6 @@ void clear_tasks_mm_cpumask(int cpu) rcu_read_unlock(); } -static inline void check_for_tasks(int dead_cpu) -{ - struct task_struct *g, *p; - - read_lock(&tasklist_lock); - for_each_process_thread(g, p) { - if (!p->on_rq) - continue; - /* - * We do the check with unlocked task_rq(p)->lock. - * Order the reading to do not warn about a task, - * which was running on this cpu in the past, and - * it's just been woken on another cpu. - */ - rmb(); - if (task_cpu(p) != dead_cpu) - continue; - - pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", - p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); - } - read_unlock(&tasklist_lock); -} - /* Take this CPU down. */ static int take_cpu_down(void *_param) { -- cgit v1.3-14-g43fede From 5c7a3a3d20a4e175304c0e23809e3d70be8fed8a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 Jun 2017 19:44:09 +0200 Subject: posix-timers: Zero out oldval itimerspec The recent posix timer rework moved the clearing of the itimerspec to the real syscall implementation, but forgot that the kclock->timer_get() is used by timer_settime() as well. That results in an uninitialized variable and bogus values returned to user space. Add the missing memset to timer_settime(). Fixes: eabdec043853 ("posix-timers: Zero settings value in common code") Reported-by: Andrei Vagin Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Cc: Cyrill Gorcunov Link: http://lkml.kernel.org/r/20170609201156.GB21491@outlook.office365.com --- kernel/time/posix-timers.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b53a0b562516..88517dcfe0ca 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -828,6 +828,8 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, if (!timespec64_valid(&new_spec64.it_interval) || !timespec64_valid(&new_spec64.it_value)) return -EINVAL; + if (rtn) + memset(rtn, 0, sizeof(*rtn)); retry: timr = lock_timer(timer_id, &flag); if (!timr) -- cgit v1.3-14-g43fede From 67edab48caeb75d412706f4b9d3107afd1e07623 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 12 Jun 2017 19:39:49 +0200 Subject: posix-timers: Handle relative posix-timers correctly The recent rework of the posix timer internals broke the magic posix mechanism, which requires that relative timers are not affected by modifications of the underlying clock. That means relative CLOCK_REALTIME timers cannot use CLOCK_REALTIME, because that can be set and adjusted. The underlying hrtimer switches the clock for these timers to CLOCK_MONOTONIC. That still works, but reading the remaining time of such a timer has been broken in the rework. The old code used the hrtimer internals directly and avoided the posix clock callbacks. Now common_timer_get() uses the underlying kclock->timer_get() callback, which is still CLOCK_REALTIME based. So the remaining time of such a timer is calculated against the wrong time base. Handle it by switching the k_itimer->kclock pointer according to the resulting hrtimer mode. k_itimer->it_clock still contains CLOCK_REALTIME because the timer might be set with ABSTIME later and then it needs to switch back to the realtime posix clock implementation. Fixes: eae1c4ae275f ("posix-timers: Make use of cancel/arm callbacks") Reported-by: Andrei Vagin Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Cc: Cyrill Gorcunov Link: http://lkml.kernel.org/r/20170609201156.GB21491@outlook.office365.com --- kernel/time/posix-timers.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 88517dcfe0ca..58c0f60b132f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -72,6 +72,7 @@ static DEFINE_SPINLOCK(hash_lock); static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); +static const struct k_clock clock_realtime, clock_monotonic; /* * we assume that the new SIGEV_THREAD_ID shares no bits with the other @@ -750,6 +751,18 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, enum hrtimer_mode mode; mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; + /* + * Posix magic: Relative CLOCK_REALTIME timers are not affected by + * clock modifications, so they become CLOCK_MONOTONIC based under the + * hood. See hrtimer_init(). Update timr->kclock, so the generic + * functions which use timr->kclock->clock_get() work. + * + * Note: it_clock stays unmodified, because the next timer_set() might + * use ABSTIME, so it needs to switch back. + */ + if (timr->it_clock == CLOCK_REALTIME) + timr->kclock = absolute ? &clock_realtime : &clock_monotonic; + hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); timr->it.real.timer.function = posix_timer_fn; -- cgit v1.3-14-g43fede From e4c1a0d15b62a3ed2f8cd560e9fa589dbe880c13 Mon Sep 17 00:00:00 2001 From: Derek Robson Date: Mon, 12 Jun 2017 14:33:03 +1200 Subject: audit: style fix Fixed checkpatch.pl warnings of "function definition argument FOO should also have an identifier name" Signed-off-by: Derek Robson Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index bb3a4e14b7e5..b331d9b83f63 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -248,13 +248,13 @@ struct audit_netlink_list { struct sk_buff_head q; }; -int audit_send_list(void *); +int audit_send_list(void *_dest); extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; -extern int audit_del_rule(struct audit_entry *); -extern void audit_free_rule_rcu(struct rcu_head *); +extern int audit_del_rule(struct audit_entry *entry); +extern void audit_free_rule_rcu(struct rcu_head *head); extern struct list_head audit_filter_list[]; extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); @@ -302,17 +302,17 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark #endif /* CONFIG_AUDIT_WATCH */ #ifdef CONFIG_AUDIT_TREE -extern struct audit_chunk *audit_tree_lookup(const struct inode *); -extern void audit_put_chunk(struct audit_chunk *); -extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *); -extern int audit_make_tree(struct audit_krule *, char *, u32); -extern int audit_add_tree_rule(struct audit_krule *); -extern int audit_remove_tree_rule(struct audit_krule *); +extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); +extern void audit_put_chunk(struct audit_chunk *chunk); +extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree); +extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op); +extern int audit_add_tree_rule(struct audit_krule *rule); +extern int audit_remove_tree_rule(struct audit_krule *rule); extern void audit_trim_trees(void); extern int audit_tag_tree(char *old, char *new); -extern const char *audit_tree_path(struct audit_tree *); -extern void audit_put_tree(struct audit_tree *); -extern void audit_kill_trees(struct list_head *); +extern const char *audit_tree_path(struct audit_tree *tree); +extern void audit_put_tree(struct audit_tree *tree); +extern void audit_kill_trees(struct list_head *list); #else #define audit_remove_tree_rule(rule) BUG() #define audit_add_tree_rule(rule) -EINVAL @@ -324,7 +324,7 @@ extern void audit_kill_trees(struct list_head *); #define audit_kill_trees(list) BUG() #endif -extern char *audit_unpack_string(void **, size_t *, size_t); +extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); extern pid_t audit_sig_pid; extern kuid_t audit_sig_uid; @@ -334,7 +334,7 @@ extern int audit_filter(int msgtype, unsigned int listtype); #ifdef CONFIG_AUDITSYSCALL extern int audit_signal_info(int sig, struct task_struct *t); -extern void audit_filter_inodes(struct task_struct *, struct audit_context *); +extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); extern struct list_head *audit_killed_trees(void); #else #define audit_signal_info(s,t) AUDIT_DISABLED -- cgit v1.3-14-g43fede From d4af6d933ccffd24286528f04d5c39e702c8580f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 13 Jun 2017 06:04:14 +0200 Subject: nohz: Fix spurious warning when hrtimer and clockevent get out of sync The sanity check ensuring that the tick expiry cache (ts->next_tick) is actually in sync with the hardware clock (dev->next_event) makes the wrong assumption that the clock can't be programmed later than the hrtimer deadline. In fact the clock hardware can be programmed later on some conditions such as: * The hrtimer deadline is already in the past. * The hrtimer deadline is earlier than the minimum delay supported by the hardware. Such conditions can be met when we program the tick, for example if the last jiffies update hasn't been seen by the current CPU yet, we may program the hrtimer to a deadline that is earlier than ktime_get() because last_jiffies_update is our timestamp base to compute the next tick. As a result, we can randomly observe such warning: WARNING: CPU: 5 PID: 0 at kernel/time/tick-sched.c:794 tick_nohz_stop_sched_tick kernel/time/tick-sched.c:791 [inline] Call Trace: tick_nohz_irq_exit tick_irq_exit irq_exit exiting_irq smp_call_function_interrupt smp_call_function_single_interrupt call_function_single_interrupt Therefore, let's rather make sure that the tick expiry cache is sync'ed with the tick hrtimer deadline, against which it is not supposed to drift away. The clock hardware instead has its own will and can't be used as a reliable comparison point. Reported-and-tested-by: Sasha Levin Reported-and-tested-by: Abdul Haleem Signed-off-by: Frederic Weisbecker Cc: James Hartsock Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Tim Wright Link: http://lkml.kernel.org/r/1497326654-14122-1-git-send-email-fweisbec@gmail.com [ Minor readability edit. ] Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9d31f1e0067b..204600986e0d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -768,7 +768,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ - if (likely(dev->next_event <= ts->next_tick)) + if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) goto out; WARN_ON_ONCE(1); @@ -806,8 +806,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, goto out; } + hrtimer_set_expires(&ts->sched_timer, tick); + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); else tick_program_event(tick, 1); out: -- cgit v1.3-14-g43fede From 6a5ae63a0cc5d3ac73a96cb412fde66f3a71f98e Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 8 Jun 2017 19:53:24 -0700 Subject: tracing: Remove unused declaration of trace_stop_cmdline_recording trace_stop_cmdline_recording declaration isn't in use, remove it. Link: http://lkml.kernel.org/r/20170609025327.9508-2-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1122f151466f..63deff9cdf2c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1910,8 +1910,6 @@ static void tracing_stop_tr(struct trace_array *tr) raw_spin_unlock_irqrestore(&tr->start_lock, flags); } -void trace_stop_cmdline_recording(void); - static int trace_save_cmdline(struct task_struct *tsk) { unsigned pid, idx; -- cgit v1.3-14-g43fede From f4e981cba2dec675d40ac4f270b7e8ac164c9004 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Wed, 24 May 2017 07:49:50 +0200 Subject: printk: add __printf attributes to internal functions When compiling with -Wsuggest-attribute=format, gcc complains that some functions in kernel/printk/printk_safe.c transmit their argument to printf-like functions without having a printf attribute. Silence these warnings by adding relevant __printf attributes. Link: http://lkml.kernel.org/r/20170524054950.6722-1-nicolas.iooss_linux@m4x.org Cc: Steven Rostedt Cc: linux-kernel@vger.kernel.org Signed-off-by: Nicolas Iooss Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk_safe.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 03a42a539b20..3cdaeaef9ce1 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -80,8 +80,8 @@ static void queue_flush_work(struct printk_safe_seq_buf *s) * happen, printk_safe_log_store() will notice the buffer->len mismatch * and repeat the write. */ -static int printk_safe_log_store(struct printk_safe_seq_buf *s, - const char *fmt, va_list args) +static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s, + const char *fmt, va_list args) { int add; size_t len; @@ -299,7 +299,7 @@ void printk_safe_flush_on_panic(void) * one writer running. But the buffer might get flushed from another * CPU, so we need to be careful. */ -static int vprintk_nmi(const char *fmt, va_list args) +static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) { struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); @@ -330,7 +330,7 @@ void printk_nmi_exit(void) #else -static int vprintk_nmi(const char *fmt, va_list args) +static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) { return 0; } @@ -342,7 +342,7 @@ static int vprintk_nmi(const char *fmt, va_list args) * into itself. It uses a per-CPU buffer to store the message, just like * NMI. */ -static int vprintk_safe(const char *fmt, va_list args) +static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args) { struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); -- cgit v1.3-14-g43fede From c81be52a3ac0267aa830a2c4cb769030ea3483c9 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 12 Jun 2017 09:35:24 -0400 Subject: audit: fix a race condition with the auditd tracking code Originally reported by Adam and Dusty, it appears we have a small race window in kauditd_thread(), as documented in the Fedora BZ: * https://bugzilla.redhat.com/show_bug.cgi?id=1459326#c35 "This issue is partly due to the read-copy nature of RCU, and partly due to how we sync the auditd_connection state across kauditd_thread and the audit control channel. The kauditd_thread thread is always running so it can service the record queues and emit the multicast messages, if it happens to be just past the "main_queue" label, but before the "if (sk == NULL || ...)" if-statement which calls auditd_reset() when the new auditd connection is registered it could end up resetting the auditd connection, regardless of if it is valid or not. This is a rather small window and the variable nature of multi-core scheduling explains why this is proving rather difficult to reproduce." The fix is to have functions only call auditd_reset() when they believe that the kernel/auditd connection is still valid, e.g. non-NULL, and to have these callers pass their local copy of the auditd_connection pointer to auditd_reset() where it can be compared with the current connection state before resetting. If the caller has a stale state tracking pointer then the reset is ignored. We also make a small change to kauditd_thread() so that if the kernel/auditd connection is dead we skip the retry queue and send the records straight to the hold queue. This is necessary as we used to rely on auditd_reset() to occasionally purge the retry queue but we are going to be calling the reset function much less now and we want to make sure the retry queue doesn't grow unbounded. Reported-by: Adam Williamson Reported-by: Dusty Mabe Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index b2e877100242..e1e2b3abfb93 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -575,12 +575,16 @@ static void kauditd_retry_skb(struct sk_buff *skb) /** * auditd_reset - Disconnect the auditd connection + * @ac: auditd connection state * * Description: * Break the auditd/kauditd connection and move all the queued records into the - * hold queue in case auditd reconnects. + * hold queue in case auditd reconnects. It is important to note that the @ac + * pointer should never be dereferenced inside this function as it may be NULL + * or invalid, you can only compare the memory address! If @ac is NULL then + * the connection will always be reset. */ -static void auditd_reset(void) +static void auditd_reset(const struct auditd_connection *ac) { unsigned long flags; struct sk_buff *skb; @@ -590,6 +594,11 @@ static void auditd_reset(void) spin_lock_irqsave(&auditd_conn_lock, flags); ac_old = rcu_dereference_protected(auditd_conn, lockdep_is_held(&auditd_conn_lock)); + if (ac && ac != ac_old) { + /* someone already registered a new auditd connection */ + spin_unlock_irqrestore(&auditd_conn_lock, flags); + return; + } rcu_assign_pointer(auditd_conn, NULL); spin_unlock_irqrestore(&auditd_conn_lock, flags); @@ -649,8 +658,8 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) return rc; err: - if (rc == -ECONNREFUSED) - auditd_reset(); + if (ac && rc == -ECONNREFUSED) + auditd_reset(ac); return rc; } @@ -795,9 +804,9 @@ static int kauditd_thread(void *dummy) rc = kauditd_send_queue(sk, portid, &audit_hold_queue, UNICAST_RETRIES, NULL, kauditd_rehold_skb); - if (rc < 0) { + if (ac && rc < 0) { sk = NULL; - auditd_reset(); + auditd_reset(ac); goto main_queue; } @@ -805,9 +814,9 @@ static int kauditd_thread(void *dummy) rc = kauditd_send_queue(sk, portid, &audit_retry_queue, UNICAST_RETRIES, NULL, kauditd_hold_skb); - if (rc < 0) { + if (ac && rc < 0) { sk = NULL; - auditd_reset(); + auditd_reset(ac); goto main_queue; } @@ -815,12 +824,13 @@ main_queue: /* process the main queue - do the multicast send and attempt * unicast, dump failed record sends to the retry queue; if * sk == NULL due to previous failures we will just do the - * multicast send and move the record to the retry queue */ + * multicast send and move the record to the hold queue */ rc = kauditd_send_queue(sk, portid, &audit_queue, 1, kauditd_send_multicast_skb, - kauditd_retry_skb); - if (sk == NULL || rc < 0) - auditd_reset(); + (sk ? + kauditd_retry_skb : kauditd_hold_skb)); + if (ac && rc < 0) + auditd_reset(ac); sk = NULL; /* drop our netns reference, no auditd sends past this line */ @@ -1230,7 +1240,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) auditd_pid, 1); /* unregister the auditd connection */ - auditd_reset(); + auditd_reset(NULL); } } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { -- cgit v1.3-14-g43fede From 02fd7f68f5342bc7e8054cb05ea4a07f26d41d12 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:42 -0500 Subject: trace: rename kernel enum section to eval The kernel and its modules have sections containing the enum string to value conversions. Rename this section because we intend to store more than enums in it. Link: http://lkml.kernel.org/r/20170531215653.3240-2-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- include/asm-generic/vmlinux.lds.h | 6 +++--- include/trace/trace_events.h | 2 +- kernel/module.c | 2 +- kernel/trace/trace.c | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 314a0b9219c6..800f9f9677a6 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -125,9 +125,9 @@ VMLINUX_SYMBOL(__start_ftrace_events) = .; \ KEEP(*(_ftrace_events)) \ VMLINUX_SYMBOL(__stop_ftrace_events) = .; \ - VMLINUX_SYMBOL(__start_ftrace_enum_maps) = .; \ - KEEP(*(_ftrace_enum_map)) \ - VMLINUX_SYMBOL(__stop_ftrace_enum_maps) = .; + VMLINUX_SYMBOL(__start_ftrace_eval_maps) = .; \ + KEEP(*(_ftrace_eval_map)) \ + VMLINUX_SYMBOL(__stop_ftrace_eval_maps) = .; #else #define FTRACE_EVENTS() #endif diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 00f643164ca2..4bdd84023f5b 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -43,7 +43,7 @@ TRACE_MAKE_SYSTEM_STR(); .enum_value = a \ }; \ static struct trace_enum_map __used \ - __attribute__((section("_ftrace_enum_map"))) \ + __attribute__((section("_ftrace_eval_map"))) \ *TRACE_SYSTEM##_##a = &__##TRACE_SYSTEM##_##a /* diff --git a/kernel/module.c b/kernel/module.c index 4a3665f8f837..9ec4713c5eee 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3077,7 +3077,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); - mod->trace_enums = section_objs(info, "_ftrace_enum_map", + mod->trace_enums = section_objs(info, "_ftrace_eval_map", sizeof(*mod->trace_enums), &mod->num_trace_enums); #endif diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 63deff9cdf2c..acd3eb4d56a0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7732,15 +7732,15 @@ struct dentry *tracing_init_dentry(void) return NULL; } -extern struct trace_enum_map *__start_ftrace_enum_maps[]; -extern struct trace_enum_map *__stop_ftrace_enum_maps[]; +extern struct trace_enum_map *__start_ftrace_eval_maps[]; +extern struct trace_enum_map *__stop_ftrace_eval_maps[]; static void __init trace_enum_init(void) { int len; - len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps; - trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len); + len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; + trace_insert_enum_map(NULL, __start_ftrace_eval_maps, len); } #ifdef CONFIG_MODULES -- cgit v1.3-14-g43fede From 00f4b652b6f1dbfd4e1d5419d7f1cc23b1374da8 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:43 -0500 Subject: trace: rename trace_enum_map to trace_eval_map Each enum is loaded into the trace_enum_map, as we are now using this for more than enums rename it. Link: http://lkml.kernel.org/r/20170531215653.3240-3-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- include/linux/module.h | 2 +- include/linux/tracepoint.h | 6 +++--- include/trace/trace_events.h | 8 ++++---- kernel/trace/trace.c | 24 ++++++++++++------------ kernel/trace/trace.h | 4 ++-- kernel/trace/trace_events.c | 14 +++++++------- 6 files changed, 29 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 21f56393602f..46b48043d741 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -442,7 +442,7 @@ struct module { #ifdef CONFIG_EVENT_TRACING struct trace_event_call **trace_events; unsigned int num_trace_events; - struct trace_enum_map **trace_enums; + struct trace_eval_map **trace_enums; unsigned int num_trace_enums; #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index cc48cb2ce209..f7b0f5525e46 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -25,10 +25,10 @@ struct module; struct tracepoint; struct notifier_block; -struct trace_enum_map { +struct trace_eval_map { const char *system; - const char *enum_string; - unsigned long enum_value; + const char *eval_string; + unsigned long eval_value; }; #define TRACEPOINT_DEFAULT_PRIO 10 diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 4bdd84023f5b..49cce5fb54ee 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -35,14 +35,14 @@ TRACE_MAKE_SYSTEM_STR(); #undef TRACE_DEFINE_ENUM #define TRACE_DEFINE_ENUM(a) \ - static struct trace_enum_map __used __initdata \ + static struct trace_eval_map __used __initdata \ __##TRACE_SYSTEM##_##a = \ { \ .system = TRACE_SYSTEM_STRING, \ - .enum_string = #a, \ - .enum_value = a \ + .eval_string = #a, \ + .eval_value = a \ }; \ - static struct trace_enum_map __used \ + static struct trace_eval_map __used \ __attribute__((section("_ftrace_eval_map"))) \ *TRACE_SYSTEM##_##a = &__##TRACE_SYSTEM##_##a diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index acd3eb4d56a0..46fac3f63af1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -132,7 +132,7 @@ union trace_enum_map_item; struct trace_enum_map_tail { /* * "end" is first and points to NULL as it must be different - * than "mod" or "enum_string" + * than "mod" or "eval_string" */ union trace_enum_map_item *next; const char *end; /* points to NULL */ @@ -148,7 +148,7 @@ static DEFINE_MUTEX(trace_enum_mutex); * pointer to the next array of saved enum_map items. */ union trace_enum_map_item { - struct trace_enum_map map; + struct trace_eval_map map; struct trace_enum_map_head head; struct trace_enum_map_tail tail; }; @@ -4748,7 +4748,7 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { static union trace_enum_map_item * update_enum_map(union trace_enum_map_item *ptr) { - if (!ptr->map.enum_string) { + if (!ptr->map.eval_string) { if (ptr->tail.next) { ptr = ptr->tail.next; /* Set ptr to the next real item (skip head) */ @@ -4808,7 +4808,7 @@ static int enum_map_show(struct seq_file *m, void *v) union trace_enum_map_item *ptr = v; seq_printf(m, "%s %ld (%s)\n", - ptr->map.enum_string, ptr->map.enum_value, + ptr->map.eval_string, ptr->map.eval_value, ptr->map.system); return 0; @@ -4844,11 +4844,11 @@ trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) } static void -trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, +trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, int len) { - struct trace_enum_map **stop; - struct trace_enum_map **map; + struct trace_eval_map **stop; + struct trace_eval_map **map; union trace_enum_map_item *map_array; union trace_enum_map_item *ptr; @@ -4902,13 +4902,13 @@ static void trace_create_enum_file(struct dentry *d_tracer) #else /* CONFIG_TRACE_ENUM_MAP_FILE */ static inline void trace_create_enum_file(struct dentry *d_tracer) { } static inline void trace_insert_enum_map_file(struct module *mod, - struct trace_enum_map **start, int len) { } + struct trace_eval_map **start, int len) { } #endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ static void trace_insert_enum_map(struct module *mod, - struct trace_enum_map **start, int len) + struct trace_eval_map **start, int len) { - struct trace_enum_map **map; + struct trace_eval_map **map; if (len <= 0) return; @@ -7732,8 +7732,8 @@ struct dentry *tracing_init_dentry(void) return NULL; } -extern struct trace_enum_map *__start_ftrace_eval_maps[]; -extern struct trace_enum_map *__stop_ftrace_eval_maps[]; +extern struct trace_eval_map *__start_ftrace_eval_maps[]; +extern struct trace_eval_map *__stop_ftrace_eval_maps[]; static void __init trace_enum_init(void) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 39fd77330aab..a9667297ae49 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1773,10 +1773,10 @@ static inline const char *get_syscall_name(int syscall) #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); -void trace_event_enum_update(struct trace_enum_map **map, int len); +void trace_event_enum_update(struct trace_eval_map **map, int len); #else static inline void __init trace_event_init(void) { } -static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { } +static inline void trace_event_enum_update(struct trace_eval_map **map, int len) { } #endif extern struct trace_iterator *tracepoint_print_iter; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e7973e10398c..cf5b9aa4d732 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2067,18 +2067,18 @@ __register_event(struct trace_event_call *call, struct module *mod) return 0; } -static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) +static char *enum_replace(char *ptr, struct trace_eval_map *map, int len) { int rlen; int elen; /* Find the length of the enum value as a string */ - elen = snprintf(ptr, 0, "%ld", map->enum_value); + elen = snprintf(ptr, 0, "%ld", map->eval_value); /* Make sure there's enough room to replace the string with the value */ if (len < elen) return NULL; - snprintf(ptr, elen + 1, "%ld", map->enum_value); + snprintf(ptr, elen + 1, "%ld", map->eval_value); /* Get the rest of the string of ptr */ rlen = strlen(ptr + len); @@ -2090,11 +2090,11 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) } static void update_event_printk(struct trace_event_call *call, - struct trace_enum_map *map) + struct trace_eval_map *map) { char *ptr; int quote = 0; - int len = strlen(map->enum_string); + int len = strlen(map->eval_string); for (ptr = call->print_fmt; *ptr; ptr++) { if (*ptr == '\\') { @@ -2125,7 +2125,7 @@ static void update_event_printk(struct trace_event_call *call, continue; } if (isalpha(*ptr) || *ptr == '_') { - if (strncmp(map->enum_string, ptr, len) == 0 && + if (strncmp(map->eval_string, ptr, len) == 0 && !isalnum(ptr[len]) && ptr[len] != '_') { ptr = enum_replace(ptr, map, len); /* Hmm, enum string smaller than value */ @@ -2165,7 +2165,7 @@ static void update_event_printk(struct trace_event_call *call, } } -void trace_event_enum_update(struct trace_enum_map **map, int len) +void trace_event_enum_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; -- cgit v1.3-14-g43fede From 99be647c5841d570a23b5dfa65bfecada8b6e6b5 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:44 -0500 Subject: trace: rename struct module entry for trace enums Each module has a list of enum's its contributing to the enum map, rename that entry to reflect its use by more than enums. Link: http://lkml.kernel.org/r/20170531215653.3240-4-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- include/linux/module.h | 4 ++-- kernel/module.c | 6 +++--- kernel/trace/trace.c | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 46b48043d741..8eb9a1e693e5 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -442,8 +442,8 @@ struct module { #ifdef CONFIG_EVENT_TRACING struct trace_event_call **trace_events; unsigned int num_trace_events; - struct trace_eval_map **trace_enums; - unsigned int num_trace_enums; + struct trace_eval_map **trace_evals; + unsigned int num_trace_evals; #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD unsigned int num_ftrace_callsites; diff --git a/kernel/module.c b/kernel/module.c index 9ec4713c5eee..df1c4a9e7abb 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3077,9 +3077,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), &mod->num_trace_events); - mod->trace_enums = section_objs(info, "_ftrace_eval_map", - sizeof(*mod->trace_enums), - &mod->num_trace_enums); + mod->trace_evals = section_objs(info, "_ftrace_eval_map", + sizeof(*mod->trace_evals), + &mod->num_trace_evals); #endif #ifdef CONFIG_TRACING mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 46fac3f63af1..061abd8ba101 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7746,7 +7746,7 @@ static void __init trace_enum_init(void) #ifdef CONFIG_MODULES static void trace_module_add_enums(struct module *mod) { - if (!mod->num_trace_enums) + if (!mod->num_trace_evals) return; /* @@ -7756,7 +7756,7 @@ static void trace_module_add_enums(struct module *mod) if (trace_module_has_bad_taint(mod)) return; - trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums); + trace_insert_enum_map(mod, mod->trace_evals, mod->num_trace_evals); } #ifdef CONFIG_TRACE_ENUM_MAP_FILE @@ -7765,7 +7765,7 @@ static void trace_module_remove_enums(struct module *mod) union trace_enum_map_item *map; union trace_enum_map_item **last = &trace_enum_maps; - if (!mod->num_trace_enums) + if (!mod->num_trace_evals) return; mutex_lock(&trace_enum_mutex); -- cgit v1.3-14-g43fede From 23bf8cb8dc86c0368d2471ebb4622e7edd38190b Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:45 -0500 Subject: trace: rename trace enum data structures in trace.c The enum map entries can be exported to userspace via a sys enum_map file. Rename those functions and structures to reflect the fact that we are using them for more than enums. Link: http://lkml.kernel.org/r/20170531215653.3240-5-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 061abd8ba101..13c81f4f2bd7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -122,38 +122,38 @@ int __disable_trace_on_warning; #ifdef CONFIG_TRACE_ENUM_MAP_FILE /* Map of enums to their values, for "enum_map" file */ -struct trace_enum_map_head { +struct trace_eval_map_head { struct module *mod; unsigned long length; }; -union trace_enum_map_item; +union trace_eval_map_item; -struct trace_enum_map_tail { +struct trace_eval_map_tail { /* * "end" is first and points to NULL as it must be different * than "mod" or "eval_string" */ - union trace_enum_map_item *next; + union trace_eval_map_item *next; const char *end; /* points to NULL */ }; static DEFINE_MUTEX(trace_enum_mutex); /* - * The trace_enum_maps are saved in an array with two extra elements, + * The trace_eval_maps are saved in an array with two extra elements, * one at the beginning, and one at the end. The beginning item contains * the count of the saved maps (head.length), and the module they * belong to if not built in (head.mod). The ending item contains a * pointer to the next array of saved enum_map items. */ -union trace_enum_map_item { +union trace_eval_map_item { struct trace_eval_map map; - struct trace_enum_map_head head; - struct trace_enum_map_tail tail; + struct trace_eval_map_head head; + struct trace_eval_map_tail tail; }; -static union trace_enum_map_item *trace_enum_maps; +static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ static int tracing_set_tracer(struct trace_array *tr, const char *buf); @@ -4745,8 +4745,8 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { }; #ifdef CONFIG_TRACE_ENUM_MAP_FILE -static union trace_enum_map_item * -update_enum_map(union trace_enum_map_item *ptr) +static union trace_eval_map_item * +update_enum_map(union trace_eval_map_item *ptr) { if (!ptr->map.eval_string) { if (ptr->tail.next) { @@ -4761,7 +4761,7 @@ update_enum_map(union trace_enum_map_item *ptr) static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) { - union trace_enum_map_item *ptr = v; + union trace_eval_map_item *ptr = v; /* * Paranoid! If ptr points to end, we don't want to increment past it. @@ -4782,12 +4782,12 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) static void *enum_map_start(struct seq_file *m, loff_t *pos) { - union trace_enum_map_item *v; + union trace_eval_map_item *v; loff_t l = 0; mutex_lock(&trace_enum_mutex); - v = trace_enum_maps; + v = trace_eval_maps; if (v) v++; @@ -4805,7 +4805,7 @@ static void enum_map_stop(struct seq_file *m, void *v) static int enum_map_show(struct seq_file *m, void *v) { - union trace_enum_map_item *ptr = v; + union trace_eval_map_item *ptr = v; seq_printf(m, "%s %ld (%s)\n", ptr->map.eval_string, ptr->map.eval_value, @@ -4836,8 +4836,8 @@ static const struct file_operations tracing_enum_map_fops = { .release = seq_release, }; -static inline union trace_enum_map_item * -trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) +static inline union trace_eval_map_item * +trace_enum_jmp_to_tail(union trace_eval_map_item *ptr) { /* Return tail of array given the head */ return ptr + ptr->head.length + 1; @@ -4849,13 +4849,13 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, { struct trace_eval_map **stop; struct trace_eval_map **map; - union trace_enum_map_item *map_array; - union trace_enum_map_item *ptr; + union trace_eval_map_item *map_array; + union trace_eval_map_item *ptr; stop = start + len; /* - * The trace_enum_maps contains the map plus a head and tail item, + * The trace_eval_maps contains the map plus a head and tail item, * where the head holds the module and length of array, and the * tail holds a pointer to the next list. */ @@ -4867,10 +4867,10 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, mutex_lock(&trace_enum_mutex); - if (!trace_enum_maps) - trace_enum_maps = map_array; + if (!trace_eval_maps) + trace_eval_maps = map_array; else { - ptr = trace_enum_maps; + ptr = trace_eval_maps; for (;;) { ptr = trace_enum_jmp_to_tail(ptr); if (!ptr->tail.next) @@ -7762,15 +7762,15 @@ static void trace_module_add_enums(struct module *mod) #ifdef CONFIG_TRACE_ENUM_MAP_FILE static void trace_module_remove_enums(struct module *mod) { - union trace_enum_map_item *map; - union trace_enum_map_item **last = &trace_enum_maps; + union trace_eval_map_item *map; + union trace_eval_map_item **last = &trace_eval_maps; if (!mod->num_trace_evals) return; mutex_lock(&trace_enum_mutex); - map = trace_enum_maps; + map = trace_eval_maps; while (map) { if (map->head.mod == mod) -- cgit v1.3-14-g43fede From 1793ed939b2a0e18b06467e10d15b66925d75d5f Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:46 -0500 Subject: trace: rename trace_enum_mutex to trace_eval_mutex There is a lock protecting the trace_enum_map, rename it to reflect the use by more than enums. Link: http://lkml.kernel.org/r/20170531215653.3240-6-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 13c81f4f2bd7..e0279f5dc83f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -138,7 +138,7 @@ struct trace_eval_map_tail { const char *end; /* points to NULL */ }; -static DEFINE_MUTEX(trace_enum_mutex); +static DEFINE_MUTEX(trace_eval_mutex); /* * The trace_eval_maps are saved in an array with two extra elements, @@ -4785,7 +4785,7 @@ static void *enum_map_start(struct seq_file *m, loff_t *pos) union trace_eval_map_item *v; loff_t l = 0; - mutex_lock(&trace_enum_mutex); + mutex_lock(&trace_eval_mutex); v = trace_eval_maps; if (v) @@ -4800,7 +4800,7 @@ static void *enum_map_start(struct seq_file *m, loff_t *pos) static void enum_map_stop(struct seq_file *m, void *v) { - mutex_unlock(&trace_enum_mutex); + mutex_unlock(&trace_eval_mutex); } static int enum_map_show(struct seq_file *m, void *v) @@ -4865,7 +4865,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, return; } - mutex_lock(&trace_enum_mutex); + mutex_lock(&trace_eval_mutex); if (!trace_eval_maps) trace_eval_maps = map_array; @@ -4890,7 +4890,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, } memset(map_array, 0, sizeof(*map_array)); - mutex_unlock(&trace_enum_mutex); + mutex_unlock(&trace_eval_mutex); } static void trace_create_enum_file(struct dentry *d_tracer) @@ -7768,7 +7768,7 @@ static void trace_module_remove_enums(struct module *mod) if (!mod->num_trace_evals) return; - mutex_lock(&trace_enum_mutex); + mutex_lock(&trace_eval_mutex); map = trace_eval_maps; @@ -7785,7 +7785,7 @@ static void trace_module_remove_enums(struct module *mod) *last = trace_enum_jmp_to_tail(map)->tail.next; kfree(map); out: - mutex_unlock(&trace_enum_mutex); + mutex_unlock(&trace_eval_mutex); } #else static inline void trace_module_remove_enums(struct module *mod) { } -- cgit v1.3-14-g43fede From 5f60b351a7e37bf243725bcc131708be2c8ea497 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:47 -0500 Subject: trace: rename trace.c enum functions Rename the init and trace_enum_jmp_to_tail() routines to reflect their use by more than enumerated types. Link: http://lkml.kernel.org/r/20170531215653.3240-7-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e0279f5dc83f..d703f429bbd9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4837,7 +4837,7 @@ static const struct file_operations tracing_enum_map_fops = { }; static inline union trace_eval_map_item * -trace_enum_jmp_to_tail(union trace_eval_map_item *ptr) +trace_eval_jmp_to_tail(union trace_eval_map_item *ptr) { /* Return tail of array given the head */ return ptr + ptr->head.length + 1; @@ -4872,7 +4872,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, else { ptr = trace_eval_maps; for (;;) { - ptr = trace_enum_jmp_to_tail(ptr); + ptr = trace_eval_jmp_to_tail(ptr); if (!ptr->tail.next) break; ptr = ptr->tail.next; @@ -7735,7 +7735,7 @@ struct dentry *tracing_init_dentry(void) extern struct trace_eval_map *__start_ftrace_eval_maps[]; extern struct trace_eval_map *__stop_ftrace_eval_maps[]; -static void __init trace_enum_init(void) +static void __init trace_eval_init(void) { int len; @@ -7775,14 +7775,14 @@ static void trace_module_remove_enums(struct module *mod) while (map) { if (map->head.mod == mod) break; - map = trace_enum_jmp_to_tail(map); + map = trace_eval_jmp_to_tail(map); last = &map->tail.next; map = map->tail.next; } if (!map) goto out; - *last = trace_enum_jmp_to_tail(map)->tail.next; + *last = trace_eval_jmp_to_tail(map)->tail.next; kfree(map); out: mutex_unlock(&trace_eval_mutex); @@ -7839,7 +7839,7 @@ static __init int tracer_init_tracefs(void) trace_create_file("saved_cmdlines_size", 0644, d_tracer, NULL, &tracing_saved_cmdlines_size_fops); - trace_enum_init(); + trace_eval_init(); trace_create_enum_file(d_tracer); -- cgit v1.3-14-g43fede From f57a41434fc51732dd5e35e0e1aa9e607f1a05d6 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:48 -0500 Subject: trace: rename enum_map functions Rename the core trace enum routines to use eval, to reflect their use by more than just enum to value mapping. Link: http://lkml.kernel.org/r/20170531215653.3240-8-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 74 ++++++++++++++++++++++----------------------- kernel/trace/trace.h | 4 +-- kernel/trace/trace_events.c | 2 +- 3 files changed, 40 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d703f429bbd9..d830be7f0ba6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -145,7 +145,7 @@ static DEFINE_MUTEX(trace_eval_mutex); * one at the beginning, and one at the end. The beginning item contains * the count of the saved maps (head.length), and the module they * belong to if not built in (head.mod). The ending item contains a - * pointer to the next array of saved enum_map items. + * pointer to the next array of saved enum_eval/enum_map items. */ union trace_eval_map_item { struct trace_eval_map map; @@ -1141,9 +1141,9 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) /* * TRACE_FLAGS is defined as a tuple matching bit masks with strings. - * It uses C(a, b) where 'a' is the enum name and 'b' is the string that + * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list - * of strings in the order that the enums were defined. + * of strings in the order that the evals (enum) were defined. */ #undef C #define C(a, b) b @@ -4746,7 +4746,7 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { #ifdef CONFIG_TRACE_ENUM_MAP_FILE static union trace_eval_map_item * -update_enum_map(union trace_eval_map_item *ptr) +update_eval_map(union trace_eval_map_item *ptr) { if (!ptr->map.eval_string) { if (ptr->tail.next) { @@ -4759,7 +4759,7 @@ update_enum_map(union trace_eval_map_item *ptr) return ptr; } -static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) +static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos) { union trace_eval_map_item *ptr = v; @@ -4767,7 +4767,7 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) * Paranoid! If ptr points to end, we don't want to increment past it. * This really should never happen. */ - ptr = update_enum_map(ptr); + ptr = update_eval_map(ptr); if (WARN_ON_ONCE(!ptr)) return NULL; @@ -4775,12 +4775,12 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; - ptr = update_enum_map(ptr); + ptr = update_eval_map(ptr); return ptr; } -static void *enum_map_start(struct seq_file *m, loff_t *pos) +static void *eval_map_start(struct seq_file *m, loff_t *pos) { union trace_eval_map_item *v; loff_t l = 0; @@ -4792,18 +4792,18 @@ static void *enum_map_start(struct seq_file *m, loff_t *pos) v++; while (v && l < *pos) { - v = enum_map_next(m, v, &l); + v = eval_map_next(m, v, &l); } return v; } -static void enum_map_stop(struct seq_file *m, void *v) +static void eval_map_stop(struct seq_file *m, void *v) { mutex_unlock(&trace_eval_mutex); } -static int enum_map_show(struct seq_file *m, void *v) +static int eval_map_show(struct seq_file *m, void *v) { union trace_eval_map_item *ptr = v; @@ -4814,23 +4814,23 @@ static int enum_map_show(struct seq_file *m, void *v) return 0; } -static const struct seq_operations tracing_enum_map_seq_ops = { - .start = enum_map_start, - .next = enum_map_next, - .stop = enum_map_stop, - .show = enum_map_show, +static const struct seq_operations tracing_eval_map_seq_ops = { + .start = eval_map_start, + .next = eval_map_next, + .stop = eval_map_stop, + .show = eval_map_show, }; -static int tracing_enum_map_open(struct inode *inode, struct file *filp) +static int tracing_eval_map_open(struct inode *inode, struct file *filp) { if (tracing_disabled) return -ENODEV; - return seq_open(filp, &tracing_enum_map_seq_ops); + return seq_open(filp, &tracing_eval_map_seq_ops); } -static const struct file_operations tracing_enum_map_fops = { - .open = tracing_enum_map_open, +static const struct file_operations tracing_eval_map_fops = { + .open = tracing_eval_map_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, @@ -4844,7 +4844,7 @@ trace_eval_jmp_to_tail(union trace_eval_map_item *ptr) } static void -trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, +trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { struct trace_eval_map **stop; @@ -4861,7 +4861,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, */ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); if (!map_array) { - pr_warn("Unable to allocate trace enum mapping\n"); + pr_warn("Unable to allocate trace eval mapping\n"); return; } @@ -4893,19 +4893,19 @@ trace_insert_enum_map_file(struct module *mod, struct trace_eval_map **start, mutex_unlock(&trace_eval_mutex); } -static void trace_create_enum_file(struct dentry *d_tracer) +static void trace_create_eval_file(struct dentry *d_tracer) { trace_create_file("enum_map", 0444, d_tracer, - NULL, &tracing_enum_map_fops); + NULL, &tracing_eval_map_fops); } #else /* CONFIG_TRACE_ENUM_MAP_FILE */ -static inline void trace_create_enum_file(struct dentry *d_tracer) { } -static inline void trace_insert_enum_map_file(struct module *mod, +static inline void trace_create_eval_file(struct dentry *d_tracer) { } +static inline void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { } #endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ -static void trace_insert_enum_map(struct module *mod, +static void trace_insert_eval_map(struct module *mod, struct trace_eval_map **start, int len) { struct trace_eval_map **map; @@ -4915,9 +4915,9 @@ static void trace_insert_enum_map(struct module *mod, map = start; - trace_event_enum_update(map, len); + trace_event_eval_update(map, len); - trace_insert_enum_map_file(mod, start, len); + trace_insert_eval_map_file(mod, start, len); } static ssize_t @@ -7740,11 +7740,11 @@ static void __init trace_eval_init(void) int len; len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; - trace_insert_enum_map(NULL, __start_ftrace_eval_maps, len); + trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); } #ifdef CONFIG_MODULES -static void trace_module_add_enums(struct module *mod) +static void trace_module_add_evals(struct module *mod) { if (!mod->num_trace_evals) return; @@ -7756,11 +7756,11 @@ static void trace_module_add_enums(struct module *mod) if (trace_module_has_bad_taint(mod)) return; - trace_insert_enum_map(mod, mod->trace_evals, mod->num_trace_evals); + trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } #ifdef CONFIG_TRACE_ENUM_MAP_FILE -static void trace_module_remove_enums(struct module *mod) +static void trace_module_remove_evals(struct module *mod) { union trace_eval_map_item *map; union trace_eval_map_item **last = &trace_eval_maps; @@ -7788,7 +7788,7 @@ static void trace_module_remove_enums(struct module *mod) mutex_unlock(&trace_eval_mutex); } #else -static inline void trace_module_remove_enums(struct module *mod) { } +static inline void trace_module_remove_evals(struct module *mod) { } #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ static int trace_module_notify(struct notifier_block *self, @@ -7798,10 +7798,10 @@ static int trace_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: - trace_module_add_enums(mod); + trace_module_add_evals(mod); break; case MODULE_STATE_GOING: - trace_module_remove_enums(mod); + trace_module_remove_evals(mod); break; } @@ -7841,7 +7841,7 @@ static __init int tracer_init_tracefs(void) trace_eval_init(); - trace_create_enum_file(d_tracer); + trace_create_eval_file(d_tracer); #ifdef CONFIG_MODULES register_module_notifier(&trace_module_nb); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a9667297ae49..69a3ab3ee4f5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1773,10 +1773,10 @@ static inline const char *get_syscall_name(int syscall) #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); -void trace_event_enum_update(struct trace_eval_map **map, int len); +void trace_event_eval_update(struct trace_eval_map **map, int len); #else static inline void __init trace_event_init(void) { } -static inline void trace_event_enum_update(struct trace_eval_map **map, int len) { } +static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } #endif extern struct trace_iterator *tracepoint_print_iter; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index cf5b9aa4d732..e6897b005947 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2165,7 +2165,7 @@ static void update_event_printk(struct trace_event_call *call, } } -void trace_event_enum_update(struct trace_eval_map **map, int len) +void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; -- cgit v1.3-14-g43fede From 67ec0d85955630924b971e04c0954370a74b8706 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:49 -0500 Subject: tracing: Rename enum_replace to eval_replace The enum_replace stanza works as is for sizeof() calls as well as enums. Rename it as well. Link: http://lkml.kernel.org/r/20170531215653.3240-9-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e6897b005947..83dfd0dbbbfe 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2067,12 +2067,12 @@ __register_event(struct trace_event_call *call, struct module *mod) return 0; } -static char *enum_replace(char *ptr, struct trace_eval_map *map, int len) +static char *eval_replace(char *ptr, struct trace_eval_map *map, int len) { int rlen; int elen; - /* Find the length of the enum value as a string */ + /* Find the length of the eval value as a string */ elen = snprintf(ptr, 0, "%ld", map->eval_value); /* Make sure there's enough room to replace the string with the value */ if (len < elen) @@ -2127,14 +2127,14 @@ static void update_event_printk(struct trace_event_call *call, if (isalpha(*ptr) || *ptr == '_') { if (strncmp(map->eval_string, ptr, len) == 0 && !isalnum(ptr[len]) && ptr[len] != '_') { - ptr = enum_replace(ptr, map, len); - /* Hmm, enum string smaller than value */ + ptr = eval_replace(ptr, map, len); + /* enum/sizeof string smaller than value */ if (WARN_ON_ONCE(!ptr)) return; /* - * No need to decrement here, as enum_replace() + * No need to decrement here, as eval_replace() * returns the pointer to the character passed - * the enum, and two enums can not be placed + * the eval, and two evals can not be placed * back to back without something in between. * We can skip that something in between. */ -- cgit v1.3-14-g43fede From 681bec0367c2606b6154060310a2ffa543175980 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 31 May 2017 16:56:53 -0500 Subject: tracing: Rename update the enum_map file The enum_map file is used to display a list of symbol to name conversions. As its now used to resolve sizeof lets update the name and description. Link: http://lkml.kernel.org/r/20170531215653.3240-13-jeremy.linton@arm.com Signed-off-by: Jeremy Linton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 22 +++++++++++----------- kernel/trace/trace.c | 20 ++++++++++---------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 7e06f04e98fe..434c840e2d82 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -667,30 +667,30 @@ config RING_BUFFER_STARTUP_TEST If unsure, say N -config TRACE_ENUM_MAP_FILE - bool "Show enum mappings for trace events" +config TRACE_EVAL_MAP_FILE + bool "Show eval mappings for trace events" depends on TRACING help - The "print fmt" of the trace events will show the enum names instead - of their values. This can cause problems for user space tools that - use this string to parse the raw data as user space does not know + The "print fmt" of the trace events will show the enum/sizeof names + instead of their values. This can cause problems for user space tools + that use this string to parse the raw data as user space does not know how to convert the string to its value. To fix this, there's a special macro in the kernel that can be used - to convert the enum into its value. If this macro is used, then the - print fmt strings will have the enums converted to their values. + to convert an enum/sizeof into its value. If this macro is used, then + the print fmt strings will be converted to their values. If something does not get converted properly, this option can be - used to show what enums the kernel tried to convert. + used to show what enums/sizeof the kernel tried to convert. - This option is for debugging the enum conversions. A file is created - in the tracing directory called "enum_map" that will show the enum + This option is for debugging the conversions. A file is created + in the tracing directory called "eval_map" that will show the names matched with their values and what trace event system they belong too. Normally, the mapping of the strings to values will be freed after boot up or module load. With this option, they will not be freed, as - they are needed for the "enum_map" file. Enabling this option will + they are needed for the "eval_map" file. Enabling this option will increase the memory footprint of the running kernel. If unsure, say N diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d830be7f0ba6..19ac2088d10a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -120,8 +120,8 @@ enum ftrace_dump_mode ftrace_dump_on_oops; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; -#ifdef CONFIG_TRACE_ENUM_MAP_FILE -/* Map of enums to their values, for "enum_map" file */ +#ifdef CONFIG_TRACE_EVAL_MAP_FILE +/* Map of enums to their values, for "eval_map" file */ struct trace_eval_map_head { struct module *mod; unsigned long length; @@ -145,7 +145,7 @@ static DEFINE_MUTEX(trace_eval_mutex); * one at the beginning, and one at the end. The beginning item contains * the count of the saved maps (head.length), and the module they * belong to if not built in (head.mod). The ending item contains a - * pointer to the next array of saved enum_eval/enum_map items. + * pointer to the next array of saved eval_map items. */ union trace_eval_map_item { struct trace_eval_map map; @@ -154,7 +154,7 @@ union trace_eval_map_item { }; static union trace_eval_map_item *trace_eval_maps; -#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ +#endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static int tracing_set_tracer(struct trace_array *tr, const char *buf); @@ -4744,7 +4744,7 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { .write = tracing_saved_cmdlines_size_write, }; -#ifdef CONFIG_TRACE_ENUM_MAP_FILE +#ifdef CONFIG_TRACE_EVAL_MAP_FILE static union trace_eval_map_item * update_eval_map(union trace_eval_map_item *ptr) { @@ -4895,15 +4895,15 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, static void trace_create_eval_file(struct dentry *d_tracer) { - trace_create_file("enum_map", 0444, d_tracer, + trace_create_file("eval_map", 0444, d_tracer, NULL, &tracing_eval_map_fops); } -#else /* CONFIG_TRACE_ENUM_MAP_FILE */ +#else /* CONFIG_TRACE_EVAL_MAP_FILE */ static inline void trace_create_eval_file(struct dentry *d_tracer) { } static inline void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { } -#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ +#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ static void trace_insert_eval_map(struct module *mod, struct trace_eval_map **start, int len) @@ -7759,7 +7759,7 @@ static void trace_module_add_evals(struct module *mod) trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } -#ifdef CONFIG_TRACE_ENUM_MAP_FILE +#ifdef CONFIG_TRACE_EVAL_MAP_FILE static void trace_module_remove_evals(struct module *mod) { union trace_eval_map_item *map; @@ -7789,7 +7789,7 @@ static void trace_module_remove_evals(struct module *mod) } #else static inline void trace_module_remove_evals(struct module *mod) { } -#endif /* CONFIG_TRACE_ENUM_MAP_FILE */ +#endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) -- cgit v1.3-14-g43fede From 86a9c446c13ecd8793ea8599761322aed125d542 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:26 +0100 Subject: posix-cpu-timers: Move copyout of timespec into do_cpu_nanosleep() The posix-cpu-timer nanosleep() implementation can be simplified by moving the copy out of the remaining time to do_cpu_nanosleep() which is shared between the real nanosleep function and the restart function. The pointer to the timespec64 which is updated has to be stored in the restart block anyway. Instead of storing it only in the restart case, store it before calling do_cpu_nanosleep() and copy the remaining time in the signal exit path. [ tglx: Added changelog ] Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-1-viro@ZenIV.linux.org.uk --- kernel/time/posix-cpu-timers.c | 63 +++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index cb4a4eb44279..239fff980fd0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1226,9 +1226,10 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp, struct itimerspec64 *it) + struct timespec64 *rqtp) { struct k_itimer timer; + struct itimerspec64 it; int error; /* @@ -1242,12 +1243,14 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, timer.it_process = current; if (!error) { static struct itimerspec64 zero_it; + struct restart_block *restart = ¤t->restart_block; + struct timespec __user *rmtp; - memset(it, 0, sizeof *it); - it->it_value = *rqtp; + memset(&it, 0, sizeof it); + it.it_value = *rqtp; spin_lock_irq(&timer.it_lock); - error = posix_cpu_timer_set(&timer, flags, it, NULL); + error = posix_cpu_timer_set(&timer, flags, &it, NULL); if (error) { spin_unlock_irq(&timer.it_lock); return error; @@ -1277,7 +1280,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * We were interrupted by a signal. */ *rqtp = ns_to_timespec64(timer.it.cpu.expires); - error = posix_cpu_timer_set(&timer, 0, &zero_it, it); + error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); if (!error) { /* * Timer is now unarmed, deletion can not fail. @@ -1297,7 +1300,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, spin_unlock_irq(&timer.it_lock); } - if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { + if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { /* * It actually did fire already. */ @@ -1305,6 +1308,18 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, } error = -ERESTART_RESTARTBLOCK; + /* + * Report back to the user the time still remaining. + */ + rmtp = restart->nanosleep.rmtp; + if (rmtp) { + struct timespec ts; + + ts = timespec64_to_timespec(it.it_value); + if (copy_to_user(rmtp, &ts, sizeof(*rmtp))) + return -EFAULT; + } + restart->nanosleep.expires = timespec64_to_ns(rqtp); } return error; @@ -1316,10 +1331,13 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, struct timespec64 *rqtp, struct timespec __user *rmtp) { struct restart_block *restart_block = ¤t->restart_block; - struct itimerspec64 it; - struct timespec ts; int error; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + + restart_block->nanosleep.rmtp = rmtp; + /* * Diagnose required errors first. */ @@ -1328,23 +1346,15 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, CPUCLOCK_PID(which_clock) == task_pid_vnr(current))) return -EINVAL; - error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); + error = do_cpu_nanosleep(which_clock, flags, rqtp); if (error == -ERESTART_RESTARTBLOCK) { if (flags & TIMER_ABSTIME) return -ERESTARTNOHAND; - /* - * Report back to the user the time still remaining. - */ - ts = timespec64_to_timespec(it.it_value); - if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp))) - return -EFAULT; restart_block->fn = posix_cpu_nsleep_restart; restart_block->nanosleep.clockid = which_clock; - restart_block->nanosleep.rmtp = rmtp; - restart_block->nanosleep.expires = timespec64_to_ns(rqtp); } return error; } @@ -1352,28 +1362,11 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->nanosleep.clockid; - struct itimerspec64 it; struct timespec64 t; - struct timespec tmp; - int error; t = ns_to_timespec64(restart_block->nanosleep.expires); - error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); - - if (error == -ERESTART_RESTARTBLOCK) { - struct timespec __user *rmtp = restart_block->nanosleep.rmtp; - /* - * Report back to the user the time still remaining. - */ - tmp = timespec64_to_timespec(it.it_value); - if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp))) - return -EFAULT; - - restart_block->nanosleep.expires = timespec64_to_ns(&t); - } - return error; - + return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); } #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) -- cgit v1.3-14-g43fede From 15f27ce24cb613e6e01ce27c4094c55e55dde5d4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:27 +0100 Subject: alarmtimer: Move copyout and freeze handling into alarmtimer_do_nsleep() The alarmtimer nanosleep() implementation can be simplified by moving the copy out of the remaining time to alarmtimer_do_nsleep() which is shared between the real nanosleep function and the restart function. The pointer to the timespec64 which is updated has to be stored in the restart block anyway. Instead of storing it only in the restart case, store it before calling alarmtimer_do_nsleep() and copy the remaining time in the signal exit path. [ tglx: Added changelog ] Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-2-viro@ZenIV.linux.org.uk --- kernel/time/alarmtimer.c | 102 +++++++++++++++-------------------------------- 1 file changed, 32 insertions(+), 70 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index d8a7a7e214de..ac6e9bc6cc59 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -688,8 +688,10 @@ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, * * Sets the alarm timer and sleeps until it is fired or interrupted. */ -static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) +static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, + enum alarmtimer_type type) { + struct timespec __user *rmtp; alarm->data = (void *)current; do { set_current_state(TASK_INTERRUPTIBLE); @@ -702,36 +704,26 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) __set_current_state(TASK_RUNNING); - return (alarm->data == NULL); -} - - -/** - * update_rmtp - Update remaining timespec value - * @exp: expiration time - * @type: timer type - * @rmtp: user pointer to remaining timepsec value - * - * Helper function that fills in rmtp value with time between - * now and the exp value - */ -static int update_rmtp(ktime_t exp, enum alarmtimer_type type, - struct timespec __user *rmtp) -{ - struct timespec rmt; - ktime_t rem; - - rem = ktime_sub(exp, alarm_bases[type].gettime()); - - if (rem <= 0) + if (!alarm->data) return 0; - rmt = ktime_to_timespec(rem); - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) - return -EFAULT; + if (freezing(current)) + alarmtimer_freezerset(absexp, type); + rmtp = current->restart_block.nanosleep.rmtp; + if (rmtp) { + struct timespec rmt; + ktime_t rem; - return 1; + rem = ktime_sub(absexp, alarm_bases[type].gettime()); + if (rem <= 0) + return 0; + rmt = ktime_to_timespec(rem); + + if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) + return -EFAULT; + } + return -ERESTART_RESTARTBLOCK; } /** @@ -743,32 +735,12 @@ static int update_rmtp(ktime_t exp, enum alarmtimer_type type, static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) { enum alarmtimer_type type = restart->nanosleep.clockid; - ktime_t exp; - struct timespec __user *rmtp; + ktime_t exp = restart->nanosleep.expires; struct alarm alarm; - int ret = 0; - exp = restart->nanosleep.expires; alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); - if (alarmtimer_do_nsleep(&alarm, exp)) - goto out; - - if (freezing(current)) - alarmtimer_freezerset(exp, type); - - rmtp = restart->nanosleep.rmtp; - if (rmtp) { - ret = update_rmtp(exp, type, rmtp); - if (ret <= 0) - goto out; - } - - - /* The other values in restart are already filled in */ - ret = -ERESTART_RESTARTBLOCK; -out: - return ret; + return alarmtimer_do_nsleep(&alarm, exp, type); } /** @@ -785,11 +757,16 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, struct timespec __user *rmtp) { enum alarmtimer_type type = clock2alarm(which_clock); - struct restart_block *restart; + struct restart_block *restart = ¤t->restart_block; struct alarm alarm; ktime_t exp; int ret = 0; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + + restart->nanosleep.rmtp = rmtp; + if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; @@ -808,32 +785,17 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, exp = ktime_add(now, exp); } - if (alarmtimer_do_nsleep(&alarm, exp)) - goto out; - - if (freezing(current)) - alarmtimer_freezerset(exp, type); + ret = alarmtimer_do_nsleep(&alarm, exp, type); + if (ret != -ERESTART_RESTARTBLOCK) + return ret; /* abs timers don't set remaining time or restart */ - if (flags == TIMER_ABSTIME) { - ret = -ERESTARTNOHAND; - goto out; - } - - if (rmtp) { - ret = update_rmtp(exp, type, rmtp); - if (ret <= 0) - goto out; - } + if (flags == TIMER_ABSTIME) + return -ERESTARTNOHAND; - restart = ¤t->restart_block; restart->fn = alarm_timer_nsleep_restart; restart->nanosleep.clockid = type; restart->nanosleep.expires = exp; - restart->nanosleep.rmtp = rmtp; - ret = -ERESTART_RESTARTBLOCK; - -out: return ret; } -- cgit v1.3-14-g43fede From 192a82f9003fe8fabd6088aa646e829225a94c55 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:28 +0100 Subject: hrtimer_nanosleep(): Pass rmtp in restart_block Store the pointer to the timespec which gets updated with the remaining time in the restart block and remove the function argument. [ tglx: Added changelog ] Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-3-viro@ZenIV.linux.org.uk --- include/linux/hrtimer.h | 1 - kernel/compat.c | 6 +++--- kernel/time/hrtimer.c | 11 ++++++----- kernel/time/posix-stubs.c | 5 ++++- kernel/time/posix-timers.c | 5 ++++- 5 files changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 8c5b10eb7265..b80c34f6fd4b 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -453,7 +453,6 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer, /* Precise sleep: */ extern long hrtimer_nanosleep(struct timespec64 *rqtp, - struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid); extern long hrtimer_nanosleep_restart(struct restart_block *restart_block); diff --git a/kernel/compat.c b/kernel/compat.c index 933bcb31ae10..cc9ba9d29b47 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -253,9 +253,9 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, oldfs = get_fs(); set_fs(KERNEL_DS); - ret = hrtimer_nanosleep(&tu64, - rmtp ? (struct timespec __user *)&rmt : NULL, - HRTIMER_MODE_REL, CLOCK_MONOTONIC); + current->restart_block.nanosleep.rmtp = + rmtp ? (struct timespec __user *)&rmt : NULL; + ret = hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); set_fs(oldfs); /* diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ac053bb5296e..4ae777f159de 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1503,10 +1503,11 @@ out: return ret; } -long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, +long hrtimer_nanosleep(struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { - struct restart_block *restart; + struct restart_block *restart = ¤t->restart_block; + struct timespec __user *rmtp; struct hrtimer_sleeper t; int ret = 0; u64 slack; @@ -1526,16 +1527,15 @@ long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, goto out; } + rmtp = restart->nanosleep.rmtp; if (rmtp) { ret = update_rmtp(&t.timer, rmtp); if (ret <= 0) goto out; } - restart = ¤t->restart_block; restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.clockid = t.timer.base->clockid; - restart->nanosleep.rmtp = rmtp; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); ret = -ERESTART_RESTARTBLOCK; @@ -1557,7 +1557,8 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, if (!timespec64_valid(&tu64)) return -EINVAL; - return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + current->restart_block.nanosleep.rmtp = rmtp; + return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } /* diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index c0cd53eb018a..156a5e6f3bd2 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -115,7 +115,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, t64 = timespec_to_timespec64(t); if (!timespec64_valid(&t64)) return -EINVAL; - return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ? + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.rmtp = rmtp; + return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); default: diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 58c0f60b132f..1a9f59f8afc2 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1043,7 +1043,10 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, static int common_nsleep(const clockid_t which_clock, int flags, struct timespec64 *tsave, struct timespec __user *rmtp) { - return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.rmtp = rmtp; + return hrtimer_nanosleep(tsave, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } -- cgit v1.3-14-g43fede From a7602681fc63f1a3ddd3da336296c9634c2ff974 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:29 +0100 Subject: hrtimer: Move copyout of remaining time to do_nanosleep() The hrtimer nanosleep() implementation can be simplified by moving the copy out of the remaining time to do_nanosleep() which is shared between the real nanosleep function and the restart function. The pointer to the timespec64 which is updated is already stored in the restart block at the call site, so the seperate handling of nanosleep and restart function can be avoided. [ tglx: Added changelog ] Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-4-viro@ZenIV.linux.org.uk --- kernel/time/hrtimer.c | 62 +++++++++++++++++---------------------------------- 1 file changed, 20 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4ae777f159de..baa7b846b6e3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1441,6 +1441,7 @@ EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { + struct timespec __user *rmtp; hrtimer_init_sleeper(t, current); do { @@ -1457,48 +1458,33 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod __set_current_state(TASK_RUNNING); - return t->task == NULL; -} - -static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) -{ - struct timespec rmt; - ktime_t rem; - - rem = hrtimer_expires_remaining(timer); - if (rem <= 0) + if (!t->task) return 0; - rmt = ktime_to_timespec(rem); - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) - return -EFAULT; - - return 1; + rmtp = current->restart_block.nanosleep.rmtp; + if (rmtp) { + struct timespec rmt; + ktime_t rem = hrtimer_expires_remaining(&t->timer); + if (rem <= 0) + return 0; + rmt = ktime_to_timespec(rem); + + if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) + return -EFAULT; + } + return -ERESTART_RESTARTBLOCK; } long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; - struct timespec __user *rmtp; - int ret = 0; + int ret; hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); - if (do_nanosleep(&t, HRTIMER_MODE_ABS)) - goto out; - - rmtp = restart->nanosleep.rmtp; - if (rmtp) { - ret = update_rmtp(&t.timer, rmtp); - if (ret <= 0) - goto out; - } - - /* The other values in restart are already filled in */ - ret = -ERESTART_RESTARTBLOCK; -out: + ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; } @@ -1506,8 +1492,7 @@ out: long hrtimer_nanosleep(struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { - struct restart_block *restart = ¤t->restart_block; - struct timespec __user *rmtp; + struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; u64 slack; @@ -1518,7 +1503,8 @@ long hrtimer_nanosleep(struct timespec64 *rqtp, hrtimer_init_on_stack(&t.timer, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); - if (do_nanosleep(&t, mode)) + ret = do_nanosleep(&t, mode); + if (ret != -ERESTART_RESTARTBLOCK) goto out; /* Absolute timers do not update the rmtp value and restart: */ @@ -1527,18 +1513,10 @@ long hrtimer_nanosleep(struct timespec64 *rqtp, goto out; } - rmtp = restart->nanosleep.rmtp; - if (rmtp) { - ret = update_rmtp(&t.timer, rmtp); - if (ret <= 0) - goto out; - } - + restart = ¤t->restart_block; restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); - - ret = -ERESTART_RESTARTBLOCK; out: destroy_hrtimer_on_stack(&t.timer); return ret; -- cgit v1.3-14-g43fede From 99e6c0e6ec349575886ca7daffc9cb7ec583176f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:30 +0100 Subject: posix-timers: Store rmtp into restart_block in sys_clock_nanosleep() ... instead of doing that in every ->nsleep() instance Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-5-viro@ZenIV.linux.org.uk --- kernel/time/alarmtimer.c | 8 +------- kernel/time/posix-cpu-timers.c | 12 +++--------- kernel/time/posix-timers.c | 10 +++++----- kernel/time/posix-timers.h | 2 +- 4 files changed, 10 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ac6e9bc6cc59..d859a3601ddd 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -753,8 +753,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) * Handles clock_nanosleep calls against _ALARM clockids */ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *tsreq, - struct timespec __user *rmtp) + struct timespec64 *tsreq) { enum alarmtimer_type type = clock2alarm(which_clock); struct restart_block *restart = ¤t->restart_block; @@ -762,11 +761,6 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, ktime_t exp; int ret = 0; - if (flags & TIMER_ABSTIME) - rmtp = NULL; - - restart->nanosleep.rmtp = rmtp; - if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 239fff980fd0..ec6258c9cde5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1328,16 +1328,11 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block); static int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp, struct timespec __user *rmtp) + struct timespec64 *rqtp) { struct restart_block *restart_block = ¤t->restart_block; int error; - if (flags & TIMER_ABSTIME) - rmtp = NULL; - - restart_block->nanosleep.rmtp = rmtp; - /* * Diagnose required errors first. */ @@ -1388,10 +1383,9 @@ static int process_cpu_timer_create(struct k_itimer *timer) return posix_cpu_timer_create(timer); } static int process_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp, - struct timespec __user *rmtp) + struct timespec64 *rqtp) { - return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); + return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); } static long process_cpu_nsleep_restart(struct restart_block *restart_block) { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1a9f59f8afc2..a3e5c01b430e 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1041,11 +1041,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, * nanosleep for monotonic and realtime clocks */ static int common_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *tsave, struct timespec __user *rmtp) + struct timespec64 *tsave) { - if (flags & TIMER_ABSTIME) - rmtp = NULL; - current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(tsave, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); @@ -1070,8 +1067,11 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, t64 = timespec_to_timespec64(t); if (!timespec64_valid(&t64)) return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.rmtp = rmtp; - return kc->nsleep(which_clock, flags, &t64, rmtp); + return kc->nsleep(which_clock, flags, &t64); } /* diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index b086f5ba2f5b..bfd9e15c6ce0 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -10,7 +10,7 @@ struct k_clock { int (*clock_adj)(const clockid_t which_clock, struct timex *tx); int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, - struct timespec64 *, struct timespec __user *); + struct timespec64 *); long (*nsleep_restart)(struct restart_block *restart_block); int (*timer_set)(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, -- cgit v1.3-14-g43fede From edbeda46322fbcb15af2d2d0f2daffb0cd349a5a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:31 +0100 Subject: time/posix-timers: Move the compat copyouts to the nanosleep implementations Turn restart_block.nanosleep.{rmtp,compat_rmtp} into a tagged union (kind = 1 -> native, kind = 2 -> compat, kind = 0 -> nothing) and make the places doing actual copyout handle compat as well as native (that will become a helper in the next commit). Result: compat wrappers, messing with reassignments, etc. are gone. [ tglx: Folded in a variant of Peter Zijlstras enum patch ] Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-6-viro@ZenIV.linux.org.uk --- include/linux/posix-timers.h | 2 - include/linux/restart_block.h | 15 ++++- kernel/compat.c | 131 ----------------------------------------- kernel/time/alarmtimer.c | 16 +++-- kernel/time/hrtimer.c | 42 +++++++++++-- kernel/time/posix-cpu-timers.c | 20 +++++-- kernel/time/posix-stubs.c | 55 +++++++++++++---- kernel/time/posix-timers.c | 32 +++++++--- 8 files changed, 142 insertions(+), 171 deletions(-) (limited to 'kernel') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 667095dbcd37..29f1b7f09ced 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -110,8 +110,6 @@ void posix_cpu_timers_exit_group(struct task_struct *task); void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, u64 *newval, u64 *oldval); -long clock_nanosleep_restart(struct restart_block *restart_block); - void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); void posixtimer_rearm(struct siginfo *info); diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 0d905d8ec553..19df8422606c 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -11,6 +11,14 @@ struct timespec; struct compat_timespec; struct pollfd; +enum timespec_type { + TT_NONE = 0, + TT_NATIVE = 1, +#ifdef CONFIG_COMPAT + TT_COMPAT = 2, +#endif +}; + /* * System call restart block. */ @@ -29,10 +37,13 @@ struct restart_block { /* For nanosleep */ struct { clockid_t clockid; - struct timespec __user *rmtp; + enum timespec_type type; + union { + struct timespec __user *rmtp; #ifdef CONFIG_COMPAT - struct compat_timespec __user *compat_rmtp; + struct compat_timespec __user *compat_rmtp; #endif + }; u64 expires; } nanosleep; /* For poll */ diff --git a/kernel/compat.c b/kernel/compat.c index cc9ba9d29b47..23afa26f574b 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -213,82 +213,6 @@ int compat_convert_timespec(struct timespec __user **kts, return 0; } -static long compat_nanosleep_restart(struct restart_block *restart) -{ - struct compat_timespec __user *rmtp; - struct timespec rmt; - mm_segment_t oldfs; - long ret; - - restart->nanosleep.rmtp = (struct timespec __user *) &rmt; - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = hrtimer_nanosleep_restart(restart); - set_fs(oldfs); - - if (ret == -ERESTART_RESTARTBLOCK) { - rmtp = restart->nanosleep.compat_rmtp; - - if (rmtp && compat_put_timespec(&rmt, rmtp)) - return -EFAULT; - } - - return ret; -} - -COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) -{ - struct timespec tu, rmt; - struct timespec64 tu64; - mm_segment_t oldfs; - long ret; - - if (compat_get_timespec(&tu, rqtp)) - return -EFAULT; - - tu64 = timespec_to_timespec64(tu); - if (!timespec64_valid(&tu64)) - return -EINVAL; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - current->restart_block.nanosleep.rmtp = - rmtp ? (struct timespec __user *)&rmt : NULL; - ret = hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); - set_fs(oldfs); - - /* - * hrtimer_nanosleep() can only return 0 or - * -ERESTART_RESTARTBLOCK here because: - * - * - we call it with HRTIMER_MODE_REL and therefor exclude the - * -ERESTARTNOHAND return path. - * - * - we supply the rmtp argument from the task stack (due to - * the necessary compat conversion. So the update cannot - * fail, which excludes the -EFAULT return path as well. If - * it fails nevertheless we have a bigger problem and wont - * reach this place anymore. - * - * - if the return value is 0, we do not have to update rmtp - * because there is no remaining time. - * - * We check for -ERESTART_RESTARTBLOCK nevertheless if the - * core implementation decides to return random nonsense. - */ - if (ret == -ERESTART_RESTARTBLOCK) { - struct restart_block *restart = ¤t->restart_block; - - restart->fn = compat_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - - if (rmtp && compat_put_timespec(&rmt, rmtp)) - return -EFAULT; - } - return ret; -} - static inline long get_compat_itimerval(struct itimerval *o, struct compat_itimerval __user *i) { @@ -821,61 +745,6 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, return err; } -static long compat_clock_nanosleep_restart(struct restart_block *restart) -{ - long err; - mm_segment_t oldfs; - struct timespec tu; - struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp; - - restart->nanosleep.rmtp = (struct timespec __user *) &tu; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = clock_nanosleep_restart(restart); - set_fs(oldfs); - - if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - compat_put_timespec(&tu, rmtp)) - return -EFAULT; - - if (err == -ERESTART_RESTARTBLOCK) { - restart->fn = compat_clock_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - } - return err; -} - -COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct compat_timespec __user *, rqtp, - struct compat_timespec __user *, rmtp) -{ - long err; - mm_segment_t oldfs; - struct timespec in, out; - struct restart_block *restart; - - if (compat_get_timespec(&in, rqtp)) - return -EFAULT; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_nanosleep(which_clock, flags, - (struct timespec __user *) &in, - (struct timespec __user *) &out); - set_fs(oldfs); - - if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - compat_put_timespec(&out, rmtp)) - return -EFAULT; - - if (err == -ERESTART_RESTARTBLOCK) { - restart = ¤t->restart_block; - restart->fn = compat_clock_nanosleep_restart; - restart->nanosleep.compat_rmtp = rmtp; - } - return err; -} - /* * We currently only need the following fields from the sigevent * structure: sigev_value, sigev_signo, sig_notify and (sometimes diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index d859a3601ddd..57bcf94ee132 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "posix-timers.h" @@ -691,7 +692,7 @@ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, enum alarmtimer_type type) { - struct timespec __user *rmtp; + struct restart_block *restart; alarm->data = (void *)current; do { set_current_state(TASK_INTERRUPTIBLE); @@ -709,8 +710,8 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, if (freezing(current)) alarmtimer_freezerset(absexp, type); - rmtp = current->restart_block.nanosleep.rmtp; - if (rmtp) { + restart = ¤t->restart_block; + if (restart->nanosleep.type != TT_NONE) { struct timespec rmt; ktime_t rem; @@ -720,7 +721,14 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, return 0; rmt = ktime_to_timespec(rem); - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) +#ifdef CONFIG_COMPAT + if (restart->nanosleep.type == TT_COMPAT) { + if (compat_put_timespec(&rmt, + restart->nanosleep.compat_rmtp)) + return -EFAULT; + } else +#endif + if (copy_to_user(restart->nanosleep.rmtp, &rmt, sizeof(rmt))) return -EFAULT; } return -ERESTART_RESTARTBLOCK; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index baa7b846b6e3..5370da8fc0a4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -1441,7 +1442,8 @@ EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { - struct timespec __user *rmtp; + struct restart_block *restart; + hrtimer_init_sleeper(t, current); do { @@ -1461,15 +1463,23 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod if (!t->task) return 0; - rmtp = current->restart_block.nanosleep.rmtp; - if (rmtp) { - struct timespec rmt; + restart = ¤t->restart_block; + if (restart->nanosleep.type != TT_NONE) { ktime_t rem = hrtimer_expires_remaining(&t->timer); + struct timespec rmt; + if (rem <= 0) return 0; rmt = ktime_to_timespec(rem); - if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) +#ifdef CONFIG_COMPAT + if (restart->nanosleep.type == TT_COMPAT) { + if (compat_put_timespec(&rmt, + restart->nanosleep.compat_rmtp)) + return -EFAULT; + } else +#endif + if (copy_to_user(restart->nanosleep.rmtp, &rmt, sizeof(rmt))) return -EFAULT; } return -ERESTART_RESTARTBLOCK; @@ -1535,10 +1545,32 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, if (!timespec64_valid(&tu64)) return -EINVAL; + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) +{ + struct timespec64 tu64; + struct timespec tu; + + if (compat_get_timespec(&tu, rqtp)) + return -EFAULT; + + tu64 = timespec_to_timespec64(tu); + if (!timespec64_valid(&tu64)) + return -EINVAL; + + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); +} +#endif + /* * Functions related to boot-time initialization: */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index ec6258c9cde5..1563ca22cf1f 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "posix-timers.h" @@ -1243,10 +1244,9 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, timer.it_process = current; if (!error) { static struct itimerspec64 zero_it; - struct restart_block *restart = ¤t->restart_block; - struct timespec __user *rmtp; + struct restart_block *restart; - memset(&it, 0, sizeof it); + memset(&it, 0, sizeof(it)); it.it_value = *rqtp; spin_lock_irq(&timer.it_lock); @@ -1311,12 +1311,20 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, /* * Report back to the user the time still remaining. */ - rmtp = restart->nanosleep.rmtp; - if (rmtp) { + restart = ¤t->restart_block; + if (restart->nanosleep.type != TT_NONE) { struct timespec ts; ts = timespec64_to_timespec(it.it_value); - if (copy_to_user(rmtp, &ts, sizeof(*rmtp))) +#ifdef CONFIG_COMPAT + if (restart->nanosleep.type == TT_COMPAT) { + if (compat_put_timespec(&ts, + restart->nanosleep.compat_rmtp)) + return -EFAULT; + } else +#endif + if (copy_to_user(restart->nanosleep.rmtp, &ts, + sizeof(ts))) return -EFAULT; } restart->nanosleep.expires = timespec64_to_ns(rqtp); diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 156a5e6f3bd2..749b76f2d757 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -17,6 +17,7 @@ #include #include #include +#include asmlinkage long sys_ni_posix_timers(void) { @@ -110,25 +111,53 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: - if (copy_from_user(&t, rqtp, sizeof (struct timespec))) - return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) - return -EINVAL; - if (flags & TIMER_ABSTIME) - rmtp = NULL; - current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? - HRTIMER_MODE_ABS : HRTIMER_MODE_REL, - which_clock); + break; default: return -EINVAL; } + + if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + return -EFAULT; + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; + current->restart_block.nanosleep.rmtp = rmtp; + return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); } #ifdef CONFIG_COMPAT -long clock_nanosleep_restart(struct restart_block *restart_block) +COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, + struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) { - return hrtimer_nanosleep_restart(restart_block); + struct timespec64 t64; + struct timespec t; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + break; + default: + return -EINVAL; + } + + if (compat_get_timespec(&t, rqtp)) + return -EFAULT; + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); } #endif diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index a3e5c01b430e..bec86b6b9814 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -49,6 +49,7 @@ #include #include #include +#include #include "timekeeping.h" #include "posix-timers.h" @@ -1069,25 +1070,40 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return kc->nsleep(which_clock, flags, &t64); } -/* - * This will restart clock_nanosleep. This is required only by - * compat_clock_nanosleep_restart for now. - */ -long clock_nanosleep_restart(struct restart_block *restart_block) +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, + struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) { - clockid_t which_clock = restart_block->nanosleep.clockid; const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 t64; + struct timespec t; - if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) + if (!kc) return -EINVAL; + if (!kc->nsleep) + return -ENANOSLEEP_NOTSUP; + + if (compat_get_timespec(&t, rqtp)) + return -EFAULT; - return kc->nsleep_restart(restart_block); + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) + return -EINVAL; + if (flags & TIMER_ABSTIME) + rmtp = NULL; + current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; + current->restart_block.nanosleep.compat_rmtp = rmtp; + + return kc->nsleep(which_clock, flags, &t64); } +#endif static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, -- cgit v1.3-14-g43fede From ce41aaf47af3d28c4c958e07675a3e0a51f09bd3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:32 +0100 Subject: hrtimers/posix-timers: Merge nanosleep timespec copyout logics into a new helper Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-7-viro@ZenIV.linux.org.uk --- include/linux/hrtimer.h | 2 ++ kernel/time/alarmtimer.c | 10 +--------- kernel/time/hrtimer.c | 29 ++++++++++++++++++++--------- kernel/time/posix-cpu-timers.c | 13 ++----------- 4 files changed, 25 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index b80c34f6fd4b..38b968f3df4e 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -452,6 +452,8 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer, } /* Precise sleep: */ + +extern int nanosleep_copyout(struct restart_block *, struct timespec *); extern long hrtimer_nanosleep(struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 57bcf94ee132..7bed4e44f9bd 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -721,15 +721,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, return 0; rmt = ktime_to_timespec(rem); -#ifdef CONFIG_COMPAT - if (restart->nanosleep.type == TT_COMPAT) { - if (compat_put_timespec(&rmt, - restart->nanosleep.compat_rmtp)) - return -EFAULT; - } else -#endif - if (copy_to_user(restart->nanosleep.rmtp, &rmt, sizeof(rmt))) - return -EFAULT; + return nanosleep_copyout(restart, &rmt); } return -ERESTART_RESTARTBLOCK; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5370da8fc0a4..db2f5f7b4ba5 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1440,6 +1440,25 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); +int nanosleep_copyout(struct restart_block *restart, struct timespec *ts) +{ + switch(restart->nanosleep.type) { +#ifdef CONFIG_COMPAT + case TT_COMPAT: + if (compat_put_timespec(ts, restart->nanosleep.compat_rmtp)) + return -EFAULT; + break; +#endif + case TT_NATIVE: + if (copy_to_user(restart->nanosleep.rmtp, ts, sizeof(struct timespec))) + return -EFAULT; + break; + default: + BUG(); + } + return -ERESTART_RESTARTBLOCK; +} + static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { struct restart_block *restart; @@ -1472,15 +1491,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod return 0; rmt = ktime_to_timespec(rem); -#ifdef CONFIG_COMPAT - if (restart->nanosleep.type == TT_COMPAT) { - if (compat_put_timespec(&rmt, - restart->nanosleep.compat_rmtp)) - return -EFAULT; - } else -#endif - if (copy_to_user(restart->nanosleep.rmtp, &rmt, sizeof(rmt))) - return -EFAULT; + return nanosleep_copyout(restart, &rmt); } return -ERESTART_RESTARTBLOCK; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1563ca22cf1f..993a924d1399 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1312,22 +1312,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * Report back to the user the time still remaining. */ restart = ¤t->restart_block; + restart->nanosleep.expires = timespec64_to_ns(rqtp); if (restart->nanosleep.type != TT_NONE) { struct timespec ts; ts = timespec64_to_timespec(it.it_value); -#ifdef CONFIG_COMPAT - if (restart->nanosleep.type == TT_COMPAT) { - if (compat_put_timespec(&ts, - restart->nanosleep.compat_rmtp)) - return -EFAULT; - } else -#endif - if (copy_to_user(restart->nanosleep.rmtp, &ts, - sizeof(ts))) - return -EFAULT; + error = nanosleep_copyout(restart, &ts); } - restart->nanosleep.expires = timespec64_to_ns(rqtp); } return error; -- cgit v1.3-14-g43fede From fb923c4a3c2ee735755d4a93522150fc35d0ecbd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:33 +0100 Subject: posix-timers: Kill ->nsleep_restart() No more users. Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-8-viro@ZenIV.linux.org.uk --- include/linux/hrtimer.h | 1 - kernel/time/hrtimer.c | 2 +- kernel/time/posix-cpu-timers.c | 6 ------ kernel/time/posix-timers.c | 4 ---- kernel/time/posix-timers.h | 1 - 5 files changed, 1 insertion(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 38b968f3df4e..d83b7ed1cb0e 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -457,7 +457,6 @@ extern int nanosleep_copyout(struct restart_block *, struct timespec *); extern long hrtimer_nanosleep(struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid); -extern long hrtimer_nanosleep_restart(struct restart_block *restart_block); extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *tsk); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index db2f5f7b4ba5..45f83cc7c0c7 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1496,7 +1496,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod return -ERESTART_RESTARTBLOCK; } -long __sched hrtimer_nanosleep_restart(struct restart_block *restart) +static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; int ret; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 993a924d1399..515148d4eeb1 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1386,10 +1386,6 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags, { return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); } -static long process_cpu_nsleep_restart(struct restart_block *restart_block) -{ - return -EINVAL; -} static int thread_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) { @@ -1412,7 +1408,6 @@ const struct k_clock clock_posix_cpu = { .clock_get = posix_cpu_clock_get, .timer_create = posix_cpu_timer_create, .nsleep = posix_cpu_nsleep, - .nsleep_restart = posix_cpu_nsleep_restart, .timer_set = posix_cpu_timer_set, .timer_del = posix_cpu_timer_del, .timer_get = posix_cpu_timer_get, @@ -1424,7 +1419,6 @@ const struct k_clock clock_process = { .clock_get = process_cpu_clock_get, .timer_create = process_cpu_timer_create, .nsleep = process_cpu_nsleep, - .nsleep_restart = process_cpu_nsleep_restart, }; const struct k_clock clock_thread = { diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index bec86b6b9814..ea4a463436bf 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1111,7 +1111,6 @@ static const struct k_clock clock_realtime = { .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, @@ -1127,7 +1126,6 @@ static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, .clock_get = posix_ktime_get_ts, .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, @@ -1158,7 +1156,6 @@ static const struct k_clock clock_tai = { .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_tai, .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, @@ -1174,7 +1171,6 @@ static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_boottime, .nsleep = common_nsleep, - .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index bfd9e15c6ce0..5e69bb85629f 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -11,7 +11,6 @@ struct k_clock { int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, struct timespec64 *); - long (*nsleep_restart)(struct restart_block *restart_block); int (*timer_set)(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, struct itimerspec64 *old_setting); -- cgit v1.3-14-g43fede From 3a4d44b6162555070194e486ff6b3799a8d323a2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:34 +0100 Subject: ntp: Move adjtimex related compat syscalls to native counterparts Get rid of set_fs() mess and sanitize compat_{get,put}_timex(), while we are at it. Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-9-viro@ZenIV.linux.org.uk --- include/linux/compat.h | 4 ++ kernel/compat.c | 141 +++++++++++++++++---------------------------- kernel/time/posix-stubs.c | 2 + kernel/time/posix-timers.c | 27 +++++++++ kernel/time/time.c | 24 +++++++- 5 files changed, 108 insertions(+), 90 deletions(-) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 1c5f3152cbb5..ecb8dd261d36 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -128,6 +128,10 @@ struct compat_timex { compat_int_t:32; compat_int_t:32; compat_int_t:32; }; +struct timex; +int compat_get_timex(struct timex *, const struct compat_timex __user *); +int compat_put_timex(struct compat_timex __user *, const struct timex *); + #define _COMPAT_NSIG_WORDS (_COMPAT_NSIG / _COMPAT_NSIG_BPW) typedef struct { diff --git a/kernel/compat.c b/kernel/compat.c index 23afa26f574b..97087b333543 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -30,60 +30,64 @@ #include -static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) +int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp) { - memset(txc, 0, sizeof(struct timex)); - - if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || - __get_user(txc->modes, &utp->modes) || - __get_user(txc->offset, &utp->offset) || - __get_user(txc->freq, &utp->freq) || - __get_user(txc->maxerror, &utp->maxerror) || - __get_user(txc->esterror, &utp->esterror) || - __get_user(txc->status, &utp->status) || - __get_user(txc->constant, &utp->constant) || - __get_user(txc->precision, &utp->precision) || - __get_user(txc->tolerance, &utp->tolerance) || - __get_user(txc->time.tv_sec, &utp->time.tv_sec) || - __get_user(txc->time.tv_usec, &utp->time.tv_usec) || - __get_user(txc->tick, &utp->tick) || - __get_user(txc->ppsfreq, &utp->ppsfreq) || - __get_user(txc->jitter, &utp->jitter) || - __get_user(txc->shift, &utp->shift) || - __get_user(txc->stabil, &utp->stabil) || - __get_user(txc->jitcnt, &utp->jitcnt) || - __get_user(txc->calcnt, &utp->calcnt) || - __get_user(txc->errcnt, &utp->errcnt) || - __get_user(txc->stbcnt, &utp->stbcnt)) + struct compat_timex tx32; + + if (copy_from_user(&tx32, utp, sizeof(struct compat_timex))) return -EFAULT; + txc->modes = tx32.modes; + txc->offset = tx32.offset; + txc->freq = tx32.freq; + txc->maxerror = tx32.maxerror; + txc->esterror = tx32.esterror; + txc->status = tx32.status; + txc->constant = tx32.constant; + txc->precision = tx32.precision; + txc->tolerance = tx32.tolerance; + txc->time.tv_sec = tx32.time.tv_sec; + txc->time.tv_usec = tx32.time.tv_usec; + txc->tick = tx32.tick; + txc->ppsfreq = tx32.ppsfreq; + txc->jitter = tx32.jitter; + txc->shift = tx32.shift; + txc->stabil = tx32.stabil; + txc->jitcnt = tx32.jitcnt; + txc->calcnt = tx32.calcnt; + txc->errcnt = tx32.errcnt; + txc->stbcnt = tx32.stbcnt; + return 0; } -static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) -{ - if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || - __put_user(txc->modes, &utp->modes) || - __put_user(txc->offset, &utp->offset) || - __put_user(txc->freq, &utp->freq) || - __put_user(txc->maxerror, &utp->maxerror) || - __put_user(txc->esterror, &utp->esterror) || - __put_user(txc->status, &utp->status) || - __put_user(txc->constant, &utp->constant) || - __put_user(txc->precision, &utp->precision) || - __put_user(txc->tolerance, &utp->tolerance) || - __put_user(txc->time.tv_sec, &utp->time.tv_sec) || - __put_user(txc->time.tv_usec, &utp->time.tv_usec) || - __put_user(txc->tick, &utp->tick) || - __put_user(txc->ppsfreq, &utp->ppsfreq) || - __put_user(txc->jitter, &utp->jitter) || - __put_user(txc->shift, &utp->shift) || - __put_user(txc->stabil, &utp->stabil) || - __put_user(txc->jitcnt, &utp->jitcnt) || - __put_user(txc->calcnt, &utp->calcnt) || - __put_user(txc->errcnt, &utp->errcnt) || - __put_user(txc->stbcnt, &utp->stbcnt) || - __put_user(txc->tai, &utp->tai)) +int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) +{ + struct compat_timex tx32; + + memset(&tx32, 0, sizeof(struct compat_timex)); + tx32.modes = txc->modes; + tx32.offset = txc->offset; + tx32.freq = txc->freq; + tx32.maxerror = txc->maxerror; + tx32.esterror = txc->esterror; + tx32.status = txc->status; + tx32.constant = txc->constant; + tx32.precision = txc->precision; + tx32.tolerance = txc->tolerance; + tx32.time.tv_sec = txc->time.tv_sec; + tx32.time.tv_usec = txc->time.tv_usec; + tx32.tick = txc->tick; + tx32.ppsfreq = txc->ppsfreq; + tx32.jitter = txc->jitter; + tx32.shift = txc->shift; + tx32.stabil = txc->stabil; + tx32.jitcnt = txc->jitcnt; + tx32.calcnt = txc->calcnt; + tx32.errcnt = txc->errcnt; + tx32.stbcnt = txc->stbcnt; + tx32.tai = txc->tai; + if (copy_to_user(utp, &tx32, sizeof(struct compat_timex))) return -EFAULT; return 0; } @@ -705,29 +709,6 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, return err; } -COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, - struct compat_timex __user *, utp) -{ - struct timex txc; - mm_segment_t oldfs; - int err, ret; - - err = compat_get_timex(&txc, utp); - if (err) - return err; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc); - set_fs(oldfs); - - err = compat_put_timex(utp, &txc); - if (err) - return err; - - return ret; -} - COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, struct compat_timespec __user *, tp) { @@ -944,24 +925,6 @@ COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ -COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) -{ - struct timex txc; - int err, ret; - - err = compat_get_timex(&txc, utp); - if (err) - return err; - - ret = do_adjtimex(&txc); - - err = compat_put_timex(utp, &txc); - if (err) - return err; - - return ret; -} - #ifdef CONFIG_NUMA COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, compat_uptr_t __user *, pages32, diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 749b76f2d757..954d1d8ff9e6 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -28,6 +28,7 @@ asmlinkage long sys_ni_posix_timers(void) } #define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) +#define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers) SYS_NI(timer_create); SYS_NI(timer_gettime); @@ -40,6 +41,7 @@ SYS_NI(setitimer); #ifdef __ARCH_WANT_SYS_ALARM SYS_NI(alarm); #endif +COMPAT_SYS_NI(clock_adjtime); /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index ea4a463436bf..b1b6d52d6425 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1018,6 +1018,33 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, return err; } +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, + struct compat_timex __user *, utp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timex ktx; + int err; + + if (!kc) + return -EINVAL; + if (!kc->clock_adj) + return -EOPNOTSUPP; + + err = compat_get_timex(&ktx, utp); + if (err) + return err; + + err = kc->clock_adj(which_clock, &ktx); + + if (err >= 0) + err = compat_put_timex(utp, &ktx); + + return err; +} +#endif + SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { diff --git a/kernel/time/time.c b/kernel/time/time.c index 49c73c6ed648..400662f16c5a 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -39,6 +39,7 @@ #include #include +#include #include #include @@ -224,12 +225,33 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) * structure. But bear in mind that the structures * may change */ - if(copy_from_user(&txc, txc_p, sizeof(struct timex))) + if (copy_from_user(&txc, txc_p, sizeof(struct timex))) return -EFAULT; ret = do_adjtimex(&txc); return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; } +#ifdef CONFIG_COMPAT + +COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) +{ + struct timex txc; + int err, ret; + + err = compat_get_timex(&txc, utp); + if (err) + return err; + + ret = do_adjtimex(&txc); + + err = compat_put_timex(utp, &txc); + if (err) + return err; + + return ret; +} +#endif + /* * Convert jiffies to milliseconds and back. * -- cgit v1.3-14-g43fede From 1acbe7708b0313b33287bb4ffcbf26462ea3c588 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:35 +0100 Subject: posix-timers: Take compat timer_settime(2) to native one ... and get rid of set_fs() in there Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-10-viro@ZenIV.linux.org.uk --- kernel/compat.c | 23 ------------ kernel/time/posix-stubs.c | 1 + kernel/time/posix-timers.c | 87 +++++++++++++++++++++++++++++++++------------- 3 files changed, 64 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 97087b333543..df39e2e00c47 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -635,29 +635,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, return sys_timer_create(which_clock, event, created_timer_id); } -COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - struct compat_itimerspec __user *, new, - struct compat_itimerspec __user *, old) -{ - long err; - mm_segment_t oldfs; - struct itimerspec newts, oldts; - - if (!new) - return -EINVAL; - if (get_compat_itimerspec(&newts, new)) - return -EFAULT; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_timer_settime(timer_id, flags, - (struct itimerspec __user *) &newts, - (struct itimerspec __user *) &oldts); - set_fs(oldfs); - if (!err && old && put_compat_itimerspec(old, &oldts)) - return -EFAULT; - return err; -} - COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct compat_itimerspec __user *, setting) { diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 954d1d8ff9e6..ad263df132d6 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -42,6 +42,7 @@ SYS_NI(setitimer); SYS_NI(alarm); #endif COMPAT_SYS_NI(clock_adjtime); +COMPAT_SYS_NI(timer_settime); /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b1b6d52d6425..a73feac191f9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -819,31 +819,21 @@ int common_timer_set(struct k_itimer *timr, int flags, return 0; } -/* Set a POSIX.1b interval timer */ -SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - const struct itimerspec __user *, new_setting, - struct itimerspec __user *, old_setting) +static int do_timer_settime(timer_t timer_id, int flags, + struct itimerspec64 *new_spec64, + struct itimerspec64 *old_spec64) { - struct itimerspec64 new_spec64, old_spec64; - struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; - struct itimerspec new_spec, old_spec; + const struct k_clock *kc; struct k_itimer *timr; unsigned long flag; - const struct k_clock *kc; int error = 0; - if (!new_setting) + if (!timespec64_valid(&new_spec64->it_interval) || + !timespec64_valid(&new_spec64->it_value)) return -EINVAL; - if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) - return -EFAULT; - new_spec64 = itimerspec_to_itimerspec64(&new_spec); - - if (!timespec64_valid(&new_spec64.it_interval) || - !timespec64_valid(&new_spec64.it_value)) - return -EINVAL; - if (rtn) - memset(rtn, 0, sizeof(*rtn)); + if (old_spec64) + memset(old_spec64, 0, sizeof(*old_spec64)); retry: timr = lock_timer(timer_id, &flag); if (!timr) @@ -853,22 +843,71 @@ retry: if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; else - error = kc->timer_set(timr, flags, &new_spec64, rtn); + error = kc->timer_set(timr, flags, new_spec64, old_spec64); unlock_timer(timr, flag); if (error == TIMER_RETRY) { - rtn = NULL; // We already got the old time... + old_spec64 = NULL; // We already got the old time... goto retry; } - old_spec = itimerspec64_to_itimerspec(&old_spec64); - if (old_setting && !error && - copy_to_user(old_setting, &old_spec, sizeof (old_spec))) - error = -EFAULT; + return error; +} + +/* Set a POSIX.1b interval timer */ +SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + const struct itimerspec __user *, new_setting, + struct itimerspec __user *, old_setting) +{ + struct itimerspec64 new_spec64, old_spec64; + struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; + struct itimerspec new_spec; + int error = 0; + + if (!new_setting) + return -EINVAL; + if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) + return -EFAULT; + new_spec64 = itimerspec_to_itimerspec64(&new_spec); + + error = do_timer_settime(timer_id, flags, &new_spec64, rtn); + if (!error && old_setting) { + struct itimerspec old_spec; + old_spec = itimerspec64_to_itimerspec(&old_spec64); + if (copy_to_user(old_setting, &old_spec, sizeof (old_spec))) + error = -EFAULT; + } return error; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + struct compat_itimerspec __user *, new, + struct compat_itimerspec __user *, old) +{ + struct itimerspec64 new_spec64, old_spec64; + struct itimerspec64 *rtn = old ? &old_spec64 : NULL; + struct itimerspec new_spec; + int error = 0; + + if (!new) + return -EINVAL; + if (get_compat_itimerspec(&new_spec, new)) + return -EFAULT; + + new_spec64 = itimerspec_to_itimerspec64(&new_spec); + error = do_timer_settime(timer_id, flags, &new_spec64, rtn); + if (!error && old) { + struct itimerspec old_spec; + old_spec = itimerspec64_to_itimerspec(&old_spec64); + if (put_compat_itimerspec(old, &old_spec)) + error = -EFAULT; + } + return error; +} +#endif + int common_timer_del(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; -- cgit v1.3-14-g43fede From b0dc12426ec404de99d7e75a12a22d9201d90914 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:36 +0100 Subject: posix-timers: Take compat timer_gettime(2) to native one ... and get rid of set_fs() in there Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-11-viro@ZenIV.linux.org.uk --- kernel/compat.c | 17 ----------------- kernel/time/posix-stubs.c | 1 + kernel/time/posix-timers.c | 43 ++++++++++++++++++++++++++++++++++--------- 3 files changed, 35 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index df39e2e00c47..1fb8cf7e691e 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -635,23 +635,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, return sys_timer_create(which_clock, event, created_timer_id); } -COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct compat_itimerspec __user *, setting) -{ - long err; - mm_segment_t oldfs; - struct itimerspec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_timer_gettime(timer_id, - (struct itimerspec __user *) &ts); - set_fs(oldfs); - if (!err && put_compat_itimerspec(setting, &ts)) - return -EFAULT; - return err; -} - COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, struct compat_timespec __user *, tp) { diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index ad263df132d6..f4a1962d1729 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -43,6 +43,7 @@ SYS_NI(alarm); #endif COMPAT_SYS_NI(clock_adjtime); COMPAT_SYS_NI(timer_settime); +COMPAT_SYS_NI(timer_gettime); /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index a73feac191f9..e82bb1fd614e 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -690,11 +690,8 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) } /* Get the time remaining on a POSIX.1b interval timer. */ -SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct itimerspec __user *, setting) +static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { - struct itimerspec64 cur_setting64; - struct itimerspec cur_setting; struct k_itimer *timr; const struct k_clock *kc; unsigned long flags; @@ -704,21 +701,49 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, if (!timr) return -EINVAL; - memset(&cur_setting64, 0, sizeof(cur_setting64)); + memset(setting, 0, sizeof(*setting)); kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_get)) ret = -EINVAL; else - kc->timer_get(timr, &cur_setting64); + kc->timer_get(timr, setting); unlock_timer(timr, flags); + return ret; +} - cur_setting = itimerspec64_to_itimerspec(&cur_setting64); - if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) - return -EFAULT; +/* Get the time remaining on a POSIX.1b interval timer. */ +SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct itimerspec __user *, setting) +{ + struct itimerspec64 cur_setting64; + int ret = do_timer_gettime(timer_id, &cur_setting64); + if (!ret) { + struct itimerspec cur_setting; + cur_setting = itimerspec64_to_itimerspec(&cur_setting64); + if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + ret = -EFAULT; + } + return ret; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct compat_itimerspec __user *, setting) +{ + struct itimerspec64 cur_setting64; + + int ret = do_timer_gettime(timer_id, &cur_setting64); + if (!ret) { + struct itimerspec cur_setting; + cur_setting = itimerspec64_to_itimerspec(&cur_setting64); + if (put_compat_itimerspec(setting, &cur_setting)) + ret = -EFAULT; + } return ret; } +#endif /* * Get the number of overruns of a POSIX.1b interval timer. This is to -- cgit v1.3-14-g43fede From 54ad9c46c262ce4a603dc7887e37956896a0211d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:37 +0100 Subject: itimers: Move compat itimer syscalls to native ones get rid of set_fs(), sanitize compat copyin/copyout. Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-12-viro@ZenIV.linux.org.uk --- include/linux/compat.h | 4 +++ kernel/compat.c | 69 +++++++++++------------------------------------ kernel/time/itimer.c | 38 ++++++++++++++++++++++++++ kernel/time/posix-stubs.c | 2 ++ 4 files changed, 60 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index ecb8dd261d36..425563c7647b 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -94,6 +94,10 @@ struct compat_itimerval { struct compat_timeval it_value; }; +struct itimerval; +int get_compat_itimerval(struct itimerval *, const struct compat_itimerval __user *); +int put_compat_itimerval(struct compat_itimerval __user *, const struct itimerval *); + struct compat_tms { compat_clock_t tms_utime; compat_clock_t tms_stime; diff --git a/kernel/compat.c b/kernel/compat.c index 1fb8cf7e691e..c349417d2c40 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -217,65 +217,28 @@ int compat_convert_timespec(struct timespec __user **kts, return 0; } -static inline long get_compat_itimerval(struct itimerval *o, - struct compat_itimerval __user *i) +int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i) { - return (!access_ok(VERIFY_READ, i, sizeof(*i)) || - (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) | - __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) | - __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) | - __get_user(o->it_value.tv_usec, &i->it_value.tv_usec))); -} - -static inline long put_compat_itimerval(struct compat_itimerval __user *o, - struct itimerval *i) -{ - return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || - (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) | - __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) | - __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) | - __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); -} - -asmlinkage long sys_ni_posix_timers(void); - -COMPAT_SYSCALL_DEFINE2(getitimer, int, which, - struct compat_itimerval __user *, it) -{ - struct itimerval kit; - int error; - - if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) - return sys_ni_posix_timers(); + struct compat_itimerval v32; - error = do_getitimer(which, &kit); - if (!error && put_compat_itimerval(it, &kit)) - error = -EFAULT; - return error; + if (copy_from_user(&v32, i, sizeof(struct compat_itimerval))) + return -EFAULT; + o->it_interval.tv_sec = v32.it_interval.tv_sec; + o->it_interval.tv_usec = v32.it_interval.tv_usec; + o->it_value.tv_sec = v32.it_value.tv_sec; + o->it_value.tv_usec = v32.it_value.tv_usec; + return 0; } -COMPAT_SYSCALL_DEFINE3(setitimer, int, which, - struct compat_itimerval __user *, in, - struct compat_itimerval __user *, out) +int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerval *i) { - struct itimerval kin, kout; - int error; + struct compat_itimerval v32; - if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) - return sys_ni_posix_timers(); - - if (in) { - if (get_compat_itimerval(&kin, in)) - return -EFAULT; - } else - memset(&kin, 0, sizeof(kin)); - - error = do_setitimer(which, &kin, out ? &kout : NULL); - if (error || !out) - return error; - if (put_compat_itimerval(out, &kout)) - return -EFAULT; - return 0; + v32.it_interval.tv_sec = i->it_interval.tv_sec; + v32.it_interval.tv_usec = i->it_interval.tv_usec; + v32.it_value.tv_sec = i->it_value.tv_sec; + v32.it_value.tv_usec = i->it_value.tv_usec; + return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0; } static compat_clock_t clock_t_to_compat_clock_t(clock_t x) diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 087d6a1279b8..9dd7ff5e445a 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -116,6 +117,19 @@ SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) return error; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(getitimer, int, which, + struct compat_itimerval __user *, it) +{ + struct itimerval kit; + int error = do_getitimer(which, &kit); + + if (!error && put_compat_itimerval(it, &kit)) + error = -EFAULT; + return error; +} +#endif + /* * The timer is automagically restarted, when interval != 0 @@ -294,3 +308,27 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, return -EFAULT; return 0; } + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(setitimer, int, which, + struct compat_itimerval __user *, in, + struct compat_itimerval __user *, out) +{ + struct itimerval kin, kout; + int error; + + if (in) { + if (get_compat_itimerval(&kin, in)) + return -EFAULT; + } else { + memset(&kin, 0, sizeof(kin)); + } + + error = do_setitimer(which, &kin, out ? &kout : NULL); + if (error || !out) + return error; + if (put_compat_itimerval(out, &kout)) + return -EFAULT; + return 0; +} +#endif diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index f4a1962d1729..7f88517461e8 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -44,6 +44,8 @@ SYS_NI(alarm); COMPAT_SYS_NI(clock_adjtime); COMPAT_SYS_NI(timer_settime); COMPAT_SYS_NI(timer_gettime); +COMPAT_SYS_NI(getitimer); +COMPAT_SYS_NI(setitimer); /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC -- cgit v1.3-14-g43fede From d822cdcce43f9d4dcddbf9c68f9537d542ccc3c3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:38 +0100 Subject: posix-timers: Move compat versions of clock_gettime/settime/getres Move them to the native implementations and get rid of the set_fs() hackery. Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-13-viro@ZenIV.linux.org.uk --- kernel/compat.c | 51 ----------------------------------- kernel/time/posix-stubs.c | 53 +++++++++++++++++++++++++++++++++++++ kernel/time/posix-timers.c | 66 +++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 115 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index c349417d2c40..582c38bfdf60 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -598,57 +598,6 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, return sys_timer_create(which_clock, event, created_timer_id); } -COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, - struct compat_timespec __user *, tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - if (compat_get_timespec(&ts, tp)) - return -EFAULT; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_settime(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - return err; -} - -COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct compat_timespec __user *, tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_gettime(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - if (!err && compat_put_timespec(&ts, tp)) - return -EFAULT; - return err; -} - -COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct compat_timespec __user *, tp) -{ - long err; - mm_segment_t oldfs; - struct timespec ts; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = sys_clock_getres(which_clock, - (struct timespec __user *) &ts); - set_fs(oldfs); - if (!err && tp && compat_put_timespec(&ts, tp)) - return -EFAULT; - return err; -} - /* * We currently only need the following fields from the sigevent * structure: sigev_value, sigev_signo, sig_notify and (sometimes diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 7f88517461e8..a375c31cb352 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -137,6 +137,59 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, } #ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + struct timespec64 new_tp64; + struct timespec new_tp; + + if (which_clock != CLOCK_REALTIME) + return -EINVAL; + if (compat_get_timespec(&new_tp, tp)) + return -EFAULT; + + new_tp64 = timespec_to_timespec64(new_tp); + return do_sys_settimeofday64(&new_tp64, NULL); +} + +COMPAT_SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct compat_timespec __user *,tp) +{ + struct timespec64 kernel_tp64; + struct timespec kernel_tp; + + switch (which_clock) { + case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; + case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; + case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; + default: return -EINVAL; + } + + kernel_tp = timespec64_to_timespec(kernel_tp64); + if (compat_put_timespec(&kernel_tp, tp)) + return -EFAULT; + return 0; +} + +COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + struct timespec rtn_tp = { + .tv_sec = 0, + .tv_nsec = hrtimer_resolution, + }; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + if (compat_put_timespec(&rtn_tp, tp)) + return -EFAULT; + return 0; + default: + return -EINVAL; + } +} COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index e82bb1fd614e..61a5fb91a3c7 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1082,8 +1082,66 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, return err; } +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, + struct timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 rtn_tp64; + struct timespec rtn_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_getres(which_clock, &rtn_tp64); + rtn_tp = timespec64_to_timespec(rtn_tp64); + + if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) + error = -EFAULT; + + return error; +} + #ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 new_tp64; + struct timespec new_tp; + + if (!kc || !kc->clock_set) + return -EINVAL; + + if (compat_get_timespec(&new_tp, tp)) + return -EFAULT; + + new_tp64 = timespec_to_timespec64(new_tp); + + return kc->clock_set(which_clock, &new_tp64); +} + +COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, + struct compat_timespec __user *, tp) +{ + const struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 kernel_tp64; + struct timespec kernel_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_get(which_clock, &kernel_tp64); + kernel_tp = timespec64_to_timespec(kernel_tp64); + + if (!error && compat_put_timespec(&kernel_tp, tp)) + error = -EFAULT; + + return error; +} + COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, struct compat_timex __user *, utp) { @@ -1107,10 +1165,9 @@ COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, return err; } -#endif -SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, - struct timespec __user *, tp) +COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, + struct compat_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 rtn_tp64; @@ -1123,11 +1180,12 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, error = kc->clock_getres(which_clock, &rtn_tp64); rtn_tp = timespec64_to_timespec(rtn_tp64); - if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) + if (!error && tp && compat_put_timespec(&rtn_tp, tp)) error = -EFAULT; return error; } +#endif /* * nanosleep for monotonic and realtime clocks -- cgit v1.3-14-g43fede From 2482097c6c0f01ad74c9b2cff120a519ac59846e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:39 +0100 Subject: posix-timers: Move compat_timer_create() to native, get rid of set_fs() Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-14-viro@ZenIV.linux.org.uk --- kernel/compat.c | 18 -------------- kernel/time/posix-stubs.c | 1 + kernel/time/posix-timers.c | 59 ++++++++++++++++++++++++++++++++-------------- 3 files changed, 42 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 582c38bfdf60..4544eb63edfa 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -580,24 +580,6 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst, return 0; } -COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, - struct compat_sigevent __user *, timer_event_spec, - timer_t __user *, created_timer_id) -{ - struct sigevent __user *event = NULL; - - if (timer_event_spec) { - struct sigevent kevent; - - event = compat_alloc_user_space(sizeof(*event)); - if (get_compat_sigevent(&kevent, timer_event_spec) || - copy_to_user(event, &kevent, sizeof(*event))) - return -EFAULT; - } - - return sys_timer_create(which_clock, event, created_timer_id); -} - /* * We currently only need the following fields from the sigevent * structure: sigev_value, sigev_signo, sig_notify and (sometimes diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index a375c31cb352..38f3b20efa29 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -41,6 +41,7 @@ SYS_NI(setitimer); #ifdef __ARCH_WANT_SYS_ALARM SYS_NI(alarm); #endif +COMPAT_SYS_NI(timer_create); COMPAT_SYS_NI(clock_adjtime); COMPAT_SYS_NI(timer_settime); COMPAT_SYS_NI(timer_gettime); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 61a5fb91a3c7..c9f45a84fb8b 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -490,15 +490,12 @@ static int common_timer_create(struct k_itimer *new_timer) } /* Create a POSIX.1b interval timer. */ - -SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, - struct sigevent __user *, timer_event_spec, - timer_t __user *, created_timer_id) +static int do_timer_create(clockid_t which_clock, struct sigevent *event, + timer_t __user *created_timer_id) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; - sigevent_t event; int it_id_set = IT_ID_NOT_SET; if (!kc) @@ -523,29 +520,25 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->kclock = kc; new_timer->it_overrun = -1; - if (timer_event_spec) { - if (copy_from_user(&event, timer_event_spec, sizeof (event))) { - error = -EFAULT; - goto out; - } + if (event) { rcu_read_lock(); - new_timer->it_pid = get_pid(good_sigevent(&event)); + new_timer->it_pid = get_pid(good_sigevent(event)); rcu_read_unlock(); if (!new_timer->it_pid) { error = -EINVAL; goto out; } + new_timer->it_sigev_notify = event->sigev_notify; + new_timer->sigq->info.si_signo = event->sigev_signo; + new_timer->sigq->info.si_value = event->sigev_value; } else { - memset(&event.sigev_value, 0, sizeof(event.sigev_value)); - event.sigev_notify = SIGEV_SIGNAL; - event.sigev_signo = SIGALRM; - event.sigev_value.sival_int = new_timer->it_id; + new_timer->it_sigev_notify = SIGEV_SIGNAL; + new_timer->sigq->info.si_signo = SIGALRM; + memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t)); + new_timer->sigq->info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } - new_timer->it_sigev_notify = event.sigev_notify; - new_timer->sigq->info.si_signo = event.sigev_signo; - new_timer->sigq->info.si_value = event.sigev_value; new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_code = SI_TIMER; @@ -576,6 +569,36 @@ out: return error; } +SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, + struct sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) +{ + if (timer_event_spec) { + sigevent_t event; + + if (copy_from_user(&event, timer_event_spec, sizeof (event))) + return -EFAULT; + return do_timer_create(which_clock, &event, created_timer_id); + } + return do_timer_create(which_clock, NULL, created_timer_id); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, + struct compat_sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) +{ + if (timer_event_spec) { + sigevent_t event; + + if (get_compat_sigevent(&event, timer_event_spec)) + return -EFAULT; + return do_timer_create(which_clock, &event, created_timer_id); + } + return do_timer_create(which_clock, NULL, created_timer_id); +} +#endif + /* * Locking issues: We need to protect the result of the id look up until * we get the timer locked down so it is not deleted under us. The -- cgit v1.3-14-g43fede From b180db2c8ca6692a10b79631cadc18d03303d494 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:40 +0100 Subject: time: Move compat_time()/stime() to native Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-15-viro@ZenIV.linux.org.uk --- kernel/compat.c | 40 ---------------------------------------- kernel/time/time.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 4544eb63edfa..aa7b9a27f9e7 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -739,46 +739,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, return ret; } -#ifdef __ARCH_WANT_COMPAT_SYS_TIME - -/* compat_time_t is a 32 bit "long" and needs to get converted. */ - -COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) -{ - compat_time_t i; - struct timeval tv; - - do_gettimeofday(&tv); - i = tv.tv_sec; - - if (tloc) { - if (put_user(i,tloc)) - return -EFAULT; - } - force_successful_syscall_return(); - return i; -} - -COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) -{ - struct timespec tv; - int err; - - if (get_user(tv.tv_sec, tptr)) - return -EFAULT; - - tv.tv_nsec = 0; - - err = security_settime(&tv, NULL); - if (err) - return err; - - do_settimeofday(&tv); - return 0; -} - -#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ - #ifdef CONFIG_NUMA COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, compat_uptr_t __user *, pages32, diff --git a/kernel/time/time.c b/kernel/time/time.c index 400662f16c5a..e5d44999ff78 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -100,6 +100,47 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) #endif /* __ARCH_WANT_SYS_TIME */ +#ifdef CONFIG_COMPAT +#ifdef __ARCH_WANT_COMPAT_SYS_TIME + +/* compat_time_t is a 32 bit "long" and needs to get converted. */ +COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) +{ + struct timeval tv; + compat_time_t i; + + do_gettimeofday(&tv); + i = tv.tv_sec; + + if (tloc) { + if (put_user(i,tloc)) + return -EFAULT; + } + force_successful_syscall_return(); + return i; +} + +COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) +{ + struct timespec tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime(&tv, NULL); + if (err) + return err; + + do_settimeofday(&tv); + return 0; +} + +#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ +#endif + SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, struct timezone __user *, tz) { -- cgit v1.3-14-g43fede From 2b2d02856b3176701c91d723356f766d6ee27853 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Jun 2017 09:42:41 +0100 Subject: time: Move compat_gettimeofday()/settimeofday() to native Signed-off-by: Al Viro Signed-off-by: Thomas Gleixner Cc: John Stultz Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170607084241.28657-16-viro@ZenIV.linux.org.uk --- kernel/compat.c | 38 -------------------------------------- kernel/time/time.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index aa7b9a27f9e7..ebd8bdc3fd68 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -92,44 +92,6 @@ int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) return 0; } -COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, - struct timezone __user *, tz) -{ - if (tv) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (compat_put_timeval(&ktv, tv)) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - - return 0; -} - -COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, - struct timezone __user *, tz) -{ - struct timespec64 new_ts; - struct timeval user_tv; - struct timezone new_tz; - - if (tv) { - if (compat_get_timeval(&user_tv, tv)) - return -EFAULT; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; - } - if (tz) { - if (copy_from_user(&new_tz, tz, sizeof(*tz))) - return -EFAULT; - } - - return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); -} - static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) { return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || diff --git a/kernel/time/time.c b/kernel/time/time.c index e5d44999ff78..7c89e437c4d7 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -257,6 +257,47 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, + struct timezone __user *, tz) +{ + if (tv) { + struct timeval ktv; + + do_gettimeofday(&ktv); + if (compat_put_timeval(&ktv, tv)) + return -EFAULT; + } + if (tz) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + + return 0; +} + +COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, + struct timezone __user *, tz) +{ + struct timespec64 new_ts; + struct timeval user_tv; + struct timezone new_tz; + + if (tv) { + if (compat_get_timeval(&user_tv, tv)) + return -EFAULT; + new_ts.tv_sec = user_tv.tv_sec; + new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + } + if (tz) { + if (copy_from_user(&new_tz, tz, sizeof(*tz))) + return -EFAULT; + } + + return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); +} +#endif + SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) { struct timex txc; /* Local copy of parameter */ -- cgit v1.3-14-g43fede From 343d8fc208e43e50525ae6e0fc4845b9966b7318 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jun 2017 23:29:14 +0200 Subject: posix-cpu-timers: Avoid timespec conversion in do_cpu_nanosleep() No point in converting the expiry time back and forth. No point either to update the value in the caller supplied variable. mark the rqtp argument const. Signed-off-by: Thomas Gleixner Cc: Al Viro Cc: John Stultz Cc: Peter Zijlstra --- kernel/time/posix-cpu-timers.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 515148d4eeb1..3adfa42ca24c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1227,10 +1227,11 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp) + const struct timespec64 *rqtp) { - struct k_itimer timer; struct itimerspec64 it; + struct k_itimer timer; + u64 expires; int error; /* @@ -1279,7 +1280,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, /* * We were interrupted by a signal. */ - *rqtp = ns_to_timespec64(timer.it.cpu.expires); + expires = timer.it.cpu.expires; error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); if (!error) { /* @@ -1312,7 +1313,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * Report back to the user the time still remaining. */ restart = ¤t->restart_block; - restart->nanosleep.expires = timespec64_to_ns(rqtp); + restart->nanosleep.expires = expires; if (restart->nanosleep.type != TT_NONE) { struct timespec ts; -- cgit v1.3-14-g43fede From 938e7cf2d569833a5acf689a8926faf507826253 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jun 2017 23:34:33 +0200 Subject: posix-timers: Make nanosleep timespec argument const No nanosleep implementation modifies the rqtp argument. Mark is const. Signed-off-by: Thomas Gleixner Cc: Al Viro Cc: John Stultz Cc: Peter Zijlstra --- include/linux/hrtimer.h | 2 +- kernel/time/alarmtimer.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/posix-cpu-timers.c | 4 ++-- kernel/time/posix-timers.c | 4 ++-- kernel/time/posix-timers.h | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index d83b7ed1cb0e..255edd5e7a74 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -454,7 +454,7 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer, /* Precise sleep: */ extern int nanosleep_copyout(struct restart_block *, struct timespec *); -extern long hrtimer_nanosleep(struct timespec64 *rqtp, +extern long hrtimer_nanosleep(const struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 7bed4e44f9bd..c991cf212c6d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -753,7 +753,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) * Handles clock_nanosleep calls against _ALARM clockids */ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *tsreq) + const struct timespec64 *tsreq) { enum alarmtimer_type type = clock2alarm(which_clock); struct restart_block *restart = ¤t->restart_block; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 45f83cc7c0c7..81da124f1115 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1510,7 +1510,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) return ret; } -long hrtimer_nanosleep(struct timespec64 *rqtp, +long hrtimer_nanosleep(const struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3adfa42ca24c..9df618ee64cf 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1328,7 +1328,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block); static int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp) + const struct timespec64 *rqtp) { struct restart_block *restart_block = ¤t->restart_block; int error; @@ -1383,7 +1383,7 @@ static int process_cpu_timer_create(struct k_itimer *timer) return posix_cpu_timer_create(timer); } static int process_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *rqtp) + const struct timespec64 *rqtp) { return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index c9f45a84fb8b..82d67be7d9d1 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1214,9 +1214,9 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, * nanosleep for monotonic and realtime clocks */ static int common_nsleep(const clockid_t which_clock, int flags, - struct timespec64 *tsave) + const struct timespec64 *rqtp) { - return hrtimer_nanosleep(tsave, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 5e69bb85629f..fb303c3be4d3 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -10,7 +10,7 @@ struct k_clock { int (*clock_adj)(const clockid_t which_clock, struct timex *tx); int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, - struct timespec64 *); + const struct timespec64 *); int (*timer_set)(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, struct itimerspec64 *old_setting); -- cgit v1.3-14-g43fede From 31fd85816dbe3a714bcc3f67c17c3dd87011f79e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 13 Jun 2017 15:52:13 -0700 Subject: bpf: permits narrower load from bpf program context fields Currently, verifier will reject a program if it contains an narrower load from the bpf context structure. For example, __u8 h = __sk_buff->hash, or __u16 p = __sk_buff->protocol __u32 sample_period = bpf_perf_event_data->sample_period which are narrower loads of 4-byte or 8-byte field. This patch solves the issue by: . Introduce a new parameter ctx_field_size to carry the field size of narrower load from prog type specific *__is_valid_access validator back to verifier. . The non-zero ctx_field_size for a memory access indicates (1). underlying prog type specific convert_ctx_accesses supporting non-whole-field access (2). the current insn is a narrower or whole field access. . In verifier, for such loads where load memory size is less than ctx_field_size, verifier transforms it to a full field load followed by proper masking. . Currently, __sk_buff and bpf_perf_event_data->sample_period are supporting narrowing loads. . Narrower stores are still not allowed as typical ctx stores are just normal stores. Because of this change, some tests in verifier will fail and these tests are removed. As a bonus, rename some out of bound __sk_buff->cb access to proper field name and remove two redundant "skb cb oob" tests. Acked-by: Daniel Borkmann Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 +- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 71 ++++++++++++++++------ kernel/trace/bpf_trace.c | 21 +++++-- net/core/filter.c | 56 +++++++++++++----- tools/testing/selftests/bpf/test_verifier.c | 92 ++++------------------------- 6 files changed, 124 insertions(+), 119 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c32bace66d3d..1bcbf0a71f75 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -157,7 +157,7 @@ struct bpf_verifier_ops { * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type); + enum bpf_reg_type *reg_type, int *ctx_field_size); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); u32 (*convert_ctx_access)(enum bpf_access_type type, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d5093b52b485..189741c0da85 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -73,6 +73,7 @@ struct bpf_insn_aux_data { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ }; + int ctx_field_size; /* the ctx field size for load/store insns, maybe 0 */ }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 519a6144d3d3..44b97d958fb7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -758,15 +758,26 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, } /* check access to 'struct bpf_context' fields */ -static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, +static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { + int ctx_field_size = 0; + /* for analyzer ctx accesses are already validated and converted */ if (env->analyzer_ops) return 0; if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { + env->prog->aux->ops->is_valid_access(off, size, t, reg_type, &ctx_field_size)) { + /* a non zero ctx_field_size indicates: + * . For this field, the prog type specific ctx conversion algorithm + * only supports whole field access. + * . This ctx access is a candiate for later verifier transformation + * to load the whole field and then apply a mask to get correct result. + */ + if (ctx_field_size) + env->insn_aux_data[insn_idx].ctx_field_size = ctx_field_size; + /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -868,7 +879,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, int bpf_size, enum bpf_access_type t, int value_regno) { @@ -911,7 +922,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, verbose("R%d leaks addr into ctx\n", value_regno); return -EACCES; } - err = check_ctx_access(env, off, size, t, ®_type); + err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { mark_reg_unknown_value_and_range(state->regs, value_regno); @@ -972,7 +983,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, return err; } -static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) +static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs; int err; @@ -994,13 +1005,13 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; /* check whether atomic_add can read the memory */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); if (err) return err; /* check whether atomic_add can write into the same memory */ - return check_mem_access(env, insn->dst_reg, insn->off, + return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1); } @@ -1416,7 +1427,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { - err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1); + err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1); if (err) return err; } @@ -2993,18 +3004,12 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ - err = check_mem_access(env, insn->src_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg); if (err) return err; - if (BPF_SIZE(insn->code) != BPF_W && - BPF_SIZE(insn->code) != BPF_DW) { - insn_idx++; - continue; - } - prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; if (*prev_src_type == NOT_INIT) { @@ -3032,7 +3037,7 @@ static int do_check(struct bpf_verifier_env *env) enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, insn); + err = check_xadd(env, insn_idx, insn); if (err) return err; insn_idx++; @@ -3051,7 +3056,7 @@ static int do_check(struct bpf_verifier_env *env) dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg); if (err) @@ -3080,7 +3085,7 @@ static int do_check(struct bpf_verifier_env *env) return err; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn->dst_reg, insn->off, + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1); if (err) @@ -3383,7 +3388,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i, cnt, delta = 0; + int i, cnt, off, size, ctx_field_size, is_narrower_load, delta = 0; if (ops->gen_prologue) { cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, @@ -3423,11 +3428,39 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; + off = insn->off; + size = bpf_size_to_bytes(BPF_SIZE(insn->code)); + ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; + is_narrower_load = (type == BPF_READ && size < ctx_field_size); + + /* If the read access is a narrower load of the field, + * convert to a 4/8-byte load, to minimum program type specific + * convert_ctx_access changes. If conversion is successful, + * we will apply proper mask to the result. + */ + if (is_narrower_load) { + int size_code = BPF_H; + + if (ctx_field_size == 4) + size_code = BPF_W; + else if (ctx_field_size == 8) + size_code = BPF_DW; + insn->off = off & ~(ctx_field_size - 1); + insn->code = BPF_LDX | BPF_MEM | size_code; + } cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; } + if (is_narrower_load) { + if (ctx_field_size <= 4) + insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, + (1 << size * 8) - 1); + else + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, + (1 << size * 8) - 1); + } new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 051d7fca0c09..9d3ec8253131 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -479,7 +479,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, int *ctx_field_size) { if (off < 0 || off >= sizeof(struct pt_regs)) return false; @@ -562,7 +562,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, int *ctx_field_size) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) return false; @@ -581,17 +581,26 @@ const struct bpf_verifier_ops tracepoint_prog_ops = { }; static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, int *ctx_field_size) { + int sample_period_off; + if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; if (type != BPF_READ) return false; if (off % size != 0) return false; - if (off == offsetof(struct bpf_perf_event_data, sample_period)) { - if (size != sizeof(u64)) - return false; + + /* permit 1, 2, 4 byte narrower and 8 normal read access to sample_period */ + sample_period_off = offsetof(struct bpf_perf_event_data, sample_period); + if (off >= sample_period_off && off < sample_period_off + sizeof(__u64)) { + *ctx_field_size = 8; +#ifdef __LITTLE_ENDIAN + return (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0; +#else + return ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0; +#endif } else { if (size != sizeof(long)) return false; diff --git a/net/core/filter.c b/net/core/filter.c index a65a3b25e104..60ed6f343a63 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2856,7 +2856,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id) } } -static bool __is_valid_access(int off, int size) +static bool __is_valid_access(int off, int size, enum bpf_access_type type, + int *ctx_field_size) { if (off < 0 || off >= sizeof(struct __sk_buff)) return false; @@ -2872,9 +2873,27 @@ static bool __is_valid_access(int off, int size) offsetof(struct __sk_buff, cb[4]) + sizeof(__u32)) return false; break; - default: + case offsetof(struct __sk_buff, data) ... + offsetof(struct __sk_buff, data) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, data_end) ... + offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1: if (size != sizeof(__u32)) return false; + break; + default: + /* permit narrower load for not cb/data/data_end fields */ + *ctx_field_size = 4; + if (type == BPF_WRITE) { + if (size != sizeof(__u32)) + return false; + } else { + if (size != sizeof(__u32)) +#ifdef __LITTLE_ENDIAN + return (off & 0x3) == 0 && (size == 1 || size == 2); +#else + return (off & 0x3) + size == 4 && (size == 1 || size == 2); +#endif + } } return true; @@ -2882,12 +2901,16 @@ static bool __is_valid_access(int off, int size) static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, + int *ctx_field_size) { switch (off) { - case offsetof(struct __sk_buff, tc_classid): - case offsetof(struct __sk_buff, data): - case offsetof(struct __sk_buff, data_end): + case offsetof(struct __sk_buff, tc_classid) ... + offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, data) ... + offsetof(struct __sk_buff, data) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, data_end) ... + offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1: return false; } @@ -2901,15 +2924,17 @@ static bool sk_filter_is_valid_access(int off, int size, } } - return __is_valid_access(off, size); + return __is_valid_access(off, size, type, ctx_field_size); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, + int *ctx_field_size) { switch (off) { - case offsetof(struct __sk_buff, tc_classid): + case offsetof(struct __sk_buff, tc_classid) ... + offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1: return false; } @@ -2934,12 +2959,13 @@ static bool lwt_is_valid_access(int off, int size, break; } - return __is_valid_access(off, size); + return __is_valid_access(off, size, type, ctx_field_size); } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, + int *ctx_field_size) { if (type == BPF_WRITE) { switch (off) { @@ -3002,7 +3028,8 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, + int *ctx_field_size) { if (type == BPF_WRITE) { switch (off) { @@ -3027,7 +3054,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, break; } - return __is_valid_access(off, size); + return __is_valid_access(off, size, type, ctx_field_size); } static bool __is_valid_xdp_access(int off, int size) @@ -3044,7 +3071,8 @@ static bool __is_valid_xdp_access(int off, int size) static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + enum bpf_reg_type *reg_type, + int *ctx_field_size) { if (type == BPF_WRITE) return false; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 4ee4708b0d60..13341700930c 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -1073,44 +1073,22 @@ static struct bpf_test tests[] = { .result = ACCEPT, }, { - "check cb access: byte, oob 1", + "__sk_buff->hash, offset 0, byte store not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, - offsetof(struct __sk_buff, cb[4]) + 4), + offsetof(struct __sk_buff, hash)), BPF_EXIT_INSN(), }, .errstr = "invalid bpf_context access", .result = REJECT, }, { - "check cb access: byte, oob 2", + "__sk_buff->tc_index, offset 3, byte store not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, - offsetof(struct __sk_buff, cb[0]) - 1), - BPF_EXIT_INSN(), - }, - .errstr = "invalid bpf_context access", - .result = REJECT, - }, - { - "check cb access: byte, oob 3", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, - offsetof(struct __sk_buff, cb[4]) + 4), - BPF_EXIT_INSN(), - }, - .errstr = "invalid bpf_context access", - .result = REJECT, - }, - { - "check cb access: byte, oob 4", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, - offsetof(struct __sk_buff, cb[0]) - 1), + offsetof(struct __sk_buff, tc_index) + 3), BPF_EXIT_INSN(), }, .errstr = "invalid bpf_context access", @@ -1188,44 +1166,22 @@ static struct bpf_test tests[] = { .result = REJECT, }, { - "check cb access: half, oob 1", + "check __sk_buff->hash, offset 0, half store not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, - offsetof(struct __sk_buff, cb[4]) + 4), + offsetof(struct __sk_buff, hash)), BPF_EXIT_INSN(), }, .errstr = "invalid bpf_context access", .result = REJECT, }, { - "check cb access: half, oob 2", + "check __sk_buff->tc_index, offset 2, half store not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, - offsetof(struct __sk_buff, cb[0]) - 2), - BPF_EXIT_INSN(), - }, - .errstr = "invalid bpf_context access", - .result = REJECT, - }, - { - "check cb access: half, oob 3", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, - offsetof(struct __sk_buff, cb[4]) + 4), - BPF_EXIT_INSN(), - }, - .errstr = "invalid bpf_context access", - .result = REJECT, - }, - { - "check cb access: half, oob 4", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, - offsetof(struct __sk_buff, cb[0]) - 2), + offsetof(struct __sk_buff, tc_index) + 2), BPF_EXIT_INSN(), }, .errstr = "invalid bpf_context access", @@ -1366,28 +1322,6 @@ static struct bpf_test tests[] = { }, { "check cb access: double, oob 2", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, - offsetof(struct __sk_buff, cb[4]) + 8), - BPF_EXIT_INSN(), - }, - .errstr = "invalid bpf_context access", - .result = REJECT, - }, - { - "check cb access: double, oob 3", - .insns = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, - offsetof(struct __sk_buff, cb[0]) - 8), - BPF_EXIT_INSN(), - }, - .errstr = "invalid bpf_context access", - .result = REJECT, - }, - { - "check cb access: double, oob 4", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, @@ -1398,22 +1332,22 @@ static struct bpf_test tests[] = { .result = REJECT, }, { - "check cb access: double, oob 5", + "check __sk_buff->ifindex dw store not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, - offsetof(struct __sk_buff, cb[4]) + 8), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, ifindex)), BPF_EXIT_INSN(), }, .errstr = "invalid bpf_context access", .result = REJECT, }, { - "check cb access: double, oob 6", + "check __sk_buff->ifindex dw load not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, - offsetof(struct __sk_buff, cb[0]) - 8), + offsetof(struct __sk_buff, ifindex)), BPF_EXIT_INSN(), }, .errstr = "invalid bpf_context access", -- cgit v1.3-14-g43fede From 73a7242a06ff995d771fbe243e72b516feaa6e3d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 13 Jun 2017 17:18:01 -0400 Subject: cgroup: Keep accurate count of tasks in each css_set The reference count in the css_set data structure was used as a proxy of the number of tasks attached to that css_set. However, that count is actually not an accurate measure especially with thread mode support. So a new variable nr_tasks is added to the css_set to keep track of the actual task count. This new variable is protected by the css_set_lock. Functions that require the actual task count are updated to use the new variable. tj: s/task_count/nr_tasks/ for consistency with cgroup_root->nr_cgrps. Refreshed on top of cgroup/for-v4.13 which dropped on css_set_populated() -> nr_tasks conversion. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 3 +++ kernel/cgroup/cgroup-v1.c | 6 +----- kernel/cgroup/cgroup.c | 10 ++++++++++ 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ec47101cb1bf..3bc4196bf217 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -166,6 +166,9 @@ struct css_set { /* the default cgroup associated with this css_set */ struct cgroup *dfl_cgrp; + /* internal task count, protected by css_set_lock */ + int nr_tasks; + /* * Lists running through all tasks using this cgroup group. * mg_tasks lists tasks which belong to this cset but are in the diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 85d75152402d..e9ea5f201fac 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -334,10 +334,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, /** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question - * - * Return the number of tasks in the cgroup. The returned number can be - * higher than the actual number of tasks due to css_set references from - * namespace roots and temporary usages. */ static int cgroup_task_count(const struct cgroup *cgrp) { @@ -346,7 +342,7 @@ static int cgroup_task_count(const struct cgroup *cgrp) spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += refcount_read(&link->cset->refcount); + count += link->cset->nr_tasks; spin_unlock_irq(&css_set_lock); return count; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8d4e85eae42c..dbfd7028b1c6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -573,6 +573,11 @@ static int css_set_count = 1; /* 1 for init_css_set */ /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set + * + * css_set_populated() should be the same as !!cset->nr_tasks at steady + * state. However, css_set_populated() can be called while a task is being + * added to or removed from the linked list before the nr_tasks is + * properly updated. Hence, we can't just look at ->nr_tasks here. */ static bool css_set_populated(struct css_set *cset) { @@ -1598,6 +1603,7 @@ static void cgroup_enable_task_cg_lists(void) css_set_update_populated(cset, true); list_add_tail(&p->cg_list, &cset->tasks); get_css_set(cset); + cset->nr_tasks++; } spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); @@ -2064,8 +2070,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) struct css_set *to_cset = cset->mg_dst_cset; get_css_set(to_cset); + to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); put_css_set_locked(from_cset); + from_cset->nr_tasks--; } } spin_unlock_irq(&css_set_lock); @@ -4789,6 +4797,7 @@ void cgroup_post_fork(struct task_struct *child) cset = task_css_set(current); if (list_empty(&child->cg_list)) { get_css_set(cset); + cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } spin_unlock_irq(&css_set_lock); @@ -4838,6 +4847,7 @@ void cgroup_exit(struct task_struct *tsk) if (!list_empty(&tsk->cg_list)) { spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); + cset->nr_tasks--; spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); -- cgit v1.3-14-g43fede From a28f8f5e995fe5964ae304444913536058f26e37 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 13 Jun 2017 17:18:02 -0400 Subject: cgroup: Move debug cgroup to its own file The debug cgroup currently resides within cgroup-v1.c and is enabled only for v1 cgroup. To enable the debug cgroup also for v2, it makes sense to put the code into its own file as it will no longer be v1 specific. There is no change to the debug cgroup specific code. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/Makefile | 1 + kernel/cgroup/cgroup-internal.h | 2 + kernel/cgroup/cgroup-v1.c | 149 +------------------------------------- kernel/cgroup/debug.c | 153 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 157 insertions(+), 148 deletions(-) create mode 100644 kernel/cgroup/debug.c (limited to 'kernel') diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 387348a40c64..ce693ccb8c58 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 00f4d6bf048f..793565c05742 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn); int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root); +int cgroup_task_count(const struct cgroup *cgrp); + /* * namespace.c */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index e9ea5f201fac..7bf4b1533f34 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -335,7 +335,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question */ -static int cgroup_task_count(const struct cgroup *cgrp) +int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; @@ -1259,150 +1259,3 @@ static int __init cgroup_no_v1(char *str) return 1; } __setup("cgroup_no_v1=", cgroup_no_v1); - - -#ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state * -debug_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_css_free(struct cgroup_subsys_state *css) -{ - kfree(css); -} - -static u64 debug_taskcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return cgroup_task_count(css->cgroup); -} - -static u64 current_css_set_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return (u64)(unsigned long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = refcount_read(&task_css_set(current)->refcount); - rcu_read_unlock(); - return count; -} - -static int current_css_set_cg_links_read(struct seq_file *seq, void *v) -{ - struct cgrp_cset_link *link; - struct css_set *cset; - char *name_buf; - - name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!name_buf) - return -ENOMEM; - - spin_lock_irq(&css_set_lock); - rcu_read_lock(); - cset = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; - - cgroup_name(c, name_buf, NAME_MAX + 1); - seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name_buf); - } - rcu_read_unlock(); - spin_unlock_irq(&css_set_lock); - kfree(name_buf); - return 0; -} - -#define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct seq_file *seq, void *v) -{ - struct cgroup_subsys_state *css = seq_css(seq); - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { - struct css_set *cset = link->cset; - struct task_struct *task; - int count = 0; - - seq_printf(seq, "css_set %pK\n", cset); - - list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); - } - - list_for_each_entry(task, &cset->mg_tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); - } - continue; - overflow: - seq_puts(seq, " ...\n"); - } - spin_unlock_irq(&css_set_lock); - return 0; -} - -static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) -{ - return (!cgroup_is_populated(css->cgroup) && - !css_has_online_children(&css->cgroup->self)); -} - -static struct cftype debug_files[] = { - { - .name = "taskcount", - .read_u64 = debug_taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "current_css_set_cg_links", - .seq_show = current_css_set_cg_links_read, - }, - - { - .name = "cgroup_css_links", - .seq_show = cgroup_css_links_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, - - { } /* terminate */ -}; - -struct cgroup_subsys debug_cgrp_subsys = { - .css_alloc = debug_css_alloc, - .css_free = debug_css_free, - .legacy_cftypes = debug_files, -}; -#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c new file mode 100644 index 000000000000..1c209aa43733 --- /dev/null +++ b/kernel/cgroup/debug.c @@ -0,0 +1,153 @@ +#include +#include +#include + +#include "cgroup-internal.h" + +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_css_free(struct cgroup_subsys_state *css) +{ + kfree(css); +} + +/* + * debug_taskcount_read - return the number of tasks in a cgroup. + * @cgrp: the cgroup in question + */ +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return cgroup_task_count(css->cgroup); +} + +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = refcount_read(&task_css_set(current)->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct seq_file *seq, void *v) +{ + struct cgrp_cset_link *link; + struct css_set *cset; + char *name_buf; + + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; + + spin_lock_irq(&css_set_lock); + rcu_read_lock(); + cset = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + cgroup_name(c, name_buf, NAME_MAX + 1); + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name_buf); + } + rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); + kfree(name_buf); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct seq_file *seq, void *v) +{ + struct cgroup_subsys_state *css = seq_css(seq); + struct cgrp_cset_link *link; + + spin_lock_irq(&css_set_lock); + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { + struct css_set *cset = link->cset; + struct task_struct *task; + int count = 0; + + seq_printf(seq, "css_set %pK\n", cset); + + list_for_each_entry(task, &cset->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + + list_for_each_entry(task, &cset->mg_tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + continue; + overflow: + seq_puts(seq, " ...\n"); + } + spin_unlock_irq(&css_set_lock); + return 0; +} + +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return (!cgroup_is_populated(css->cgroup) && + !css_has_online_children(&css->cgroup->self)); +} + +static struct cftype debug_files[] = { + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .seq_show = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .seq_show = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, + + { } /* terminate */ +}; + +struct cgroup_subsys debug_cgrp_subsys = { + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, + .legacy_cftypes = debug_files, +}; -- cgit v1.3-14-g43fede From 23b0be480f341db26ce0dee7d3f6e67f8e0e166f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 13 Jun 2017 17:18:03 -0400 Subject: cgroup: Make Kconfig prompt of debug cgroup more accurate The Kconfig prompt and description of the debug cgroup controller more accurate by saying that it is for debug purpose only and its interfaces are unstable. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- init/Kconfig | 7 +++++-- kernel/cgroup/debug.c | 6 ++++++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 1d3475fc9496..5f9d13929ae0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1205,11 +1205,14 @@ config CGROUP_BPF inet sockets. config CGROUP_DEBUG - bool "Example controller" + bool "Debug controller" default n + depends on DEBUG_KERNEL help This option enables a simple controller that exports - debugging information about the cgroups framework. + debugging information about the cgroups framework. This + controller is for control cgroup debugging only. Its + interfaces are not stable. Say N. diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 1c209aa43733..cbe77a2087c5 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -1,3 +1,9 @@ +/* + * Debug controller + * + * WARNING: This controller is for cgroup core debugging only. + * Its interfaces are unstable and subject to changes at any time. + */ #include #include #include -- cgit v1.3-14-g43fede From 575313f40ff33d0c2aff2701dfb2ccfcd6211d55 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 13 Jun 2017 17:18:04 -0400 Subject: cgroup: Make debug cgroup support v2 and thread mode Besides supporting cgroup v2 and thread mode, the following changes are also made: 1) current_* cgroup files now resides only at the root as we don't need duplicated files of the same function all over the cgroup hierarchy. 2) The cgroup_css_links_read() function is modified to report the number of tasks that are skipped because of overflow. 3) The number of extra unaccounted references are displayed. 4) The current_css_set_read() function now prints out the addresses of the css'es associated with the current css_set. 5) A new cgroup_subsys_states file is added to display the css objects associated with a cgroup. 6) A new cgroup_masks file is added to display the various controller bit masks in the cgroup. tj: Dropped thread mode related information for now so that debug controller changes aren't blocked on the thread mode. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/debug.c | 170 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 153 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index cbe77a2087c5..057d9b07f461 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -36,10 +36,37 @@ static u64 debug_taskcount_read(struct cgroup_subsys_state *css, return cgroup_task_count(css->cgroup); } -static u64 current_css_set_read(struct cgroup_subsys_state *css, - struct cftype *cft) +static int current_css_set_read(struct seq_file *seq, void *v) { - return (u64)(unsigned long)current->cgroups; + struct css_set *cset; + struct cgroup_subsys *ss; + struct cgroup_subsys_state *css; + int i, refcnt; + + mutex_lock(&cgroup_mutex); + spin_lock_irq(&css_set_lock); + rcu_read_lock(); + cset = rcu_dereference(current->cgroups); + refcnt = refcount_read(&cset->refcount); + seq_printf(seq, "css_set %pK %d", cset, refcnt); + if (refcnt > cset->nr_tasks) + seq_printf(seq, " +%d", refcnt - cset->nr_tasks); + seq_puts(seq, "\n"); + + /* + * Print the css'es stored in the current css_set. + */ + for_each_subsys(ss, i) { + css = cset->subsys[ss->id]; + if (!css) + continue; + seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, + (unsigned long)css, css->id); + } + rcu_read_unlock(); + spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); + return 0; } static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, @@ -84,31 +111,126 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) { struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; + int dead_cnt = 0, extra_refs = 0; spin_lock_irq(&css_set_lock); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; + int refcnt = refcount_read(&cset->refcount); - seq_printf(seq, "css_set %pK\n", cset); + seq_printf(seq, " %d", refcnt); + if (refcnt - cset->nr_tasks > 0) { + int extra = refcnt - cset->nr_tasks; + + seq_printf(seq, " +%d", extra); + /* + * Take out the one additional reference in + * init_css_set. + */ + if (cset == &init_css_set) + extra--; + extra_refs += extra; + } + seq_puts(seq, "\n"); list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); + if (count++ <= MAX_TASKS_SHOWN_PER_CSS) + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); } list_for_each_entry(task, &cset->mg_tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) - goto overflow; - seq_printf(seq, " task %d\n", task_pid_vnr(task)); + if (count++ <= MAX_TASKS_SHOWN_PER_CSS) + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); } - continue; - overflow: - seq_puts(seq, " ...\n"); + /* show # of overflowed tasks */ + if (count > MAX_TASKS_SHOWN_PER_CSS) + seq_printf(seq, " ... (%d)\n", + count - MAX_TASKS_SHOWN_PER_CSS); + + if (cset->dead) { + seq_puts(seq, " [dead]\n"); + dead_cnt++; + } + + WARN_ON(count != cset->nr_tasks); } spin_unlock_irq(&css_set_lock); + + if (!dead_cnt && !extra_refs) + return 0; + + seq_puts(seq, "\n"); + if (extra_refs) + seq_printf(seq, "extra references = %d\n", extra_refs); + if (dead_cnt) + seq_printf(seq, "dead css_sets = %d\n", dead_cnt); + + return 0; +} + +static int cgroup_subsys_states_read(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct cgroup_subsys *ss; + struct cgroup_subsys_state *css; + char pbuf[16]; + int i; + + mutex_lock(&cgroup_mutex); + for_each_subsys(ss, i) { + css = rcu_dereference_check(cgrp->subsys[ss->id], true); + if (!css) + continue; + + pbuf[0] = '\0'; + + /* Show the parent CSS if applicable*/ + if (css->parent) + snprintf(pbuf, sizeof(pbuf) - 1, " P=%d", + css->parent->id); + seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, + (unsigned long)css, css->id, + atomic_read(&css->online_cnt), pbuf); + } + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroup_masks_read(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct cgroup_subsys *ss; + int i, j; + struct { + u16 *mask; + char *name; + } mask_list[] = { + { &cgrp->subtree_control, "subtree_control" }, + { &cgrp->subtree_ss_mask, "subtree_ss_mask" }, + }; + + mutex_lock(&cgroup_mutex); + for (i = 0; i < ARRAY_SIZE(mask_list); i++) { + u16 mask = *mask_list[i].mask; + bool first = true; + + seq_printf(seq, "%-17s: ", mask_list[i].name); + for_each_subsys(ss, j) { + if (!(mask & (1 << ss->id))) + continue; + if (!first) + seq_puts(seq, ", "); + seq_puts(seq, ss->name); + first = false; + } + seq_putc(seq, '\n'); + } + + mutex_unlock(&cgroup_mutex); return 0; } @@ -126,17 +248,20 @@ static struct cftype debug_files[] = { { .name = "current_css_set", - .read_u64 = current_css_set_read, + .seq_show = current_css_set_read, + .flags = CFTYPE_ONLY_ON_ROOT, }, { .name = "current_css_set_refcount", .read_u64 = current_css_set_refcount_read, + .flags = CFTYPE_ONLY_ON_ROOT, }, { .name = "current_css_set_cg_links", .seq_show = current_css_set_cg_links_read, + .flags = CFTYPE_ONLY_ON_ROOT, }, { @@ -144,6 +269,16 @@ static struct cftype debug_files[] = { .seq_show = cgroup_css_links_read, }, + { + .name = "cgroup_subsys_states", + .seq_show = cgroup_subsys_states_read, + }, + + { + .name = "cgroup_masks", + .seq_show = cgroup_masks_read, + }, + { .name = "releasable", .read_u64 = releasable_read, @@ -153,7 +288,8 @@ static struct cftype debug_files[] = { }; struct cgroup_subsys debug_cgrp_subsys = { - .css_alloc = debug_css_alloc, - .css_free = debug_css_free, - .legacy_cftypes = debug_files, + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, + .legacy_cftypes = debug_files, + .dfl_cftypes = debug_files, }; -- cgit v1.3-14-g43fede From 8cc38fa7fa317d44710f24475576b1f9ee205da9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Jun 2017 16:01:32 -0400 Subject: cgroup: make debug an implicit controller on cgroup2 Make debug an implicit controller on cgroup2 which is enabled by "cgroup_debug" boot param. Signed-off-by: Tejun Heo Cc: Waiman Long --- kernel/cgroup/debug.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 057d9b07f461..d61e692a5338 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -240,7 +240,7 @@ static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) !css_has_online_children(&css->cgroup->self)); } -static struct cftype debug_files[] = { +static struct cftype debug_legacy_files[] = { { .name = "taskcount", .read_u64 = debug_taskcount_read, @@ -287,9 +287,62 @@ static struct cftype debug_files[] = { { } /* terminate */ }; +static struct cftype debug_files[] = { + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .seq_show = current_css_set_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "current_css_set_cg_links", + .seq_show = current_css_set_cg_links_read, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + + { + .name = "css_links", + .seq_show = cgroup_css_links_read, + }, + + { + .name = "csses", + .seq_show = cgroup_subsys_states_read, + }, + + { + .name = "masks", + .seq_show = cgroup_masks_read, + }, + + { } /* terminate */ +}; + struct cgroup_subsys debug_cgrp_subsys = { .css_alloc = debug_css_alloc, .css_free = debug_css_free, - .legacy_cftypes = debug_files, - .dfl_cftypes = debug_files, + .legacy_cftypes = debug_legacy_files, }; + +/* + * On v2, debug is an implicit controller enabled by "cgroup_debug" boot + * parameter. + */ +static int __init enable_cgroup_debug(char *str) +{ + debug_cgrp_subsys.dfl_cftypes = debug_files; + debug_cgrp_subsys.implicit_on_dfl = true; + return 1; +} +__setup("cgroup_debug", enable_cgroup_debug); -- cgit v1.3-14-g43fede From 2866c0b4cf25274040f0b4cc045ad1f44b885dd8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Jun 2017 16:01:36 -0400 Subject: cgroup: refactor cgroup_masks_read() in the debug controller Factor out cgroup_masks_read_one() out of cgroup_masks_read() for simplicity. Signed-off-by: Tejun Heo Cc: Waiman Long --- kernel/cgroup/debug.c | 46 +++++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index d61e692a5338..163fdbd7adf6 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -200,36 +200,32 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) return 0; } -static int cgroup_masks_read(struct seq_file *seq, void *v) +static void cgroup_masks_read_one(struct seq_file *seq, const char *name, + u16 mask) { - struct cgroup *cgrp = seq_css(seq)->cgroup; struct cgroup_subsys *ss; - int i, j; - struct { - u16 *mask; - char *name; - } mask_list[] = { - { &cgrp->subtree_control, "subtree_control" }, - { &cgrp->subtree_ss_mask, "subtree_ss_mask" }, - }; + int ssid; + bool first = true; - mutex_lock(&cgroup_mutex); - for (i = 0; i < ARRAY_SIZE(mask_list); i++) { - u16 mask = *mask_list[i].mask; - bool first = true; - - seq_printf(seq, "%-17s: ", mask_list[i].name); - for_each_subsys(ss, j) { - if (!(mask & (1 << ss->id))) - continue; - if (!first) - seq_puts(seq, ", "); - seq_puts(seq, ss->name); - first = false; - } - seq_putc(seq, '\n'); + seq_printf(seq, "%-17s: ", name); + for_each_subsys(ss, ssid) { + if (!(mask & (1 << ssid))) + continue; + if (!first) + seq_puts(seq, ", "); + seq_puts(seq, ss->name); + first = false; } + seq_putc(seq, '\n'); +} +static int cgroup_masks_read(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + mutex_lock(&cgroup_mutex); + cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control); + cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask); mutex_unlock(&cgroup_mutex); return 0; } -- cgit v1.3-14-g43fede From b6053d40e3387c916d56d319ba6bfdb2c42a67c7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Jun 2017 16:01:41 -0400 Subject: cgroup: fix lockdep warning in debug controller The debug controller grabs cgroup_mutex from interface file show functions which can deadlock and triggers lockdep warnings. Fix it by using cgroup_kn_lock_live()/cgroup_kn_unlock() instead. Signed-off-by: Tejun Heo Cc: Waiman Long --- kernel/cgroup/debug.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 163fdbd7adf6..dac46af22782 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -38,12 +38,15 @@ static u64 debug_taskcount_read(struct cgroup_subsys_state *css, static int current_css_set_read(struct seq_file *seq, void *v) { + struct kernfs_open_file *of = seq->private; struct css_set *cset; struct cgroup_subsys *ss; struct cgroup_subsys_state *css; int i, refcnt; - mutex_lock(&cgroup_mutex); + if (!cgroup_kn_lock_live(of->kn, false)) + return -ENODEV; + spin_lock_irq(&css_set_lock); rcu_read_lock(); cset = rcu_dereference(current->cgroups); @@ -65,7 +68,7 @@ static int current_css_set_read(struct seq_file *seq, void *v) } rcu_read_unlock(); spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); + cgroup_kn_unlock(of->kn); return 0; } @@ -174,13 +177,17 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) static int cgroup_subsys_states_read(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; + struct kernfs_open_file *of = seq->private; + struct cgroup *cgrp; struct cgroup_subsys *ss; struct cgroup_subsys_state *css; char pbuf[16]; int i; - mutex_lock(&cgroup_mutex); + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + for_each_subsys(ss, i) { css = rcu_dereference_check(cgrp->subsys[ss->id], true); if (!css) @@ -196,7 +203,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) (unsigned long)css, css->id, atomic_read(&css->online_cnt), pbuf); } - mutex_unlock(&cgroup_mutex); + + cgroup_kn_unlock(of->kn); return 0; } @@ -221,12 +229,17 @@ static void cgroup_masks_read_one(struct seq_file *seq, const char *name, static int cgroup_masks_read(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; + struct kernfs_open_file *of = seq->private; + struct cgroup *cgrp; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; - mutex_lock(&cgroup_mutex); cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control); cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask); - mutex_unlock(&cgroup_mutex); + + cgroup_kn_unlock(of->kn); return 0; } -- cgit v1.3-14-g43fede From 33e4f80ee69b5168badf37edbfed796eb48434b9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 12 Jun 2017 22:56:34 +0200 Subject: ACPI / PM: Ignore spurious SCI wakeups from suspend-to-idle The ACPI SCI (System Control Interrupt) is set up as a wakeup IRQ during suspend-to-idle transitions and, consequently, any events signaled through it wake up the system from that state. However, on some systems some of the events signaled via the ACPI SCI while suspended to idle should not cause the system to wake up. In fact, quite often they should just be discarded. Arguably, systems should not resume entirely on such events, but in order to decide which events really should cause the system to resume and which are spurious, it is necessary to resume up to the point when ACPI SCIs are actually handled and processed, which is after executing dpm_resume_noirq() in the system resume path. For this reasons, add a loop around freeze_enter() in which the platforms can process events signaled via multiplexed IRQ lines like the ACPI SCI and add suspend-to-idle hooks that can be used for this purpose to struct platform_freeze_ops. In the ACPI case, the ->wake hook is used for checking if the SCI has triggered while suspended and deferring the interrupt-induced system wakeup until the events signaled through it are actually processed sufficiently to decide whether or not the system should resume. In turn, the ->sync hook allows all of the relevant event queues to be flushed so as to prevent events from being missed due to race conditions. In addition to that, some ACPI code processing wakeup events needs to be modified to use the "hard" version of wakeup triggers, so that it will cause a system resume to happen on device-induced wakeup events even if the "soft" mechanism to prevent the system from suspending is not enabled. However, to preserve the existing behavior with respect to suspend-to-RAM, this only is done in the suspend-to-idle case and only if an SCI has occurred while suspended. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/battery.c | 2 +- drivers/acpi/button.c | 5 +++-- drivers/acpi/device_pm.c | 9 ++++++++- drivers/acpi/internal.h | 2 ++ drivers/acpi/sleep.c | 37 +++++++++++++++++++++++++++++++++++++ drivers/base/power/main.c | 5 ----- drivers/base/power/wakeup.c | 18 ++++++++++++------ include/acpi/acpi_bus.h | 6 +++++- include/linux/suspend.h | 7 +++++-- kernel/power/process.c | 2 +- kernel/power/suspend.c | 35 +++++++++++++++++++++++++++++------ 11 files changed, 103 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index d42eeef9d928..1cbb88d938e5 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -782,7 +782,7 @@ static int acpi_battery_update(struct acpi_battery *battery, bool resume) if ((battery->state & ACPI_BATTERY_STATE_CRITICAL) || (test_bit(ACPI_BATTERY_ALARM_PRESENT, &battery->flags) && (battery->capacity_now <= battery->alarm))) - pm_wakeup_event(&battery->device->dev, 0); + acpi_pm_wakeup_event(&battery->device->dev); return result; } diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index e19f530f1083..91cfdf377df7 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -217,7 +217,7 @@ static int acpi_lid_notify_state(struct acpi_device *device, int state) } if (state) - pm_wakeup_event(&device->dev, 0); + acpi_pm_wakeup_event(&device->dev); ret = blocking_notifier_call_chain(&acpi_lid_notifier, state, device); if (ret == NOTIFY_DONE) @@ -402,7 +402,7 @@ static void acpi_button_notify(struct acpi_device *device, u32 event) } else { int keycode; - pm_wakeup_event(&device->dev, 0); + acpi_pm_wakeup_event(&device->dev); if (button->suspended) break; @@ -534,6 +534,7 @@ static int acpi_button_add(struct acpi_device *device) lid_device = device; } + device_init_wakeup(&device->dev, true); printk(KERN_INFO PREFIX "%s [%s]\n", name, acpi_device_bid(device)); return 0; diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index f13c62c4b117..ca0210213773 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "internal.h" @@ -385,6 +386,12 @@ EXPORT_SYMBOL(acpi_bus_power_manageable); #ifdef CONFIG_PM static DEFINE_MUTEX(acpi_pm_notifier_lock); +void acpi_pm_wakeup_event(struct device *dev) +{ + pm_wakeup_dev_event(dev, 0, acpi_s2idle_wakeup()); +} +EXPORT_SYMBOL_GPL(acpi_pm_wakeup_event); + static void acpi_pm_notify_handler(acpi_handle handle, u32 val, void *not_used) { struct acpi_device *adev; @@ -399,7 +406,7 @@ static void acpi_pm_notify_handler(acpi_handle handle, u32 val, void *not_used) mutex_lock(&acpi_pm_notifier_lock); if (adev->wakeup.flags.notifier_present) { - __pm_wakeup_event(adev->wakeup.ws, 0); + pm_wakeup_ws_event(adev->wakeup.ws, 0, acpi_s2idle_wakeup()); if (adev->wakeup.context.func) adev->wakeup.context.func(&adev->wakeup.context); } diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 66229ffa909b..75924ea69071 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -198,8 +198,10 @@ void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit); Suspend/Resume -------------------------------------------------------------------------- */ #ifdef CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT +extern bool acpi_s2idle_wakeup(void); extern int acpi_sleep_init(void); #else +static inline bool acpi_s2idle_wakeup(void) { return false; } static inline int acpi_sleep_init(void) { return -ENXIO; } #endif diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index a4782c75ebdd..555de11a56b6 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -650,6 +650,8 @@ static const struct platform_suspend_ops acpi_suspend_ops_old = { .recover = acpi_pm_finish, }; +static bool s2idle_wakeup; + static int acpi_freeze_begin(void) { acpi_scan_lock_acquire(); @@ -666,6 +668,33 @@ static int acpi_freeze_prepare(void) return 0; } +static void acpi_freeze_wake(void) +{ + /* + * If IRQD_WAKEUP_ARMED is not set for the SCI at this point, it means + * that the SCI has triggered while suspended, so cancel the wakeup in + * case it has not been a wakeup event (the GPEs will be checked later). + */ + if (acpi_sci_irq_valid() && + !irqd_is_wakeup_armed(irq_get_irq_data(acpi_sci_irq))) { + pm_system_cancel_wakeup(); + s2idle_wakeup = true; + } +} + +static void acpi_freeze_sync(void) +{ + /* + * Process all pending events in case there are any wakeup ones. + * + * The EC driver uses the system workqueue, so that one needs to be + * flushed too. + */ + acpi_os_wait_events_complete(); + flush_scheduled_work(); + s2idle_wakeup = false; +} + static void acpi_freeze_restore(void) { if (acpi_sci_irq_valid()) @@ -682,6 +711,8 @@ static void acpi_freeze_end(void) static const struct platform_freeze_ops acpi_freeze_ops = { .begin = acpi_freeze_begin, .prepare = acpi_freeze_prepare, + .wake = acpi_freeze_wake, + .sync = acpi_freeze_sync, .restore = acpi_freeze_restore, .end = acpi_freeze_end, }; @@ -700,9 +731,15 @@ static void acpi_sleep_suspend_setup(void) } #else /* !CONFIG_SUSPEND */ +#define s2idle_wakeup (false) static inline void acpi_sleep_suspend_setup(void) {} #endif /* !CONFIG_SUSPEND */ +bool acpi_s2idle_wakeup(void) +{ + return s2idle_wakeup; +} + #ifdef CONFIG_PM_SLEEP static u32 saved_bm_rld; diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 253f860e8981..ef5b6a6e5045 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1095,11 +1095,6 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a if (async_error) goto Complete; - if (pm_wakeup_pending()) { - async_error = -EBUSY; - goto Complete; - } - if (dev->power.syscore || dev->power.direct_complete) goto Complete; diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index c313b600d356..9c36b27996fc 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -28,8 +28,8 @@ bool events_check_enabled __read_mostly; /* First wakeup IRQ seen by the kernel in the last cycle. */ unsigned int pm_wakeup_irq __read_mostly; -/* If set and the system is suspending, terminate the suspend. */ -static bool pm_abort_suspend __read_mostly; +/* If greater than 0 and the system is suspending, terminate the suspend. */ +static atomic_t pm_abort_suspend __read_mostly; /* * Combined counters of registered wakeup events and wakeup events in progress. @@ -855,20 +855,26 @@ bool pm_wakeup_pending(void) pm_print_active_wakeup_sources(); } - return ret || pm_abort_suspend; + return ret || atomic_read(&pm_abort_suspend) > 0; } void pm_system_wakeup(void) { - pm_abort_suspend = true; + atomic_inc(&pm_abort_suspend); freeze_wake(); } EXPORT_SYMBOL_GPL(pm_system_wakeup); -void pm_wakeup_clear(void) +void pm_system_cancel_wakeup(void) +{ + atomic_dec(&pm_abort_suspend); +} + +void pm_wakeup_clear(bool reset) { - pm_abort_suspend = false; pm_wakeup_irq = 0; + if (reset) + atomic_set(&pm_abort_suspend, 0); } void pm_system_irq_wakeup(unsigned int irq_number) diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 79c0af419300..63a90a624a0f 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -598,15 +598,19 @@ static inline bool acpi_device_always_present(struct acpi_device *adev) #endif #ifdef CONFIG_PM +void acpi_pm_wakeup_event(struct device *dev); acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev, void (*func)(struct acpi_device_wakeup_context *context)); acpi_status acpi_remove_pm_notifier(struct acpi_device *adev); int acpi_pm_device_sleep_state(struct device *, int *, int); int acpi_pm_device_run_wake(struct device *, bool); #else +static inline void acpi_pm_wakeup_event(struct device *dev) +{ +} static inline acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev, - void (*work_func)(struct work_struct *work)) + void (*func)(struct acpi_device_wakeup_context *context)) { return AE_SUPPORT; } diff --git a/include/linux/suspend.h b/include/linux/suspend.h index d9718378a8be..0b1cf32edfd7 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -189,6 +189,8 @@ struct platform_suspend_ops { struct platform_freeze_ops { int (*begin)(void); int (*prepare)(void); + void (*wake)(void); + void (*sync)(void); void (*restore)(void); void (*end)(void); }; @@ -428,7 +430,8 @@ extern unsigned int pm_wakeup_irq; extern bool pm_wakeup_pending(void); extern void pm_system_wakeup(void); -extern void pm_wakeup_clear(void); +extern void pm_system_cancel_wakeup(void); +extern void pm_wakeup_clear(bool reset); extern void pm_system_irq_wakeup(unsigned int irq_number); extern bool pm_get_wakeup_count(unsigned int *count, bool block); extern bool pm_save_wakeup_count(unsigned int count); @@ -478,7 +481,7 @@ static inline int unregister_pm_notifier(struct notifier_block *nb) static inline bool pm_wakeup_pending(void) { return false; } static inline void pm_system_wakeup(void) {} -static inline void pm_wakeup_clear(void) {} +static inline void pm_wakeup_clear(bool reset) {} static inline void pm_system_irq_wakeup(unsigned int irq_number) {} static inline void lock_system_sleep(void) {} diff --git a/kernel/power/process.c b/kernel/power/process.c index c7209f060eeb..78672d324a6e 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -132,7 +132,7 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); - pm_wakeup_clear(); + pm_wakeup_clear(true); pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 15e6baef5c73..3ecf275d7e44 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -72,6 +72,8 @@ static void freeze_begin(void) static void freeze_enter(void) { + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true); + spin_lock_irq(&suspend_freeze_lock); if (pm_wakeup_pending()) goto out; @@ -84,11 +86,9 @@ static void freeze_enter(void) /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); - pr_debug("PM: suspend-to-idle\n"); /* Make the current CPU wait so it can enter the idle loop too. */ wait_event(suspend_freeze_wait_head, suspend_freeze_state == FREEZE_STATE_WAKE); - pr_debug("PM: resume from suspend-to-idle\n"); cpuidle_pause(); put_online_cpus(); @@ -98,6 +98,31 @@ static void freeze_enter(void) out: suspend_freeze_state = FREEZE_STATE_NONE; spin_unlock_irq(&suspend_freeze_lock); + + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false); +} + +static void s2idle_loop(void) +{ + pr_debug("PM: suspend-to-idle\n"); + + do { + freeze_enter(); + + if (freeze_ops && freeze_ops->wake) + freeze_ops->wake(); + + dpm_resume_noirq(PMSG_RESUME); + if (freeze_ops && freeze_ops->sync) + freeze_ops->sync(); + + if (pm_wakeup_pending()) + break; + + pm_wakeup_clear(false); + } while (!dpm_suspend_noirq(PMSG_SUSPEND)); + + pr_debug("PM: resume from suspend-to-idle\n"); } void freeze_wake(void) @@ -371,10 +396,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) * all the devices are suspended. */ if (state == PM_SUSPEND_FREEZE) { - trace_suspend_resume(TPS("machine_suspend"), state, true); - freeze_enter(); - trace_suspend_resume(TPS("machine_suspend"), state, false); - goto Platform_wake; + s2idle_loop(); + goto Platform_early_resume; } error = disable_nonboot_cpus(); -- cgit v1.3-14-g43fede From cd33f5f2cbfaadc21270f3ddac7c3c33e0a1a28c Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 12 Jun 2017 11:53:09 -0400 Subject: audit: make sure we never skip the multicast broadcast When the auditd connection is reset, either intentionally or due to a failure, any records that were in the main backlog queue would not be sent in a multicast broadcast. This patch fixes this problem by not flushing the main backlog queue on a connection reset, the main kauditd_thread() will take care of that normally. Resolves: https://github.com/linux-audit/audit-kernel/issues/41 Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e1e2b3abfb93..7cad70214b81 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -605,11 +605,10 @@ static void auditd_reset(const struct auditd_connection *ac) if (ac_old) call_rcu(&ac_old->rcu, auditd_conn_free); - /* flush all of the main and retry queues to the hold queue */ + /* flush the retry queue to the hold queue, but don't touch the main + * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) kauditd_hold_skb(skb); - while ((skb = skb_dequeue(&audit_queue))) - kauditd_hold_skb(skb); } /** -- cgit v1.3-14-g43fede From 204a2be30a7a8a8d12642f23f3fbdc8b9923b500 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Wed, 7 Jun 2017 00:11:44 +0200 Subject: m68k: Remove ptrace_signal_deliver This fixes debugger syscall restart interactions. A debugger that modifies the tracee's program counter is expected to set the orig_d0 pseudo register to -1, to disable a possible syscall restart. This removes the last user of the ptrace_signal_deliver hook in the ptrace signal handling, so remove that as well. Signed-off-by: Andreas Schwab Signed-off-by: Geert Uytterhoeven --- arch/m68k/include/asm/signal.h | 5 ----- arch/m68k/kernel/signal.c | 16 ---------------- include/linux/ptrace.h | 4 ---- kernel/signal.c | 1 - 4 files changed, 26 deletions(-) (limited to 'kernel') diff --git a/arch/m68k/include/asm/signal.h b/arch/m68k/include/asm/signal.h index 8c8ce5e1ee0e..3bc64d02ba5f 100644 --- a/arch/m68k/include/asm/signal.h +++ b/arch/m68k/include/asm/signal.h @@ -62,9 +62,4 @@ static inline int __gen_sigismember(sigset_t *set, int _sig) #endif /* !CONFIG_CPU_HAS_NO_BITFIELDS */ -#ifndef __uClinux__ -extern void ptrace_signal_deliver(void); -#define ptrace_signal_deliver ptrace_signal_deliver -#endif /* __uClinux__ */ - #endif /* _M68K_SIGNAL_H */ diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 6f945bb5ffbd..e79421f5b9cd 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -109,22 +109,6 @@ int fixup_exception(struct pt_regs *regs) return 1; } -void ptrace_signal_deliver(void) -{ - struct pt_regs *regs = signal_pt_regs(); - if (regs->orig_d0 < 0) - return; - switch (regs->d0) { - case -ERESTARTNOHAND: - case -ERESTARTSYS: - case -ERESTARTNOINTR: - regs->d0 = regs->orig_d0; - regs->orig_d0 = -1; - regs->pc -= 2; - break; - } -} - static inline void push_cache (unsigned long vaddr) { /* diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 422bc2e4cb6a..9a2e04be0657 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -388,10 +388,6 @@ static inline void user_single_step_siginfo(struct task_struct *tsk, #define current_pt_regs() task_pt_regs(current) #endif -#ifndef ptrace_signal_deliver -#define ptrace_signal_deliver() ((void)0) -#endif - /* * unlike current_pt_regs(), this one is equal to task_pt_regs(current) * on *all* architectures; the only reason to have a per-arch definition diff --git a/kernel/signal.c b/kernel/signal.c index ca92bcfeb322..f0a48e5b1f0b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2092,7 +2092,6 @@ static void do_jobctl_trap(void) static int ptrace_signal(int signr, siginfo_t *info) { - ptrace_signal_deliver(); /* * We do not check sig_kernel_stop(signr) but set this marker * unconditionally because we do not know whether debugger will -- cgit v1.3-14-g43fede From cde50a67397c0da7d11795d4b4418384022ab8e6 Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Sun, 18 Jun 2017 14:06:01 +0000 Subject: locking/rtmutex: Don't initialize lockdep when not required pi_mutex isn't supposed to be tracked by lockdep, but just passing NULLs for name and key will cause lockdep to spew a warning and die, which is not what we want it to do. Skip lockdep initialization if the caller passed NULLs for name and key, suggesting such initialization isn't desired. Signed-off-by: Sasha Levin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: f5694788ad8d ("rt_mutex: Add lockdep annotations") Link: http://lkml.kernel.org/r/20170618140548.4763-1-alexander.levin@verizon.com Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 43123533e9b1..78069895032a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1661,7 +1661,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name, lock->waiters = RB_ROOT; lock->waiters_leftmost = NULL; - debug_rt_mutex_init(lock, name, key); + if (name && key) + debug_rt_mutex_init(lock, name, key); } EXPORT_SYMBOL_GPL(__rt_mutex_init); -- cgit v1.3-14-g43fede From ac6424b981bce1c4bc55675c6ce11bfe1bbfa64f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 20 Jun 2017 12:06:13 +0200 Subject: sched/wait: Rename wait_queue_t => wait_queue_entry_t Rename: wait_queue_t => wait_queue_entry_t 'wait_queue_t' was always a slight misnomer: its name implies that it's a "queue", but in reality it's a queue *entry*. The 'real' queue is the wait queue head, which had to carry the name. Start sorting this out by renaming it to 'wait_queue_entry_t'. This also allows the real structure name 'struct __wait_queue' to lose its double underscore and become 'struct wait_queue_entry', which is the more canonical nomenclature for such data types. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- Documentation/DocBook/kernel-hacking.tmpl | 2 +- Documentation/filesystems/autofs4.txt | 12 ++-- block/blk-mq.c | 2 +- block/blk-wbt.c | 2 +- block/kyber-iosched.c | 8 +-- drivers/bluetooth/btmrvl_main.c | 2 +- drivers/char/ipmi/ipmi_watchdog.c | 2 +- drivers/gpu/drm/i915/i915_gem_request.h | 2 +- drivers/gpu/drm/i915/i915_sw_fence.c | 14 ++--- drivers/gpu/drm/i915/i915_sw_fence.h | 2 +- drivers/gpu/drm/radeon/radeon.h | 2 +- drivers/gpu/drm/radeon/radeon_fence.c | 2 +- drivers/gpu/vga/vgaarb.c | 2 +- drivers/infiniband/hw/i40iw/i40iw_main.c | 2 +- drivers/md/bcache/btree.h | 2 +- drivers/net/ethernet/cavium/liquidio/octeon_main.h | 4 +- drivers/net/wireless/cisco/airo.c | 2 +- .../net/wireless/intersil/hostap/hostap_ioctl.c | 2 +- drivers/net/wireless/marvell/libertas/main.c | 2 +- drivers/scsi/dpt/dpti_i2o.h | 2 +- drivers/scsi/ips.c | 12 ++-- drivers/scsi/ips.h | 4 +- .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c | 6 +- .../staging/lustre/lnet/klnds/socklnd/socklnd_cb.c | 4 +- drivers/staging/lustre/lnet/libcfs/debug.c | 2 +- drivers/staging/lustre/lnet/libcfs/tracefile.c | 2 +- drivers/staging/lustre/lnet/lnet/lib-eq.c | 2 +- drivers/staging/lustre/lnet/lnet/lib-socket.c | 2 +- drivers/staging/lustre/lustre/fid/fid_request.c | 6 +- drivers/staging/lustre/lustre/include/lustre_lib.h | 4 +- drivers/staging/lustre/lustre/llite/lcommon_cl.c | 2 +- .../staging/lustre/lustre/lov/lov_cl_internal.h | 2 +- drivers/staging/lustre/lustre/lov/lov_object.c | 2 +- drivers/staging/lustre/lustre/obdclass/lu_object.c | 6 +- drivers/tty/synclink_gt.c | 2 +- drivers/vfio/virqfd.c | 2 +- drivers/vhost/vhost.c | 2 +- drivers/vhost/vhost.h | 2 +- fs/autofs4/autofs_i.h | 2 +- fs/autofs4/waitq.c | 18 +++--- fs/cachefiles/internal.h | 2 +- fs/cachefiles/namei.c | 2 +- fs/cachefiles/rdwr.c | 2 +- fs/dax.c | 4 +- fs/eventfd.c | 2 +- fs/eventpoll.c | 10 ++-- fs/fs_pin.c | 2 +- fs/nfs/nfs4proc.c | 4 +- fs/nilfs2/segment.c | 2 +- fs/orangefs/orangefs-bufmap.c | 4 +- fs/reiserfs/journal.c | 2 +- fs/select.c | 4 +- fs/signalfd.c | 2 +- fs/userfaultfd.c | 8 +-- include/linux/blk-mq.h | 2 +- include/linux/eventfd.h | 4 +- include/linux/kvm_irqfd.h | 2 +- include/linux/pagemap.h | 2 +- include/linux/poll.h | 2 +- include/linux/vfio.h | 2 +- include/linux/wait.h | 67 +++++++++++----------- include/net/af_unix.h | 2 +- include/uapi/linux/auto_fs.h | 4 +- include/uapi/linux/auto_fs4.h | 4 +- kernel/exit.c | 4 +- kernel/futex.c | 2 +- kernel/sched/completion.c | 2 +- kernel/sched/core.c | 2 +- kernel/sched/wait.c | 42 +++++++------- kernel/workqueue.c | 4 +- mm/filemap.c | 10 ++-- mm/memcontrol.c | 8 +-- mm/mempool.c | 2 +- mm/shmem.c | 2 +- net/9p/trans_fd.c | 4 +- net/bluetooth/bnep/core.c | 2 +- net/bluetooth/cmtp/core.c | 2 +- net/bluetooth/hidp/core.c | 2 +- net/core/datagram.c | 2 +- net/unix/af_unix.c | 4 +- sound/core/control.c | 2 +- sound/core/hwdep.c | 2 +- sound/core/init.c | 2 +- sound/core/oss/pcm_oss.c | 4 +- sound/core/pcm_lib.c | 2 +- sound/core/pcm_native.c | 4 +- sound/core/rawmidi.c | 8 +-- sound/core/seq/seq_fifo.c | 2 +- sound/core/seq/seq_memory.c | 2 +- sound/core/timer.c | 2 +- sound/isa/wavefront/wavefront_synth.c | 2 +- sound/pci/mixart/mixart_core.c | 4 +- sound/pci/ymfpci/ymfpci_main.c | 2 +- virt/kvm/eventfd.c | 2 +- 94 files changed, 216 insertions(+), 213 deletions(-) (limited to 'kernel') diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl index da5c087462b1..c3c705591532 100644 --- a/Documentation/DocBook/kernel-hacking.tmpl +++ b/Documentation/DocBook/kernel-hacking.tmpl @@ -819,7 +819,7 @@ printk(KERN_INFO "my ip: %pI4\n", &ipaddress); certain condition is true. They must be used carefully to ensure there is no race condition. You declare a wait_queue_head_t, and then processes which want to - wait for that condition declare a wait_queue_t + wait for that condition declare a wait_queue_entry_t referring to themselves, and place that in the queue. diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt index f10dd590f69f..8444dc3d57e8 100644 --- a/Documentation/filesystems/autofs4.txt +++ b/Documentation/filesystems/autofs4.txt @@ -316,7 +316,7 @@ For version 5, the format of the message is: struct autofs_v5_packet { int proto_version; /* Protocol version */ int type; /* Type of packet */ - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_entry_token; __u32 dev; __u64 ino; __u32 uid; @@ -341,12 +341,12 @@ The pipe will be set to "packet mode" (equivalent to passing `O_DIRECT`) to _pipe2(2)_ so that a read from the pipe will return at most one packet, and any unread portion of a packet will be discarded. -The `wait_queue_token` is a unique number which can identify a +The `wait_queue_entry_token` is a unique number which can identify a particular request to be acknowledged. When a message is sent over the pipe the affected dentry is marked as either "active" or "expiring" and other accesses to it block until the message is acknowledged using one of the ioctls below and the relevant -`wait_queue_token`. +`wait_queue_entry_token`. Communicating with autofs: root directory ioctls ------------------------------------------------ @@ -358,7 +358,7 @@ capability, or must be the automount daemon. The available ioctl commands are: - **AUTOFS_IOC_READY**: a notification has been handled. The argument - to the ioctl command is the "wait_queue_token" number + to the ioctl command is the "wait_queue_entry_token" number corresponding to the notification being acknowledged. - **AUTOFS_IOC_FAIL**: similar to above, but indicates failure with the error code `ENOENT`. @@ -382,14 +382,14 @@ The available ioctl commands are: struct autofs_packet_expire_multi { int proto_version; /* Protocol version */ int type; /* Type of packet */ - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_entry_token; int len; char name[NAME_MAX+1]; }; is required. This is filled in with the name of something that can be unmounted or removed. If nothing can be expired, - `errno` is set to `EAGAIN`. Even though a `wait_queue_token` + `errno` is set to `EAGAIN`. Even though a `wait_queue_entry_token` is present in the structure, no "wait queue" is established and no acknowledgment is needed. - **AUTOFS_IOC_EXPIRE_MULTI**: This is similar to diff --git a/block/blk-mq.c b/block/blk-mq.c index bb66c96850b1..a083f95e04b1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -926,7 +926,7 @@ static bool reorder_tags_to_front(struct list_head *list) return first != NULL; } -static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags, +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { struct blk_mq_hw_ctx *hctx; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 17676f4d7fd1..5f3a37c2784c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -503,7 +503,7 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) } static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, - wait_queue_t *wait, unsigned long rw) + wait_queue_entry_t *wait, unsigned long rw) { /* * inc it here even if disabled, since we'll dec it at completion. diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index b9faabc75fdb..b95d6bd714c0 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -99,7 +99,7 @@ struct kyber_hctx_data { struct list_head rqs[KYBER_NUM_DOMAINS]; unsigned int cur_domain; unsigned int batching; - wait_queue_t domain_wait[KYBER_NUM_DOMAINS]; + wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS]; atomic_t wait_index[KYBER_NUM_DOMAINS]; }; @@ -507,7 +507,7 @@ static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd, } } -static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags, +static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private); @@ -523,7 +523,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, { unsigned int sched_domain = khd->cur_domain; struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; - wait_queue_t *wait = &khd->domain_wait[sched_domain]; + wait_queue_entry_t *wait = &khd->domain_wait[sched_domain]; struct sbq_wait_state *ws; int nr; @@ -734,7 +734,7 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \ { \ struct blk_mq_hw_ctx *hctx = data; \ struct kyber_hctx_data *khd = hctx->sched_data; \ - wait_queue_t *wait = &khd->domain_wait[domain]; \ + wait_queue_entry_t *wait = &khd->domain_wait[domain]; \ \ seq_printf(m, "%d\n", !list_empty_careful(&wait->task_list)); \ return 0; \ diff --git a/drivers/bluetooth/btmrvl_main.c b/drivers/bluetooth/btmrvl_main.c index c38cb5b91291..fe850f0567cb 100644 --- a/drivers/bluetooth/btmrvl_main.c +++ b/drivers/bluetooth/btmrvl_main.c @@ -602,7 +602,7 @@ static int btmrvl_service_main_thread(void *data) struct btmrvl_thread *thread = data; struct btmrvl_private *priv = thread->priv; struct btmrvl_adapter *adapter = priv->adapter; - wait_queue_t wait; + wait_queue_entry_t wait; struct sk_buff *skb; ulong flags; diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c index d165af8abe36..a5c6cfe71a8e 100644 --- a/drivers/char/ipmi/ipmi_watchdog.c +++ b/drivers/char/ipmi/ipmi_watchdog.c @@ -821,7 +821,7 @@ static ssize_t ipmi_read(struct file *file, loff_t *ppos) { int rv = 0; - wait_queue_t wait; + wait_queue_entry_t wait; if (count <= 0) return 0; diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h index 129c58bb4805..a4a920c4c454 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.h +++ b/drivers/gpu/drm/i915/i915_gem_request.h @@ -123,7 +123,7 @@ struct drm_i915_gem_request { * It is used by the driver to then queue the request for execution. */ struct i915_sw_fence submit; - wait_queue_t submitq; + wait_queue_entry_t submitq; wait_queue_head_t execute; /* A list of everyone we wait upon, and everyone who waits upon us. diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c index a277f8eb7beb..8669bfa33064 100644 --- a/drivers/gpu/drm/i915/i915_sw_fence.c +++ b/drivers/gpu/drm/i915/i915_sw_fence.c @@ -152,7 +152,7 @@ static void __i915_sw_fence_wake_up_all(struct i915_sw_fence *fence, struct list_head *continuation) { wait_queue_head_t *x = &fence->wait; - wait_queue_t *pos, *next; + wait_queue_entry_t *pos, *next; unsigned long flags; debug_fence_deactivate(fence); @@ -254,7 +254,7 @@ void i915_sw_fence_commit(struct i915_sw_fence *fence) __i915_sw_fence_commit(fence); } -static int i915_sw_fence_wake(wait_queue_t *wq, unsigned mode, int flags, void *key) +static int i915_sw_fence_wake(wait_queue_entry_t *wq, unsigned mode, int flags, void *key) { list_del(&wq->task_list); __i915_sw_fence_complete(wq->private, key); @@ -267,7 +267,7 @@ static int i915_sw_fence_wake(wait_queue_t *wq, unsigned mode, int flags, void * static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence, const struct i915_sw_fence * const signaler) { - wait_queue_t *wq; + wait_queue_entry_t *wq; if (__test_and_set_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags)) return false; @@ -288,7 +288,7 @@ static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence, static void __i915_sw_fence_clear_checked_bit(struct i915_sw_fence *fence) { - wait_queue_t *wq; + wait_queue_entry_t *wq; if (!__test_and_clear_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags)) return; @@ -320,7 +320,7 @@ static bool i915_sw_fence_check_if_after(struct i915_sw_fence *fence, static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence, struct i915_sw_fence *signaler, - wait_queue_t *wq, gfp_t gfp) + wait_queue_entry_t *wq, gfp_t gfp) { unsigned long flags; int pending; @@ -359,7 +359,7 @@ static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence, spin_lock_irqsave(&signaler->wait.lock, flags); if (likely(!i915_sw_fence_done(signaler))) { - __add_wait_queue_tail(&signaler->wait, wq); + __add_wait_queue_entry_tail(&signaler->wait, wq); pending = 1; } else { i915_sw_fence_wake(wq, 0, 0, NULL); @@ -372,7 +372,7 @@ static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence, int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence, struct i915_sw_fence *signaler, - wait_queue_t *wq) + wait_queue_entry_t *wq) { return __i915_sw_fence_await_sw_fence(fence, signaler, wq, 0); } diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h index d31cefbbcc04..fd3c3bf6c8b7 100644 --- a/drivers/gpu/drm/i915/i915_sw_fence.h +++ b/drivers/gpu/drm/i915/i915_sw_fence.h @@ -66,7 +66,7 @@ void i915_sw_fence_commit(struct i915_sw_fence *fence); int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence, struct i915_sw_fence *after, - wait_queue_t *wq); + wait_queue_entry_t *wq); int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence, struct i915_sw_fence *after, gfp_t gfp); diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index c1c8e2208a21..e562a78510ff 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -375,7 +375,7 @@ struct radeon_fence { unsigned ring; bool is_vm_update; - wait_queue_t fence_wake; + wait_queue_entry_t fence_wake; }; int radeon_fence_driver_start_ring(struct radeon_device *rdev, int ring); diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index ef09f0a63754..e86f2bd38410 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -158,7 +158,7 @@ int radeon_fence_emit(struct radeon_device *rdev, * for the fence locking itself, so unlocked variants are used for * fence_signal, and remove_wait_queue. */ -static int radeon_fence_check_signaled(wait_queue_t *wait, unsigned mode, int flags, void *key) +static int radeon_fence_check_signaled(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { struct radeon_fence *fence; u64 seq; diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c index 92f1452dad57..76875f6299b8 100644 --- a/drivers/gpu/vga/vgaarb.c +++ b/drivers/gpu/vga/vgaarb.c @@ -417,7 +417,7 @@ int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible) { struct vga_device *vgadev, *conflict; unsigned long flags; - wait_queue_t wait; + wait_queue_entry_t wait; int rc = 0; vga_check_first_use(); diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index a3f18a22f5ed..e0f47cc2effc 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -1939,7 +1939,7 @@ static int i40iw_virtchnl_receive(struct i40e_info *ldev, bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev) { struct i40iw_device *iwdev; - wait_queue_t wait; + wait_queue_entry_t wait; iwdev = dev->back_dev; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 9b80417cd547..73da1f5626cb 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -207,7 +207,7 @@ void bkey_put(struct cache_set *c, struct bkey *k); struct btree_op { /* for waiting on btree reserve in btree_split() */ - wait_queue_t wait; + wait_queue_entry_t wait; /* Btree level at which we start taking write locks */ short lock; diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_main.h b/drivers/net/ethernet/cavium/liquidio/octeon_main.h index bed9ef17bc26..7ccffbb0019e 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_main.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_main.h @@ -144,7 +144,7 @@ static inline int sleep_cond(wait_queue_head_t *wait_queue, int *condition) { int errno = 0; - wait_queue_t we; + wait_queue_entry_t we; init_waitqueue_entry(&we, current); add_wait_queue(wait_queue, &we); @@ -171,7 +171,7 @@ sleep_timeout_cond(wait_queue_head_t *wait_queue, int *condition, int timeout) { - wait_queue_t we; + wait_queue_entry_t we; init_waitqueue_entry(&we, current); add_wait_queue(wait_queue, &we); diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c index 1b7e125a28e2..6a13303af2b7 100644 --- a/drivers/net/wireless/cisco/airo.c +++ b/drivers/net/wireless/cisco/airo.c @@ -3066,7 +3066,7 @@ static int airo_thread(void *data) { if (ai->jobs) { locked = down_interruptible(&ai->sem); } else { - wait_queue_t wait; + wait_queue_entry_t wait; init_waitqueue_entry(&wait, current); add_wait_queue(&ai->thr_wait, &wait); diff --git a/drivers/net/wireless/intersil/hostap/hostap_ioctl.c b/drivers/net/wireless/intersil/hostap/hostap_ioctl.c index b2c6b065b542..ff153ce29539 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_ioctl.c +++ b/drivers/net/wireless/intersil/hostap/hostap_ioctl.c @@ -2544,7 +2544,7 @@ static int prism2_ioctl_priv_prism2_param(struct net_device *dev, ret = -EINVAL; } if (local->iw_mode == IW_MODE_MASTER) { - wait_queue_t __wait; + wait_queue_entry_t __wait; init_waitqueue_entry(&__wait, current); add_wait_queue(&local->hostscan_wq, &__wait); set_current_state(TASK_INTERRUPTIBLE); diff --git a/drivers/net/wireless/marvell/libertas/main.c b/drivers/net/wireless/marvell/libertas/main.c index e3500203715c..dde065d0d5c1 100644 --- a/drivers/net/wireless/marvell/libertas/main.c +++ b/drivers/net/wireless/marvell/libertas/main.c @@ -453,7 +453,7 @@ static int lbs_thread(void *data) { struct net_device *dev = data; struct lbs_private *priv = dev->ml_priv; - wait_queue_t wait; + wait_queue_entry_t wait; lbs_deb_enter(LBS_DEB_THREAD); diff --git a/drivers/scsi/dpt/dpti_i2o.h b/drivers/scsi/dpt/dpti_i2o.h index bd9e31e16249..16fc380b5512 100644 --- a/drivers/scsi/dpt/dpti_i2o.h +++ b/drivers/scsi/dpt/dpti_i2o.h @@ -48,7 +48,7 @@ #include typedef wait_queue_head_t adpt_wait_queue_head_t; #define ADPT_DECLARE_WAIT_QUEUE_HEAD(wait) DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wait) -typedef wait_queue_t adpt_wait_queue_t; +typedef wait_queue_entry_t adpt_wait_queue_entry_t; /* * message structures diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c index 3419e1bcdff6..67621308eb9c 100644 --- a/drivers/scsi/ips.c +++ b/drivers/scsi/ips.c @@ -301,13 +301,13 @@ static uint32_t ips_statupd_copperhead_memio(ips_ha_t *); static uint32_t ips_statupd_morpheus(ips_ha_t *); static ips_scb_t *ips_getscb(ips_ha_t *); static void ips_putq_scb_head(ips_scb_queue_t *, ips_scb_t *); -static void ips_putq_wait_tail(ips_wait_queue_t *, struct scsi_cmnd *); +static void ips_putq_wait_tail(ips_wait_queue_entry_t *, struct scsi_cmnd *); static void ips_putq_copp_tail(ips_copp_queue_t *, ips_copp_wait_item_t *); static ips_scb_t *ips_removeq_scb_head(ips_scb_queue_t *); static ips_scb_t *ips_removeq_scb(ips_scb_queue_t *, ips_scb_t *); -static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *); -static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_t *, +static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_entry_t *); +static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_entry_t *, struct scsi_cmnd *); static ips_copp_wait_item_t *ips_removeq_copp(ips_copp_queue_t *, ips_copp_wait_item_t *); @@ -2871,7 +2871,7 @@ ips_removeq_scb(ips_scb_queue_t * queue, ips_scb_t * item) /* ASSUMED to be called from within the HA lock */ /* */ /****************************************************************************/ -static void ips_putq_wait_tail(ips_wait_queue_t *queue, struct scsi_cmnd *item) +static void ips_putq_wait_tail(ips_wait_queue_entry_t *queue, struct scsi_cmnd *item) { METHOD_TRACE("ips_putq_wait_tail", 1); @@ -2902,7 +2902,7 @@ static void ips_putq_wait_tail(ips_wait_queue_t *queue, struct scsi_cmnd *item) /* ASSUMED to be called from within the HA lock */ /* */ /****************************************************************************/ -static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *queue) +static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_entry_t *queue) { struct scsi_cmnd *item; @@ -2936,7 +2936,7 @@ static struct scsi_cmnd *ips_removeq_wait_head(ips_wait_queue_t *queue) /* ASSUMED to be called from within the HA lock */ /* */ /****************************************************************************/ -static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_t *queue, +static struct scsi_cmnd *ips_removeq_wait(ips_wait_queue_entry_t *queue, struct scsi_cmnd *item) { struct scsi_cmnd *p; diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h index b782bb60baf0..366be3b2f9b4 100644 --- a/drivers/scsi/ips.h +++ b/drivers/scsi/ips.h @@ -989,7 +989,7 @@ typedef struct ips_wait_queue { struct scsi_cmnd *head; struct scsi_cmnd *tail; int count; -} ips_wait_queue_t; +} ips_wait_queue_entry_t; typedef struct ips_copp_wait_item { struct scsi_cmnd *scsi_cmd; @@ -1035,7 +1035,7 @@ typedef struct ips_ha { ips_stat_t sp; /* Status packer pointer */ struct ips_scb *scbs; /* Array of all CCBS */ struct ips_scb *scb_freelist; /* SCB free list */ - ips_wait_queue_t scb_waitlist; /* Pending SCB list */ + ips_wait_queue_entry_t scb_waitlist; /* Pending SCB list */ ips_copp_queue_t copp_waitlist; /* Pending PT list */ ips_scb_queue_t scb_activelist; /* Active SCB list */ IPS_IO_CMD *dummy; /* dummy command */ diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c index 0db662d6abdd..85b242ec5f9b 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -3267,7 +3267,7 @@ int kiblnd_connd(void *arg) { spinlock_t *lock = &kiblnd_data.kib_connd_lock; - wait_queue_t wait; + wait_queue_entry_t wait; unsigned long flags; struct kib_conn *conn; int timeout; @@ -3521,7 +3521,7 @@ kiblnd_scheduler(void *arg) long id = (long)arg; struct kib_sched_info *sched; struct kib_conn *conn; - wait_queue_t wait; + wait_queue_entry_t wait; unsigned long flags; struct ib_wc wc; int did_something; @@ -3656,7 +3656,7 @@ kiblnd_failover_thread(void *arg) { rwlock_t *glock = &kiblnd_data.kib_global_lock; struct kib_dev *dev; - wait_queue_t wait; + wait_queue_entry_t wait; unsigned long flags; int rc; diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c index 3ed3b08c122c..6b38d5a8fe92 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c @@ -2166,7 +2166,7 @@ ksocknal_connd(void *arg) { spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock; struct ksock_connreq *cr; - wait_queue_t wait; + wait_queue_entry_t wait; int nloops = 0; int cons_retry = 0; @@ -2554,7 +2554,7 @@ ksocknal_check_peer_timeouts(int idx) int ksocknal_reaper(void *arg) { - wait_queue_t wait; + wait_queue_entry_t wait; struct ksock_conn *conn; struct ksock_sched *sched; struct list_head enomem_conns; diff --git a/drivers/staging/lustre/lnet/libcfs/debug.c b/drivers/staging/lustre/lnet/libcfs/debug.c index c56e9922cd5b..49deb448b044 100644 --- a/drivers/staging/lustre/lnet/libcfs/debug.c +++ b/drivers/staging/lustre/lnet/libcfs/debug.c @@ -361,7 +361,7 @@ static int libcfs_debug_dumplog_thread(void *arg) void libcfs_debug_dumplog(void) { - wait_queue_t wait; + wait_queue_entry_t wait; struct task_struct *dumper; /* we're being careful to ensure that the kernel thread is diff --git a/drivers/staging/lustre/lnet/libcfs/tracefile.c b/drivers/staging/lustre/lnet/libcfs/tracefile.c index 9599b7441feb..27082d2f7938 100644 --- a/drivers/staging/lustre/lnet/libcfs/tracefile.c +++ b/drivers/staging/lustre/lnet/libcfs/tracefile.c @@ -990,7 +990,7 @@ static int tracefiled(void *arg) complete(&tctl->tctl_start); while (1) { - wait_queue_t __wait; + wait_queue_entry_t __wait; pc.pc_want_daemon_pages = 0; collect_pages(&pc); diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c index ce4b83584e17..9ebba4ef5f90 100644 --- a/drivers/staging/lustre/lnet/lnet/lib-eq.c +++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c @@ -312,7 +312,7 @@ __must_hold(&the_lnet.ln_eq_wait_lock) { int tms = *timeout_ms; int wait; - wait_queue_t wl; + wait_queue_entry_t wl; unsigned long now; if (!tms) diff --git a/drivers/staging/lustre/lnet/lnet/lib-socket.c b/drivers/staging/lustre/lnet/lnet/lib-socket.c index 9fca8d225ee0..f075706bba6d 100644 --- a/drivers/staging/lustre/lnet/lnet/lib-socket.c +++ b/drivers/staging/lustre/lnet/lnet/lib-socket.c @@ -516,7 +516,7 @@ lnet_sock_listen(struct socket **sockp, __u32 local_ip, int local_port, int lnet_sock_accept(struct socket **newsockp, struct socket *sock) { - wait_queue_t wait; + wait_queue_entry_t wait; struct socket *newsock; int rc; diff --git a/drivers/staging/lustre/lustre/fid/fid_request.c b/drivers/staging/lustre/lustre/fid/fid_request.c index 999f250ceed0..bf31bc200d27 100644 --- a/drivers/staging/lustre/lustre/fid/fid_request.c +++ b/drivers/staging/lustre/lustre/fid/fid_request.c @@ -192,7 +192,7 @@ static int seq_client_alloc_seq(const struct lu_env *env, } static int seq_fid_alloc_prep(struct lu_client_seq *seq, - wait_queue_t *link) + wait_queue_entry_t *link) { if (seq->lcs_update) { add_wait_queue(&seq->lcs_waitq, link); @@ -223,7 +223,7 @@ static void seq_fid_alloc_fini(struct lu_client_seq *seq) int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq, struct lu_fid *fid) { - wait_queue_t link; + wait_queue_entry_t link; int rc; LASSERT(seq); @@ -290,7 +290,7 @@ EXPORT_SYMBOL(seq_client_alloc_fid); */ void seq_client_flush(struct lu_client_seq *seq) { - wait_queue_t link; + wait_queue_entry_t link; LASSERT(seq); init_waitqueue_entry(&link, current); diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h index b04d613846ee..f24970da8323 100644 --- a/drivers/staging/lustre/lustre/include/lustre_lib.h +++ b/drivers/staging/lustre/lustre/include/lustre_lib.h @@ -201,7 +201,7 @@ struct l_wait_info { sigmask(SIGALRM)) /** - * wait_queue_t of Linux (version < 2.6.34) is a FIFO list for exclusively + * wait_queue_entry_t of Linux (version < 2.6.34) is a FIFO list for exclusively * waiting threads, which is not always desirable because all threads will * be waken up again and again, even user only needs a few of them to be * active most time. This is not good for performance because cache can @@ -228,7 +228,7 @@ struct l_wait_info { */ #define __l_wait_event(wq, condition, info, ret, l_add_wait) \ do { \ - wait_queue_t __wait; \ + wait_queue_entry_t __wait; \ long __timeout = info->lwi_timeout; \ sigset_t __blocked; \ int __allow_intr = info->lwi_allow_intr; \ diff --git a/drivers/staging/lustre/lustre/llite/lcommon_cl.c b/drivers/staging/lustre/lustre/llite/lcommon_cl.c index 8af611033e12..96515b839436 100644 --- a/drivers/staging/lustre/lustre/llite/lcommon_cl.c +++ b/drivers/staging/lustre/lustre/llite/lcommon_cl.c @@ -207,7 +207,7 @@ int cl_file_inode_init(struct inode *inode, struct lustre_md *md) static void cl_object_put_last(struct lu_env *env, struct cl_object *obj) { struct lu_object_header *header = obj->co_lu.lo_header; - wait_queue_t waiter; + wait_queue_entry_t waiter; if (unlikely(atomic_read(&header->loh_ref) != 1)) { struct lu_site *site = obj->co_lu.lo_dev->ld_site; diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h index 391c632365ae..e889d3a7de9c 100644 --- a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h +++ b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h @@ -370,7 +370,7 @@ struct lov_thread_info { struct ost_lvb lti_lvb; struct cl_2queue lti_cl2q; struct cl_page_list lti_plist; - wait_queue_t lti_waiter; + wait_queue_entry_t lti_waiter; struct cl_attr lti_attr; }; diff --git a/drivers/staging/lustre/lustre/lov/lov_object.c b/drivers/staging/lustre/lustre/lov/lov_object.c index ab3ecfeeadc8..eddabbe31e5c 100644 --- a/drivers/staging/lustre/lustre/lov/lov_object.c +++ b/drivers/staging/lustre/lustre/lov/lov_object.c @@ -371,7 +371,7 @@ static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov, struct lov_layout_raid0 *r0; struct lu_site *site; struct lu_site_bkt_data *bkt; - wait_queue_t *waiter; + wait_queue_entry_t *waiter; r0 = &lov->u.raid0; LASSERT(r0->lo_sub[idx] == los); diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c index abcf951208d2..76ae600ae2c8 100644 --- a/drivers/staging/lustre/lustre/obdclass/lu_object.c +++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c @@ -556,7 +556,7 @@ EXPORT_SYMBOL(lu_object_print); static struct lu_object *htable_lookup(struct lu_site *s, struct cfs_hash_bd *bd, const struct lu_fid *f, - wait_queue_t *waiter, + wait_queue_entry_t *waiter, __u64 *version) { struct lu_site_bkt_data *bkt; @@ -670,7 +670,7 @@ static struct lu_object *lu_object_find_try(const struct lu_env *env, struct lu_device *dev, const struct lu_fid *f, const struct lu_object_conf *conf, - wait_queue_t *waiter) + wait_queue_entry_t *waiter) { struct lu_object *o; struct lu_object *shadow; @@ -750,7 +750,7 @@ struct lu_object *lu_object_find_at(const struct lu_env *env, { struct lu_site_bkt_data *bkt; struct lu_object *obj; - wait_queue_t wait; + wait_queue_entry_t wait; while (1) { obj = lu_object_find_try(env, dev, f, conf, &wait); diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c index 31885f20fc15..cc047de72e2a 100644 --- a/drivers/tty/synclink_gt.c +++ b/drivers/tty/synclink_gt.c @@ -184,7 +184,7 @@ static void hdlcdev_exit(struct slgt_info *info); struct cond_wait { struct cond_wait *next; wait_queue_head_t q; - wait_queue_t wait; + wait_queue_entry_t wait; unsigned int data; }; static void init_cond_wait(struct cond_wait *w, unsigned int data); diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c index 27c89cd5d70b..4797217e5e72 100644 --- a/drivers/vfio/virqfd.c +++ b/drivers/vfio/virqfd.c @@ -43,7 +43,7 @@ static void virqfd_deactivate(struct virqfd *virqfd) queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown); } -static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int virqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct virqfd *virqfd = container_of(wait, struct virqfd, wait); unsigned long flags = (unsigned long)key; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 042030e5a035..e4613a3c362d 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -165,7 +165,7 @@ static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, add_wait_queue(wqh, &poll->wait); } -static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, +static int vhost_poll_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct vhost_poll *poll = container_of(wait, struct vhost_poll, wait); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index f55671d53f28..f72095868b93 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -31,7 +31,7 @@ struct vhost_work { struct vhost_poll { poll_table table; wait_queue_head_t *wqh; - wait_queue_t wait; + wait_queue_entry_t wait; struct vhost_work work; unsigned long mask; struct vhost_dev *dev; diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index beef981aa54f..974f5346458a 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -83,7 +83,7 @@ struct autofs_info { struct autofs_wait_queue { wait_queue_head_t queue; struct autofs_wait_queue *next; - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_entry_token; /* We use the following to see what we are waiting for */ struct qstr name; u32 dev; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 24a58bf9ca72..7071895b0678 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -104,7 +104,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, size_t pktsz; pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n", - (unsigned long) wq->wait_queue_token, + (unsigned long) wq->wait_queue_entry_token, wq->name.len, wq->name.name, type); memset(&pkt, 0, sizeof(pkt)); /* For security reasons */ @@ -120,7 +120,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*mp); - mp->wait_queue_token = wq->wait_queue_token; + mp->wait_queue_entry_token = wq->wait_queue_entry_token; mp->len = wq->name.len; memcpy(mp->name, wq->name.name, wq->name.len); mp->name[wq->name.len] = '\0'; @@ -133,7 +133,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*ep); - ep->wait_queue_token = wq->wait_queue_token; + ep->wait_queue_entry_token = wq->wait_queue_entry_token; ep->len = wq->name.len; memcpy(ep->name, wq->name.name, wq->name.len); ep->name[wq->name.len] = '\0'; @@ -153,7 +153,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pktsz = sizeof(*packet); - packet->wait_queue_token = wq->wait_queue_token; + packet->wait_queue_entry_token = wq->wait_queue_entry_token; packet->len = wq->name.len; memcpy(packet->name, wq->name.name, wq->name.len); packet->name[wq->name.len] = '\0'; @@ -428,7 +428,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, return -ENOMEM; } - wq->wait_queue_token = autofs4_next_wait_queue; + wq->wait_queue_entry_token = autofs4_next_wait_queue; if (++autofs4_next_wait_queue == 0) autofs4_next_wait_queue = 1; wq->next = sbi->queues; @@ -461,7 +461,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, } pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", - (unsigned long) wq->wait_queue_token, wq->name.len, + (unsigned long) wq->wait_queue_entry_token, wq->name.len, wq->name.name, notify); /* @@ -471,7 +471,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, } else { wq->wait_ctr++; pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", - (unsigned long) wq->wait_queue_token, wq->name.len, + (unsigned long) wq->wait_queue_entry_token, wq->name.len, wq->name.name, notify); mutex_unlock(&sbi->wq_mutex); kfree(qstr.name); @@ -550,13 +550,13 @@ int autofs4_wait(struct autofs_sb_info *sbi, } -int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status) +int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_entry_token, int status) { struct autofs_wait_queue *wq, **wql; mutex_lock(&sbi->wq_mutex); for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) { - if (wq->wait_queue_token == wait_queue_token) + if (wq->wait_queue_entry_token == wait_queue_entry_token) break; } diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 9bf90bcc56ac..54a4fcd679ed 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -97,7 +97,7 @@ struct cachefiles_cache { * backing file read tracking */ struct cachefiles_one_read { - wait_queue_t monitor; /* link into monitored waitqueue */ + wait_queue_entry_t monitor; /* link into monitored waitqueue */ struct page *back_page; /* backing file page we're waiting for */ struct page *netfs_page; /* netfs page we're going to fill */ struct fscache_retrieval *op; /* retrieval op covering this */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 41df8a27d7eb..3978b324cbca 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -204,7 +204,7 @@ wait_for_old_object: wait_queue_head_t *wq; signed long timeout = 60 * HZ; - wait_queue_t wait; + wait_queue_entry_t wait; bool requeue; /* if the object we're waiting for is queued for processing, diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index afbdc418966d..8be33b33b981 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -21,7 +21,7 @@ * - we use this to detect read completion of backing pages * - the caller holds the waitqueue lock */ -static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, +static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, int sync, void *_key) { struct cachefiles_one_read *monitor = diff --git a/fs/dax.c b/fs/dax.c index 2a6889b3585f..323ea481d4a8 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -84,7 +84,7 @@ struct exceptional_entry_key { }; struct wait_exceptional_entry_queue { - wait_queue_t wait; + wait_queue_entry_t wait; struct exceptional_entry_key key; }; @@ -108,7 +108,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, return wait_table + hash; } -static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, +static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, int sync, void *keyp) { struct exceptional_entry_key *key = keyp; diff --git a/fs/eventfd.c b/fs/eventfd.c index 68b9fffcb2c8..9736df2ce89d 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -191,7 +191,7 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) * This is used to atomically remove a wait queue entry from the eventfd wait * queue head, and read/reset the counter value. */ -int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt) { unsigned long flags; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5420767c9b68..5ac1cba5ef72 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -244,7 +244,7 @@ struct eppoll_entry { * Wait queue item that will be linked to the target file wait * queue head. */ - wait_queue_t wait; + wait_queue_entry_t wait; /* The wait queue head that linked the "wait" wait queue item */ wait_queue_head_t *whead; @@ -347,13 +347,13 @@ static inline int ep_is_linked(struct list_head *p) return !list_empty(p); } -static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p) +static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait); } /* Get the "struct epitem" from a wait queue pointer */ -static inline struct epitem *ep_item_from_wait(wait_queue_t *p) +static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait)->base; } @@ -1078,7 +1078,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) * mechanism. It is called by the stored file descriptors when they * have events to report. */ -static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; unsigned long flags; @@ -1699,7 +1699,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int res = 0, eavail, timed_out = 0; unsigned long flags; u64 slack = 0; - wait_queue_t wait; + wait_queue_entry_t wait; ktime_t expires, *to = NULL; if (timeout > 0) { diff --git a/fs/fs_pin.c b/fs/fs_pin.c index 611b5408f6ec..7b447a245760 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -34,7 +34,7 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m) void pin_kill(struct fs_pin *p) { - wait_queue_t wait; + wait_queue_entry_t wait; if (!p) { rcu_read_unlock(); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c08c46a3b8cd..be5a8f84e5bb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6372,7 +6372,7 @@ struct nfs4_lock_waiter { }; static int -nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key) +nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key) { int ret; struct cb_notify_lock_args *cbnl = key; @@ -6415,7 +6415,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) .inode = state->inode, .owner = &owner, .notified = false }; - wait_queue_t wait; + wait_queue_entry_t wait; /* Don't bother with waitqueue if we don't expect a callback */ if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags)) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index febed1217b3f..775304e7f96f 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2161,7 +2161,7 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino) } struct nilfs_segctor_wait_request { - wait_queue_t wq; + wait_queue_entry_t wq; __u32 seq; int err; atomic_t done; diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 83b506020718..9e37b7028ea4 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -47,7 +47,7 @@ static void run_down(struct slot_map *m) if (m->c != -1) { for (;;) { if (likely(list_empty(&wait.task_list))) - __add_wait_queue_tail(&m->q, &wait); + __add_wait_queue_entry_tail(&m->q, &wait); set_current_state(TASK_UNINTERRUPTIBLE); if (m->c == -1) @@ -85,7 +85,7 @@ static int wait_for_free(struct slot_map *m) do { long n = left, t; if (likely(list_empty(&wait.task_list))) - __add_wait_queue_tail_exclusive(&m->q, &wait); + __add_wait_queue_entry_tail_exclusive(&m->q, &wait); set_current_state(TASK_INTERRUPTIBLE); if (m->c > 0) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 39bb1e838d8d..a11d773e5ff3 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2956,7 +2956,7 @@ void reiserfs_wait_on_write_block(struct super_block *s) static void queue_log_writer(struct super_block *s) { - wait_queue_t wait; + wait_queue_entry_t wait; struct reiserfs_journal *journal = SB_JOURNAL(s); set_bit(J_WRITERS_QUEUED, &journal->j_state); diff --git a/fs/select.c b/fs/select.c index d6c652a31e99..5b524a977d91 100644 --- a/fs/select.c +++ b/fs/select.c @@ -180,7 +180,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) return table->entry++; } -static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); @@ -206,7 +206,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) return default_wake_function(&dummy_wait, mode, sync, key); } -static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; diff --git a/fs/signalfd.c b/fs/signalfd.c index 7e3d71109f51..593b022ac11b 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -43,7 +43,7 @@ void signalfd_cleanup(struct sighand_struct *sighand) if (likely(!waitqueue_active(wqh))) return; - /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + /* wait_queue_entry_t->func(POLLFREE) should do remove_wait_queue() */ wake_up_poll(wqh, POLLHUP | POLLFREE); } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1d622f276e3a..bda64fcd8a0c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -81,7 +81,7 @@ struct userfaultfd_unmap_ctx { struct userfaultfd_wait_queue { struct uffd_msg msg; - wait_queue_t wq; + wait_queue_entry_t wq; struct userfaultfd_ctx *ctx; bool waken; }; @@ -91,7 +91,7 @@ struct userfaultfd_wake_range { unsigned long len; }; -static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, +static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { struct userfaultfd_wake_range *range = key; @@ -860,7 +860,7 @@ wakeup: static inline struct userfaultfd_wait_queue *find_userfault_in( wait_queue_head_t *wqh) { - wait_queue_t *wq; + wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; VM_BUG_ON(!spin_is_locked(&wqh->lock)); @@ -1747,7 +1747,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) { struct userfaultfd_ctx *ctx = f->private_data; - wait_queue_t *wq; + wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; unsigned long pending = 0, total = 0; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index fcd641032f8d..95ba83806c5d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -33,7 +33,7 @@ struct blk_mq_hw_ctx { struct blk_mq_ctx **ctxs; unsigned int nr_ctx; - wait_queue_t dispatch_wait; + wait_queue_entry_t dispatch_wait; atomic_t wait_index; struct blk_mq_tags *tags; diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index ff0b981f078e..9e4befd95bc7 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -37,7 +37,7 @@ struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt); -int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); #else /* CONFIG_EVENTFD */ @@ -73,7 +73,7 @@ static inline ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, } static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, - wait_queue_t *wait, __u64 *cnt) + wait_queue_entry_t *wait, __u64 *cnt) { return -ENOSYS; } diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h index 0c1de05098c8..76c2fbc59f35 100644 --- a/include/linux/kvm_irqfd.h +++ b/include/linux/kvm_irqfd.h @@ -46,7 +46,7 @@ struct kvm_kernel_irqfd_resampler { struct kvm_kernel_irqfd { /* Used for MSI fast-path */ struct kvm *kvm; - wait_queue_t wait; + wait_queue_entry_t wait; /* Update side is protected by irqfds.lock */ struct kvm_kernel_irq_routing_entry irq_entry; seqcount_t irq_entry_sc; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 316a19f6b635..e7bbd9d4dc6c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -524,7 +524,7 @@ void page_endio(struct page *page, bool is_write, int err); /* * Add an arbitrary waiter to a page's wait queue */ -extern void add_page_wait_queue(struct page *page, wait_queue_t *waiter); +extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); /* * Fault everything in given userspace address range in. diff --git a/include/linux/poll.h b/include/linux/poll.h index 75ffc5729e4c..2889f09a1c60 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -75,7 +75,7 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) struct poll_table_entry { struct file *filp; unsigned long key; - wait_queue_t wait; + wait_queue_entry_t wait; wait_queue_head_t *wait_address; }; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index edf9b2cad277..f57076b958b7 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -183,7 +183,7 @@ struct virqfd { void (*thread)(void *, void *); void *data; struct work_struct inject; - wait_queue_t wait; + wait_queue_entry_t wait; poll_table pt; struct work_struct shutdown; struct virqfd **pvirqfd; diff --git a/include/linux/wait.h b/include/linux/wait.h index db076ca7f11d..5889f0c86ff7 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -10,15 +10,18 @@ #include #include -typedef struct __wait_queue wait_queue_t; -typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key); -int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key); +typedef struct wait_queue_entry wait_queue_entry_t; +typedef int (*wait_queue_func_t)(wait_queue_entry_t *wait, unsigned mode, int flags, void *key); +int default_wake_function(wait_queue_entry_t *wait, unsigned mode, int flags, void *key); -/* __wait_queue::flags */ +/* wait_queue_entry::flags */ #define WQ_FLAG_EXCLUSIVE 0x01 #define WQ_FLAG_WOKEN 0x02 -struct __wait_queue { +/* + * A single wait-queue entry structure: + */ +struct wait_queue_entry { unsigned int flags; void *private; wait_queue_func_t func; @@ -34,7 +37,7 @@ struct wait_bit_key { struct wait_bit_queue { struct wait_bit_key key; - wait_queue_t wait; + wait_queue_entry_t wait; }; struct __wait_queue_head { @@ -55,7 +58,7 @@ struct task_struct; .task_list = { NULL, NULL } } #define DECLARE_WAITQUEUE(name, tsk) \ - wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk) + wait_queue_entry_t name = __WAITQUEUE_INITIALIZER(name, tsk) #define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ @@ -88,7 +91,7 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name) #endif -static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) +static inline void init_waitqueue_entry(wait_queue_entry_t *q, struct task_struct *p) { q->flags = 0; q->private = p; @@ -96,7 +99,7 @@ static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) } static inline void -init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func) +init_waitqueue_func_entry(wait_queue_entry_t *q, wait_queue_func_t func) { q->flags = 0; q->private = NULL; @@ -159,11 +162,11 @@ static inline bool wq_has_sleeper(wait_queue_head_t *wq) return waitqueue_active(wq); } -extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait); -extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait); -extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait); +extern void add_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait); +extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait); +extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait); -static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new) +static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_entry_t *new) { list_add(&new->task_list, &head->task_list); } @@ -172,27 +175,27 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new) * Used for wake-one threads: */ static inline void -__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait) { wait->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue(q, wait); } -static inline void __add_wait_queue_tail(wait_queue_head_t *head, - wait_queue_t *new) +static inline void __add_wait_queue_entry_tail(wait_queue_head_t *head, + wait_queue_entry_t *new) { list_add_tail(&new->task_list, &head->task_list); } static inline void -__add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +__add_wait_queue_entry_tail_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait) { wait->flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_tail(q, wait); + __add_wait_queue_entry_tail(q, wait); } static inline void -__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) +__remove_wait_queue(wait_queue_head_t *head, wait_queue_entry_t *old) { list_del(&old->task_list); } @@ -249,7 +252,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); (!__builtin_constant_p(state) || \ state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ -extern void init_wait_entry(wait_queue_t *__wait, int flags); +extern void init_wait_entry(wait_queue_entry_t *__wait, int flags); /* * The below macro ___wait_event() has an explicit shadow of the __ret @@ -266,7 +269,7 @@ extern void init_wait_entry(wait_queue_t *__wait, int flags); #define ___wait_event(wq, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ - wait_queue_t __wait; \ + wait_queue_entry_t __wait; \ long __ret = ret; /* explicit shadow */ \ \ init_wait_entry(&__wait, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ @@ -620,8 +623,8 @@ do { \ __ret; \ }) -extern int do_wait_intr(wait_queue_head_t *, wait_queue_t *); -extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_t *); +extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *); +extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); #define __wait_event_interruptible_locked(wq, condition, exclusive, fn) \ ({ \ @@ -967,17 +970,17 @@ do { \ /* * Waitqueues which are removed from the waitqueue_head at wakeup time */ -void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); -void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); -long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); -void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); -long wait_woken(wait_queue_t *wait, unsigned mode, long timeout); -int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); -int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); -int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); +void prepare_to_wait(wait_queue_head_t *q, wait_queue_entry_t *wait, int state); +void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait, int state); +long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int state); +void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait); +long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout); +int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key); +int autoremove_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key); +int wake_bit_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key); #define DEFINE_WAIT_FUNC(name, function) \ - wait_queue_t name = { \ + wait_queue_entry_t name = { \ .private = current, \ .func = function, \ .task_list = LIST_HEAD_INIT((name).task_list), \ diff --git a/include/net/af_unix.h b/include/net/af_unix.h index fd60eccb59a6..75e612a45824 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -62,7 +62,7 @@ struct unix_sock { #define UNIX_GC_CANDIDATE 0 #define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; - wait_queue_t peer_wake; + wait_queue_entry_t peer_wake; }; static inline struct unix_sock *unix_sk(const struct sock *sk) diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index aa63451ef20a..1953f8d6063b 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -26,7 +26,7 @@ #define AUTOFS_MIN_PROTO_VERSION AUTOFS_PROTO_VERSION /* - * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed + * The wait_queue_entry_token (autofs_wqt_t) is part of a structure which is passed * back to the kernel via ioctl from userspace. On architectures where 32- and * 64-bit userspace binaries can be executed it's important that the size of * autofs_wqt_t stays constant between 32- and 64-bit Linux kernels so that we @@ -49,7 +49,7 @@ struct autofs_packet_hdr { struct autofs_packet_missing { struct autofs_packet_hdr hdr; - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_entry_token; int len; char name[NAME_MAX+1]; }; diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h index 7c6da423d54e..65b72d0222e7 100644 --- a/include/uapi/linux/auto_fs4.h +++ b/include/uapi/linux/auto_fs4.h @@ -108,7 +108,7 @@ enum autofs_notify { /* v4 multi expire (via pipe) */ struct autofs_packet_expire_multi { struct autofs_packet_hdr hdr; - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_entry_token; int len; char name[NAME_MAX+1]; }; @@ -123,7 +123,7 @@ union autofs_packet_union { /* autofs v5 common packet struct */ struct autofs_v5_packet { struct autofs_packet_hdr hdr; - autofs_wqt_t wait_queue_token; + autofs_wqt_t wait_queue_entry_token; __u32 dev; __u64 ino; __u32 uid; diff --git a/kernel/exit.c b/kernel/exit.c index 516acdb0e0ec..7d694437ab44 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1004,7 +1004,7 @@ struct wait_opts { int __user *wo_stat; struct rusage __user *wo_rusage; - wait_queue_t child_wait; + wait_queue_entry_t child_wait; int notask_error; }; @@ -1541,7 +1541,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } -static int child_wait_callback(wait_queue_t *wait, unsigned mode, +static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, diff --git a/kernel/futex.c b/kernel/futex.c index 357348a6cf6b..d6cf71d08f21 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -225,7 +225,7 @@ struct futex_pi_state { * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup * - * We use this hashed waitqueue, instead of a normal wait_queue_t, so + * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 53f9558fa925..13fc5ae9bf2f 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -66,7 +66,7 @@ do_wait_for_common(struct completion *x, if (!x->done) { DECLARE_WAITQUEUE(wait, current); - __add_wait_queue_tail_exclusive(&x->wait, &wait); + __add_wait_queue_entry_tail_exclusive(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 326d4f88e2b1..5b36644536ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3687,7 +3687,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) exception_exit(prev_state); } -int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, +int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { return try_to_wake_up(curr->private, mode, wake_flags); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index b8c84c6dee64..301ea02dede0 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -21,7 +21,7 @@ void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_c EXPORT_SYMBOL(__init_waitqueue_head); -void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +void add_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait) { unsigned long flags; @@ -32,18 +32,18 @@ void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(add_wait_queue); -void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait) { unsigned long flags; wait->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_tail(q, wait); + __add_wait_queue_entry_tail(q, wait); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(add_wait_queue_exclusive); -void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +void remove_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait) { unsigned long flags; @@ -66,7 +66,7 @@ EXPORT_SYMBOL(remove_wait_queue); static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { - wait_queue_t *curr, *next; + wait_queue_entry_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; @@ -170,7 +170,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * loads to move into the critical region). */ void -prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +prepare_to_wait(wait_queue_head_t *q, wait_queue_entry_t *wait, int state) { unsigned long flags; @@ -184,20 +184,20 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) EXPORT_SYMBOL(prepare_to_wait); void -prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait, int state) { unsigned long flags; wait->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) - __add_wait_queue_tail(q, wait); + __add_wait_queue_entry_tail(q, wait); set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait_exclusive); -void init_wait_entry(wait_queue_t *wait, int flags) +void init_wait_entry(wait_queue_entry_t *wait, int flags) { wait->flags = flags; wait->private = current; @@ -206,7 +206,7 @@ void init_wait_entry(wait_queue_t *wait, int flags) } EXPORT_SYMBOL(init_wait_entry); -long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int state) { unsigned long flags; long ret = 0; @@ -230,7 +230,7 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) } else { if (list_empty(&wait->task_list)) { if (wait->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_tail(q, wait); + __add_wait_queue_entry_tail(q, wait); else __add_wait_queue(q, wait); } @@ -249,10 +249,10 @@ EXPORT_SYMBOL(prepare_to_wait_event); * condition in the caller before they add the wait * entry to the wake queue. */ -int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait) +int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) { if (likely(list_empty(&wait->task_list))) - __add_wait_queue_tail(wq, wait); + __add_wait_queue_entry_tail(wq, wait); set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) @@ -265,10 +265,10 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait) } EXPORT_SYMBOL(do_wait_intr); -int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait) +int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) { if (likely(list_empty(&wait->task_list))) - __add_wait_queue_tail(wq, wait); + __add_wait_queue_entry_tail(wq, wait); set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) @@ -290,7 +290,7 @@ EXPORT_SYMBOL(do_wait_intr_irq); * the wait descriptor from the given waitqueue if still * queued. */ -void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait) { unsigned long flags; @@ -316,7 +316,7 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(finish_wait); -int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +int autoremove_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); @@ -351,7 +351,7 @@ static inline bool is_kthread_should_stop(void) * remove_wait_queue(&wq, &wait); * */ -long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) +long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout) { set_current_state(mode); /* A */ /* @@ -375,7 +375,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) } EXPORT_SYMBOL(wait_woken); -int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { /* * Although this function is called under waitqueue lock, LOCK @@ -391,7 +391,7 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) } EXPORT_SYMBOL(woken_wake_function); -int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +int wake_bit_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; struct wait_bit_queue *wait_bit @@ -534,7 +534,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) return bit_waitqueue(p, 0); } -static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, +static int wake_atomic_t_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c74bf39ef764..a86688fabc55 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2864,11 +2864,11 @@ bool flush_work(struct work_struct *work) EXPORT_SYMBOL_GPL(flush_work); struct cwt_wait { - wait_queue_t wait; + wait_queue_entry_t wait; struct work_struct *work; }; -static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); diff --git a/mm/filemap.c b/mm/filemap.c index 6f1be573a5e6..80c19ee81e95 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -768,10 +768,10 @@ struct wait_page_key { struct wait_page_queue { struct page *page; int bit_nr; - wait_queue_t wait; + wait_queue_entry_t wait; }; -static int wake_page_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct wait_page_key *key = arg; struct wait_page_queue *wait_page @@ -834,7 +834,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, bool lock) { struct wait_page_queue wait_page; - wait_queue_t *wait = &wait_page.wait; + wait_queue_entry_t *wait = &wait_page.wait; int ret = 0; init_wait(wait); @@ -847,7 +847,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, if (likely(list_empty(&wait->task_list))) { if (lock) - __add_wait_queue_tail_exclusive(q, wait); + __add_wait_queue_entry_tail_exclusive(q, wait); else __add_wait_queue(q, wait); SetPageWaiters(page); @@ -907,7 +907,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) * * Add an arbitrary @waiter to the wait queue for the nominated @page. */ -void add_page_wait_queue(struct page *page, wait_queue_t *waiter) +void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter) { wait_queue_head_t *q = page_waitqueue(page); unsigned long flags; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 94172089f52f..9a90b096dc6b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -170,7 +170,7 @@ struct mem_cgroup_event { */ poll_table pt; wait_queue_head_t *wqh; - wait_queue_t wait; + wait_queue_entry_t wait; struct work_struct remove; }; @@ -1479,10 +1479,10 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); struct oom_wait_info { struct mem_cgroup *memcg; - wait_queue_t wait; + wait_queue_entry_t wait; }; -static int memcg_oom_wake_function(wait_queue_t *wait, +static int memcg_oom_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; @@ -3725,7 +3725,7 @@ static void memcg_event_remove(struct work_struct *work) * * Called with wqh->lock held and interrupts disabled. */ -static int memcg_event_wake(wait_queue_t *wait, unsigned mode, +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct mem_cgroup_event *event = diff --git a/mm/mempool.c b/mm/mempool.c index 47a659dedd44..1c0294858527 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -312,7 +312,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; - wait_queue_t wait; + wait_queue_entry_t wait; gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); diff --git a/mm/shmem.c b/mm/shmem.c index e67d6ba4e98e..a6c7dece4660 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1902,7 +1902,7 @@ unlock: * entry unconditionally - even if something else had already woken the * target. */ -static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); list_del_init(&wait->task_list); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 7bc2208b6cc4..dca3cdd1a014 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -95,7 +95,7 @@ enum { struct p9_poll_wait { struct p9_conn *conn; - wait_queue_t wait; + wait_queue_entry_t wait; wait_queue_head_t *wait_addr; }; @@ -522,7 +522,7 @@ error: clear_bit(Wworksched, &m->wsched); } -static int p9_pollwake(wait_queue_t *wait, unsigned int mode, int sync, void *key) +static int p9_pollwake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { struct p9_poll_wait *pwait = container_of(wait, struct p9_poll_wait, wait); diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index fbf251fef70f..5c4808b3da2d 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -484,7 +484,7 @@ static int bnep_session(void *arg) struct net_device *dev = s->dev; struct sock *sk = s->sock->sk; struct sk_buff *skb; - wait_queue_t wait; + wait_queue_entry_t wait; BT_DBG(""); diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 9e59b6654126..14f7c8135c31 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -280,7 +280,7 @@ static int cmtp_session(void *arg) struct cmtp_session *session = arg; struct sock *sk = session->sock->sk; struct sk_buff *skb; - wait_queue_t wait; + wait_queue_entry_t wait; BT_DBG("session %p", session); diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 0bec4588c3c8..fc31161e98f2 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -1244,7 +1244,7 @@ static void hidp_session_run(struct hidp_session *session) static int hidp_session_thread(void *arg) { struct hidp_session *session = arg; - wait_queue_t ctrl_wait, intr_wait; + wait_queue_entry_t ctrl_wait, intr_wait; BT_DBG("session %p", session); diff --git a/net/core/datagram.c b/net/core/datagram.c index db1866f2ffcf..34678828e2bb 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -68,7 +68,7 @@ static inline int connection_based(struct sock *sk) return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; } -static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync, +static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { unsigned long bits = (unsigned long)key; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1a0c961f4ffe..c77ced0109b7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -343,7 +343,7 @@ found: * are still connected to it and there's no way to inform "a polling * implementation" that it should let go of a certain wait queue * - * In order to propagate a wake up, a wait_queue_t of the client + * In order to propagate a wake up, a wait_queue_entry_t of the client * socket is enqueued on the peer_wait queue of the server socket * whose wake function does a wake_up on the ordinary client socket * wait queue. This connection is established whenever a write (or @@ -352,7 +352,7 @@ found: * was relayed. */ -static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags, +static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, void *key) { struct unix_sock *u; diff --git a/sound/core/control.c b/sound/core/control.c index c109b82eef4b..6362da17ac3f 100644 --- a/sound/core/control.c +++ b/sound/core/control.c @@ -1577,7 +1577,7 @@ static ssize_t snd_ctl_read(struct file *file, char __user *buffer, struct snd_ctl_event ev; struct snd_kctl_event *kev; while (list_empty(&ctl->events)) { - wait_queue_t wait; + wait_queue_entry_t wait; if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) { err = -EAGAIN; goto __end_lock; diff --git a/sound/core/hwdep.c b/sound/core/hwdep.c index 9602a7e38d8a..a73baa1242be 100644 --- a/sound/core/hwdep.c +++ b/sound/core/hwdep.c @@ -85,7 +85,7 @@ static int snd_hwdep_open(struct inode *inode, struct file * file) int major = imajor(inode); struct snd_hwdep *hw; int err; - wait_queue_t wait; + wait_queue_entry_t wait; if (major == snd_major) { hw = snd_lookup_minor_data(iminor(inode), diff --git a/sound/core/init.c b/sound/core/init.c index 6bda8436d765..d61d2b3cd521 100644 --- a/sound/core/init.c +++ b/sound/core/init.c @@ -989,7 +989,7 @@ EXPORT_SYMBOL(snd_card_file_remove); */ int snd_power_wait(struct snd_card *card, unsigned int power_state) { - wait_queue_t wait; + wait_queue_entry_t wait; int result = 0; /* fastpath */ diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index 36baf962f9b0..cd8b7bef8d06 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -1554,7 +1554,7 @@ static int snd_pcm_oss_sync1(struct snd_pcm_substream *substream, size_t size) ssize_t result = 0; snd_pcm_state_t state; long res; - wait_queue_t wait; + wait_queue_entry_t wait; runtime = substream->runtime; init_waitqueue_entry(&wait, current); @@ -2387,7 +2387,7 @@ static int snd_pcm_oss_open(struct inode *inode, struct file *file) struct snd_pcm_oss_file *pcm_oss_file; struct snd_pcm_oss_setup setup[2]; int nonblock; - wait_queue_t wait; + wait_queue_entry_t wait; err = nonseekable_open(inode, file); if (err < 0) diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index 5088d4b8db22..dd5254077ef7 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -1904,7 +1904,7 @@ static int wait_for_avail(struct snd_pcm_substream *substream, { struct snd_pcm_runtime *runtime = substream->runtime; int is_playback = substream->stream == SNDRV_PCM_STREAM_PLAYBACK; - wait_queue_t wait; + wait_queue_entry_t wait; int err = 0; snd_pcm_uframes_t avail = 0; long wait_time, tout; diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 13dec5ec93f2..faa2e2be6f2e 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -1652,7 +1652,7 @@ static int snd_pcm_drain(struct snd_pcm_substream *substream, struct snd_card *card; struct snd_pcm_runtime *runtime; struct snd_pcm_substream *s; - wait_queue_t wait; + wait_queue_entry_t wait; int result = 0; int nonblock = 0; @@ -2353,7 +2353,7 @@ static int snd_pcm_capture_open(struct inode *inode, struct file *file) static int snd_pcm_open(struct file *file, struct snd_pcm *pcm, int stream) { int err; - wait_queue_t wait; + wait_queue_entry_t wait; if (pcm == NULL) { err = -ENODEV; diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c index ab890336175f..32588ad05653 100644 --- a/sound/core/rawmidi.c +++ b/sound/core/rawmidi.c @@ -368,7 +368,7 @@ static int snd_rawmidi_open(struct inode *inode, struct file *file) int err; struct snd_rawmidi *rmidi; struct snd_rawmidi_file *rawmidi_file = NULL; - wait_queue_t wait; + wait_queue_entry_t wait; if ((file->f_flags & O_APPEND) && !(file->f_flags & O_NONBLOCK)) return -EINVAL; /* invalid combination */ @@ -1002,7 +1002,7 @@ static ssize_t snd_rawmidi_read(struct file *file, char __user *buf, size_t coun while (count > 0) { spin_lock_irq(&runtime->lock); while (!snd_rawmidi_ready(substream)) { - wait_queue_t wait; + wait_queue_entry_t wait; if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) { spin_unlock_irq(&runtime->lock); return result > 0 ? result : -EAGAIN; @@ -1306,7 +1306,7 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf, while (count > 0) { spin_lock_irq(&runtime->lock); while (!snd_rawmidi_ready_append(substream, count)) { - wait_queue_t wait; + wait_queue_entry_t wait; if (file->f_flags & O_NONBLOCK) { spin_unlock_irq(&runtime->lock); return result > 0 ? result : -EAGAIN; @@ -1338,7 +1338,7 @@ static ssize_t snd_rawmidi_write(struct file *file, const char __user *buf, if (file->f_flags & O_DSYNC) { spin_lock_irq(&runtime->lock); while (runtime->avail != runtime->buffer_size) { - wait_queue_t wait; + wait_queue_entry_t wait; unsigned int last_avail = runtime->avail; init_waitqueue_entry(&wait, current); add_wait_queue(&runtime->sleep, &wait); diff --git a/sound/core/seq/seq_fifo.c b/sound/core/seq/seq_fifo.c index 01c4cfe30c9f..a8c2822e0198 100644 --- a/sound/core/seq/seq_fifo.c +++ b/sound/core/seq/seq_fifo.c @@ -179,7 +179,7 @@ int snd_seq_fifo_cell_out(struct snd_seq_fifo *f, { struct snd_seq_event_cell *cell; unsigned long flags; - wait_queue_t wait; + wait_queue_entry_t wait; if (snd_BUG_ON(!f)) return -EINVAL; diff --git a/sound/core/seq/seq_memory.c b/sound/core/seq/seq_memory.c index d4c61ec9be13..d6e9aacdc36b 100644 --- a/sound/core/seq/seq_memory.c +++ b/sound/core/seq/seq_memory.c @@ -227,7 +227,7 @@ static int snd_seq_cell_alloc(struct snd_seq_pool *pool, struct snd_seq_event_cell *cell; unsigned long flags; int err = -EAGAIN; - wait_queue_t wait; + wait_queue_entry_t wait; if (pool == NULL) return -EINVAL; diff --git a/sound/core/timer.c b/sound/core/timer.c index cd67d1c12cf1..884c3066b028 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -1964,7 +1964,7 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer, spin_lock_irq(&tu->qlock); while ((long)count - result >= unit) { while (!tu->qused) { - wait_queue_t wait; + wait_queue_entry_t wait; if ((file->f_flags & O_NONBLOCK) != 0 || result > 0) { err = -EAGAIN; diff --git a/sound/isa/wavefront/wavefront_synth.c b/sound/isa/wavefront/wavefront_synth.c index 4dae9ff9ef5a..0b1e4b34b299 100644 --- a/sound/isa/wavefront/wavefront_synth.c +++ b/sound/isa/wavefront/wavefront_synth.c @@ -1782,7 +1782,7 @@ wavefront_should_cause_interrupt (snd_wavefront_t *dev, int val, int port, unsigned long timeout) { - wait_queue_t wait; + wait_queue_entry_t wait; init_waitqueue_entry(&wait, current); spin_lock_irq(&dev->irq_lock); diff --git a/sound/pci/mixart/mixart_core.c b/sound/pci/mixart/mixart_core.c index dccf3db48fe0..8bf2ce32d4a8 100644 --- a/sound/pci/mixart/mixart_core.c +++ b/sound/pci/mixart/mixart_core.c @@ -239,7 +239,7 @@ int snd_mixart_send_msg(struct mixart_mgr *mgr, struct mixart_msg *request, int struct mixart_msg resp; u32 msg_frame = 0; /* set to 0, so it's no notification to wait for, but the answer */ int err; - wait_queue_t wait; + wait_queue_entry_t wait; long timeout; init_waitqueue_entry(&wait, current); @@ -284,7 +284,7 @@ int snd_mixart_send_msg_wait_notif(struct mixart_mgr *mgr, struct mixart_msg *request, u32 notif_event) { int err; - wait_queue_t wait; + wait_queue_entry_t wait; long timeout; if (snd_BUG_ON(!notif_event)) diff --git a/sound/pci/ymfpci/ymfpci_main.c b/sound/pci/ymfpci/ymfpci_main.c index fe4ba463b57c..1114166c685c 100644 --- a/sound/pci/ymfpci/ymfpci_main.c +++ b/sound/pci/ymfpci/ymfpci_main.c @@ -781,7 +781,7 @@ static snd_pcm_uframes_t snd_ymfpci_capture_pointer(struct snd_pcm_substream *su static void snd_ymfpci_irq_wait(struct snd_ymfpci *chip) { - wait_queue_t wait; + wait_queue_entry_t wait; int loops = 4; while (loops-- > 0) { diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index a8d540398bbd..9120edf3c94b 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -184,7 +184,7 @@ int __attribute__((weak)) kvm_arch_set_irq_inatomic( * Called with wqh->lock held and interrupts disabled */ static int -irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) +irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct kvm_kernel_irqfd *irqfd = container_of(wait, struct kvm_kernel_irqfd, wait); -- cgit v1.3-14-g43fede From 50816c48997af857d4bab3dca1aba90339705e96 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Mar 2017 10:33:16 +0100 Subject: sched/wait: Standardize internal naming of wait-queue entries So the various wait-queue entry variables in include/linux/wait.h and kernel/sched/wait.c are named in a colorfully inconsistent way: wait_queue_entry_t *wait wait_queue_entry_t *__wait (even in plain C code!) wait_queue_entry_t *q (!) wait_queue_entry_t *new (making anyone who knows C++ cringe) wait_queue_entry_t *old I think part of the reason for the inconsistency is the constant apparent confusion about what a wait queue 'head' versus 'entry' is. ( Some of the documentation talks about a 'wait descriptor', which is the wait-queue entry itself - further adding to the confusion. ) The most common name is 'wait', but that in itself is somewhat ambiguous as well, as it does not really make it clear whether it's a wait-queue entry or head. To improve all this name the wait-queue entry structure parameters and variables consistently and push through this naming into all the wait.h and wait.c code: struct wait_queue_entry *wq_entry The 'wq_' prefix makes it easy to grep for, and we also use the opportunity to move away from the typedef to a plain 'struct' naming: in the kernel we typically reserve typedefs for cases where a C structure is really small and somewhat opaque - such as pte_t. wait-queue entries are neither small nor opaque, so use the more standard 'struct xxx_entry' list management code nomenclature instead. ( We don't touch external users, and we preserve the typedef as well for actual wait-queue users, to reduce unnecessary churn. ) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 84 ++++++++++++++++++++++---------------------- kernel/sched/wait.c | 98 ++++++++++++++++++++++++++-------------------------- 2 files changed, 91 insertions(+), 91 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index 5889f0c86ff7..77fdea851d8b 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -11,8 +11,9 @@ #include typedef struct wait_queue_entry wait_queue_entry_t; -typedef int (*wait_queue_func_t)(wait_queue_entry_t *wait, unsigned mode, int flags, void *key); -int default_wake_function(wait_queue_entry_t *wait, unsigned mode, int flags, void *key); + +typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key); +int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key); /* wait_queue_entry::flags */ #define WQ_FLAG_EXCLUSIVE 0x01 @@ -37,7 +38,7 @@ struct wait_bit_key { struct wait_bit_queue { struct wait_bit_key key; - wait_queue_entry_t wait; + struct wait_queue_entry wait; }; struct __wait_queue_head { @@ -58,7 +59,7 @@ struct task_struct; .task_list = { NULL, NULL } } #define DECLARE_WAITQUEUE(name, tsk) \ - wait_queue_entry_t name = __WAITQUEUE_INITIALIZER(name, tsk) + struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk) #define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ @@ -91,19 +92,19 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name) #endif -static inline void init_waitqueue_entry(wait_queue_entry_t *q, struct task_struct *p) +static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p) { - q->flags = 0; - q->private = p; - q->func = default_wake_function; + wq_entry->flags = 0; + wq_entry->private = p; + wq_entry->func = default_wake_function; } static inline void -init_waitqueue_func_entry(wait_queue_entry_t *q, wait_queue_func_t func) +init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func) { - q->flags = 0; - q->private = NULL; - q->func = func; + wq_entry->flags = 0; + wq_entry->private = NULL; + wq_entry->func = func; } /** @@ -162,42 +163,41 @@ static inline bool wq_has_sleeper(wait_queue_head_t *wq) return waitqueue_active(wq); } -extern void add_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait); -extern void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait); -extern void remove_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait); +extern void add_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry); +extern void add_wait_queue_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry); +extern void remove_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry); -static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_entry_t *new) +static inline void __add_wait_queue(wait_queue_head_t *head, struct wait_queue_entry *wq_entry) { - list_add(&new->task_list, &head->task_list); + list_add(&wq_entry->task_list, &head->task_list); } /* * Used for wake-one threads: */ static inline void -__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait) +__add_wait_queue_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) { - wait->flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue(q, wait); + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue(q, wq_entry); } -static inline void __add_wait_queue_entry_tail(wait_queue_head_t *head, - wait_queue_entry_t *new) +static inline void __add_wait_queue_entry_tail(wait_queue_head_t *head, struct wait_queue_entry *wq_entry) { - list_add_tail(&new->task_list, &head->task_list); + list_add_tail(&wq_entry->task_list, &head->task_list); } static inline void -__add_wait_queue_entry_tail_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait) +__add_wait_queue_entry_tail_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) { - wait->flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_entry_tail(q, wait); + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_entry_tail(q, wq_entry); } static inline void -__remove_wait_queue(wait_queue_head_t *head, wait_queue_entry_t *old) +__remove_wait_queue(wait_queue_head_t *head, struct wait_queue_entry *wq_entry) { - list_del(&old->task_list); + list_del(&wq_entry->task_list); } typedef int wait_bit_action_f(struct wait_bit_key *, int mode); @@ -252,7 +252,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); (!__builtin_constant_p(state) || \ state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ -extern void init_wait_entry(wait_queue_entry_t *__wait, int flags); +extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags); /* * The below macro ___wait_event() has an explicit shadow of the __ret @@ -269,12 +269,12 @@ extern void init_wait_entry(wait_queue_entry_t *__wait, int flags); #define ___wait_event(wq, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ - wait_queue_entry_t __wait; \ + struct wait_queue_entry __wq_entry; \ long __ret = ret; /* explicit shadow */ \ \ - init_wait_entry(&__wait, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ + init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);\ for (;;) { \ - long __int = prepare_to_wait_event(&wq, &__wait, state);\ + long __int = prepare_to_wait_event(&wq, &__wq_entry, state);\ \ if (condition) \ break; \ @@ -286,7 +286,7 @@ extern void init_wait_entry(wait_queue_entry_t *__wait, int flags); \ cmd; \ } \ - finish_wait(&wq, &__wait); \ + finish_wait(&wq, &__wq_entry); \ __out: __ret; \ }) @@ -970,17 +970,17 @@ do { \ /* * Waitqueues which are removed from the waitqueue_head at wakeup time */ -void prepare_to_wait(wait_queue_head_t *q, wait_queue_entry_t *wait, int state); -void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait, int state); -long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int state); -void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait); -long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout); -int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key); -int autoremove_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key); -int wake_bit_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key); +void prepare_to_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state); +void prepare_to_wait_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state); +long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state); +void finish_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry); +long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); +int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); +int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); +int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); #define DEFINE_WAIT_FUNC(name, function) \ - wait_queue_entry_t name = { \ + struct wait_queue_entry name = { \ .private = current, \ .func = function, \ .task_list = LIST_HEAD_INIT((name).task_list), \ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 301ea02dede0..c37b3140763e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -21,34 +21,34 @@ void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_c EXPORT_SYMBOL(__init_waitqueue_head); -void add_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait) +void add_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) { unsigned long flags; - wait->flags &= ~WQ_FLAG_EXCLUSIVE; + wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, wait); + __add_wait_queue_entry_tail(q, wq_entry); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(add_wait_queue); -void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait) +void add_wait_queue_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) { unsigned long flags; - wait->flags |= WQ_FLAG_EXCLUSIVE; + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_entry_tail(q, wait); + __add_wait_queue_entry_tail(q, wq_entry); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(add_wait_queue_exclusive); -void remove_wait_queue(wait_queue_head_t *q, wait_queue_entry_t *wait) +void remove_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) { unsigned long flags; spin_lock_irqsave(&q->lock, flags); - __remove_wait_queue(q, wait); + __remove_wait_queue(q, wq_entry); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(remove_wait_queue); @@ -170,43 +170,43 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * loads to move into the critical region). */ void -prepare_to_wait(wait_queue_head_t *q, wait_queue_entry_t *wait, int state) +prepare_to_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; - wait->flags &= ~WQ_FLAG_EXCLUSIVE; + wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue(q, wait); + if (list_empty(&wq_entry->task_list)) + __add_wait_queue(q, wq_entry); set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait); void -prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_entry_t *wait, int state) +prepare_to_wait_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; - wait->flags |= WQ_FLAG_EXCLUSIVE; + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue_entry_tail(q, wait); + if (list_empty(&wq_entry->task_list)) + __add_wait_queue_entry_tail(q, wq_entry); set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait_exclusive); -void init_wait_entry(wait_queue_entry_t *wait, int flags) +void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) { - wait->flags = flags; - wait->private = current; - wait->func = autoremove_wake_function; - INIT_LIST_HEAD(&wait->task_list); + wq_entry->flags = flags; + wq_entry->private = current; + wq_entry->func = autoremove_wake_function; + INIT_LIST_HEAD(&wq_entry->task_list); } EXPORT_SYMBOL(init_wait_entry); -long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int state) +long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; long ret = 0; @@ -225,14 +225,14 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_entry_t *wait, int s * can't see us, it should wake up another exclusive waiter if * we fail. */ - list_del_init(&wait->task_list); + list_del_init(&wq_entry->task_list); ret = -ERESTARTSYS; } else { - if (list_empty(&wait->task_list)) { - if (wait->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_entry_tail(q, wait); + if (list_empty(&wq_entry->task_list)) { + if (wq_entry->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_entry_tail(q, wq_entry); else - __add_wait_queue(q, wait); + __add_wait_queue(q, wq_entry); } set_current_state(state); } @@ -284,13 +284,13 @@ EXPORT_SYMBOL(do_wait_intr_irq); /** * finish_wait - clean up after waiting in a queue * @q: waitqueue waited on - * @wait: wait descriptor + * @wq_entry: wait descriptor * * Sets current thread back to running state and removes * the wait descriptor from the given waitqueue if still * queued. */ -void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait) +void finish_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -308,20 +308,20 @@ void finish_wait(wait_queue_head_t *q, wait_queue_entry_t *wait) * have _one_ other CPU that looks at or modifies * the list). */ - if (!list_empty_careful(&wait->task_list)) { + if (!list_empty_careful(&wq_entry->task_list)) { spin_lock_irqsave(&q->lock, flags); - list_del_init(&wait->task_list); + list_del_init(&wq_entry->task_list); spin_unlock_irqrestore(&q->lock, flags); } } EXPORT_SYMBOL(finish_wait); -int autoremove_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) +int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) { - int ret = default_wake_function(wait, mode, sync, key); + int ret = default_wake_function(wq_entry, mode, sync, key); if (ret) - list_del_init(&wait->task_list); + list_del_init(&wq_entry->task_list); return ret; } EXPORT_SYMBOL(autoremove_wake_function); @@ -341,17 +341,17 @@ static inline bool is_kthread_should_stop(void) * * p->state = mode; condition = true; * smp_mb(); // A smp_wmb(); // C - * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; + * if (!wq_entry->flags & WQ_FLAG_WOKEN) wq_entry->flags |= WQ_FLAG_WOKEN; * schedule() try_to_wake_up(); * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ - * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; + * wq_entry->flags &= ~WQ_FLAG_WOKEN; condition = true; * smp_mb() // B smp_wmb(); // C - * wait->flags |= WQ_FLAG_WOKEN; + * wq_entry->flags |= WQ_FLAG_WOKEN; * } * remove_wait_queue(&wq, &wait); * */ -long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout) +long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) { set_current_state(mode); /* A */ /* @@ -359,7 +359,7 @@ long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout) * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must * also observe all state before the wakeup. */ - if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) timeout = schedule_timeout(timeout); __set_current_state(TASK_RUNNING); @@ -369,13 +369,13 @@ long wait_woken(wait_queue_entry_t *wait, unsigned mode, long timeout) * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss * an event. */ - smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */ return timeout; } EXPORT_SYMBOL(wait_woken); -int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) +int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) { /* * Although this function is called under waitqueue lock, LOCK @@ -385,24 +385,24 @@ int woken_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void * and is paired with smp_store_mb() in wait_woken(). */ smp_wmb(); /* C */ - wait->flags |= WQ_FLAG_WOKEN; + wq_entry->flags |= WQ_FLAG_WOKEN; - return default_wake_function(wait, mode, sync, key); + return default_wake_function(wq_entry, mode, sync, key); } EXPORT_SYMBOL(woken_wake_function); -int wake_bit_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) +int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); + = container_of(wq_entry, struct wait_bit_queue, wait); if (wait_bit->key.flags != key->flags || wait_bit->key.bit_nr != key->bit_nr || test_bit(key->bit_nr, key->flags)) return 0; else - return autoremove_wake_function(wait, mode, sync, key); + return autoremove_wake_function(wq_entry, mode, sync, key); } EXPORT_SYMBOL(wake_bit_function); @@ -534,19 +534,19 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) return bit_waitqueue(p, 0); } -static int wake_atomic_t_function(wait_queue_entry_t *wait, unsigned mode, int sync, +static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); + = container_of(wq_entry, struct wait_bit_queue, wait); atomic_t *val = key->flags; if (wait_bit->key.flags != key->flags || wait_bit->key.bit_nr != key->bit_nr || atomic_read(val) != 0) return 0; - return autoremove_wake_function(wait, mode, sync, key); + return autoremove_wake_function(wq_entry, mode, sync, key); } /* -- cgit v1.3-14-g43fede From 9d9d676f595b5081326be7a17dc681fcb38fb3b2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Mar 2017 11:10:18 +0100 Subject: sched/wait: Standardize internal naming of wait-queue heads The wait-queue head parameters and variables are named in a couple of ways, we have the following variants currently: wait_queue_head_t *q wait_queue_head_t *wq wait_queue_head_t *head In particular the 'wq' naming is ambiguous in the sense whether it's a wait-queue head or entry name - as entries were often named 'wait'. ( Not to mention the confusion of any readers coming over from workqueue-land. ) Standardize all this around a single, unambiguous parameter and variable name: struct wait_queue_head *wq_head which is easy to grep for and also rhymes nicely with the wait-queue entry naming: struct wait_queue_entry *wq_entry Also rename: struct __wait_queue_head => struct wait_queue_head ... and use this struct type to migrate from typedefs usage to 'struct' usage, which is more in line with existing kernel practices. Don't touch any external users and preserve the main wait_queue_head_t typedef. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 76 ++++++++++++------------- kernel/sched/wait.c | 154 +++++++++++++++++++++++++-------------------------- 2 files changed, 115 insertions(+), 115 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index 77fdea851d8b..c3d1cefc7853 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -41,11 +41,11 @@ struct wait_bit_queue { struct wait_queue_entry wait; }; -struct __wait_queue_head { +struct wait_queue_head { spinlock_t lock; struct list_head task_list; }; -typedef struct __wait_queue_head wait_queue_head_t; +typedef struct wait_queue_head wait_queue_head_t; struct task_struct; @@ -66,7 +66,7 @@ struct task_struct; .task_list = { &(name).task_list, &(name).task_list } } #define DECLARE_WAIT_QUEUE_HEAD(name) \ - wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name) + struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } @@ -74,20 +74,20 @@ struct task_struct; #define __WAIT_ATOMIC_T_KEY_INITIALIZER(p) \ { .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, } -extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *); +extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *); -#define init_waitqueue_head(q) \ +#define init_waitqueue_head(wq_head) \ do { \ static struct lock_class_key __key; \ \ - __init_waitqueue_head((q), #q, &__key); \ + __init_waitqueue_head((wq_head), #wq_head, &__key); \ } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ ({ init_waitqueue_head(&name); name; }) # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \ - wait_queue_head_t name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) + struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) #else # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name) #endif @@ -109,14 +109,14 @@ init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t f /** * waitqueue_active -- locklessly test for waiters on the queue - * @q: the waitqueue to test for waiters + * @wq_head: the waitqueue to test for waiters * * returns true if the wait list is not empty * * NOTE: this function is lockless and requires care, incorrect usage _will_ * lead to sporadic and non-obvious failure. * - * Use either while holding wait_queue_head_t::lock or when used for wakeups + * Use either while holding wait_queue_head::lock or when used for wakeups * with an extra smp_mb() like: * * CPU0 - waker CPU1 - waiter @@ -137,9 +137,9 @@ init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t f * Also note that this 'optimization' trades a spin_lock() for an smp_mb(), * which (when the lock is uncontended) are of roughly equal cost. */ -static inline int waitqueue_active(wait_queue_head_t *q) +static inline int waitqueue_active(struct wait_queue_head *wq_head) { - return !list_empty(&q->task_list); + return !list_empty(&wq_head->task_list); } /** @@ -150,7 +150,7 @@ static inline int waitqueue_active(wait_queue_head_t *q) * * Please refer to the comment for waitqueue_active. */ -static inline bool wq_has_sleeper(wait_queue_head_t *wq) +static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) { /* * We need to be sure we are in sync with the @@ -160,62 +160,62 @@ static inline bool wq_has_sleeper(wait_queue_head_t *wq) * waiting side. */ smp_mb(); - return waitqueue_active(wq); + return waitqueue_active(wq_head); } -extern void add_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry); -extern void add_wait_queue_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry); -extern void remove_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry); +extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); -static inline void __add_wait_queue(wait_queue_head_t *head, struct wait_queue_entry *wq_entry) +static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { - list_add(&wq_entry->task_list, &head->task_list); + list_add(&wq_entry->task_list, &wq_head->task_list); } /* * Used for wake-one threads: */ static inline void -__add_wait_queue_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) +__add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { wq_entry->flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue(q, wq_entry); + __add_wait_queue(wq_head, wq_entry); } -static inline void __add_wait_queue_entry_tail(wait_queue_head_t *head, struct wait_queue_entry *wq_entry) +static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { - list_add_tail(&wq_entry->task_list, &head->task_list); + list_add_tail(&wq_entry->task_list, &wq_head->task_list); } static inline void -__add_wait_queue_entry_tail_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) +__add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { wq_entry->flags |= WQ_FLAG_EXCLUSIVE; - __add_wait_queue_entry_tail(q, wq_entry); + __add_wait_queue_entry_tail(wq_head, wq_entry); } static inline void -__remove_wait_queue(wait_queue_head_t *head, struct wait_queue_entry *wq_entry) +__remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { list_del(&wq_entry->task_list); } typedef int wait_bit_action_f(struct wait_bit_key *, int mode); -void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); -void __wake_up_bit(wait_queue_head_t *, void *, int); -int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_action_f *, unsigned); -int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_action_f *, unsigned); +void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); +void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); +void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); +void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); +void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr); +void __wake_up_bit(struct wait_queue_head *, void *, int); +int __wait_on_bit(struct wait_queue_head *, struct wait_bit_queue *, wait_bit_action_f *, unsigned); +int __wait_on_bit_lock(struct wait_queue_head *, struct wait_bit_queue *, wait_bit_action_f *, unsigned); void wake_up_bit(void *, int); void wake_up_atomic_t(atomic_t *); int out_of_line_wait_on_bit(void *, int, wait_bit_action_f *, unsigned); int out_of_line_wait_on_bit_timeout(void *, int, wait_bit_action_f *, unsigned, unsigned long); int out_of_line_wait_on_bit_lock(void *, int, wait_bit_action_f *, unsigned); int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned); -wait_queue_head_t *bit_waitqueue(void *, int); +struct wait_queue_head *bit_waitqueue(void *, int); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) @@ -970,10 +970,10 @@ do { \ /* * Waitqueues which are removed from the waitqueue_head at wakeup time */ -void prepare_to_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state); -void prepare_to_wait_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state); -long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state); -void finish_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry); +void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +void prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index c37b3140763e..203aeea96f16 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -12,44 +12,44 @@ #include #include -void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) +void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { - spin_lock_init(&q->lock); - lockdep_set_class_and_name(&q->lock, key, name); - INIT_LIST_HEAD(&q->task_list); + spin_lock_init(&wq_head->lock); + lockdep_set_class_and_name(&wq_head->lock, key, name); + INIT_LIST_HEAD(&wq_head->task_list); } EXPORT_SYMBOL(__init_waitqueue_head); -void add_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) +void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_entry_tail(q, wq_entry); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue_entry_tail(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(add_wait_queue); -void add_wait_queue_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) +void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; wq_entry->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_entry_tail(q, wq_entry); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue_entry_tail(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(add_wait_queue_exclusive); -void remove_wait_queue(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) +void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; - spin_lock_irqsave(&q->lock, flags); - __remove_wait_queue(q, wq_entry); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __remove_wait_queue(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(remove_wait_queue); @@ -63,12 +63,12 @@ EXPORT_SYMBOL(remove_wait_queue); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, int wake_flags, void *key) { wait_queue_entry_t *curr, *next; - list_for_each_entry_safe(curr, next, &q->task_list, task_list) { + list_for_each_entry_safe(curr, next, &wq_head->task_list, task_list) { unsigned flags = curr->flags; if (curr->func(curr, mode, wake_flags, key) && @@ -79,7 +79,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, /** * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue + * @wq_head: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function @@ -87,35 +87,35 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, +void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __wake_up_common(wq_head, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(__wake_up); /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) +void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr) { - __wake_up_common(q, mode, nr, 0, NULL); + __wake_up_common(wq_head, mode, nr, 0, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(q, mode, 1, 0, key); + __wake_up_common(wq_head, mode, 1, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. - * @q: the waitqueue + * @wq_head: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: opaque value to be passed to wakeup targets @@ -130,30 +130,30 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key); * It may be assumed that this function implies a write memory barrier before * changing the task state if and only if any tasks are woken up. */ -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, +void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; int wake_flags = 1; /* XXX WF_SYNC */ - if (unlikely(!q)) + if (unlikely(!wq_head)) return; if (unlikely(nr_exclusive != 1)) wake_flags = 0; - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, wake_flags, key); - spin_unlock_irqrestore(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); + __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); /* * __wake_up_sync - see __wake_up_sync_key() */ -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive) { - __wake_up_sync_key(q, mode, nr_exclusive, NULL); + __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL); } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ @@ -170,30 +170,30 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * loads to move into the critical region). */ void -prepare_to_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state) +prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); if (list_empty(&wq_entry->task_list)) - __add_wait_queue(q, wq_entry); + __add_wait_queue(wq_head, wq_entry); set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(prepare_to_wait); void -prepare_to_wait_exclusive(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state) +prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; wq_entry->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); if (list_empty(&wq_entry->task_list)) - __add_wait_queue_entry_tail(q, wq_entry); + __add_wait_queue_entry_tail(wq_head, wq_entry); set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(prepare_to_wait_exclusive); @@ -206,12 +206,12 @@ void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) } EXPORT_SYMBOL(init_wait_entry); -long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_entry, int state) +long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; long ret = 0; - spin_lock_irqsave(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); if (unlikely(signal_pending_state(state, current))) { /* * Exclusive waiter must not fail if it was selected by wakeup, @@ -219,7 +219,7 @@ long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_ent * * The caller will recheck the condition and return success if * we were already woken up, we can not miss the event because - * wakeup locks/unlocks the same q->lock. + * wakeup locks/unlocks the same wq_head->lock. * * But we need to ensure that set-condition + wakeup after that * can't see us, it should wake up another exclusive waiter if @@ -230,13 +230,13 @@ long prepare_to_wait_event(wait_queue_head_t *q, struct wait_queue_entry *wq_ent } else { if (list_empty(&wq_entry->task_list)) { if (wq_entry->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_entry_tail(q, wq_entry); + __add_wait_queue_entry_tail(wq_head, wq_entry); else - __add_wait_queue(q, wq_entry); + __add_wait_queue(wq_head, wq_entry); } set_current_state(state); } - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&wq_head->lock, flags); return ret; } @@ -283,14 +283,14 @@ EXPORT_SYMBOL(do_wait_intr_irq); /** * finish_wait - clean up after waiting in a queue - * @q: waitqueue waited on + * @wq_head: waitqueue waited on * @wq_entry: wait descriptor * * Sets current thread back to running state and removes * the wait descriptor from the given waitqueue if still * queued. */ -void finish_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) +void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -309,9 +309,9 @@ void finish_wait(wait_queue_head_t *q, struct wait_queue_entry *wq_entry) * the list). */ if (!list_empty_careful(&wq_entry->task_list)) { - spin_lock_irqsave(&q->lock, flags); + spin_lock_irqsave(&wq_head->lock, flags); list_del_init(&wq_entry->task_list); - spin_unlock_irqrestore(&q->lock, flags); + spin_unlock_irqrestore(&wq_head->lock, flags); } } EXPORT_SYMBOL(finish_wait); @@ -334,7 +334,7 @@ static inline bool is_kthread_should_stop(void) /* * DEFINE_WAIT_FUNC(wait, woken_wake_func); * - * add_wait_queue(&wq, &wait); + * add_wait_queue(&wq_head, &wait); * for (;;) { * if (condition) * break; @@ -348,7 +348,7 @@ static inline bool is_kthread_should_stop(void) * smp_mb() // B smp_wmb(); // C * wq_entry->flags |= WQ_FLAG_WOKEN; * } - * remove_wait_queue(&wq, &wait); + * remove_wait_queue(&wq_head, &wait); * */ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) @@ -412,17 +412,17 @@ EXPORT_SYMBOL(wake_bit_function); * permitted return codes. Nonzero return codes halt waiting and return. */ int __sched -__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, +__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue *q, wait_bit_action_f *action, unsigned mode) { int ret = 0; do { - prepare_to_wait(wq, &q->wait, mode); + prepare_to_wait(wq_head, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) ret = (*action)(&q->key, mode); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); - finish_wait(wq, &q->wait); + finish_wait(wq_head, &q->wait); return ret; } EXPORT_SYMBOL(__wait_on_bit); @@ -430,10 +430,10 @@ EXPORT_SYMBOL(__wait_on_bit); int __sched out_of_line_wait_on_bit(void *word, int bit, wait_bit_action_f *action, unsigned mode) { - wait_queue_head_t *wq = bit_waitqueue(word, bit); + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); - return __wait_on_bit(wq, &wait, action, mode); + return __wait_on_bit(wq_head, &wait, action, mode); } EXPORT_SYMBOL(out_of_line_wait_on_bit); @@ -441,36 +441,36 @@ int __sched out_of_line_wait_on_bit_timeout( void *word, int bit, wait_bit_action_f *action, unsigned mode, unsigned long timeout) { - wait_queue_head_t *wq = bit_waitqueue(word, bit); + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); wait.key.timeout = jiffies + timeout; - return __wait_on_bit(wq, &wait, action, mode); + return __wait_on_bit(wq_head, &wait, action, mode); } EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); int __sched -__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, +__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q, wait_bit_action_f *action, unsigned mode) { int ret = 0; for (;;) { - prepare_to_wait_exclusive(wq, &q->wait, mode); + prepare_to_wait_exclusive(wq_head, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) { ret = action(&q->key, mode); /* * See the comment in prepare_to_wait_event(). - * finish_wait() does not necessarily takes wq->lock, + * finish_wait() does not necessarily takes wwq_head->lock, * but test_and_set_bit() implies mb() which pairs with * smp_mb__after_atomic() before wake_up_page(). */ if (ret) - finish_wait(wq, &q->wait); + finish_wait(wq_head, &q->wait); } if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { if (!ret) - finish_wait(wq, &q->wait); + finish_wait(wq_head, &q->wait); return 0; } else if (ret) { return ret; @@ -482,18 +482,18 @@ EXPORT_SYMBOL(__wait_on_bit_lock); int __sched out_of_line_wait_on_bit_lock(void *word, int bit, wait_bit_action_f *action, unsigned mode) { - wait_queue_head_t *wq = bit_waitqueue(word, bit); + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); - return __wait_on_bit_lock(wq, &wait, action, mode); + return __wait_on_bit_lock(wq_head, &wait, action, mode); } EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); -void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) +void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) { struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, 1, &key); + if (waitqueue_active(wq_head)) + __wake_up(wq_head, TASK_NORMAL, 1, &key); } EXPORT_SYMBOL(__wake_up_bit); @@ -555,20 +555,20 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo * return codes halt waiting and return. */ static __sched -int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, +int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue *q, int (*action)(atomic_t *), unsigned mode) { atomic_t *val; int ret = 0; do { - prepare_to_wait(wq, &q->wait, mode); + prepare_to_wait(wq_head, &q->wait, mode); val = q->key.flags; if (atomic_read(val) == 0) break; ret = (*action)(val); } while (!ret && atomic_read(val) != 0); - finish_wait(wq, &q->wait); + finish_wait(wq_head, &q->wait); return ret; } @@ -586,10 +586,10 @@ int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), unsigned mode) { - wait_queue_head_t *wq = atomic_t_waitqueue(p); + struct wait_queue_head *wq_head = atomic_t_waitqueue(p); DEFINE_WAIT_ATOMIC_T(wait, p); - return __wait_on_atomic_t(wq, &wait, action, mode); + return __wait_on_atomic_t(wq_head, &wait, action, mode); } EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); -- cgit v1.3-14-g43fede From 2141713616c652aeabf2dd5c1e89bc601c4fed6a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Mar 2017 11:25:39 +0100 Subject: sched/wait: Standardize 'struct wait_bit_queue' wait-queue entry field name Rename 'struct wait_bit_queue::wait' to ::wq_entry, to more clearly name it as a wait-queue entry. Propagate it to a couple of usage sites where the wait-bit-queue internals are exposed. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/inode.c | 8 ++++---- fs/jbd2/journal.c | 4 ++-- fs/xfs/xfs_icache.c | 4 ++-- fs/xfs/xfs_inode.c | 8 ++++---- include/linux/wait.h | 6 +++--- kernel/sched/wait.c | 41 ++++++++++++++++++++--------------------- 6 files changed, 35 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/fs/inode.c b/fs/inode.c index db5914783a71..70761d6cafcd 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1891,11 +1891,11 @@ static void __wait_on_freeing_inode(struct inode *inode) wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); spin_unlock(&inode_hash_lock); schedule(); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); spin_lock(&inode_hash_lock); } @@ -2038,11 +2038,11 @@ static void __inode_dio_wait(struct inode *inode) DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); do { - prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE); if (atomic_read(&inode->i_dio_count)) schedule(); } while (atomic_read(&inode->i_dio_count)); - finish_wait(wq, &q.wait); + finish_wait(wq, &q.wq_entry); } /** diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ebad34266bcf..7d5ef3bf3f3e 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2579,10 +2579,10 @@ restart: wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&journal->j_list_lock); schedule(); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); goto restart; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 990210fcb9c3..b9c12e1cc23a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -269,12 +269,12 @@ xfs_inew_wait( DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); do { - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (!xfs_iflags_test(ip, XFS_INEW)) break; schedule(); } while (true); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ec9826c56500..c0a1e840a588 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -622,12 +622,12 @@ __xfs_iflock( DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); do { - prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait_exclusive(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (xfs_isiflocked(ip)) io_schedule(); } while (!xfs_iflock_nowait(ip)); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } STATIC uint @@ -2486,11 +2486,11 @@ __xfs_iunpin_wait( xfs_iunpin(ip); do { - prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (xfs_ipincount(ip)) io_schedule(); } while (xfs_ipincount(ip)); - finish_wait(wq, &wait.wait); + finish_wait(wq, &wait.wq_entry); } void diff --git a/include/linux/wait.h b/include/linux/wait.h index c3d1cefc7853..1c8add685f22 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -38,7 +38,7 @@ struct wait_bit_key { struct wait_bit_queue { struct wait_bit_key key; - struct wait_queue_entry wait; + struct wait_queue_entry wq_entry; }; struct wait_queue_head { @@ -991,11 +991,11 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync #define DEFINE_WAIT_BIT(name, word, bit) \ struct wait_bit_queue name = { \ .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ - .wait = { \ + .wq_entry = { \ .private = current, \ .func = wake_bit_function, \ .task_list = \ - LIST_HEAD_INIT((name).wait.task_list), \ + LIST_HEAD_INIT((name).wq_entry.task_list), \ }, \ } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 203aeea96f16..f1ba0625b8be 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -395,7 +395,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync { struct wait_bit_key *key = arg; struct wait_bit_queue *wait_bit - = container_of(wq_entry, struct wait_bit_queue, wait); + = container_of(wq_entry, struct wait_bit_queue, wq_entry); if (wait_bit->key.flags != key->flags || wait_bit->key.bit_nr != key->bit_nr || @@ -418,11 +418,11 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue *q, int ret = 0; do { - prepare_to_wait(wq_head, &q->wait, mode); + prepare_to_wait(wq_head, &q->wq_entry, mode); if (test_bit(q->key.bit_nr, q->key.flags)) ret = (*action)(&q->key, mode); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); - finish_wait(wq_head, &q->wait); + finish_wait(wq_head, &q->wq_entry); return ret; } EXPORT_SYMBOL(__wait_on_bit); @@ -431,9 +431,9 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); - return __wait_on_bit(wq_head, &wait, action, mode); + return __wait_on_bit(wq_head, &wq_entry, action, mode); } EXPORT_SYMBOL(out_of_line_wait_on_bit); @@ -442,10 +442,10 @@ int __sched out_of_line_wait_on_bit_timeout( unsigned mode, unsigned long timeout) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); - wait.key.timeout = jiffies + timeout; - return __wait_on_bit(wq_head, &wait, action, mode); + wq_entry.key.timeout = jiffies + timeout; + return __wait_on_bit(wq_head, &wq_entry, action, mode); } EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); @@ -456,7 +456,7 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q, int ret = 0; for (;;) { - prepare_to_wait_exclusive(wq_head, &q->wait, mode); + prepare_to_wait_exclusive(wq_head, &q->wq_entry, mode); if (test_bit(q->key.bit_nr, q->key.flags)) { ret = action(&q->key, mode); /* @@ -466,11 +466,11 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q, * smp_mb__after_atomic() before wake_up_page(). */ if (ret) - finish_wait(wq_head, &q->wait); + finish_wait(wq_head, &q->wq_entry); } if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { if (!ret) - finish_wait(wq_head, &q->wait); + finish_wait(wq_head, &q->wq_entry); return 0; } else if (ret) { return ret; @@ -483,9 +483,9 @@ int __sched out_of_line_wait_on_bit_lock(void *word, int bit, wait_bit_action_f *action, unsigned mode) { struct wait_queue_head *wq_head = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); - return __wait_on_bit_lock(wq_head, &wait, action, mode); + return __wait_on_bit_lock(wq_head, &wq_entry, action, mode); } EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); @@ -538,8 +538,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo void *arg) { struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wq_entry, struct wait_bit_queue, wait); + struct wait_bit_queue *wait_bit = container_of(wq_entry, struct wait_bit_queue, wq_entry); atomic_t *val = key->flags; if (wait_bit->key.flags != key->flags || @@ -562,24 +561,24 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue *q int ret = 0; do { - prepare_to_wait(wq_head, &q->wait, mode); + prepare_to_wait(wq_head, &q->wq_entry, mode); val = q->key.flags; if (atomic_read(val) == 0) break; ret = (*action)(val); } while (!ret && atomic_read(val) != 0); - finish_wait(wq_head, &q->wait); + finish_wait(wq_head, &q->wq_entry); return ret; } #define DEFINE_WAIT_ATOMIC_T(name, p) \ struct wait_bit_queue name = { \ .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ - .wait = { \ + .wq_entry = { \ .private = current, \ .func = wake_atomic_t_function, \ .task_list = \ - LIST_HEAD_INIT((name).wait.task_list), \ + LIST_HEAD_INIT((name).wq_entry.task_list), \ }, \ } @@ -587,9 +586,9 @@ __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), unsigned mode) { struct wait_queue_head *wq_head = atomic_t_waitqueue(p); - DEFINE_WAIT_ATOMIC_T(wait, p); + DEFINE_WAIT_ATOMIC_T(wq_entry, p); - return __wait_on_atomic_t(wq_head, &wait, action, mode); + return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); } EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); -- cgit v1.3-14-g43fede From 76c85ddc4695bb7b8209bfeff11f5156088f9197 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Mar 2017 11:35:27 +0100 Subject: sched/wait: Standardize wait_bit_queue naming So wait-bit-queue head variables are often named: struct wait_bit_queue *q ... which is a bit ambiguous and super confusing, because they clearly suggest wait-queue head semantics and behavior (they rhyme with the old wait_queue_t *q naming), while they are extended wait-queue _entries_, not heads! They are misnomers in two ways: - the 'wait_bit_queue' leaves open the question of whether it's an entry or a head - the 'q' parameter and local variable naming falsely implies that it's a 'queue' - while it's an entry. This resulted in sometimes confusing cases such as: finish_wait(wq, &q->wait); where the 'q' is not a wait-queue head, but a wait-bit-queue entry. So improve this all by standardizing wait-bit-queue nomenclature similar to wait-queue head naming: struct wait_bit_queue => struct wait_bit_queue_entry q => wbq_entry Which makes it all a much clearer: struct wait_bit_queue_entry *wbq_entry ... and turns the former confusing piece of code into: finish_wait(wq_head, &wbq_entry->wq_entry; which IMHO makes it apparently clear what we are doing, without having to analyze the context of the code: we are adding a wait-queue entry to a regular wait-queue head, which entry is embedded in a wait-bit-queue entry. I'm not a big fan of acronyms, but repeating wait_bit_queue_entry in field and local variable names is too long, so Hopefully it's clear enough that 'wq_' prefixes stand for wait-queues, while 'wbq_' prefixes stand for wait-bit-queues. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 8 ++++---- kernel/sched/wait.c | 41 ++++++++++++++++++++--------------------- 2 files changed, 24 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index 1c8add685f22..fc7c32d82120 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -36,7 +36,7 @@ struct wait_bit_key { unsigned long timeout; }; -struct wait_bit_queue { +struct wait_bit_queue_entry { struct wait_bit_key key; struct wait_queue_entry wq_entry; }; @@ -207,8 +207,8 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_bit(struct wait_queue_head *, void *, int); -int __wait_on_bit(struct wait_queue_head *, struct wait_bit_queue *, wait_bit_action_f *, unsigned); -int __wait_on_bit_lock(struct wait_queue_head *, struct wait_bit_queue *, wait_bit_action_f *, unsigned); +int __wait_on_bit(struct wait_queue_head *, struct wait_bit_queue_entry *, wait_bit_action_f *, unsigned); +int __wait_on_bit_lock(struct wait_queue_head *, struct wait_bit_queue_entry *, wait_bit_action_f *, unsigned); void wake_up_bit(void *, int); void wake_up_atomic_t(atomic_t *); int out_of_line_wait_on_bit(void *, int, wait_bit_action_f *, unsigned); @@ -989,7 +989,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) #define DEFINE_WAIT_BIT(name, word, bit) \ - struct wait_bit_queue name = { \ + struct wait_bit_queue_entry name = { \ .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ .wq_entry = { \ .private = current, \ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f1ba0625b8be..95e6d3820cba 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -394,8 +394,7 @@ EXPORT_SYMBOL(woken_wake_function); int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wq_entry, struct wait_bit_queue, wq_entry); + struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); if (wait_bit->key.flags != key->flags || wait_bit->key.bit_nr != key->bit_nr || @@ -412,17 +411,17 @@ EXPORT_SYMBOL(wake_bit_function); * permitted return codes. Nonzero return codes halt waiting and return. */ int __sched -__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue *q, +__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned mode) { int ret = 0; do { - prepare_to_wait(wq_head, &q->wq_entry, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(&q->key, mode); - } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); - finish_wait(wq_head, &q->wq_entry); + prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); + if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) + ret = (*action)(&wbq_entry->key, mode); + } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + finish_wait(wq_head, &wbq_entry->wq_entry); return ret; } EXPORT_SYMBOL(__wait_on_bit); @@ -450,15 +449,15 @@ int __sched out_of_line_wait_on_bit_timeout( EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); int __sched -__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q, +__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned mode) { int ret = 0; for (;;) { - prepare_to_wait_exclusive(wq_head, &q->wq_entry, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) { - ret = action(&q->key, mode); + prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode); + if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { + ret = action(&wbq_entry->key, mode); /* * See the comment in prepare_to_wait_event(). * finish_wait() does not necessarily takes wwq_head->lock, @@ -466,11 +465,11 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue *q, * smp_mb__after_atomic() before wake_up_page(). */ if (ret) - finish_wait(wq_head, &q->wq_entry); + finish_wait(wq_head, &wbq_entry->wq_entry); } - if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { + if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { if (!ret) - finish_wait(wq_head, &q->wq_entry); + finish_wait(wq_head, &wbq_entry->wq_entry); return 0; } else if (ret) { return ret; @@ -538,7 +537,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo void *arg) { struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit = container_of(wq_entry, struct wait_bit_queue, wq_entry); + struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); atomic_t *val = key->flags; if (wait_bit->key.flags != key->flags || @@ -554,25 +553,25 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo * return codes halt waiting and return. */ static __sched -int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue *q, +int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, int (*action)(atomic_t *), unsigned mode) { atomic_t *val; int ret = 0; do { - prepare_to_wait(wq_head, &q->wq_entry, mode); - val = q->key.flags; + prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); + val = wbq_entry->key.flags; if (atomic_read(val) == 0) break; ret = (*action)(val); } while (!ret && atomic_read(val) != 0); - finish_wait(wq_head, &q->wq_entry); + finish_wait(wq_head, &wbq_entry->wq_entry); return ret; } #define DEFINE_WAIT_ATOMIC_T(name, p) \ - struct wait_bit_queue name = { \ + struct wait_bit_queue_entry name = { \ .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ .wq_entry = { \ .private = current, \ -- cgit v1.3-14-g43fede From 5dd43ce2f69d42a71dcacdb13d17d8c0ac1fe8f7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 20 Jun 2017 12:19:09 +0200 Subject: sched/wait: Split out the wait_bit*() APIs from into The wait_bit*() types and APIs are mixed into wait.h, but they are a pretty orthogonal extension of wait-queues. Furthermore, only about 50 kernel files use these APIs, while over 1000 use the regular wait-queue functionality. So clean up the main wait.h by moving the wait-bit functionality out of it, into a separate .h and .c file: include/linux/wait_bit.h for types and APIs kernel/sched/wait_bit.c for the implementation Update all header dependencies. This reduces the size of wait.h rather significantly, by about 30%. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- fs/cachefiles/internal.h | 2 +- fs/cifs/inode.c | 1 + fs/nfs/internal.h | 1 + include/linux/fs.h | 2 +- include/linux/sunrpc/sched.h | 2 +- include/linux/wait.h | 250 ---------------------------------------- include/linux/wait_bit.h | 260 ++++++++++++++++++++++++++++++++++++++++++ kernel/sched/Makefile | 2 +- kernel/sched/wait.c | 257 ------------------------------------------ kernel/sched/wait_bit.c | 263 +++++++++++++++++++++++++++++++++++++++++++ security/keys/internal.h | 1 + 11 files changed, 530 insertions(+), 511 deletions(-) create mode 100644 include/linux/wait_bit.h create mode 100644 kernel/sched/wait_bit.c (limited to 'kernel') diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 54a4fcd679ed..bb3a02ca9da4 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include #include diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 4d1fcd76d022..a8693632235f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "cifsfs.h" diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 3e24392f2caa..8701d7617964 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -7,6 +7,7 @@ #include #include #include +#include #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) diff --git a/include/linux/fs.h b/include/linux/fs.h index 803e5a9b2654..53f7e49d8fe5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2,7 +2,7 @@ #define _LINUX_FS_H #include -#include +#include #include #include #include diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 7ba040c797ec..9d7529ffc4ce 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/wait.h b/include/linux/wait.h index 0805098f3589..629489746f8a 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -29,18 +29,6 @@ struct wait_queue_entry { struct list_head task_list; }; -struct wait_bit_key { - void *flags; - int bit_nr; -#define WAIT_ATOMIC_T_BIT_NR -1 - unsigned long timeout; -}; - -struct wait_bit_queue_entry { - struct wait_bit_key key; - struct wait_queue_entry wq_entry; -}; - struct wait_queue_head { spinlock_t lock; struct list_head task_list; @@ -68,12 +56,6 @@ struct task_struct; #define DECLARE_WAIT_QUEUE_HEAD(name) \ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) -#define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ - { .flags = word, .bit_nr = bit, } - -#define __WAIT_ATOMIC_T_KEY_INITIALIZER(p) \ - { .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, } - extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *); #define init_waitqueue_head(wq_head) \ @@ -200,22 +182,11 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq list_del(&wq_entry->task_list); } -typedef int wait_bit_action_f(struct wait_bit_key *key, int mode); void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr); -void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit); -int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); -int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); -void wake_up_bit(void *word, int bit); -void wake_up_atomic_t(atomic_t *p); -int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode); -int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout); -int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode); -int out_of_line_wait_on_atomic_t(atomic_t *p, int (*)(atomic_t *), unsigned int mode); -struct wait_queue_head *bit_waitqueue(void *word, int bit); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) @@ -976,7 +947,6 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_en long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); -int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); #define DEFINE_WAIT_FUNC(name, function) \ struct wait_queue_entry name = { \ @@ -987,17 +957,6 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) -#define DEFINE_WAIT_BIT(name, word, bit) \ - struct wait_bit_queue_entry name = { \ - .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ - .wq_entry = { \ - .private = current, \ - .func = wake_bit_function, \ - .task_list = \ - LIST_HEAD_INIT((name).wq_entry.task_list), \ - }, \ - } - #define init_wait(wait) \ do { \ (wait)->private = current; \ @@ -1006,213 +965,4 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync (wait)->flags = 0; \ } while (0) - -extern int bit_wait(struct wait_bit_key *key, int bit); -extern int bit_wait_io(struct wait_bit_key *key, int bit); -extern int bit_wait_timeout(struct wait_bit_key *key, int bit); -extern int bit_wait_io_timeout(struct wait_bit_key *key, int bit); - -/** - * wait_on_bit - wait for a bit to be cleared - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * @mode: the task state to sleep in - * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that waits on a bit. - * For instance, if one were to have waiters on a bitflag, one would - * call wait_on_bit() in threads waiting for the bit to clear. - * One uses wait_on_bit() where one is waiting for the bit to clear, - * but has no intention of setting it. - * Returned value will be zero if the bit was cleared, or non-zero - * if the process received a signal and the mode permitted wakeup - * on that signal. - */ -static inline int -wait_on_bit(unsigned long *word, int bit, unsigned mode) -{ - might_sleep(); - if (!test_bit(bit, word)) - return 0; - return out_of_line_wait_on_bit(word, bit, - bit_wait, - mode); -} - -/** - * wait_on_bit_io - wait for a bit to be cleared - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * @mode: the task state to sleep in - * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared. This is similar to wait_on_bit(), but calls - * io_schedule() instead of schedule() for the actual waiting. - * - * Returned value will be zero if the bit was cleared, or non-zero - * if the process received a signal and the mode permitted wakeup - * on that signal. - */ -static inline int -wait_on_bit_io(unsigned long *word, int bit, unsigned mode) -{ - might_sleep(); - if (!test_bit(bit, word)) - return 0; - return out_of_line_wait_on_bit(word, bit, - bit_wait_io, - mode); -} - -/** - * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * @mode: the task state to sleep in - * @timeout: timeout, in jiffies - * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared. This is similar to wait_on_bit(), except also takes a - * timeout parameter. - * - * Returned value will be zero if the bit was cleared before the - * @timeout elapsed, or non-zero if the @timeout elapsed or process - * received a signal and the mode permitted wakeup on that signal. - */ -static inline int -wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, - unsigned long timeout) -{ - might_sleep(); - if (!test_bit(bit, word)) - return 0; - return out_of_line_wait_on_bit_timeout(word, bit, - bit_wait_timeout, - mode, timeout); -} - -/** - * wait_on_bit_action - wait for a bit to be cleared - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * @action: the function used to sleep, which may take special actions - * @mode: the task state to sleep in - * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared, and allow the waiting action to be specified. - * This is like wait_on_bit() but allows fine control of how the waiting - * is done. - * - * Returned value will be zero if the bit was cleared, or non-zero - * if the process received a signal and the mode permitted wakeup - * on that signal. - */ -static inline int -wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, - unsigned mode) -{ - might_sleep(); - if (!test_bit(bit, word)) - return 0; - return out_of_line_wait_on_bit(word, bit, action, mode); -} - -/** - * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * @mode: the task state to sleep in - * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that waits on a bit - * when one intends to set it, for instance, trying to lock bitflags. - * For instance, if one were to have waiters trying to set bitflag - * and waiting for it to clear before setting it, one would call - * wait_on_bit() in threads waiting to be able to set the bit. - * One uses wait_on_bit_lock() where one is waiting for the bit to - * clear with the intention of setting it, and when done, clearing it. - * - * Returns zero if the bit was (eventually) found to be clear and was - * set. Returns non-zero if a signal was delivered to the process and - * the @mode allows that signal to wake the process. - */ -static inline int -wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) -{ - might_sleep(); - if (!test_and_set_bit(bit, word)) - return 0; - return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); -} - -/** - * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * @mode: the task state to sleep in - * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared and then to atomically set it. This is similar - * to wait_on_bit(), but calls io_schedule() instead of schedule() - * for the actual waiting. - * - * Returns zero if the bit was (eventually) found to be clear and was - * set. Returns non-zero if a signal was delivered to the process and - * the @mode allows that signal to wake the process. - */ -static inline int -wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode) -{ - might_sleep(); - if (!test_and_set_bit(bit, word)) - return 0; - return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); -} - -/** - * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * @action: the function used to sleep, which may take special actions - * @mode: the task state to sleep in - * - * Use the standard hashed waitqueue table to wait for a bit - * to be cleared and then to set it, and allow the waiting action - * to be specified. - * This is like wait_on_bit() but allows fine control of how the waiting - * is done. - * - * Returns zero if the bit was (eventually) found to be clear and was - * set. Returns non-zero if a signal was delivered to the process and - * the @mode allows that signal to wake the process. - */ -static inline int -wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, - unsigned mode) -{ - might_sleep(); - if (!test_and_set_bit(bit, word)) - return 0; - return out_of_line_wait_on_bit_lock(word, bit, action, mode); -} - -/** - * wait_on_atomic_t - Wait for an atomic_t to become 0 - * @val: The atomic value being waited on, a kernel virtual address - * @action: the function used to sleep, which may take special actions - * @mode: the task state to sleep in - * - * Wait for an atomic_t to become 0. We abuse the bit-wait waitqueue table for - * the purpose of getting a waitqueue, but we set the key to a bit number - * outside of the target 'word'. - */ -static inline -int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode) -{ - might_sleep(); - if (atomic_read(val) == 0) - return 0; - return out_of_line_wait_on_atomic_t(val, action, mode); -} - #endif /* _LINUX_WAIT_H */ diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h new file mode 100644 index 000000000000..8c85c52d94b6 --- /dev/null +++ b/include/linux/wait_bit.h @@ -0,0 +1,260 @@ +#ifndef _LINUX_WAIT_BIT_H +#define _LINUX_WAIT_BIT_H + +/* + * Linux wait-bit related types and methods: + */ +#include + +struct wait_bit_key { + void *flags; + int bit_nr; +#define WAIT_ATOMIC_T_BIT_NR -1 + unsigned long timeout; +}; + +struct wait_bit_queue_entry { + struct wait_bit_key key; + struct wait_queue_entry wq_entry; +}; + +#define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ + { .flags = word, .bit_nr = bit, } + +#define __WAIT_ATOMIC_T_KEY_INITIALIZER(p) \ + { .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, } + +typedef int wait_bit_action_f(struct wait_bit_key *key, int mode); +void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit); +int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); +int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); +void wake_up_bit(void *word, int bit); +void wake_up_atomic_t(atomic_t *p); +int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode); +int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout); +int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode); +int out_of_line_wait_on_atomic_t(atomic_t *p, int (*)(atomic_t *), unsigned int mode); +struct wait_queue_head *bit_waitqueue(void *word, int bit); + +int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); + +#define DEFINE_WAIT_BIT(name, word, bit) \ + struct wait_bit_queue_entry name = { \ + .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ + .wq_entry = { \ + .private = current, \ + .func = wake_bit_function, \ + .task_list = \ + LIST_HEAD_INIT((name).wq_entry.task_list), \ + }, \ + } + +extern int bit_wait(struct wait_bit_key *key, int bit); +extern int bit_wait_io(struct wait_bit_key *key, int bit); +extern int bit_wait_timeout(struct wait_bit_key *key, int bit); +extern int bit_wait_io_timeout(struct wait_bit_key *key, int bit); + +/** + * wait_on_bit - wait for a bit to be cleared + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @mode: the task state to sleep in + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that waits on a bit. + * For instance, if one were to have waiters on a bitflag, one would + * call wait_on_bit() in threads waiting for the bit to clear. + * One uses wait_on_bit() where one is waiting for the bit to clear, + * but has no intention of setting it. + * Returned value will be zero if the bit was cleared, or non-zero + * if the process received a signal and the mode permitted wakeup + * on that signal. + */ +static inline int +wait_on_bit(unsigned long *word, int bit, unsigned mode) +{ + might_sleep(); + if (!test_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit(word, bit, + bit_wait, + mode); +} + +/** + * wait_on_bit_io - wait for a bit to be cleared + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @mode: the task state to sleep in + * + * Use the standard hashed waitqueue table to wait for a bit + * to be cleared. This is similar to wait_on_bit(), but calls + * io_schedule() instead of schedule() for the actual waiting. + * + * Returned value will be zero if the bit was cleared, or non-zero + * if the process received a signal and the mode permitted wakeup + * on that signal. + */ +static inline int +wait_on_bit_io(unsigned long *word, int bit, unsigned mode) +{ + might_sleep(); + if (!test_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit(word, bit, + bit_wait_io, + mode); +} + +/** + * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @mode: the task state to sleep in + * @timeout: timeout, in jiffies + * + * Use the standard hashed waitqueue table to wait for a bit + * to be cleared. This is similar to wait_on_bit(), except also takes a + * timeout parameter. + * + * Returned value will be zero if the bit was cleared before the + * @timeout elapsed, or non-zero if the @timeout elapsed or process + * received a signal and the mode permitted wakeup on that signal. + */ +static inline int +wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, + unsigned long timeout) +{ + might_sleep(); + if (!test_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit_timeout(word, bit, + bit_wait_timeout, + mode, timeout); +} + +/** + * wait_on_bit_action - wait for a bit to be cleared + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @action: the function used to sleep, which may take special actions + * @mode: the task state to sleep in + * + * Use the standard hashed waitqueue table to wait for a bit + * to be cleared, and allow the waiting action to be specified. + * This is like wait_on_bit() but allows fine control of how the waiting + * is done. + * + * Returned value will be zero if the bit was cleared, or non-zero + * if the process received a signal and the mode permitted wakeup + * on that signal. + */ +static inline int +wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, + unsigned mode) +{ + might_sleep(); + if (!test_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit(word, bit, action, mode); +} + +/** + * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @mode: the task state to sleep in + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that waits on a bit + * when one intends to set it, for instance, trying to lock bitflags. + * For instance, if one were to have waiters trying to set bitflag + * and waiting for it to clear before setting it, one would call + * wait_on_bit() in threads waiting to be able to set the bit. + * One uses wait_on_bit_lock() where one is waiting for the bit to + * clear with the intention of setting it, and when done, clearing it. + * + * Returns zero if the bit was (eventually) found to be clear and was + * set. Returns non-zero if a signal was delivered to the process and + * the @mode allows that signal to wake the process. + */ +static inline int +wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) +{ + might_sleep(); + if (!test_and_set_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit_lock(word, bit, bit_wait, mode); +} + +/** + * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @mode: the task state to sleep in + * + * Use the standard hashed waitqueue table to wait for a bit + * to be cleared and then to atomically set it. This is similar + * to wait_on_bit(), but calls io_schedule() instead of schedule() + * for the actual waiting. + * + * Returns zero if the bit was (eventually) found to be clear and was + * set. Returns non-zero if a signal was delivered to the process and + * the @mode allows that signal to wake the process. + */ +static inline int +wait_on_bit_lock_io(unsigned long *word, int bit, unsigned mode) +{ + might_sleep(); + if (!test_and_set_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit_lock(word, bit, bit_wait_io, mode); +} + +/** + * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * @action: the function used to sleep, which may take special actions + * @mode: the task state to sleep in + * + * Use the standard hashed waitqueue table to wait for a bit + * to be cleared and then to set it, and allow the waiting action + * to be specified. + * This is like wait_on_bit() but allows fine control of how the waiting + * is done. + * + * Returns zero if the bit was (eventually) found to be clear and was + * set. Returns non-zero if a signal was delivered to the process and + * the @mode allows that signal to wake the process. + */ +static inline int +wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, + unsigned mode) +{ + might_sleep(); + if (!test_and_set_bit(bit, word)) + return 0; + return out_of_line_wait_on_bit_lock(word, bit, action, mode); +} + +/** + * wait_on_atomic_t - Wait for an atomic_t to become 0 + * @val: The atomic value being waited on, a kernel virtual address + * @action: the function used to sleep, which may take special actions + * @mode: the task state to sleep in + * + * Wait for an atomic_t to become 0. We abuse the bit-wait waitqueue table for + * the purpose of getting a waitqueue, but we set the key to a bit number + * outside of the target 'word'. + */ +static inline +int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode) +{ + might_sleep(); + if (atomic_read(val) == 0) + return 0; + return out_of_line_wait_on_atomic_t(val, action, mode); +} + +#endif /* _LINUX_WAIT_BIT_H */ diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 89ab6758667b..16277e2ed8ee 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -17,7 +17,7 @@ endif obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o swait.o completion.o idle.o +obj-y += wait.o wait_bit.o swait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 95e6d3820cba..6bcd7c3c4501 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -390,260 +390,3 @@ int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sy return default_wake_function(wq_entry, mode, sync, key); } EXPORT_SYMBOL(woken_wake_function); - -int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - test_bit(key->bit_nr, key->flags)) - return 0; - else - return autoremove_wake_function(wq_entry, mode, sync, key); -} -EXPORT_SYMBOL(wake_bit_function); - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) - * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are - * permitted return codes. Nonzero return codes halt waiting and return. - */ -int __sched -__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, - wait_bit_action_f *action, unsigned mode) -{ - int ret = 0; - - do { - prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); - if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) - ret = (*action)(&wbq_entry->key, mode); - } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); - finish_wait(wq_head, &wbq_entry->wq_entry); - return ret; -} -EXPORT_SYMBOL(__wait_on_bit); - -int __sched out_of_line_wait_on_bit(void *word, int bit, - wait_bit_action_f *action, unsigned mode) -{ - struct wait_queue_head *wq_head = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wq_entry, word, bit); - - return __wait_on_bit(wq_head, &wq_entry, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit); - -int __sched out_of_line_wait_on_bit_timeout( - void *word, int bit, wait_bit_action_f *action, - unsigned mode, unsigned long timeout) -{ - struct wait_queue_head *wq_head = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wq_entry, word, bit); - - wq_entry.key.timeout = jiffies + timeout; - return __wait_on_bit(wq_head, &wq_entry, action, mode); -} -EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); - -int __sched -__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, - wait_bit_action_f *action, unsigned mode) -{ - int ret = 0; - - for (;;) { - prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode); - if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { - ret = action(&wbq_entry->key, mode); - /* - * See the comment in prepare_to_wait_event(). - * finish_wait() does not necessarily takes wwq_head->lock, - * but test_and_set_bit() implies mb() which pairs with - * smp_mb__after_atomic() before wake_up_page(). - */ - if (ret) - finish_wait(wq_head, &wbq_entry->wq_entry); - } - if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { - if (!ret) - finish_wait(wq_head, &wbq_entry->wq_entry); - return 0; - } else if (ret) { - return ret; - } - } -} -EXPORT_SYMBOL(__wait_on_bit_lock); - -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, - wait_bit_action_f *action, unsigned mode) -{ - struct wait_queue_head *wq_head = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wq_entry, word, bit); - - return __wait_on_bit_lock(wq_head, &wq_entry, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); - -void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) -{ - struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); - if (waitqueue_active(wq_head)) - __wake_up(wq_head, TASK_NORMAL, 1, &key); -} -EXPORT_SYMBOL(__wake_up_bit); - -/** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. - * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_atomic(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. - */ -void wake_up_bit(void *word, int bit) -{ - __wake_up_bit(bit_waitqueue(word, bit), word, bit); -} -EXPORT_SYMBOL(wake_up_bit); - -/* - * Manipulate the atomic_t address to produce a better bit waitqueue table hash - * index (we're keying off bit -1, but that would produce a horrible hash - * value). - */ -static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) -{ - if (BITS_PER_LONG == 64) { - unsigned long q = (unsigned long)p; - return bit_waitqueue((void *)(q & ~1), q & 1); - } - return bit_waitqueue(p, 0); -} - -static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, - void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); - atomic_t *val = key->flags; - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - atomic_read(val) != 0) - return 0; - return autoremove_wake_function(wq_entry, mode, sync, key); -} - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, - * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero - * return codes halt waiting and return. - */ -static __sched -int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, - int (*action)(atomic_t *), unsigned mode) -{ - atomic_t *val; - int ret = 0; - - do { - prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); - val = wbq_entry->key.flags; - if (atomic_read(val) == 0) - break; - ret = (*action)(val); - } while (!ret && atomic_read(val) != 0); - finish_wait(wq_head, &wbq_entry->wq_entry); - return ret; -} - -#define DEFINE_WAIT_ATOMIC_T(name, p) \ - struct wait_bit_queue_entry name = { \ - .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ - .wq_entry = { \ - .private = current, \ - .func = wake_atomic_t_function, \ - .task_list = \ - LIST_HEAD_INIT((name).wq_entry.task_list), \ - }, \ - } - -__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), - unsigned mode) -{ - struct wait_queue_head *wq_head = atomic_t_waitqueue(p); - DEFINE_WAIT_ATOMIC_T(wq_entry, p); - - return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); - -/** - * wake_up_atomic_t - Wake up a waiter on a atomic_t - * @p: The atomic_t being waited on, a kernel virtual address - * - * Wake up anyone waiting for the atomic_t to go to zero. - * - * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t - * check is done by the waiter's wake function, not the by the waker itself). - */ -void wake_up_atomic_t(atomic_t *p) -{ - __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); -} -EXPORT_SYMBOL(wake_up_atomic_t); - -__sched int bit_wait(struct wait_bit_key *word, int mode) -{ - schedule(); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL(bit_wait); - -__sched int bit_wait_io(struct wait_bit_key *word, int mode) -{ - io_schedule(); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL(bit_wait_io); - -__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) -{ - unsigned long now = READ_ONCE(jiffies); - if (time_after_eq(now, word->timeout)) - return -EAGAIN; - schedule_timeout(word->timeout - now); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL_GPL(bit_wait_timeout); - -__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) -{ - unsigned long now = READ_ONCE(jiffies); - if (time_after_eq(now, word->timeout)) - return -EAGAIN; - io_schedule_timeout(word->timeout - now); - if (signal_pending_state(mode, current)) - return -EINTR; - return 0; -} -EXPORT_SYMBOL_GPL(bit_wait_io_timeout); diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c new file mode 100644 index 000000000000..463bac84dfd1 --- /dev/null +++ b/kernel/sched/wait_bit.c @@ -0,0 +1,263 @@ +/* + * The implementation of the wait_bit*() and related waiting APIs: + */ +#include +#include +#include + +int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + test_bit(key->bit_nr, key->flags)) + return 0; + else + return autoremove_wake_function(wq_entry, mode, sync, key); +} +EXPORT_SYMBOL(wake_bit_function); + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are + * permitted return codes. Nonzero return codes halt waiting and return. + */ +int __sched +__wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, + wait_bit_action_f *action, unsigned mode) +{ + int ret = 0; + + do { + prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); + if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) + ret = (*action)(&wbq_entry->key, mode); + } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); + finish_wait(wq_head, &wbq_entry->wq_entry); + return ret; +} +EXPORT_SYMBOL(__wait_on_bit); + +int __sched out_of_line_wait_on_bit(void *word, int bit, + wait_bit_action_f *action, unsigned mode) +{ + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); + + return __wait_on_bit(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit); + +int __sched out_of_line_wait_on_bit_timeout( + void *word, int bit, wait_bit_action_f *action, + unsigned mode, unsigned long timeout) +{ + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); + + wq_entry.key.timeout = jiffies + timeout; + return __wait_on_bit(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); + +int __sched +__wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, + wait_bit_action_f *action, unsigned mode) +{ + int ret = 0; + + for (;;) { + prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode); + if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { + ret = action(&wbq_entry->key, mode); + /* + * See the comment in prepare_to_wait_event(). + * finish_wait() does not necessarily takes wwq_head->lock, + * but test_and_set_bit() implies mb() which pairs with + * smp_mb__after_atomic() before wake_up_page(). + */ + if (ret) + finish_wait(wq_head, &wbq_entry->wq_entry); + } + if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { + if (!ret) + finish_wait(wq_head, &wbq_entry->wq_entry); + return 0; + } else if (ret) { + return ret; + } + } +} +EXPORT_SYMBOL(__wait_on_bit_lock); + +int __sched out_of_line_wait_on_bit_lock(void *word, int bit, + wait_bit_action_f *action, unsigned mode) +{ + struct wait_queue_head *wq_head = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wq_entry, word, bit); + + return __wait_on_bit_lock(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); + +void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) +{ + struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + if (waitqueue_active(wq_head)) + __wake_up(wq_head, TASK_NORMAL, 1, &key); +} +EXPORT_SYMBOL(__wake_up_bit); + +/** + * wake_up_bit - wake up a waiter on a bit + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that wakes up waiters + * on a bit. For instance, if one were to have waiters on a bitflag, + * one would call wake_up_bit() after clearing the bit. + * + * In order for this to function properly, as it uses waitqueue_active() + * internally, some kind of memory barrier must be done prior to calling + * this. Typically, this will be smp_mb__after_atomic(), but in some + * cases where bitflags are manipulated non-atomically under a lock, one + * may need to use a less regular barrier, such fs/inode.c's smp_mb(), + * because spin_unlock() does not guarantee a memory barrier. + */ +void wake_up_bit(void *word, int bit) +{ + __wake_up_bit(bit_waitqueue(word, bit), word, bit); +} +EXPORT_SYMBOL(wake_up_bit); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ + if (BITS_PER_LONG == 64) { + unsigned long q = (unsigned long)p; + return bit_waitqueue((void *)(q & ~1), q & 1); + } + return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, + void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); + atomic_t *val = key->flags; + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + atomic_read(val) != 0) + return 0; + return autoremove_wake_function(wq_entry, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, + int (*action)(atomic_t *), unsigned mode) +{ + atomic_t *val; + int ret = 0; + + do { + prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); + val = wbq_entry->key.flags; + if (atomic_read(val) == 0) + break; + ret = (*action)(val); + } while (!ret && atomic_read(val) != 0); + finish_wait(wq_head, &wbq_entry->wq_entry); + return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p) \ + struct wait_bit_queue_entry name = { \ + .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ + .wq_entry = { \ + .private = current, \ + .func = wake_atomic_t_function, \ + .task_list = \ + LIST_HEAD_INIT((name).wq_entry.task_list), \ + }, \ + } + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), + unsigned mode) +{ + struct wait_queue_head *wq_head = atomic_t_waitqueue(p); + DEFINE_WAIT_ATOMIC_T(wq_entry, p); + + return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @p: The atomic_t being waited on, a kernel virtual address + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ + __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t); + +__sched int bit_wait(struct wait_bit_key *word, int mode) +{ + schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL(bit_wait); + +__sched int bit_wait_io(struct wait_bit_key *word, int mode) +{ + io_schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL(bit_wait_io); + +__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) +{ + unsigned long now = READ_ONCE(jiffies); + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_timeout); + +__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) +{ + unsigned long now = READ_ONCE(jiffies); + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + io_schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_io_timeout); diff --git a/security/keys/internal.h b/security/keys/internal.h index c0f8682eba69..91bc6214ae57 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -13,6 +13,7 @@ #define _INTERNAL_H #include +#include #include #include #include -- cgit v1.3-14-g43fede From 5822a454d6d22297c5fcd66264120587b2ec21cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Mar 2017 13:09:07 +0100 Subject: sched/wait: Move bit_wait_table[] and related functionality from sched/core.c to sched/wait_bit.c The key hashed waitqueue data structures and their initialization was done in the main scheduler file for no good reason, move them to sched/wait_bit.c instead. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/wait_bit.h | 1 + kernel/sched/core.c | 18 ++---------------- kernel/sched/wait_bit.c | 23 +++++++++++++++++++++++ 3 files changed, 26 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 8c85c52d94b6..9cc82114dbcb 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -35,6 +35,7 @@ int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode); int out_of_line_wait_on_atomic_t(atomic_t *p, int (*)(atomic_t *), unsigned int mode); struct wait_queue_head *bit_waitqueue(void *word, int bit); +extern void __init wait_bit_init(void); int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5b36644536ab..e7b9ef8df126 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -6026,28 +6027,13 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); -#define WAIT_TABLE_BITS 8 -#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) -static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; - -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - unsigned long val = (unsigned long)word << shift | bit; - - return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); -} -EXPORT_SYMBOL(bit_waitqueue); - void __init sched_init(void) { int i, j; unsigned long alloc_size = 0, ptr; sched_clock_init(); - - for (i = 0; i < WAIT_TABLE_SIZE; i++) - init_waitqueue_head(bit_wait_table + i); + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 463bac84dfd1..c891b34e1896 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -4,6 +4,21 @@ #include #include #include +#include + +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) + +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + unsigned long val = (unsigned long)word << shift | bit; + + return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) { @@ -261,3 +276,11 @@ __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) return 0; } EXPORT_SYMBOL_GPL(bit_wait_io_timeout); + +void __init wait_bit_init(void) +{ + int i; + + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); +} -- cgit v1.3-14-g43fede From 2055da97389a605c8a00d163d40903afbe413921 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 20 Jun 2017 12:06:46 +0200 Subject: sched/wait: Disambiguate wq_entry->task_list and wq_head->task_list naming So I've noticed a number of instances where it was not obvious from the code whether ->task_list was for a wait-queue head or a wait-queue entry. Furthermore, there's a number of wait-queue users where the lists are not for 'tasks' but other entities (poll tables, etc.), in which case the 'task_list' name is actively confusing. To clear this all up, name the wait-queue head and entry list structure fields unambiguously: struct wait_queue_head::task_list => ::head struct wait_queue_entry::task_list => ::entry For example, this code: rqw->wait.task_list.next != &wait->task_list ... is was pretty unclear (to me) what it's doing, while now it's written this way: rqw->wait.head.next != &wait->entry ... which makes it pretty clear that we are iterating a list until we see the head. Other examples are: list_for_each_entry_safe(pos, next, &x->task_list, task_list) { list_for_each_entry(wq, &fence->wait.task_list, task_list) { ... where it's unclear (to me) what we are iterating, and during review it's hard to tell whether it's trying to walk a wait-queue entry (which would be a bug), while now it's written as: list_for_each_entry_safe(pos, next, &x->head, entry) { list_for_each_entry(wq, &fence->wait.head, entry) { Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- block/blk-mq.c | 2 +- block/blk-wbt.c | 2 +- block/kyber-iosched.c | 8 ++++---- drivers/gpu/drm/i915/i915_sw_fence.c | 21 ++++++++++----------- drivers/rtc/rtc-imxdi.c | 2 +- fs/cachefiles/rdwr.c | 2 +- fs/eventpoll.c | 2 +- fs/fs_pin.c | 2 +- fs/nilfs2/segment.c | 3 +-- fs/orangefs/orangefs-bufmap.c | 8 ++++---- fs/userfaultfd.c | 22 +++++++++++----------- include/linux/wait.h | 20 ++++++++++---------- include/linux/wait_bit.h | 4 ++-- kernel/sched/wait.c | 24 ++++++++++++------------ kernel/sched/wait_bit.c | 4 ++-- mm/filemap.c | 2 +- mm/memcontrol.c | 2 +- mm/shmem.c | 4 ++-- 18 files changed, 66 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/block/blk-mq.c b/block/blk-mq.c index a083f95e04b1..121aa1dbb192 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -933,7 +933,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int fla hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); - list_del(&wait->task_list); + list_del(&wait->entry); clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); blk_mq_run_hw_queue(hctx, true); return 1; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 5f3a37c2784c..6a9a0f03a67b 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -520,7 +520,7 @@ static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, * in line to be woken up, wait for our turn. */ if (waitqueue_active(&rqw->wait) && - rqw->wait.task_list.next != &wait->task_list) + rqw->wait.head.next != &wait->entry) return false; return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index b95d6bd714c0..9bf1484365b2 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -385,7 +385,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) for (i = 0; i < KYBER_NUM_DOMAINS; i++) { INIT_LIST_HEAD(&khd->rqs[i]); - INIT_LIST_HEAD(&khd->domain_wait[i].task_list); + INIT_LIST_HEAD(&khd->domain_wait[i].entry); atomic_set(&khd->wait_index[i], 0); } @@ -512,7 +512,7 @@ static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, { struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private); - list_del_init(&wait->task_list); + list_del_init(&wait->entry); blk_mq_run_hw_queue(hctx, true); return 1; } @@ -536,7 +536,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, * run when one becomes available. Note that this is serialized on * khd->lock, but we still need to be careful about the waker. */ - if (list_empty_careful(&wait->task_list)) { + if (list_empty_careful(&wait->entry)) { init_waitqueue_func_entry(wait, kyber_domain_wake); wait->private = hctx; ws = sbq_wait_ptr(domain_tokens, @@ -736,7 +736,7 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \ struct kyber_hctx_data *khd = hctx->sched_data; \ wait_queue_entry_t *wait = &khd->domain_wait[domain]; \ \ - seq_printf(m, "%d\n", !list_empty_careful(&wait->task_list)); \ + seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ return 0; \ } KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read) diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c index 8669bfa33064..380de4360b8a 100644 --- a/drivers/gpu/drm/i915/i915_sw_fence.c +++ b/drivers/gpu/drm/i915/i915_sw_fence.c @@ -160,31 +160,30 @@ static void __i915_sw_fence_wake_up_all(struct i915_sw_fence *fence, /* * To prevent unbounded recursion as we traverse the graph of - * i915_sw_fences, we move the task_list from this, the next ready - * fence, to the tail of the original fence's task_list + * i915_sw_fences, we move the entry list from this, the next ready + * fence, to the tail of the original fence's entry list * (and so added to the list to be woken). */ spin_lock_irqsave_nested(&x->lock, flags, 1 + !!continuation); if (continuation) { - list_for_each_entry_safe(pos, next, &x->task_list, task_list) { + list_for_each_entry_safe(pos, next, &x->head, entry) { if (pos->func == autoremove_wake_function) pos->func(pos, TASK_NORMAL, 0, continuation); else - list_move_tail(&pos->task_list, continuation); + list_move_tail(&pos->entry, continuation); } } else { LIST_HEAD(extra); do { - list_for_each_entry_safe(pos, next, - &x->task_list, task_list) + list_for_each_entry_safe(pos, next, &x->head, entry) pos->func(pos, TASK_NORMAL, 0, &extra); if (list_empty(&extra)) break; - list_splice_tail_init(&extra, &x->task_list); + list_splice_tail_init(&extra, &x->head); } while (1); } spin_unlock_irqrestore(&x->lock, flags); @@ -256,7 +255,7 @@ void i915_sw_fence_commit(struct i915_sw_fence *fence) static int i915_sw_fence_wake(wait_queue_entry_t *wq, unsigned mode, int flags, void *key) { - list_del(&wq->task_list); + list_del(&wq->entry); __i915_sw_fence_complete(wq->private, key); i915_sw_fence_put(wq->private); if (wq->flags & I915_SW_FENCE_FLAG_ALLOC) @@ -275,7 +274,7 @@ static bool __i915_sw_fence_check_if_after(struct i915_sw_fence *fence, if (fence == signaler) return true; - list_for_each_entry(wq, &fence->wait.task_list, task_list) { + list_for_each_entry(wq, &fence->wait.head, entry) { if (wq->func != i915_sw_fence_wake) continue; @@ -293,7 +292,7 @@ static void __i915_sw_fence_clear_checked_bit(struct i915_sw_fence *fence) if (!__test_and_clear_bit(I915_SW_FENCE_CHECKED_BIT, &fence->flags)) return; - list_for_each_entry(wq, &fence->wait.task_list, task_list) { + list_for_each_entry(wq, &fence->wait.head, entry) { if (wq->func != i915_sw_fence_wake) continue; @@ -350,7 +349,7 @@ static int __i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence, pending |= I915_SW_FENCE_FLAG_ALLOC; } - INIT_LIST_HEAD(&wq->task_list); + INIT_LIST_HEAD(&wq->entry); wq->flags = pending; wq->func = i915_sw_fence_wake; wq->private = i915_sw_fence_get(fence); diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c index 6b54f6c24c5f..80931114c899 100644 --- a/drivers/rtc/rtc-imxdi.c +++ b/drivers/rtc/rtc-imxdi.c @@ -709,7 +709,7 @@ static irqreturn_t dryice_irq(int irq, void *dev_id) /*If the write wait queue is empty then there is no pending operations. It means the interrupt is for DryIce -Security. IRQ must be returned as none.*/ - if (list_empty_careful(&imxdi->write_wait.task_list)) + if (list_empty_careful(&imxdi->write_wait.head)) return rc; /* DSR_WCF clears itself on DSR read */ diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 8be33b33b981..18d7aa61ef0f 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -48,7 +48,7 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, } /* remove from the waitqueue */ - list_del(&wait->task_list); + list_del(&wait->entry); /* move onto the action list and queue for FS-Cache thread pool */ ASSERT(monitor->op); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5ac1cba5ef72..b1c8e23ddf65 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1094,7 +1094,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * can't use __remove_wait_queue(). whead->lock is held by * the caller. */ - list_del_init(&wait->task_list); + list_del_init(&wait->entry); } spin_lock_irqsave(&ep->lock, flags); diff --git a/fs/fs_pin.c b/fs/fs_pin.c index 7b447a245760..e747b3d720ee 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -61,7 +61,7 @@ void pin_kill(struct fs_pin *p) rcu_read_unlock(); schedule(); rcu_read_lock(); - if (likely(list_empty(&wait.task_list))) + if (likely(list_empty(&wait.entry))) break; /* OK, we know p couldn't have been freed yet */ spin_lock_irq(&p->wait.lock); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 775304e7f96f..70ded52dc1dd 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2206,8 +2206,7 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err) unsigned long flags; spin_lock_irqsave(&sci->sc_wait_request.lock, flags); - list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list, - wq.task_list) { + list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.head, wq.entry) { if (!atomic_read(&wrq->done) && nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) { wrq->err = err; diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 9e37b7028ea4..038d67545d9f 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -46,7 +46,7 @@ static void run_down(struct slot_map *m) spin_lock(&m->q.lock); if (m->c != -1) { for (;;) { - if (likely(list_empty(&wait.task_list))) + if (likely(list_empty(&wait.entry))) __add_wait_queue_entry_tail(&m->q, &wait); set_current_state(TASK_UNINTERRUPTIBLE); @@ -84,7 +84,7 @@ static int wait_for_free(struct slot_map *m) do { long n = left, t; - if (likely(list_empty(&wait.task_list))) + if (likely(list_empty(&wait.entry))) __add_wait_queue_entry_tail_exclusive(&m->q, &wait); set_current_state(TASK_INTERRUPTIBLE); @@ -108,8 +108,8 @@ static int wait_for_free(struct slot_map *m) left = -EINTR; } while (left > 0); - if (!list_empty(&wait.task_list)) - list_del(&wait.task_list); + if (!list_empty(&wait.entry)) + list_del(&wait.entry); else if (left <= 0 && waitqueue_active(&m->q)) __wake_up_locked_key(&m->q, TASK_INTERRUPTIBLE, NULL); __set_current_state(TASK_RUNNING); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index bda64fcd8a0c..6148ccd6cccf 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -129,7 +129,7 @@ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, * wouldn't be enough, the smp_mb__before_spinlock is * enough to avoid an explicit smp_mb() here. */ - list_del_init(&wq->task_list); + list_del_init(&wq->entry); out: return ret; } @@ -522,13 +522,13 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) * and it's fine not to block on the spinlock. The uwq on this * kernel stack can be released after the list_del_init. */ - if (!list_empty_careful(&uwq.wq.task_list)) { + if (!list_empty_careful(&uwq.wq.entry)) { spin_lock(&ctx->fault_pending_wqh.lock); /* * No need of list_del_init(), the uwq on the stack * will be freed shortly anyway. */ - list_del(&uwq.wq.task_list); + list_del(&uwq.wq.entry); spin_unlock(&ctx->fault_pending_wqh.lock); } @@ -869,7 +869,7 @@ static inline struct userfaultfd_wait_queue *find_userfault_in( if (!waitqueue_active(wqh)) goto out; /* walk in reverse to provide FIFO behavior to read userfaults */ - wq = list_last_entry(&wqh->task_list, typeof(*wq), task_list); + wq = list_last_entry(&wqh->head, typeof(*wq), entry); uwq = container_of(wq, struct userfaultfd_wait_queue, wq); out: return uwq; @@ -1003,14 +1003,14 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, * changes __remove_wait_queue() to use * list_del_init() in turn breaking the * !list_empty_careful() check in - * handle_userfault(). The uwq->wq.task_list + * handle_userfault(). The uwq->wq.head list * must never be empty at any time during the * refile, or the waitqueue could disappear * from under us. The "wait_queue_head_t" * parameter of __remove_wait_queue() is unused * anyway. */ - list_del(&uwq->wq.task_list); + list_del(&uwq->wq.entry); __add_wait_queue(&ctx->fault_wqh, &uwq->wq); write_seqcount_end(&ctx->refile_seq); @@ -1032,7 +1032,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, fork_nctx = (struct userfaultfd_ctx *) (unsigned long) uwq->msg.arg.reserved.reserved1; - list_move(&uwq->wq.task_list, &fork_event); + list_move(&uwq->wq.entry, &fork_event); spin_unlock(&ctx->event_wqh.lock); ret = 0; break; @@ -1069,8 +1069,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, if (!list_empty(&fork_event)) { uwq = list_first_entry(&fork_event, typeof(*uwq), - wq.task_list); - list_del(&uwq->wq.task_list); + wq.entry); + list_del(&uwq->wq.entry); __add_wait_queue(&ctx->event_wqh, &uwq->wq); userfaultfd_event_complete(ctx, uwq); } @@ -1752,12 +1752,12 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) unsigned long pending = 0, total = 0; spin_lock(&ctx->fault_pending_wqh.lock); - list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) { + list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { uwq = container_of(wq, struct userfaultfd_wait_queue, wq); pending++; total++; } - list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) { + list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { uwq = container_of(wq, struct userfaultfd_wait_queue, wq); total++; } diff --git a/include/linux/wait.h b/include/linux/wait.h index 629489746f8a..b289c96151ee 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -26,12 +26,12 @@ struct wait_queue_entry { unsigned int flags; void *private; wait_queue_func_t func; - struct list_head task_list; + struct list_head entry; }; struct wait_queue_head { spinlock_t lock; - struct list_head task_list; + struct list_head head; }; typedef struct wait_queue_head wait_queue_head_t; @@ -44,14 +44,14 @@ struct task_struct; #define __WAITQUEUE_INITIALIZER(name, tsk) { \ .private = tsk, \ .func = default_wake_function, \ - .task_list = { NULL, NULL } } + .entry = { NULL, NULL } } #define DECLARE_WAITQUEUE(name, tsk) \ struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk) #define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ - .task_list = { &(name).task_list, &(name).task_list } } + .head = { &(name).head, &(name).head } } #define DECLARE_WAIT_QUEUE_HEAD(name) \ struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) @@ -121,7 +121,7 @@ init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t f */ static inline int waitqueue_active(struct wait_queue_head *wq_head) { - return !list_empty(&wq_head->task_list); + return !list_empty(&wq_head->head); } /** @@ -151,7 +151,7 @@ extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { - list_add(&wq_entry->task_list, &wq_head->task_list); + list_add(&wq_entry->entry, &wq_head->head); } /* @@ -166,7 +166,7 @@ __add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_en static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { - list_add_tail(&wq_entry->task_list, &wq_head->task_list); + list_add_tail(&wq_entry->entry, &wq_head->head); } static inline void @@ -179,7 +179,7 @@ __add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wa static inline void __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { - list_del(&wq_entry->task_list); + list_del(&wq_entry->entry); } void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); @@ -952,7 +952,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i struct wait_queue_entry name = { \ .private = current, \ .func = function, \ - .task_list = LIST_HEAD_INIT((name).task_list), \ + .entry = LIST_HEAD_INIT((name).entry), \ } #define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function) @@ -961,7 +961,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i do { \ (wait)->private = current; \ (wait)->func = autoremove_wake_function; \ - INIT_LIST_HEAD(&(wait)->task_list); \ + INIT_LIST_HEAD(&(wait)->entry); \ (wait)->flags = 0; \ } while (0) diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 9cc82114dbcb..12b26660d7e9 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -45,8 +45,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync .wq_entry = { \ .private = current, \ .func = wake_bit_function, \ - .task_list = \ - LIST_HEAD_INIT((name).wq_entry.task_list), \ + .entry = \ + LIST_HEAD_INIT((name).wq_entry.entry), \ }, \ } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 6bcd7c3c4501..17f11c6b0a9f 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -16,7 +16,7 @@ void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, st { spin_lock_init(&wq_head->lock); lockdep_set_class_and_name(&wq_head->lock, key, name); - INIT_LIST_HEAD(&wq_head->task_list); + INIT_LIST_HEAD(&wq_head->head); } EXPORT_SYMBOL(__init_waitqueue_head); @@ -68,7 +68,7 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, { wait_queue_entry_t *curr, *next; - list_for_each_entry_safe(curr, next, &wq_head->task_list, task_list) { + list_for_each_entry_safe(curr, next, &wq_head->head, entry) { unsigned flags = curr->flags; if (curr->func(curr, mode, wake_flags, key) && @@ -176,7 +176,7 @@ prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_ent wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&wq_head->lock, flags); - if (list_empty(&wq_entry->task_list)) + if (list_empty(&wq_entry->entry)) __add_wait_queue(wq_head, wq_entry); set_current_state(state); spin_unlock_irqrestore(&wq_head->lock, flags); @@ -190,7 +190,7 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent wq_entry->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&wq_head->lock, flags); - if (list_empty(&wq_entry->task_list)) + if (list_empty(&wq_entry->entry)) __add_wait_queue_entry_tail(wq_head, wq_entry); set_current_state(state); spin_unlock_irqrestore(&wq_head->lock, flags); @@ -202,7 +202,7 @@ void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) wq_entry->flags = flags; wq_entry->private = current; wq_entry->func = autoremove_wake_function; - INIT_LIST_HEAD(&wq_entry->task_list); + INIT_LIST_HEAD(&wq_entry->entry); } EXPORT_SYMBOL(init_wait_entry); @@ -225,10 +225,10 @@ long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_en * can't see us, it should wake up another exclusive waiter if * we fail. */ - list_del_init(&wq_entry->task_list); + list_del_init(&wq_entry->entry); ret = -ERESTARTSYS; } else { - if (list_empty(&wq_entry->task_list)) { + if (list_empty(&wq_entry->entry)) { if (wq_entry->flags & WQ_FLAG_EXCLUSIVE) __add_wait_queue_entry_tail(wq_head, wq_entry); else @@ -251,7 +251,7 @@ EXPORT_SYMBOL(prepare_to_wait_event); */ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) { - if (likely(list_empty(&wait->task_list))) + if (likely(list_empty(&wait->entry))) __add_wait_queue_entry_tail(wq, wait); set_current_state(TASK_INTERRUPTIBLE); @@ -267,7 +267,7 @@ EXPORT_SYMBOL(do_wait_intr); int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) { - if (likely(list_empty(&wait->task_list))) + if (likely(list_empty(&wait->entry))) __add_wait_queue_entry_tail(wq, wait); set_current_state(TASK_INTERRUPTIBLE); @@ -308,9 +308,9 @@ void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_en * have _one_ other CPU that looks at or modifies * the list). */ - if (!list_empty_careful(&wq_entry->task_list)) { + if (!list_empty_careful(&wq_entry->entry)) { spin_lock_irqsave(&wq_head->lock, flags); - list_del_init(&wq_entry->task_list); + list_del_init(&wq_entry->entry); spin_unlock_irqrestore(&wq_head->lock, flags); } } @@ -321,7 +321,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i int ret = default_wake_function(wq_entry, mode, sync, key); if (ret) - list_del_init(&wq_entry->task_list); + list_del_init(&wq_entry->entry); return ret; } EXPORT_SYMBOL(autoremove_wake_function); diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index c891b34e1896..f8159698aa4d 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -205,8 +205,8 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en .wq_entry = { \ .private = current, \ .func = wake_atomic_t_function, \ - .task_list = \ - LIST_HEAD_INIT((name).wq_entry.task_list), \ + .entry = \ + LIST_HEAD_INIT((name).wq_entry.entry), \ }, \ } diff --git a/mm/filemap.c b/mm/filemap.c index 80c19ee81e95..926484561624 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -845,7 +845,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, for (;;) { spin_lock_irq(&q->lock); - if (likely(list_empty(&wait->task_list))) { + if (likely(list_empty(&wait->entry))) { if (lock) __add_wait_queue_entry_tail_exclusive(q, wait); else diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9a90b096dc6b..d75b38b66ef6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1570,7 +1570,7 @@ bool mem_cgroup_oom_synchronize(bool handle) owait.wait.flags = 0; owait.wait.func = memcg_oom_wake_function; owait.wait.private = current; - INIT_LIST_HEAD(&owait.wait.task_list); + INIT_LIST_HEAD(&owait.wait.entry); prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); mem_cgroup_mark_under_oom(memcg); diff --git a/mm/shmem.c b/mm/shmem.c index a6c7dece4660..fdc413f82a99 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1905,7 +1905,7 @@ unlock: static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); - list_del_init(&wait->task_list); + list_del_init(&wait->entry); return ret; } @@ -2840,7 +2840,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, spin_lock(&inode->i_lock); inode->i_private = NULL; wake_up_all(&shmem_falloc_waitq); - WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list)); + WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); spin_unlock(&inode->i_lock); error = 0; goto out; -- cgit v1.3-14-g43fede From 6d3aed3d8a0573d0a6eb1160ccd0a0713f4dbc2f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 20 Jun 2017 12:24:42 +0200 Subject: sched/debug: Fix SCHED_WARN_ON() to return a value on !CONFIG_SCHED_DEBUG as well This definition of SCHED_WARN_ON(): #define SCHED_WARN_ON(x) ((void)(x)) is not fully compatible with the 'real' WARN_ON_ONCE() primitive, as it has no return value, so it cannot be used in conditionals. Fix it. Cc: Daniel Axtens Cc: Konstantin Khlebnikov Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Thomas Gleixner Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f2ef759a4cb6..e0329d10bdb8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -39,9 +39,9 @@ #include "cpuacct.h" #ifdef CONFIG_SCHED_DEBUG -#define SCHED_WARN_ON(x) WARN_ONCE(x, #x) +# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) #else -#define SCHED_WARN_ON(x) ((void)(x)) +# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif struct rq; -- cgit v1.3-14-g43fede From c5ae366e12b2bd56fc7d7e9d484836bec9ddc110 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Thu, 11 May 2017 06:11:39 +1000 Subject: sched/fair: WARN() and refuse to set buddy when !se->on_rq If we set a next or last buddy for a se that is not on_rq, we will end up taking a NULL pointer dereference in wakeup_preempt_entity via pick_next_task_fair. Detect when we would be about to do that, throw a warning and then refuse to actually set it. This has been suggested at least twice: https://marc.info/?l=linux-kernel&m=146651668921468&w=2 https://lkml.org/lkml/2016/6/16/663 I recently had to debug a problem with these (we hadn't backported Konstantin's patches in this area) and this would have saved a lot of time/pain. Just do it. Signed-off-by: Daniel Axtens Cc: Ben Segall Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170510201139.16236-1-dja@axtens.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 396bca9c7996..cb3a3da7089f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6164,8 +6164,11 @@ static void set_last_buddy(struct sched_entity *se) if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) return; - for_each_sched_entity(se) + for_each_sched_entity(se) { + if (SCHED_WARN_ON(!se->on_rq)) + return; cfs_rq_of(se)->last = se; + } } static void set_next_buddy(struct sched_entity *se) @@ -6173,8 +6176,11 @@ static void set_next_buddy(struct sched_entity *se) if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) return; - for_each_sched_entity(se) + for_each_sched_entity(se) { + if (SCHED_WARN_ON(!se->on_rq)) + return; cfs_rq_of(se)->next = se; + } } static void set_skip_buddy(struct sched_entity *se) -- cgit v1.3-14-g43fede From f11cc0760b8397e0d230122606421b6a96e9f869 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 14 Jun 2017 19:37:30 -0700 Subject: sched/core: Drop the unused try_get_task_struct() helper function This function was introduced by: 150593bf8693 ("sched/api: Introduce task_rcu_dereference() and try_get_task_struct()") ... to allow easier usage of task_rcu_dereference(), however no users were ever added. Drop the helper. Signed-off-by: Davidlohr Bueso Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dave@stgolabs.net Link: http://lkml.kernel.org/r/20170615023730.22827-1-dave@stgolabs.net Signed-off-by: Ingo Molnar --- include/linux/sched/task.h | 2 -- kernel/exit.c | 13 ------------- 2 files changed, 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index a978d7189cfd..f0f065c5afcf 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -95,8 +95,6 @@ static inline void put_task_struct(struct task_struct *t) } struct task_struct *task_rcu_dereference(struct task_struct **ptask); -struct task_struct *try_get_task_struct(struct task_struct **ptask); - #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; diff --git a/kernel/exit.c b/kernel/exit.c index 7d694437ab44..c63226283aef 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -318,19 +318,6 @@ void rcuwait_wake_up(struct rcuwait *w) rcu_read_unlock(); } -struct task_struct *try_get_task_struct(struct task_struct **ptask) -{ - struct task_struct *task; - - rcu_read_lock(); - task = task_rcu_dereference(ptask); - if (task) - get_task_struct(task); - rcu_read_unlock(); - - return task; -} - /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected -- cgit v1.3-14-g43fede From d15bc69affc57d7985a01745ca28eafa0772325b Mon Sep 17 00:00:00 2001 From: Peter Meerwald-Stadler Date: Tue, 30 May 2017 21:41:03 +0200 Subject: timers: Fix parameter description of try_to_del_timer_sync() Signed-off-by: Peter Meerwald-Stadler Link: http://lkml.kernel.org/r/20170530194103.7454-1-pmeerw@pmeerw.net Cc: John Stultz Cc: trivial@rustcorp.com.au Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 152a706ef8b8..709a404bd133 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1150,7 +1150,7 @@ EXPORT_SYMBOL(del_timer); /** * try_to_del_timer_sync - Try to deactivate a timer - * @timer: timer do del + * @timer: timer to delete * * This function tries to deactivate a timer. Upon successful (ret >= 0) * exit the timer is not queued and the handler is not running on any CPU. -- cgit v1.3-14-g43fede From 35eb7258c009dc478338e674a5a84d25d0929c56 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 17:37:35 +0200 Subject: itimer: Make timeval to nsec conversion range limited The expiry time of a itimer is supplied through sys_setitimer() via a struct timeval. The timeval is validated for correctness. In the actual set timer implementation the timeval is converted to a scalar nanoseconds value. If the tv_sec part of the time spec is large enough the conversion to nanoseconds (sec * NSEC_PER_SEC) overflows 64bit. Mitigate that by using the timeval_to_ktime() conversion function, which checks the tv_sec part for a potential mult overflow and clamps the result to KTIME_MAX, which is about 292 years. Reported-by: Xishi Qiu Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/20170620154113.505981643@linutronix.de --- kernel/time/itimer.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 9dd7ff5e445a..2ef98a02376a 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -152,8 +152,12 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, u64 oval, nval, ointerval, ninterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; - nval = timeval_to_ns(&value->it_value); - ninterval = timeval_to_ns(&value->it_interval); + /* + * Use the to_ktime conversion because that clamps the maximum + * value to KTIME_MAX and avoid multiplication overflows. + */ + nval = ktime_to_ns(timeval_to_ktime(value->it_value)); + ninterval = ktime_to_ns(timeval_to_ktime(value->it_interval)); spin_lock_irq(&tsk->sighand->siglock); -- cgit v1.3-14-g43fede From 098b0e01a91c42aaaf0425605cd126b03fcb0bcf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 17:37:36 +0200 Subject: posix-cpu-timers: Make timespec to nsec conversion safe The expiry time of a posix cpu timer is supplied through sys_timer_set() via a struct timespec. The timespec is validated for correctness. In the actual set timer implementation the timespec is converted to a scalar nanoseconds value. If the tv_sec part of the time spec is large enough the conversion to nanoseconds (sec * NSEC_PER_SEC) overflows 64bit. Mitigate that by using the timespec_to_ktime() conversion function, which checks the tv_sec part for a potential mult overflow and clamps the result to KTIME_MAX, which is about 292 years. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Xishi Qiu Cc: John Stultz Link: http://lkml.kernel.org/r/20170620154113.588276707@linutronix.de --- kernel/time/posix-cpu-timers.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 9df618ee64cf..60cb24ac9ebc 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -580,7 +580,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, WARN_ON_ONCE(p == NULL); - new_expires = timespec64_to_ns(&new->it_value); + /* + * Use the to_ktime conversion because that clamps the maximum + * value to KTIME_MAX and avoid multiplication overflows. + */ + new_expires = ktime_to_ns(timespec64_to_ktime(new->it_value)); /* * Protect against sighand release/switch in exit/exec and p->cpu_timers -- cgit v1.3-14-g43fede From fc6eead7c1e2e5376c25d2795d4539fdacbc0648 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 22 May 2017 17:20:20 -0700 Subject: time: Clean up CLOCK_MONOTONIC_RAW time handling Now that we fixed the sub-ns handling for CLOCK_MONOTONIC_RAW, remove the duplicitive tk->raw_time.tv_nsec, which can be stored in tk->tkr_raw.xtime_nsec (similarly to how its handled for monotonic time). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Cc: Kevin Brodsky Cc: Will Deacon Cc: Daniel Mentz Tested-by: Daniel Mentz Signed-off-by: John Stultz --- arch/arm64/kernel/vdso.c | 6 ++--- include/linux/timekeeper_internal.h | 4 ++-- kernel/time/timekeeping.c | 45 ++++++++++++++++++++----------------- 3 files changed, 29 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index d0cb007fa482..7492d9009610 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -220,10 +220,8 @@ void update_vsyscall(struct timekeeper *tk) if (!use_syscall) { /* tkr_mono.cycle_last == tkr_raw.cycle_last */ vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last; - vdso_data->raw_time_sec = tk->raw_time.tv_sec; - vdso_data->raw_time_nsec = (tk->raw_time.tv_nsec << - tk->tkr_raw.shift) + - tk->tkr_raw.xtime_nsec; + vdso_data->raw_time_sec = tk->raw_sec; + vdso_data->raw_time_nsec = tk->tkr_raw.xtime_nsec; vdso_data->xtime_clock_sec = tk->xtime_sec; vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; vdso_data->cs_mono_mult = tk->tkr_mono.mult; diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index f7043ccca81c..0a0a53daf2a2 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -51,7 +51,7 @@ struct tk_read_base { * @clock_was_set_seq: The sequence number of clock was set events * @cs_was_changed_seq: The sequence number of clocksource change events * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second - * @raw_time: Monotonic raw base time in timespec64 format + * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds * @cycle_interval: Number of clock cycles in one NTP interval * @xtime_interval: Number of clock shifted nano seconds in one NTP * interval. @@ -93,7 +93,7 @@ struct timekeeper { unsigned int clock_was_set_seq; u8 cs_was_changed_seq; ktime_t next_leap_ktime; - struct timespec64 raw_time; + u64 raw_sec; /* The following members are for timekeeping internal use */ u64 cycle_interval; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b602c48cb841..0454bfa24353 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -72,6 +72,10 @@ static inline void tk_normalize_xtime(struct timekeeper *tk) tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec++; } + while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) { + tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift; + tk->raw_sec++; + } } static inline struct timespec64 tk_xtime(struct timekeeper *tk) @@ -285,12 +289,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { int shift_change = clock->shift - old_clock->shift; - if (shift_change < 0) + if (shift_change < 0) { tk->tkr_mono.xtime_nsec >>= -shift_change; - else + tk->tkr_raw.xtime_nsec >>= -shift_change; + } else { tk->tkr_mono.xtime_nsec <<= shift_change; + tk->tkr_raw.xtime_nsec <<= shift_change; + } } - tk->tkr_raw.xtime_nsec = 0; tk->tkr_mono.shift = clock->shift; tk->tkr_raw.shift = clock->shift; @@ -619,9 +625,6 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) nsec = (u32) tk->wall_to_monotonic.tv_nsec; tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); - /* Update the monotonic raw base */ - tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time); - /* * The sum of the nanoseconds portions of xtime and * wall_to_monotonic can be greater/equal one second. Take @@ -631,6 +634,11 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) if (nsec >= NSEC_PER_SEC) seconds++; tk->ktime_sec = seconds; + + /* Update the monotonic raw base */ + seconds = tk->raw_sec; + nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift); + tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); } /* must hold timekeeper_lock */ @@ -672,7 +680,6 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) static void timekeeping_forward_now(struct timekeeper *tk) { u64 cycle_now, delta; - u64 nsec; cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); @@ -684,10 +691,13 @@ static void timekeeping_forward_now(struct timekeeper *tk) /* If arch requires, add in get_arch_timeoffset() */ tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; - tk_normalize_xtime(tk); - nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift); - timespec64_add_ns(&tk->raw_time, nsec); + tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult; + + /* If arch requires, add in get_arch_timeoffset() */ + tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift; + + tk_normalize_xtime(tk); } /** @@ -1373,19 +1383,18 @@ int timekeeping_notify(struct clocksource *clock) void getrawmonotonic64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 ts64; unsigned long seq; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->raw_sec; nsecs = timekeeping_get_ns(&tk->tkr_raw); - ts64 = tk->raw_time; } while (read_seqcount_retry(&tk_core.seq, seq)); - timespec64_add_ns(&ts64, nsecs); - *ts = ts64; + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(getrawmonotonic64); @@ -1509,8 +1518,7 @@ void __init timekeeping_init(void) tk_setup_internals(tk, clock); tk_set_xtime(tk, &now); - tk->raw_time.tv_sec = 0; - tk->raw_time.tv_nsec = 0; + tk->raw_sec = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) boot = tk_xtime(tk); @@ -2011,15 +2019,12 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { tk->tkr_raw.xtime_nsec -= snsec_per_sec; - tk->raw_time.tv_sec++; + tk->raw_sec++; } - tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift; - tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; -- cgit v1.3-14-g43fede From 369adf04d80a7e179b9ea6d74cc01c233f142f47 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 12 May 2017 10:58:18 -0700 Subject: time: Add warning about imminent deprecation of CONFIG_GENERIC_TIME_VSYSCALL_OLD CONFIG_GENERIC_TIME_VSYSCALL_OLD was introduced five years ago to allow a transition from the old vsyscall implementations to the new method (which simplified internal accounting and made timekeeping more precise). However, PPC and IA64 have yet to make the transition, despite in some cases me sending test patches to try to help it along. http://patches.linaro.org/patch/30501/ http://patches.linaro.org/patch/35412/ If its helpful, my last pass at the patches can be found here: https://git.linaro.org/people/john.stultz/linux.git dev/oldvsyscall-cleanup So I think its time to set a deadline and make it clear this is going away. So this patch adds warnings about this functionality being dropped. Likely to be in v4.15. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Marcelo Tosatti Cc: Paul Mackerras Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Tony Luck Cc: Michael Ellerman Cc: Fenghua Yu Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0454bfa24353..cedafa008de5 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -516,6 +516,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk) } #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD +#warning Please contact your maintainers, as GENERIC_TIME_VSYSCALL_OLD compatibity will disappear soon. static inline void update_vsyscall(struct timekeeper *tk) { -- cgit v1.3-14-g43fede From f160203986a6ad23ab8077c4a25b260fe55d6e26 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 31 May 2017 18:06:58 +0200 Subject: irq/generic-chip: Export irq_init_generic_chip() locally This function will be used in the devres variant of irq_alloc_generic_chip(). Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: linux-doc@vger.kernel.org Cc: Jonathan Corbet Link: http://lkml.kernel.org/r/1496246820-13250-4-git-send-email-brgl@bgdev.pl --- kernel/irq/generic-chip.c | 7 +++---- kernel/irq/internals.h | 11 +++++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index ee32870079c9..f7086b78ad6e 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -201,10 +201,9 @@ static void irq_writel_be(u32 val, void __iomem *addr) iowrite32be(val, addr); } -static void -irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, - int num_ct, unsigned int irq_base, - void __iomem *reg_base, irq_flow_handler_t handler) +void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, + int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler) { raw_spin_lock_init(&gc->lock); gc->num_ct = num_ct; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index bc226e783bd2..921a2419720c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -226,3 +226,14 @@ irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } static inline void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } #endif + +#ifdef CONFIG_GENERIC_IRQ_CHIP +void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, + int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler); +#else +static inline void +irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, + int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler) { } +#endif /* CONFIG_GENERIC_IRQ_CHIP */ -- cgit v1.3-14-g43fede From 1c3e36309fe2e94b8a889fa32cb5c871434f8ed6 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 31 May 2017 18:06:59 +0200 Subject: irq/generic-chip: Provide devm_irq_alloc_generic_chip() Provide a resource managed variant of irq_alloc_generic_chip(). Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: linux-doc@vger.kernel.org Cc: Jonathan Corbet Link: http://lkml.kernel.org/r/1496246820-13250-5-git-send-email-brgl@bgdev.pl --- Documentation/driver-model/devres.txt | 1 + include/linux/irq.h | 5 +++++ kernel/irq/devres.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+) (limited to 'kernel') diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt index e72587fe477d..d473be8c8781 100644 --- a/Documentation/driver-model/devres.txt +++ b/Documentation/driver-model/devres.txt @@ -311,6 +311,7 @@ IRQ devm_irq_alloc_desc_at() devm_irq_alloc_desc_from() devm_irq_alloc_descs_from() + devm_irq_alloc_generic_chip() LED devm_led_classdev_register() diff --git a/include/linux/irq.h b/include/linux/irq.h index dc63aa10ce70..64ae54673e08 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -958,6 +958,11 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type); void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set); +struct irq_chip_generic * +devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, + unsigned int irq_base, void __iomem *reg_base, + irq_flow_handler_t handler); + struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq); int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 1613bfd48365..21ee0aebccfb 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -4,6 +4,8 @@ #include #include +#include "internals.h" + /* * Device resource management aware IRQ request/free implementation. */ @@ -198,3 +200,35 @@ int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from, return base; } EXPORT_SYMBOL_GPL(__devm_irq_alloc_descs); + +#ifdef CONFIG_GENERIC_IRQ_CHIP +/** + * devm_irq_alloc_generic_chip - Allocate and initialize a generic chip + * for a managed device + * @dev: Device to allocate the generic chip for + * @name: Name of the irq chip + * @num_ct: Number of irq_chip_type instances associated with this + * @irq_base: Interrupt base nr for this chip + * @reg_base: Register base address (virtual) + * @handler: Default flow handler associated with this chip + * + * Returns an initialized irq_chip_generic structure. The chip defaults + * to the primary (index 0) irq_chip_type and @handler + */ +struct irq_chip_generic * +devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, + unsigned int irq_base, void __iomem *reg_base, + irq_flow_handler_t handler) +{ + struct irq_chip_generic *gc; + unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); + + gc = devm_kzalloc(dev, sz, GFP_KERNEL); + if (gc) + irq_init_generic_chip(gc, name, num_ct, + irq_base, reg_base, handler); + + return gc; +} +EXPORT_SYMBOL_GPL(devm_irq_alloc_generic_chip); +#endif /* CONFIG_GENERIC_IRQ_CHIP */ -- cgit v1.3-14-g43fede From 30fd8fc5c91973485705f83c7efe9588b8e6f371 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 31 May 2017 18:07:00 +0200 Subject: irq/generic-chip: Provide devm_irq_setup_generic_chip() Provide a resource managed variant of irq_setup_generic_chip(). Signed-off-by: Bartosz Golaszewski Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: linux-doc@vger.kernel.org Cc: Jonathan Corbet Link: http://lkml.kernel.org/r/1496246820-13250-6-git-send-email-brgl@bgdev.pl --- Documentation/driver-model/devres.txt | 1 + include/linux/irq.h | 3 ++ kernel/irq/devres.c | 52 +++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) (limited to 'kernel') diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt index d473be8c8781..6a6618f34440 100644 --- a/Documentation/driver-model/devres.txt +++ b/Documentation/driver-model/devres.txt @@ -312,6 +312,7 @@ IRQ devm_irq_alloc_desc_from() devm_irq_alloc_descs_from() devm_irq_alloc_generic_chip() + devm_irq_setup_generic_chip() LED devm_led_classdev_register() diff --git a/include/linux/irq.h b/include/linux/irq.h index 64ae54673e08..d996314b6522 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -962,6 +962,9 @@ struct irq_chip_generic * devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler); +int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc, + u32 msk, enum irq_gc_flags flags, + unsigned int clr, unsigned int set); struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq); diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 21ee0aebccfb..194c506d9d20 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -231,4 +231,56 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, return gc; } EXPORT_SYMBOL_GPL(devm_irq_alloc_generic_chip); + +struct irq_generic_chip_devres { + struct irq_chip_generic *gc; + u32 msk; + unsigned int clr; + unsigned int set; +}; + +static void devm_irq_remove_generic_chip(struct device *dev, void *res) +{ + struct irq_generic_chip_devres *this = res; + + irq_remove_generic_chip(this->gc, this->msk, this->clr, this->set); +} + +/** + * devm_irq_setup_generic_chip - Setup a range of interrupts with a generic + * chip for a managed device + * + * @dev: Device to setup the generic chip for + * @gc: Generic irq chip holding all data + * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base + * @flags: Flags for initialization + * @clr: IRQ_* bits to clear + * @set: IRQ_* bits to set + * + * Set up max. 32 interrupts starting from gc->irq_base. Note, this + * initializes all interrupts to the primary irq_chip_type and its + * associated handler. + */ +int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc, + u32 msk, enum irq_gc_flags flags, + unsigned int clr, unsigned int set) +{ + struct irq_generic_chip_devres *dr; + + dr = devres_alloc(devm_irq_remove_generic_chip, + sizeof(*dr), GFP_KERNEL); + if (!dr) + return -ENOMEM; + + irq_setup_generic_chip(gc, msk, flags, clr, set); + + dr->gc = gc; + dr->msk = msk; + dr->clr = clr; + dr->set = set; + devres_add(dev, dr); + + return 0; +} +EXPORT_SYMBOL_GPL(devm_irq_setup_generic_chip); #endif /* CONFIG_GENERIC_IRQ_CHIP */ -- cgit v1.3-14-g43fede From 3c85d6db5e5f05ae6c3d7f5a0ceceb43746a5ca7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 19 Jun 2017 04:12:00 +0200 Subject: sched/loadavg: Generalize "_idle" naming to "_nohz" The loadavg naming code still assumes that nohz == idle whereas its code is actually handling well both nohz idle and nohz full. So lets fix the naming according to what the code actually does, to unconfuse the reader. Signed-off-by: Frederic Weisbecker Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1497838322-10913-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- Documentation/trace/ftrace.txt | 2 +- include/linux/sched/nohz.h | 8 +++---- kernel/sched/loadavg.c | 51 +++++++++++++++++++++--------------------- kernel/time/tick-sched.c | 4 ++-- 4 files changed, 33 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 94a987bd2bc5..fff8ff6d4893 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -1609,7 +1609,7 @@ Doing the same with chrt -r 5 and function-trace set. -0 3dN.2 14us : sched_avg_update <-__cpu_load_update -0 3dN.2 14us : _raw_spin_unlock <-cpu_load_update_nohz -0 3dN.2 14us : sub_preempt_count <-_raw_spin_unlock - -0 3dN.1 15us : calc_load_exit_idle <-tick_nohz_idle_exit + -0 3dN.1 15us : calc_load_nohz_stop <-tick_nohz_idle_exit -0 3dN.1 15us : touch_softlockup_watchdog <-tick_nohz_idle_exit -0 3dN.1 15us : hrtimer_cancel <-tick_nohz_idle_exit -0 3dN.1 15us : hrtimer_try_to_cancel <-hrtimer_cancel diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index 4995b717500b..7d3f75db23e5 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -23,11 +23,11 @@ static inline void set_cpu_sd_state_idle(void) { } #endif #ifdef CONFIG_NO_HZ_COMMON -void calc_load_enter_idle(void); -void calc_load_exit_idle(void); +void calc_load_nohz_start(void); +void calc_load_nohz_stop(void); #else -static inline void calc_load_enter_idle(void) { } -static inline void calc_load_exit_idle(void) { } +static inline void calc_load_nohz_start(void) { } +static inline void calc_load_nohz_stop(void) { } #endif /* CONFIG_NO_HZ_COMMON */ #if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index f15fb2bdbc0d..f14716a3522f 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -117,7 +117,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * load-average relies on per-cpu sampling from the tick, it is affected by * NO_HZ. * - * The basic idea is to fold the nr_active delta into a global idle-delta upon + * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon * entering NO_HZ state such that we can include this as an 'extra' cpu delta * when we read the global state. * @@ -126,7 +126,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * - When we go NO_HZ idle during the window, we can negate our sample * contribution, causing under-accounting. * - * We avoid this by keeping two idle-delta counters and flipping them + * We avoid this by keeping two NO_HZ-delta counters and flipping them * when the window starts, thus separating old and new NO_HZ load. * * The only trick is the slight shift in index flip for read vs write. @@ -137,22 +137,22 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * r:0 0 1 1 0 0 1 1 0 * w:0 1 1 0 0 1 1 0 0 * - * This ensures we'll fold the old idle contribution in this window while + * This ensures we'll fold the old NO_HZ contribution in this window while * accumlating the new one. * - * - When we wake up from NO_HZ idle during the window, we push up our + * - When we wake up from NO_HZ during the window, we push up our * contribution, since we effectively move our sample point to a known * busy state. * * This is solved by pushing the window forward, and thus skipping the - * sample, for this cpu (effectively using the idle-delta for this cpu which + * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which * was in effect at the time the window opened). This also solves the issue - * of having to deal with a cpu having been in NOHZ idle for multiple - * LOAD_FREQ intervals. + * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ + * intervals. * * When making the ILB scale, we should try to pull this in as well. */ -static atomic_long_t calc_load_idle[2]; +static atomic_long_t calc_load_nohz[2]; static int calc_load_idx; static inline int calc_load_write_idx(void) @@ -167,7 +167,7 @@ static inline int calc_load_write_idx(void) /* * If the folding window started, make sure we start writing in the - * next idle-delta. + * next NO_HZ-delta. */ if (!time_before(jiffies, READ_ONCE(calc_load_update))) idx++; @@ -180,24 +180,24 @@ static inline int calc_load_read_idx(void) return calc_load_idx & 1; } -void calc_load_enter_idle(void) +void calc_load_nohz_start(void) { struct rq *this_rq = this_rq(); long delta; /* - * We're going into NOHZ mode, if there's any pending delta, fold it - * into the pending idle delta. + * We're going into NO_HZ mode, if there's any pending delta, fold it + * into the pending NO_HZ delta. */ delta = calc_load_fold_active(this_rq, 0); if (delta) { int idx = calc_load_write_idx(); - atomic_long_add(delta, &calc_load_idle[idx]); + atomic_long_add(delta, &calc_load_nohz[idx]); } } -void calc_load_exit_idle(void) +void calc_load_nohz_stop(void) { struct rq *this_rq = this_rq(); @@ -217,13 +217,13 @@ void calc_load_exit_idle(void) this_rq->calc_load_update += LOAD_FREQ; } -static long calc_load_fold_idle(void) +static long calc_load_nohz_fold(void) { int idx = calc_load_read_idx(); long delta = 0; - if (atomic_long_read(&calc_load_idle[idx])) - delta = atomic_long_xchg(&calc_load_idle[idx], 0); + if (atomic_long_read(&calc_load_nohz[idx])) + delta = atomic_long_xchg(&calc_load_nohz[idx], 0); return delta; } @@ -299,9 +299,9 @@ calc_load_n(unsigned long load, unsigned long exp, /* * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. + * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into + * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold + * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. * * Once we've updated the global active value, we need to apply the exponential * weights adjusted to the number of cycles missed. @@ -330,7 +330,7 @@ static void calc_global_nohz(void) } /* - * Flip the idle index... + * Flip the NO_HZ index... * * Make sure we first write the new time then flip the index, so that * calc_load_write_idx() will see the new time when it reads the new @@ -341,7 +341,7 @@ static void calc_global_nohz(void) } #else /* !CONFIG_NO_HZ_COMMON */ -static inline long calc_load_fold_idle(void) { return 0; } +static inline long calc_load_nohz_fold(void) { return 0; } static inline void calc_global_nohz(void) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -362,9 +362,9 @@ void calc_global_load(unsigned long ticks) return; /* - * Fold the 'old' idle-delta to include all NO_HZ cpus. + * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. */ - delta = calc_load_fold_idle(); + delta = calc_load_nohz_fold(); if (delta) atomic_long_add(delta, &calc_load_tasks); @@ -378,7 +378,8 @@ void calc_global_load(unsigned long ticks) WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ); /* - * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + * In case we went to NO_HZ for multiple LOAD_FREQ intervals + * catch up in bulk. */ calc_global_nohz(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9c2dc64e31d8..b1b58a07e042 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -783,7 +783,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, */ if (!ts->tick_stopped) { nohz_balance_enter_idle(cpu); - calc_load_enter_idle(); + calc_load_nohz_start(); cpu_load_update_nohz_start(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); @@ -823,7 +823,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) */ timer_clear_idle(); - calc_load_exit_idle(); + calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick -- cgit v1.3-14-g43fede From a0db971e4eb69fc84eb3d7ef94f718b483550b4a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 19 Jun 2017 04:12:01 +0200 Subject: nohz: Move idle balancer registration to the idle path The idle load balancing registration path assumes that we only stop the tick when the CPU is idle, ignoring the nohz full case. As a result, a nohz full CPU that is running a task may be chosen to perform idle load balancing. Lets make sure that only CPUs in dynticks idle mode can be picked as idle load balancers. Signed-off-by: Frederic Weisbecker Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1497838322-10913-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b1b58a07e042..db023e9cbb25 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -782,7 +782,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - nohz_balance_enter_idle(cpu); calc_load_nohz_start(); cpu_load_update_nohz_start(); @@ -923,8 +922,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ts->idle_expires = expires; } - if (!was_stopped && ts->tick_stopped) + if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; + nohz_balance_enter_idle(cpu); + } } } -- cgit v1.3-14-g43fede From 387bc8b5536eeb0a92f4b4ab553539eaea2ac0ba Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 19 Jun 2017 04:12:02 +0200 Subject: sched/fair: Spare idle load balancing on nohz_full CPUs Although idle load balancing obviously only concerns idle CPUs, it can be a disturbance on a busy nohz_full CPU. Indeed a CPU can only get rid of an idle load balancing duty once a tick fires while it runs a task and this can take a while on a nohz_full CPU. We could fix that and escape the idle load balancing duty from the very idle exit path but that would bring unecessary overhead. Lets just not bother and leave that job to housekeeping CPUs (those outside nohz_full range). The nohz_full CPUs simply don't want any disturbance. Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1497838322-10913-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a24661ac3d23..694c258b8771 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8683,6 +8683,10 @@ void nohz_balance_enter_idle(int cpu) if (!cpu_active(cpu)) return; + /* Spare idle load balancing on CPUs that don't want to be disturbed: */ + if (!is_housekeeping_cpu(cpu)) + return; + if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; -- cgit v1.3-14-g43fede From 0165308a2f994939d2e1b36624f5a8f57746bc88 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:04 +0200 Subject: genirq/msi: Prevent overwriting domain name Prevent overwriting an already assigned domain name. Remove the extra check for chip->name, because if domain->name is NULL overwriting it with NULL is not a problem. Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: Jens Axboe Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235443.510684976@linutronix.de --- kernel/irq/msi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index fe4d48ec5bc4..9e3f1857c6bd 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -274,7 +274,8 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, fwnode, &msi_domain_ops, info); - if (domain && info->chip && info->chip->name) + + if (domain && !domain->name && info->chip) domain->name = info->chip->name; return domain; -- cgit v1.3-14-g43fede From d59f6617eef0f76e34f7a9993f5645c5ef467e42 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:05 +0200 Subject: genirq: Allow fwnode to carry name information only In order to provide proper debug interface it's required to have domain names available when the domain is added. Non fwnode based architectures like x86 have no way to do so. It's not possible to use domain ops or host data for this as domain ops might be the same for several instances, but the names have to be unique. Extend the irqchip fwnode to allow transporting the domain name. If no node is supplied, create a 'unknown-N' placeholder. Warn if an invalid node is supplied and treat it like no node. This happens e.g. with i2 devices on x86 which hand in an ACPI type node which has no interface for retrieving the name. [ Folded a fix from Marc to make DT name parsing work ] Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: Jens Axboe Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235443.588784933@linutronix.de --- include/linux/irqdomain.h | 31 +++++++++++++- kernel/irq/irqdomain.c | 105 ++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 122 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 9f3616085423..9cf32a2fbe69 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -189,6 +189,9 @@ enum { /* Irq domain implements MSI remapping */ IRQ_DOMAIN_FLAG_MSI_REMAP = (1 << 5), + /* Irq domain name was allocated in __irq_domain_add() */ + IRQ_DOMAIN_NAME_ALLOCATED = (1 << 6), + /* * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved * for implementation specific purposes and ignored by the @@ -203,7 +206,33 @@ static inline struct device_node *irq_domain_get_of_node(struct irq_domain *d) } #ifdef CONFIG_IRQ_DOMAIN -struct fwnode_handle *irq_domain_alloc_fwnode(void *data); +struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, + const char *name, void *data); + +enum { + IRQCHIP_FWNODE_REAL, + IRQCHIP_FWNODE_NAMED, + IRQCHIP_FWNODE_NAMED_ID, +}; + +static inline +struct fwnode_handle *irq_domain_alloc_named_fwnode(const char *name) +{ + return __irq_domain_alloc_fwnode(IRQCHIP_FWNODE_NAMED, 0, name, NULL); +} + +static inline +struct fwnode_handle *irq_domain_alloc_named_id_fwnode(const char *name, int id) +{ + return __irq_domain_alloc_fwnode(IRQCHIP_FWNODE_NAMED_ID, id, name, + NULL); +} + +static inline struct fwnode_handle *irq_domain_alloc_fwnode(void *data) +{ + return __irq_domain_alloc_fwnode(IRQCHIP_FWNODE_REAL, 0, NULL, data); +} + void irq_domain_free_fwnode(struct fwnode_handle *fwnode); struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, irq_hw_number_t hwirq_max, int direct_max, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 70b9da72018b..e1b925bea205 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -26,39 +26,61 @@ static struct irq_domain *irq_default_domain; static void irq_domain_check_hierarchy(struct irq_domain *domain); struct irqchip_fwid { - struct fwnode_handle fwnode; - char *name; - void *data; + struct fwnode_handle fwnode; + unsigned int type; + char *name; + void *data; }; /** * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for * identifying an irq domain - * @data: optional user-provided data + * @type: Type of irqchip_fwnode. See linux/irqdomain.h + * @name: Optional user provided domain name + * @id: Optional user provided id if name != NULL + * @data: Optional user-provided data * - * Allocate a struct device_node, and return a poiner to the embedded + * Allocate a struct irqchip_fwid, and return a poiner to the embedded * fwnode_handle (or NULL on failure). + * + * Note: The types IRQCHIP_FWNODE_NAMED and IRQCHIP_FWNODE_NAMED_ID are + * solely to transport name information to irqdomain creation code. The + * node is not stored. For other types the pointer is kept in the irq + * domain struct. */ -struct fwnode_handle *irq_domain_alloc_fwnode(void *data) +struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, + const char *name, void *data) { struct irqchip_fwid *fwid; - char *name; + char *n; fwid = kzalloc(sizeof(*fwid), GFP_KERNEL); - name = kasprintf(GFP_KERNEL, "irqchip@%p", data); - if (!fwid || !name) { + switch (type) { + case IRQCHIP_FWNODE_NAMED: + n = kasprintf(GFP_KERNEL, "%s", name); + break; + case IRQCHIP_FWNODE_NAMED_ID: + n = kasprintf(GFP_KERNEL, "%s-%d", name, id); + break; + default: + n = kasprintf(GFP_KERNEL, "irqchip@%p", data); + break; + } + + if (!fwid || !n) { kfree(fwid); - kfree(name); + kfree(n); return NULL; } - fwid->name = name; + fwid->type = type; + fwid->name = n; fwid->data = data; fwid->fwnode.type = FWNODE_IRQCHIP; return &fwid->fwnode; } -EXPORT_SYMBOL_GPL(irq_domain_alloc_fwnode); +EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode); /** * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle @@ -97,20 +119,75 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, void *host_data) { struct device_node *of_node = to_of_node(fwnode); + struct irqchip_fwid *fwid; struct irq_domain *domain; + static atomic_t unknown_domains; + domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), GFP_KERNEL, of_node_to_nid(of_node)); if (WARN_ON(!domain)) return NULL; + if (fwnode && is_fwnode_irqchip(fwnode)) { + fwid = container_of(fwnode, struct irqchip_fwid, fwnode); + + switch (fwid->type) { + case IRQCHIP_FWNODE_NAMED: + case IRQCHIP_FWNODE_NAMED_ID: + domain->name = kstrdup(fwid->name, GFP_KERNEL); + if (!domain->name) { + kfree(domain); + return NULL; + } + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + break; + default: + domain->fwnode = fwnode; + domain->name = fwid->name; + break; + } + } else if (of_node) { + char *name; + + /* + * DT paths contain '/', which debugfs is legitimately + * unhappy about. Replace them with ':', which does + * the trick and is not as offensive as '\'... + */ + name = kstrdup(of_node_full_name(of_node), GFP_KERNEL); + if (!name) { + kfree(domain); + return NULL; + } + + strreplace(name, '/', ':'); + + domain->name = name; + domain->fwnode = fwnode; + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + } + + if (!domain->name) { + if (fwnode) { + pr_err("Invalid fwnode type (%d) for irqdomain\n", + fwnode->type); + } + domain->name = kasprintf(GFP_KERNEL, "unknown-%d", + atomic_inc_return(&unknown_domains)); + if (!domain->name) { + kfree(domain); + return NULL; + } + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + } + of_node_get(of_node); /* Fill structure */ INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); domain->ops = ops; domain->host_data = host_data; - domain->fwnode = fwnode; domain->hwirq_max = hwirq_max; domain->revmap_size = size; domain->revmap_direct_max_irq = direct_max; @@ -152,6 +229,8 @@ void irq_domain_remove(struct irq_domain *domain) pr_debug("Removed domain %s\n", domain->name); of_node_put(irq_domain_get_of_node(domain)); + if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) + kfree(domain->name); kfree(domain); } EXPORT_SYMBOL_GPL(irq_domain_remove); -- cgit v1.3-14-g43fede From 9dc6be3d419398eae9a19cd09b7969ceff8eaf10 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:16 +0200 Subject: genirq/irqdomain: Add map counter Add a map counter instead of counting radix tree entries for diagnosis. That also gives correct information for linear domains. Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: Jens Axboe Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235444.459397746@linutronix.de --- include/linux/irqdomain.h | 2 ++ kernel/irq/irqdomain.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 9cf32a2fbe69..17ccd54d936d 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -130,6 +130,7 @@ struct irq_domain_chip_generic; * @host_data: private data pointer for use by owner. Not touched by irq_domain * core code. * @flags: host per irq_domain flags + * @mapcount: The number of mapped interrupts * * Optional elements * @of_node: Pointer to device tree nodes associated with the irq_domain. Used @@ -152,6 +153,7 @@ struct irq_domain { const struct irq_domain_ops *ops; void *host_data; unsigned int flags; + unsigned int mapcount; /* Optional data */ struct fwnode_handle *fwnode; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e1b925bea205..8d5805c655b6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -423,6 +423,7 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) irq_data->domain = NULL; irq_data->hwirq = 0; + domain->mapcount--; /* Clear reverse map for this hwirq */ if (hwirq < domain->revmap_size) { @@ -474,6 +475,7 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq, domain->name = irq_data->chip->name; } + domain->mapcount++; if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = virq; } else { @@ -1081,6 +1083,7 @@ static void irq_domain_insert_irq(int virq) struct irq_domain *domain = data->domain; irq_hw_number_t hwirq = data->hwirq; + domain->mapcount++; if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = virq; } else { @@ -1110,6 +1113,7 @@ static void irq_domain_remove_irq(int virq) struct irq_domain *domain = data->domain; irq_hw_number_t hwirq = data->hwirq; + domain->mapcount--; if (hwirq < domain->revmap_size) { domain->linear_revmap[hwirq] = 0; } else { -- cgit v1.3-14-g43fede From 087cdfb662ae50e3826e7cd2e54b6519d07b60f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:17 +0200 Subject: genirq/debugfs: Add proper debugfs interface Debugging (hierarchical) interupt domains is tedious as there is no information about the hierarchy and no information about states of interrupts in the various domain levels. Add a debugfs directory 'irq' and subdirectories 'domains' and 'irqs'. The domains directory contains the domain files. The content is information about the domain. If the domain is part of a hierarchy then the parent domains are printed as well. # ls /sys/kernel/debug/irq/domains/ default INTEL-IR-2 INTEL-IR-MSI-2 IO-APIC-IR-2 PCI-MSI DMAR-MSI INTEL-IR-3 INTEL-IR-MSI-3 IO-APIC-IR-3 unknown-1 INTEL-IR-0 INTEL-IR-MSI-0 IO-APIC-IR-0 IO-APIC-IR-4 VECTOR INTEL-IR-1 INTEL-IR-MSI-1 IO-APIC-IR-1 PCI-HT # cat /sys/kernel/debug/irq/domains/VECTOR name: VECTOR size: 0 mapped: 216 flags: 0x00000041 # cat /sys/kernel/debug/irq/domains/IO-APIC-IR-0 name: IO-APIC-IR-0 size: 24 mapped: 19 flags: 0x00000041 parent: INTEL-IR-3 name: INTEL-IR-3 size: 65536 mapped: 167 flags: 0x00000041 parent: VECTOR name: VECTOR size: 0 mapped: 216 flags: 0x00000041 Unfortunately there is no per cpu information about the VECTOR domain (yet). The irqs directory contains detailed information about mapped interrupts. # cat /sys/kernel/debug/irq/irqs/3 handler: handle_edge_irq status: 0x00004000 istate: 0x00000000 ddepth: 1 wdepth: 0 dstate: 0x01018000 IRQD_IRQ_DISABLED IRQD_SINGLE_TARGET IRQD_MOVE_PCNTXT node: 0 affinity: 0-143 effectiv: 0 pending: domain: IO-APIC-IR-0 hwirq: 0x3 chip: IR-IO-APIC flags: 0x10 IRQCHIP_SKIP_SET_WAKE parent: domain: INTEL-IR-3 hwirq: 0x20000 chip: INTEL-IR flags: 0x0 parent: domain: VECTOR hwirq: 0x3 chip: APIC flags: 0x0 This was developed to simplify the debugging of the managed affinity changes. Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Cc: Jens Axboe Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235444.537566163@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/irqdesc.h | 4 + include/linux/irqdomain.h | 4 + kernel/irq/Kconfig | 11 +++ kernel/irq/Makefile | 1 + kernel/irq/debugfs.c | 215 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/irq/internals.h | 22 +++++ kernel/irq/irqdesc.c | 1 + kernel/irq/irqdomain.c | 87 ++++++++++++++++++- kernel/irq/manage.c | 1 + 9 files changed, 345 insertions(+), 1 deletion(-) create mode 100644 kernel/irq/debugfs.c (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index c9be57931b58..d425a3a09722 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -46,6 +46,7 @@ struct pt_regs; * @rcu: rcu head for delayed free * @kobj: kobject used to represent this struct in sysfs * @dir: /proc/irq/ procfs entry + * @debugfs_file: dentry for the debugfs file * @name: flow handler name for /proc/interrupts output */ struct irq_desc { @@ -88,6 +89,9 @@ struct irq_desc { #ifdef CONFIG_PROC_FS struct proc_dir_entry *dir; #endif +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + struct dentry *debugfs_file; +#endif #ifdef CONFIG_SPARSE_IRQ struct rcu_head rcu; struct kobject kobj; diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 17ccd54d936d..914b0c31d233 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -139,6 +139,7 @@ struct irq_domain_chip_generic; * setting up one or more generic chips for interrupt controllers * drivers using the generic chip library which uses this pointer. * @parent: Pointer to parent irq_domain to support hierarchy irq_domains + * @debugfs_file: dentry for the domain debugfs file * * Revmap data, used internally by irq_domain * @revmap_direct_max_irq: The largest hwirq that can be set for controllers that @@ -162,6 +163,9 @@ struct irq_domain { #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY struct irq_domain *parent; #endif +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + struct dentry *debugfs_file; +#endif /* reverse map data. The linear map gets appended to the irq_domain */ irq_hw_number_t hwirq_max; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3bbfd6a9c475..8d9498e51585 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -108,4 +108,15 @@ config SPARSE_IRQ If you don't know what to do here, say N. +config GENERIC_IRQ_DEBUGFS + bool "Expose irq internals in debugfs" + depends on DEBUG_FS + default n + ---help--- + + Exposes internal state information through debugfs. Mostly for + developers and debugging of hard to diagnose interrupt problems. + + If you don't know what to do here, say N. + endmenu diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 1d3ee3169202..c61fc9c2d1f7 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o obj-$(CONFIG_SMP) += affinity.o +obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c new file mode 100644 index 000000000000..50ee2f6593e8 --- /dev/null +++ b/kernel/irq/debugfs.c @@ -0,0 +1,215 @@ +/* + * Copyright 2017 Thomas Gleixner + * + * This file is licensed under the GPL V2. + */ +#include +#include +#include + +#include "internals.h" + +static struct dentry *irq_dir; + +struct irq_bit_descr { + unsigned int mask; + char *name; +}; +#define BIT_MASK_DESCR(m) { .mask = m, .name = #m } + +static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state, + const struct irq_bit_descr *sd, int size) +{ + int i; + + for (i = 0; i < size; i++, sd++) { + if (state & sd->mask) + seq_printf(m, "%*s%s\n", ind + 12, "", sd->name); + } +} + +#ifdef CONFIG_SMP +static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct cpumask *msk; + + msk = irq_data_get_affinity_mask(data); + seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk)); +#ifdef CONFIG_GENERIC_PENDING_IRQ + msk = desc->pending_mask; + seq_printf(m, "pending: %*pbl\n", cpumask_pr_args(msk)); +#endif +} +#else +static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc) { } +#endif + +static const struct irq_bit_descr irqchip_flags[] = { + BIT_MASK_DESCR(IRQCHIP_SET_TYPE_MASKED), + BIT_MASK_DESCR(IRQCHIP_EOI_IF_HANDLED), + BIT_MASK_DESCR(IRQCHIP_MASK_ON_SUSPEND), + BIT_MASK_DESCR(IRQCHIP_ONOFFLINE_ENABLED), + BIT_MASK_DESCR(IRQCHIP_SKIP_SET_WAKE), + BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE), + BIT_MASK_DESCR(IRQCHIP_EOI_THREADED), +}; + +static void +irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind) +{ + struct irq_chip *chip = data->chip; + + if (!chip) { + seq_printf(m, "chip: None\n"); + return; + } + seq_printf(m, "%*schip: %s\n", ind, "", chip->name); + seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags); + irq_debug_show_bits(m, ind, chip->flags, irqchip_flags, + ARRAY_SIZE(irqchip_flags)); +} + +static void +irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind) +{ + seq_printf(m, "%*sdomain: %s\n", ind, "", + data->domain ? data->domain->name : ""); + seq_printf(m, "%*shwirq: 0x%lx\n", ind + 1, "", data->hwirq); + irq_debug_show_chip(m, data, ind + 1); +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + if (!data->parent_data) + return; + seq_printf(m, "%*sparent:\n", ind + 1, ""); + irq_debug_show_data(m, data->parent_data, ind + 4); +#endif +} + +static const struct irq_bit_descr irqdata_states[] = { + BIT_MASK_DESCR(IRQ_TYPE_EDGE_RISING), + BIT_MASK_DESCR(IRQ_TYPE_EDGE_FALLING), + BIT_MASK_DESCR(IRQ_TYPE_LEVEL_HIGH), + BIT_MASK_DESCR(IRQ_TYPE_LEVEL_LOW), + BIT_MASK_DESCR(IRQD_LEVEL), + + BIT_MASK_DESCR(IRQD_ACTIVATED), + BIT_MASK_DESCR(IRQD_IRQ_STARTED), + BIT_MASK_DESCR(IRQD_IRQ_DISABLED), + BIT_MASK_DESCR(IRQD_IRQ_MASKED), + BIT_MASK_DESCR(IRQD_IRQ_INPROGRESS), + + BIT_MASK_DESCR(IRQD_PER_CPU), + BIT_MASK_DESCR(IRQD_NO_BALANCING), + + BIT_MASK_DESCR(IRQD_MOVE_PCNTXT), + BIT_MASK_DESCR(IRQD_AFFINITY_SET), + BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), + BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), + BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), + + BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), + + BIT_MASK_DESCR(IRQD_WAKEUP_STATE), + BIT_MASK_DESCR(IRQD_WAKEUP_ARMED), +}; + +static const struct irq_bit_descr irqdesc_states[] = { + BIT_MASK_DESCR(_IRQ_NOPROBE), + BIT_MASK_DESCR(_IRQ_NOREQUEST), + BIT_MASK_DESCR(_IRQ_NOTHREAD), + BIT_MASK_DESCR(_IRQ_NOAUTOEN), + BIT_MASK_DESCR(_IRQ_NESTED_THREAD), + BIT_MASK_DESCR(_IRQ_PER_CPU_DEVID), + BIT_MASK_DESCR(_IRQ_IS_POLLED), + BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY), +}; + +static const struct irq_bit_descr irqdesc_istates[] = { + BIT_MASK_DESCR(IRQS_AUTODETECT), + BIT_MASK_DESCR(IRQS_SPURIOUS_DISABLED), + BIT_MASK_DESCR(IRQS_POLL_INPROGRESS), + BIT_MASK_DESCR(IRQS_ONESHOT), + BIT_MASK_DESCR(IRQS_REPLAY), + BIT_MASK_DESCR(IRQS_WAITING), + BIT_MASK_DESCR(IRQS_PENDING), + BIT_MASK_DESCR(IRQS_SUSPENDED), +}; + + +static int irq_debug_show(struct seq_file *m, void *p) +{ + struct irq_desc *desc = m->private; + struct irq_data *data; + + raw_spin_lock_irq(&desc->lock); + data = irq_desc_get_irq_data(desc); + seq_printf(m, "handler: %pf\n", desc->handle_irq); + seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); + irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, + ARRAY_SIZE(irqdesc_states)); + seq_printf(m, "istate: 0x%08x\n", desc->istate); + irq_debug_show_bits(m, 0, desc->istate, irqdesc_istates, + ARRAY_SIZE(irqdesc_istates)); + seq_printf(m, "ddepth: %u\n", desc->depth); + seq_printf(m, "wdepth: %u\n", desc->wake_depth); + seq_printf(m, "dstate: 0x%08x\n", irqd_get(data)); + irq_debug_show_bits(m, 0, irqd_get(data), irqdata_states, + ARRAY_SIZE(irqdata_states)); + seq_printf(m, "node: %d\n", irq_data_get_node(data)); + irq_debug_show_masks(m, desc); + irq_debug_show_data(m, data, 0); + raw_spin_unlock_irq(&desc->lock); + return 0; +} + +static int irq_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_debug_show, inode->i_private); +} + +static const struct file_operations dfs_irq_ops = { + .open = irq_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) +{ + char name [10]; + + if (!irq_dir || !desc || desc->debugfs_file) + return; + + sprintf(name, "%d", irq); + desc->debugfs_file = debugfs_create_file(name, 0444, irq_dir, desc, + &dfs_irq_ops); +} + +void irq_remove_debugfs_entry(struct irq_desc *desc) +{ + if (desc->debugfs_file) + debugfs_remove(desc->debugfs_file); +} + +static int __init irq_debugfs_init(void) +{ + struct dentry *root_dir; + int irq; + + root_dir = debugfs_create_dir("irq", NULL); + if (!root_dir) + return -ENOMEM; + + irq_domain_debugfs_init(root_dir); + + irq_dir = debugfs_create_dir("irqs", root_dir); + + irq_lock_sparse(); + for_each_active_irq(irq) + irq_add_debugfs_entry(irq, irq_to_desc(irq)); + irq_unlock_sparse(); + + return 0; +} +__initcall(irq_debugfs_init); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 921a2419720c..094db5bfb83f 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -169,6 +169,11 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) +static inline unsigned int irqd_get(struct irq_data *d) +{ + return __irqd_to_state(d); +} + /* * Manipulation functions for irq_data.state */ @@ -237,3 +242,20 @@ irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler) { } #endif /* CONFIG_GENERIC_IRQ_CHIP */ + +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); +void irq_remove_debugfs_entry(struct irq_desc *desc); +# ifdef CONFIG_IRQ_DOMAIN +void irq_domain_debugfs_init(struct dentry *root); +# else +static inline void irq_domain_debugfs_init(struct dentry *root); +# endif +#else /* CONFIG_GENERIC_IRQ_DEBUGFS */ +static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) +{ +} +static inline void irq_remove_debugfs_entry(struct irq_desc *d) +{ +} +#endif /* CONFIG_GENERIC_IRQ_DEBUGFS */ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 09abce2ea8f0..feade536b6d1 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -394,6 +394,7 @@ static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); + irq_remove_debugfs_entry(desc); unregister_irq_proc(irq, desc); /* diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8d5805c655b6..75e1f0851c33 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -29,9 +29,17 @@ struct irqchip_fwid { struct fwnode_handle fwnode; unsigned int type; char *name; - void *data; + void *data; }; +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static void debugfs_add_domain_dir(struct irq_domain *d); +static void debugfs_remove_domain_dir(struct irq_domain *d); +#else +static inline void debugfs_add_domain_dir(struct irq_domain *d) { } +static inline void debugfs_remove_domain_dir(struct irq_domain *d) { } +#endif + /** * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for * identifying an irq domain @@ -194,6 +202,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, irq_domain_check_hierarchy(domain); mutex_lock(&irq_domain_mutex); + debugfs_add_domain_dir(domain); list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); @@ -213,6 +222,7 @@ EXPORT_SYMBOL_GPL(__irq_domain_add); void irq_domain_remove(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); + debugfs_remove_domain_dir(domain); WARN_ON(!radix_tree_empty(&domain->revmap_tree)); @@ -1599,3 +1609,78 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain) { } #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ + +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static struct dentry *domain_dir; + +static void +irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind) +{ + seq_printf(m, "%*sname: %s\n", ind, "", d->name); + seq_printf(m, "%*ssize: %u\n", ind + 1, "", + d->revmap_size + d->revmap_direct_max_irq); + seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount); + seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags); +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + if (!d->parent) + return; + seq_printf(m, "%*sparent: %s\n", ind + 1, "", d->parent->name); + irq_domain_debug_show_one(m, d->parent, ind + 4); +#endif +} + +static int irq_domain_debug_show(struct seq_file *m, void *p) +{ + struct irq_domain *d = m->private; + + /* Default domain? Might be NULL */ + if (!d) { + if (!irq_default_domain) + return 0; + d = irq_default_domain; + } + irq_domain_debug_show_one(m, d, 0); + return 0; +} + +static int irq_domain_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_domain_debug_show, inode->i_private); +} + +static const struct file_operations dfs_domain_ops = { + .open = irq_domain_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void debugfs_add_domain_dir(struct irq_domain *d) +{ + if (!d->name || !domain_dir || d->debugfs_file) + return; + d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d, + &dfs_domain_ops); +} + +static void debugfs_remove_domain_dir(struct irq_domain *d) +{ + if (d->debugfs_file) + debugfs_remove(d->debugfs_file); +} + +void __init irq_domain_debugfs_init(struct dentry *root) +{ + struct irq_domain *d; + + domain_dir = debugfs_create_dir("domains", root); + if (!domain_dir) + return; + + debugfs_create_file("default", 0444, domain_dir, NULL, &dfs_domain_ops); + mutex_lock(&irq_domain_mutex); + list_for_each_entry(d, &irq_domain_list, link) + debugfs_add_domain_dir(d); + mutex_unlock(&irq_domain_mutex); +} +#endif diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4c34696ca575..284f4eb1ffbe 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1398,6 +1398,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) wake_up_process(new->secondary->thread); register_irq_proc(irq, desc); + irq_add_debugfs_entry(irq, desc); new->dir = NULL; register_handler_proc(irq, new); free_cpumask_var(mask); -- cgit v1.3-14-g43fede From cdd16365b0bd7c0cd19e2cc768b6bdc8021f32c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:19 +0200 Subject: genirq: Provide irq_fixup_move_pending() If an CPU goes offline, the interrupts are migrated away, but a eventually pending interrupt move, which has not yet been made effective is kept pending even if the outgoing CPU is the sole target of the pending affinity mask. What's worse is, that the pending affinity mask is discarded even if it would contain a valid subset of the online CPUs. Implement a helper function which allows to avoid these issues. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235444.691345468@linutronix.de --- include/linux/irq.h | 5 +++++ kernel/irq/migration.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 7e62e10e5856..d008065e2f4d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -491,9 +491,14 @@ extern void irq_migrate_all_off_this_cpu(void); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) void irq_move_irq(struct irq_data *data); void irq_move_masked_irq(struct irq_data *data); +bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); #else static inline void irq_move_irq(struct irq_data *data) { } static inline void irq_move_masked_irq(struct irq_data *data) { } +static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) +{ + return false; +} #endif extern int no_irq_affinity; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 37ddb7bda651..6ca054a3f91d 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -4,6 +4,36 @@ #include "internals.h" +/** + * irq_fixup_move_pending - Cleanup irq move pending from a dying CPU + * @desc: Interrupt descpriptor to clean up + * @force_clear: If set clear the move pending bit unconditionally. + * If not set, clear it only when the dying CPU is the + * last one in the pending mask. + * + * Returns true if the pending bit was set and the pending mask contains an + * online CPU other than the dying CPU. + */ +bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + + if (!irqd_is_setaffinity_pending(data)) + return false; + + /* + * The outgoing CPU might be the last online target in a pending + * interrupt move. If that's the case clear the pending move bit. + */ + if (cpumask_any_and(desc->pending_mask, cpu_online_mask) >= nr_cpu_ids) { + irqd_clr_move_pending(data); + return false; + } + if (force_clear) + irqd_clr_move_pending(data); + return true; +} + void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); -- cgit v1.3-14-g43fede From cba4235e6031e9318d68186f6d765c531cbea4e1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:21 +0200 Subject: genirq: Remove mask argument from setup_affinity() No point to have this alloc/free dance of cpumasks. Provide a static mask for setup_affinity() and protect it proper. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235444.851571573@linutronix.de --- kernel/irq/internals.h | 2 +- kernel/irq/manage.c | 53 ++++++++++++++++++++++---------------------------- kernel/irq/proc.c | 8 +++++--- 3 files changed, 29 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 094db5bfb83f..33ca83816b8c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -109,7 +109,7 @@ static inline void unregister_handler_proc(unsigned int irq, extern bool irq_can_set_affinity_usr(unsigned int irq); -extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); +extern int irq_select_affinity_usr(unsigned int irq); extern void irq_set_thread_affinity(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 284f4eb1ffbe..e2f20d553d60 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -345,15 +345,18 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); /* * Generic version of the affinity autoselector. */ -static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) +static int irq_setup_affinity(struct irq_desc *desc) { struct cpumask *set = irq_default_affinity; - int node = irq_desc_get_node(desc); + int ret, node = irq_desc_get_node(desc); + static DEFINE_RAW_SPINLOCK(mask_lock); + static struct cpumask mask; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!__irq_can_set_affinity(desc)) return 0; + raw_spin_lock(&mask_lock); /* * Preserve the managed affinity setting and a userspace affinity * setup, but make sure that one of the targets is online. @@ -367,43 +370,42 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); } - cpumask_and(mask, cpu_online_mask, set); + cpumask_and(&mask, cpu_online_mask, set); if (node != NUMA_NO_NODE) { const struct cpumask *nodemask = cpumask_of_node(node); /* make sure at least one of the cpus in nodemask is online */ - if (cpumask_intersects(mask, nodemask)) - cpumask_and(mask, mask, nodemask); + if (cpumask_intersects(&mask, nodemask)) + cpumask_and(&mask, &mask, nodemask); } - irq_do_set_affinity(&desc->irq_data, mask, false); - return 0; + ret = irq_do_set_affinity(&desc->irq_data, &mask, false); + raw_spin_unlock(&mask_lock); + return ret; } #else /* Wrapper for ALPHA specific affinity selector magic */ -static inline int setup_affinity(struct irq_desc *d, struct cpumask *mask) +int irq_setup_affinity(struct irq_desc *desc) { - return irq_select_affinity(irq_desc_get_irq(d)); + return irq_select_affinity(irq_desc_get_irq(desc)); } #endif /* - * Called when affinity is set via /proc/irq + * Called when a bogus affinity is set via /proc/irq */ -int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) +int irq_select_affinity_usr(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret; raw_spin_lock_irqsave(&desc->lock, flags); - ret = setup_affinity(desc, mask); + ret = irq_setup_affinity(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } - #else -static inline int -setup_affinity(struct irq_desc *desc, struct cpumask *mask) +static inline int setup_affinity(struct irq_desc *desc) { return 0; } @@ -1128,7 +1130,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) struct irqaction *old, **old_ptr; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; - cpumask_var_t mask; if (!desc) return -EINVAL; @@ -1187,11 +1188,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } } - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { - ret = -ENOMEM; - goto out_thread; - } - /* * Drivers are often written to work w/o knowledge about the * underlying irq chip implementation, so a request for a @@ -1256,7 +1252,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (thread_mask == ~0UL) { ret = -EBUSY; - goto out_mask; + goto out_unlock; } /* * The thread_mask for the action is or'ed to @@ -1300,7 +1296,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", irq); ret = -EINVAL; - goto out_mask; + goto out_unlock; } if (!shared) { @@ -1308,7 +1304,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (ret) { pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", new->name, irq, desc->irq_data.chip->name); - goto out_mask; + goto out_unlock; } init_waitqueue_head(&desc->wait_for_threads); @@ -1320,7 +1316,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (ret) { irq_release_resources(desc); - goto out_mask; + goto out_unlock; } } @@ -1357,7 +1353,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } /* Set default affinity mask once everything is setup */ - setup_affinity(desc, mask); + irq_setup_affinity(desc); } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; @@ -1401,8 +1397,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) irq_add_debugfs_entry(irq, desc); new->dir = NULL; register_handler_proc(irq, new); - free_cpumask_var(mask); - return 0; mismatch: @@ -1415,9 +1409,8 @@ mismatch: } ret = -EBUSY; -out_mask: +out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); - free_cpumask_var(mask); out_thread: if (new->thread) { diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index c53edad7b459..d35bb8d4c317 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -120,9 +120,11 @@ static ssize_t write_irq_affinity(int type, struct file *file, * one online CPU still has to be targeted. */ if (!cpumask_intersects(new_value, cpu_online_mask)) { - /* Special case for empty set - allow the architecture - code to set default SMP affinity. */ - err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count; + /* + * Special case for empty set - allow the architecture code + * to set default SMP affinity. + */ + err = irq_select_affinity_usr(irq) ? -EINVAL : count; } else { irq_set_affinity(irq, new_value); err = count; -- cgit v1.3-14-g43fede From 43564bd97d0e6182bbd43b51b33254c728832551 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:22 +0200 Subject: genirq: Rename setup_affinity() to irq_setup_affinity() Rename it with a proper irq_ prefix and make it available for other files in the core code. Preparatory patch for moving the irq affinity setup around. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235444.928501004@linutronix.de --- kernel/irq/internals.h | 6 ++++++ kernel/irq/manage.c | 7 +------ 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 33ca83816b8c..2d7927d9fb57 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -116,6 +116,12 @@ extern void irq_set_thread_affinity(struct irq_desc *desc); extern int irq_do_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force); +#ifdef CONFIG_SMP +extern int irq_setup_affinity(struct irq_desc *desc); +#else +static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; } +#endif + /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e2f20d553d60..907fb791ff63 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -345,7 +345,7 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); /* * Generic version of the affinity autoselector. */ -static int irq_setup_affinity(struct irq_desc *desc) +int irq_setup_affinity(struct irq_desc *desc) { struct cpumask *set = irq_default_affinity; int ret, node = irq_desc_get_node(desc); @@ -404,11 +404,6 @@ int irq_select_affinity_usr(unsigned int irq) raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } -#else -static inline int setup_affinity(struct irq_desc *desc) -{ - return 0; -} #endif /** -- cgit v1.3-14-g43fede From 2e051552df69af6d134c2592d0d6f1ac80f01190 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:23 +0200 Subject: genirq: Move initial affinity setup to irq_startup() The startup vs. setaffinity ordering of interrupts depends on the IRQF_NOAUTOEN flag. Chained interrupts are not getting any affinity assignment at all. A regular interrupt is started up and then the affinity is set. A IRQF_NOAUTOEN marked interrupt is not started up, but the affinity is set nevertheless. Move the affinity setup to startup_irq() so the ordering is always the same and chained interrupts get the proper default affinity assigned as well. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.020534783@linutronix.de --- kernel/irq/chip.c | 2 ++ kernel/irq/manage.c | 15 ++++++--------- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index bc1331f84fb5..e290d73b88e2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -213,6 +213,8 @@ int irq_startup(struct irq_desc *desc, bool resend) irq_enable(desc); } irq_state_set_started(desc); + /* Set default affinity mask once everything is setup */ + irq_setup_affinity(desc); } if (resend) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 907fb791ff63..1e283073cecc 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1327,6 +1327,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; + /* Exclude IRQ from balancing if requested */ + if (new->flags & IRQF_NOBALANCING) { + irq_settings_set_no_balancing(desc); + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + } + if (irq_settings_can_autoenable(desc)) { irq_startup(desc, true); } else { @@ -1341,15 +1347,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->depth = 1; } - /* Exclude IRQ from balancing if requested */ - if (new->flags & IRQF_NOBALANCING) { - irq_settings_set_no_balancing(desc); - irqd_set(&desc->irq_data, IRQD_NO_BALANCING); - } - - /* Set default affinity mask once everything is setup */ - irq_setup_affinity(desc); - } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; unsigned int omsk = irqd_get_trigger_type(&desc->irq_data); -- cgit v1.3-14-g43fede From 137221df69c6f8a7002f82dc3d95052d34f5667e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Jun 2017 01:37:24 +0200 Subject: genirq: Move pending helpers to internal.h So that the affinity code can reuse them. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170619235445.109426284@linutronix.de --- kernel/irq/internals.h | 38 ++++++++++++++++++++++++++++++++++++++ kernel/irq/manage.c | 28 ---------------------------- 2 files changed, 38 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 2d7927d9fb57..20b197f0a7b5 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -249,6 +249,44 @@ irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, void __iomem *reg_base, irq_flow_handler_t handler) { } #endif /* CONFIG_GENERIC_IRQ_CHIP */ +#ifdef CONFIG_GENERIC_PENDING_IRQ +static inline bool irq_can_move_pcntxt(struct irq_data *data) +{ + return irqd_can_move_in_process_context(data); +} +static inline bool irq_move_pending(struct irq_data *data) +{ + return irqd_is_setaffinity_pending(data); +} +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) +{ + cpumask_copy(desc->pending_mask, mask); +} +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) +{ + cpumask_copy(mask, desc->pending_mask); +} +#else /* CONFIG_GENERIC_PENDING_IRQ */ +static inline bool irq_can_move_pcntxt(struct irq_data *data) +{ + return true; +} +static inline bool irq_move_pending(struct irq_data *data) +{ + return false; +} +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) +{ +} +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) +{ +} +#endif /* CONFIG_GENERIC_PENDING_IRQ */ + #ifdef CONFIG_GENERIC_IRQ_DEBUGFS void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); void irq_remove_debugfs_entry(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1e283073cecc..7dcf19397c39 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -168,34 +168,6 @@ void irq_set_thread_affinity(struct irq_desc *desc) set_bit(IRQTF_AFFINITY, &action->thread_flags); } -#ifdef CONFIG_GENERIC_PENDING_IRQ -static inline bool irq_can_move_pcntxt(struct irq_data *data) -{ - return irqd_can_move_in_process_context(data); -} -static inline bool irq_move_pending(struct irq_data *data) -{ - return irqd_is_setaffinity_pending(data); -} -static inline void -irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) -{ - cpumask_copy(desc->pending_mask, mask); -} -static inline void -irq_get_pending(struct cpumask *mask, struct irq_desc *desc) -{ - cpumask_copy(mask, desc->pending_mask); -} -#else -static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } -static inline bool irq_move_pending(struct irq_data *data) { return false; } -static inline void -irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } -static inline void -irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } -#endif - int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { -- cgit v1.3-14-g43fede From 0dd945ff4647a1f29c6ae8f4f9a69c8f37c994cf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:25 +0200 Subject: genirq/cpuhotplug: Remove irq disabling logic This is called from stop_machine() with interrupts disabled. No point in disabling them some more. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.198042748@linutronix.de --- kernel/irq/cpuhotplug.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 011f8c4c63da..705139831590 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -59,11 +59,8 @@ static bool migrate_one_irq(struct irq_desc *desc) */ void irq_migrate_all_off_this_cpu(void) { - unsigned int irq; struct irq_desc *desc; - unsigned long flags; - - local_irq_save(flags); + unsigned int irq; for_each_active_irq(irq) { bool affinity_broken; @@ -73,10 +70,9 @@ void irq_migrate_all_off_this_cpu(void) affinity_broken = migrate_one_irq(desc); raw_spin_unlock(&desc->lock); - if (affinity_broken) - pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n", + if (affinity_broken) { + pr_warn_ratelimited("IRQ %u: no longer affine to CPU%u\n", irq, smp_processor_id()); + } } - - local_irq_restore(flags); } -- cgit v1.3-14-g43fede From 735c09524d3e7c92315e8e2699a1b9acb4fb415c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:26 +0200 Subject: genirq/cpuhotplug: Dont claim success on error In case the affinity of an interrupt was broken, a printk is emitted. But if the affinity cannot be set at all due to a missing irq_set_affinity() callback or due to a failing callback, the message is still printed preceeded by a warning/error. That makes no sense whatsoever. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.274852976@linutronix.de --- kernel/irq/cpuhotplug.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 705139831590..9c5521b247d5 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -37,11 +37,14 @@ static bool migrate_one_irq(struct irq_desc *desc) c = irq_data_get_irq_chip(d); if (!c->irq_set_affinity) { pr_debug("IRQ%u: unable to set affinity\n", d->irq); + ret = false; } else { int r = irq_do_set_affinity(d, affinity, false); - if (r) + if (r) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, r); + ret = false; + } } return ret; -- cgit v1.3-14-g43fede From e8a7035039306c90bcc99129ffc18e0be052bbb9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:27 +0200 Subject: genirq/cpuhotplug: Reorder check logic Move the checks for a valid irq chip and the irq_set_affinity() callback right in front of the whole migration logic. No point in doing a gazillion of other things when the interrupt cannot be migrated at all. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.354181630@linutronix.de --- kernel/irq/cpuhotplug.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 9c5521b247d5..41fe1e04d5d9 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -17,9 +17,20 @@ static bool migrate_one_irq(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(d); const struct cpumask *affinity = d->common->affinity; - struct irq_chip *c; - bool ret = false; + bool brokeaff = false; + int err; + + /* + * IRQ chip might be already torn down, but the irq descriptor is + * still in the radix tree. Also if the chip has no affinity setter, + * nothing can be done here. + */ + if (!chip || !chip->irq_set_affinity) { + pr_debug("IRQ %u: Unable to migrate away\n", d->irq); + return false; + } /* * If this is a per-CPU interrupt, or the affinity does not @@ -31,23 +42,16 @@ static bool migrate_one_irq(struct irq_desc *desc) if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { affinity = cpu_online_mask; - ret = true; + brokeaff = true; } - c = irq_data_get_irq_chip(d); - if (!c->irq_set_affinity) { - pr_debug("IRQ%u: unable to set affinity\n", d->irq); - ret = false; - } else { - int r = irq_do_set_affinity(d, affinity, false); - if (r) { - pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", - d->irq, r); - ret = false; - } + err = irq_do_set_affinity(d, affinity, false); + if (err) { + pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", + d->irq, err); + return false; } - - return ret; + return brokeaff; } /** -- cgit v1.3-14-g43fede From 91f26cb4cd3c22bd656ab46c49329aacaaab5504 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:28 +0200 Subject: genirq/cpuhotplug: Do not migrated shutdown irqs Interrupts, which are shut down are tried to be migrated as well. That's pointless because the interrupt cannot fire and the next startup will move it to the proper place anyway. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.447550992@linutronix.de --- kernel/irq/cpuhotplug.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 41fe1e04d5d9..09b20e127aee 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -33,10 +33,15 @@ static bool migrate_one_irq(struct irq_desc *desc) } /* - * If this is a per-CPU interrupt, or the affinity does not - * include this CPU, then we have nothing to do. + * No move required, if: + * - Interrupt is per cpu + * - Interrupt is not started + * - Affinity mask does not include this CPU. + * + * Note: Do not check desc->action as this might be a chained + * interrupt. */ - if (irqd_is_per_cpu(d) || + if (irqd_is_per_cpu(d) || !irqd_is_started(d) || !cpumask_test_cpu(smp_processor_id(), affinity)) return false; -- cgit v1.3-14-g43fede From f0383c24b4855f6a4b5a358c7b2d2c16e0437e9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:29 +0200 Subject: genirq/cpuhotplug: Add support for cleaning up move in progress In order to move x86 to the generic hotplug migration code, add support for cleaning up move in progress bits. On architectures which have this x86 specific (mis)feature not enabled, this is optimized out by the compiler. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.525817311@linutronix.de --- arch/x86/include/asm/irq.h | 1 - include/linux/irq.h | 2 ++ kernel/irq/cpuhotplug.c | 28 ++++++++++++++++++++++++++-- kernel/irq/internals.h | 10 +++++++++- 4 files changed, 37 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 16d3fa211962..668cca540025 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -29,7 +29,6 @@ struct irq_desc; #include extern int check_irq_vectors_for_cpu_disable(void); extern void fixup_irqs(void); -extern void irq_force_complete_move(struct irq_desc *desc); #endif #ifdef CONFIG_HAVE_KVM diff --git a/include/linux/irq.h b/include/linux/irq.h index d008065e2f4d..299271a4953c 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -491,10 +491,12 @@ extern void irq_migrate_all_off_this_cpu(void); #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) void irq_move_irq(struct irq_data *data); void irq_move_masked_irq(struct irq_data *data); +void irq_force_complete_move(struct irq_desc *desc); bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); #else static inline void irq_move_irq(struct irq_data *data) { } static inline void irq_move_masked_irq(struct irq_data *data) { } +static inline void irq_force_complete_move(struct irq_desc *desc) { } static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) { return false; diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 09b20e127aee..4be4bd669d81 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -18,7 +18,7 @@ static bool migrate_one_irq(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *chip = irq_data_get_irq_chip(d); - const struct cpumask *affinity = d->common->affinity; + const struct cpumask *affinity; bool brokeaff = false; int err; @@ -41,9 +41,33 @@ static bool migrate_one_irq(struct irq_desc *desc) * Note: Do not check desc->action as this might be a chained * interrupt. */ + affinity = irq_data_get_affinity_mask(d); if (irqd_is_per_cpu(d) || !irqd_is_started(d) || - !cpumask_test_cpu(smp_processor_id(), affinity)) + !cpumask_test_cpu(smp_processor_id(), affinity)) { + /* + * If an irq move is pending, abort it if the dying CPU is + * the sole target. + */ + irq_fixup_move_pending(desc, false); return false; + } + + /* + * Complete an eventually pending irq move cleanup. If this + * interrupt was moved in hard irq context, then the vectors need + * to be cleaned up. It can't wait until this interrupt actually + * happens and this CPU was involved. + */ + irq_force_complete_move(desc); + + /* + * If there is a setaffinity pending, then try to reuse the pending + * mask, so the last change of the affinity does not get lost. If + * there is no move pending or the pending mask does not contain + * any online CPU, use the current affinity mask. + */ + if (irq_fixup_move_pending(desc, true)) + affinity = irq_desc_get_pending_mask(desc); if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { affinity = cpu_online_mask; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 20b197f0a7b5..fd4fa8382b8f 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -268,6 +268,10 @@ irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { cpumask_copy(mask, desc->pending_mask); } +static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) +{ + return desc->pending_mask; +} #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) { @@ -285,7 +289,11 @@ static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } -#endif /* CONFIG_GENERIC_PENDING_IRQ */ +static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) +{ + return NULL; +} +#endif /* !CONFIG_GENERIC_PENDING_IRQ */ #ifdef CONFIG_GENERIC_IRQ_DEBUGFS void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); -- cgit v1.3-14-g43fede From 47a06d3a783217acae02976f15ca07ddc1ac024f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:30 +0200 Subject: genirq/cpuhotplug: Add support for conditional masking Interrupts which cannot be migrated in process context, need to be masked before the affinity is changed forcefully. Add support for that. Will be compiled out for architectures which do not have this x86 specific issue. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.604565591@linutronix.de --- kernel/irq/cpuhotplug.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 4be4bd669d81..6f46587a9ce5 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -18,6 +18,7 @@ static bool migrate_one_irq(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); struct irq_chip *chip = irq_data_get_irq_chip(d); + bool maskchip = !irq_can_move_pcntxt(d) && !irqd_irq_masked(d); const struct cpumask *affinity; bool brokeaff = false; int err; @@ -69,6 +70,10 @@ static bool migrate_one_irq(struct irq_desc *desc) if (irq_fixup_move_pending(desc, true)) affinity = irq_desc_get_pending_mask(desc); + /* Mask the chip for interrupts which cannot move in process context */ + if (maskchip && chip->irq_mask) + chip->irq_mask(d); + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { affinity = cpu_online_mask; brokeaff = true; @@ -78,8 +83,12 @@ static bool migrate_one_irq(struct irq_desc *desc) if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); - return false; + brokeaff = false; } + + if (maskchip && chip->irq_unmask) + chip->irq_unmask(d); + return brokeaff; } -- cgit v1.3-14-g43fede From 77f85e66aa8be563ae5804eebf74a78ec6ef5555 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:31 +0200 Subject: genirq/cpuhotplug: Set force affinity flag on hotplug migration Set the force migration flag when migrating interrupts away from an outgoing CPU. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.681874648@linutronix.de --- kernel/irq/cpuhotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 6f46587a9ce5..e09cb91a7c8b 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -79,7 +79,7 @@ static bool migrate_one_irq(struct irq_desc *desc) brokeaff = true; } - err = irq_do_set_affinity(d, affinity, false); + err = irq_do_set_affinity(d, affinity, true); if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); -- cgit v1.3-14-g43fede From 36d84fb45140f151fa4e145381dbce5e5ffed24d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:34 +0200 Subject: genirq: Move irq_fixup_move_pending() to core Now that x86 uses the generic code, the function declaration and inline stub can move to the core internal header. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235445.928156166@linutronix.de --- include/linux/irq.h | 5 ----- kernel/irq/internals.h | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 299271a4953c..2b7e5a70d05f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -492,15 +492,10 @@ extern void irq_migrate_all_off_this_cpu(void); void irq_move_irq(struct irq_data *data); void irq_move_masked_irq(struct irq_data *data); void irq_force_complete_move(struct irq_desc *desc); -bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); #else static inline void irq_move_irq(struct irq_data *data) { } static inline void irq_move_masked_irq(struct irq_data *data) { } static inline void irq_force_complete_move(struct irq_desc *desc) { } -static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) -{ - return false; -} #endif extern int no_irq_affinity; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fd4fa8382b8f..040806f1124c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -272,6 +272,7 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) { return desc->pending_mask; } +bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) { @@ -293,6 +294,10 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) { return NULL; } +static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) +{ + return false; +} #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #ifdef CONFIG_GENERIC_IRQ_DEBUGFS -- cgit v1.3-14-g43fede From 047dc6331de58da51818582c0db0dbfcb837e614 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:35 +0200 Subject: genirq: Remove pointless arg from show_irq_affinity The third argument of the internal helper function is unused. Remove it. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235446.004958600@linutronix.de --- kernel/irq/proc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d35bb8d4c317..eff7c0c8f9b9 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -37,7 +37,7 @@ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP -static int show_irq_affinity(int type, struct seq_file *m, void *v) +static int show_irq_affinity(int type, struct seq_file *m) { struct irq_desc *desc = irq_to_desc((long)m->private); const struct cpumask *mask = desc->irq_common_data.affinity; @@ -80,12 +80,12 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) int no_irq_affinity; static int irq_affinity_proc_show(struct seq_file *m, void *v) { - return show_irq_affinity(0, m, v); + return show_irq_affinity(0, m); } static int irq_affinity_list_proc_show(struct seq_file *m, void *v) { - return show_irq_affinity(1, m, v); + return show_irq_affinity(1, m); } -- cgit v1.3-14-g43fede From 4ab764c336123157690ee0000a1dcf81851c58d1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:36 +0200 Subject: genirq: Remove pointless gfp argument All callers hand in GPF_KERNEL. No point to have an extra argument for that. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235446.082544752@linutronix.de --- kernel/irq/irqdesc.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index feade536b6d1..48d4f0365e52 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -54,14 +54,14 @@ static void __init init_irq_default_affinity(void) #endif #ifdef CONFIG_SMP -static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) +static int alloc_masks(struct irq_desc *desc, int node) { if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, - gfp, node)) + GFP_KERNEL, node)) return -ENOMEM; #ifdef CONFIG_GENERIC_PENDING_IRQ - if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { + if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) { free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } @@ -86,7 +86,7 @@ static void desc_smp_init(struct irq_desc *desc, int node, #else static inline int -alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } +alloc_masks(struct irq_desc *desc, int node) { return 0; } static inline void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } #endif @@ -344,9 +344,8 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, struct module *owner) { struct irq_desc *desc; - gfp_t gfp = GFP_KERNEL; - desc = kzalloc_node(sizeof(*desc), gfp, node); + desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node); if (!desc) return NULL; /* allocate based on nr_cpu_ids */ @@ -354,7 +353,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, if (!desc->kstat_irqs) goto err_desc; - if (alloc_masks(desc, gfp, node)) + if (alloc_masks(desc, node)) goto err_kstat; raw_spin_lock_init(&desc->lock); @@ -525,7 +524,7 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].kstat_irqs = alloc_percpu(unsigned int); - alloc_masks(&desc[i], GFP_KERNEL, node); + alloc_masks(&desc[i], node); raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); desc_set_defaults(i, &desc[i], node, NULL, NULL); -- cgit v1.3-14-g43fede From c1a80386965e9fa3c2f8d1d57966216fe02c9124 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:37 +0200 Subject: genirq/proc: Replace ever repeating type cast The proc file setup repeats the same ugly type cast for the irq number over and over. Do it once and hand in the local void pointer. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235446.160866358@linutronix.de --- kernel/irq/proc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index eff7c0c8f9b9..cbc4c5e377ec 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -326,6 +326,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) void register_irq_proc(unsigned int irq, struct irq_desc *desc) { static DEFINE_MUTEX(register_lock); + void __maybe_unused *irqp = (void *)(unsigned long) irq; char name [MAX_NAMELEN]; if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) @@ -351,20 +352,19 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) #ifdef CONFIG_SMP /* create /proc/irq//smp_affinity */ proc_create_data("smp_affinity", 0644, desc->dir, - &irq_affinity_proc_fops, (void *)(long)irq); + &irq_affinity_proc_fops, irqp); /* create /proc/irq//affinity_hint */ proc_create_data("affinity_hint", 0444, desc->dir, - &irq_affinity_hint_proc_fops, (void *)(long)irq); + &irq_affinity_hint_proc_fops, irqp); /* create /proc/irq//smp_affinity_list */ proc_create_data("smp_affinity_list", 0644, desc->dir, - &irq_affinity_list_proc_fops, (void *)(long)irq); + &irq_affinity_list_proc_fops, irqp); proc_create_data("node", 0444, desc->dir, - &irq_node_proc_fops, (void *)(long)irq); + &irq_node_proc_fops, irqp); #endif - proc_create_data("spurious", 0444, desc->dir, &irq_spurious_proc_fops, (void *)(long)irq); -- cgit v1.3-14-g43fede From 0d3f54257dc300f2db480d6a46b34bdb87f18c1b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:38 +0200 Subject: genirq: Introduce effective affinity mask There is currently no way to evaluate the effective affinity mask of a given interrupt. Many irq chips allow only a single target CPU or a subset of CPUs in the affinity mask. Updating the mask at the time of setting the affinity to the subset would be counterproductive because information for cpu hotplug about assigned interrupt affinities gets lost. On CPU hotplug it's also pointless to force migrate an interrupt, which is not targeted at the CPU effectively. But currently the information is not available. Provide a seperate mask to be updated by the irq_chip->irq_set_affinity() implementations. Implement the read only proc files so the user can see the effective mask as well w/o trying to deduce it from /proc/interrupts. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235446.247834245@linutronix.de --- include/linux/irq.h | 29 +++++++++++++++++ kernel/irq/Kconfig | 4 +++ kernel/irq/debugfs.c | 4 +++ kernel/irq/irqdesc.c | 14 ++++++++ kernel/irq/proc.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++---- 5 files changed, 134 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 2b7e5a70d05f..4087ef268ba9 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -137,6 +137,9 @@ struct irq_domain; * @affinity: IRQ affinity on SMP. If this is an IPI * related irq, then this is the mask of the * CPUs to which an IPI can be sent. + * @effective_affinity: The effective IRQ affinity on SMP as some irq + * chips do not allow multi CPU destinations. + * A subset of @affinity. * @msi_desc: MSI descriptor * @ipi_offset: Offset of first IPI target cpu in @affinity. Optional. */ @@ -148,6 +151,9 @@ struct irq_common_data { void *handler_data; struct msi_desc *msi_desc; cpumask_var_t affinity; +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + cpumask_var_t effective_affinity; +#endif #ifdef CONFIG_GENERIC_IRQ_IPI unsigned int ipi_offset; #endif @@ -737,6 +743,29 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d) return d->common->affinity; } +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK +static inline +struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) +{ + return d->common->effective_affinity; +} +static inline void irq_data_update_effective_affinity(struct irq_data *d, + const struct cpumask *m) +{ + cpumask_copy(d->common->effective_affinity, m); +} +#else +static inline void irq_data_update_effective_affinity(struct irq_data *d, + const struct cpumask *m) +{ +} +static inline +struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) +{ + return d->common->affinity; +} +#endif + unsigned int arch_dynirq_lower_bound(unsigned int from); int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 8d9498e51585..fcbb1d6d51cb 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -21,6 +21,10 @@ config GENERIC_IRQ_SHOW config GENERIC_IRQ_SHOW_LEVEL bool +# Supports effective affinity mask +config GENERIC_IRQ_EFFECTIVE_AFF_MASK + bool + # Facility to allocate a hardware interrupt. This is legacy support # and should not be used in new code. Use irq domains instead. config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 50ee2f6593e8..edbef252d0c4 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -36,6 +36,10 @@ static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc) msk = irq_data_get_affinity_mask(data); seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk)); +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + msk = irq_data_get_effective_affinity_mask(data); + seq_printf(m, "effectiv: %*pbl\n", cpumask_pr_args(msk)); +#endif #ifdef CONFIG_GENERIC_PENDING_IRQ msk = desc->pending_mask; seq_printf(m, "pending: %*pbl\n", cpumask_pr_args(msk)); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 48d4f0365e52..35a95fadcfda 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -60,8 +60,19 @@ static int alloc_masks(struct irq_desc *desc, int node) GFP_KERNEL, node)) return -ENOMEM; +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + if (!zalloc_cpumask_var_node(&desc->irq_common_data.effective_affinity, + GFP_KERNEL, node)) { + free_cpumask_var(desc->irq_common_data.affinity); + return -ENOMEM; + } +#endif + #ifdef CONFIG_GENERIC_PENDING_IRQ if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) { +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + free_cpumask_var(desc->irq_common_data.effective_affinity); +#endif free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } @@ -324,6 +335,9 @@ static void free_masks(struct irq_desc *desc) free_cpumask_var(desc->pending_mask); #endif free_cpumask_var(desc->irq_common_data.affinity); +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + free_cpumask_var(desc->irq_common_data.effective_affinity); +#endif } #else static inline void free_masks(struct irq_desc *desc) { } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index cbc4c5e377ec..7f9642a1e267 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -37,19 +37,47 @@ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP +enum { + AFFINITY, + AFFINITY_LIST, + EFFECTIVE, + EFFECTIVE_LIST, +}; + static int show_irq_affinity(int type, struct seq_file *m) { struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = desc->irq_common_data.affinity; + const struct cpumask *mask; + switch (type) { + case AFFINITY: + case AFFINITY_LIST: + mask = desc->irq_common_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ - if (irqd_is_setaffinity_pending(&desc->irq_data)) - mask = desc->pending_mask; + if (irqd_is_setaffinity_pending(&desc->irq_data)) + mask = desc->pending_mask; #endif - if (type) + break; + case EFFECTIVE: + case EFFECTIVE_LIST: +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + mask = desc->irq_common_data.effective_affinity; + break; +#else + return -EINVAL; +#endif + }; + + switch (type) { + case AFFINITY_LIST: + case EFFECTIVE_LIST: seq_printf(m, "%*pbl\n", cpumask_pr_args(mask)); - else + break; + case AFFINITY: + case EFFECTIVE: seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); + break; + } return 0; } @@ -80,12 +108,12 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) int no_irq_affinity; static int irq_affinity_proc_show(struct seq_file *m, void *v) { - return show_irq_affinity(0, m); + return show_irq_affinity(AFFINITY, m); } static int irq_affinity_list_proc_show(struct seq_file *m, void *v) { - return show_irq_affinity(1, m); + return show_irq_affinity(AFFINITY_LIST, m); } @@ -185,6 +213,44 @@ static const struct file_operations irq_affinity_list_proc_fops = { .write = irq_affinity_list_proc_write, }; +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK +static int irq_effective_aff_proc_show(struct seq_file *m, void *v) +{ + return show_irq_affinity(EFFECTIVE, m); +} + +static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v) +{ + return show_irq_affinity(EFFECTIVE_LIST, m); +} + +static int irq_effective_aff_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_effective_aff_proc_show, PDE_DATA(inode)); +} + +static int irq_effective_aff_list_proc_open(struct inode *inode, + struct file *file) +{ + return single_open(file, irq_effective_aff_list_proc_show, + PDE_DATA(inode)); +} + +static const struct file_operations irq_effective_aff_proc_fops = { + .open = irq_effective_aff_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations irq_effective_aff_list_proc_fops = { + .open = irq_effective_aff_list_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static int default_affinity_show(struct seq_file *m, void *v) { seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity)); @@ -364,6 +430,12 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) proc_create_data("node", 0444, desc->dir, &irq_node_proc_fops, irqp); +# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + proc_create_data("effective_affinity", 0444, desc->dir, + &irq_effective_aff_proc_fops, irqp); + proc_create_data("effective_affinity_list", 0444, desc->dir, + &irq_effective_aff_list_proc_fops, irqp); +# endif #endif proc_create_data("spurious", 0444, desc->dir, &irq_spurious_proc_fops, (void *)(long)irq); @@ -383,6 +455,10 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) remove_proc_entry("affinity_hint", desc->dir); remove_proc_entry("smp_affinity_list", desc->dir); remove_proc_entry("node", desc->dir); +# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + remove_proc_entry("effective_affinity", desc->dir); + remove_proc_entry("effective_affinity_list", desc->dir); +# endif #endif remove_proc_entry("spurious", desc->dir); -- cgit v1.3-14-g43fede From 415fcf1a2293046e0c1f4ab8558a87bad66652b1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:39 +0200 Subject: genirq/cpuhotplug: Use effective affinity mask If the architecture supports the effective affinity mask, migrating interrupts away which are not targeted by the effective mask is pointless. They can stay in the user or system supplied affinity mask, but won't be targetted at any given point as the affinity setter functions need to validate against the online cpu mask anyway. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235446.328488490@linutronix.de --- kernel/irq/cpuhotplug.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index e09cb91a7c8b..0b093db3336b 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -14,6 +14,14 @@ #include "internals.h" +/* For !GENERIC_IRQ_EFFECTIVE_AFF_MASK this looks at general affinity mask */ +static inline bool irq_needs_fixup(struct irq_data *d) +{ + const struct cpumask *m = irq_data_get_effective_affinity_mask(d); + + return cpumask_test_cpu(smp_processor_id(), m); +} + static bool migrate_one_irq(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); @@ -42,9 +50,7 @@ static bool migrate_one_irq(struct irq_desc *desc) * Note: Do not check desc->action as this might be a chained * interrupt. */ - affinity = irq_data_get_affinity_mask(d); - if (irqd_is_per_cpu(d) || !irqd_is_started(d) || - !cpumask_test_cpu(smp_processor_id(), affinity)) { + if (irqd_is_per_cpu(d) || !irqd_is_started(d) || !irq_needs_fixup(d)) { /* * If an irq move is pending, abort it if the dying CPU is * the sole target. @@ -69,6 +75,8 @@ static bool migrate_one_irq(struct irq_desc *desc) */ if (irq_fixup_move_pending(desc, true)) affinity = irq_desc_get_pending_mask(desc); + else + affinity = irq_data_get_affinity_mask(d); /* Mask the chip for interrupts which cannot move in process context */ if (maskchip && chip->irq_mask) -- cgit v1.3-14-g43fede From 54fdf6a0875ca380647ac1cc9b5b8f2dbbbfa131 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:47 +0200 Subject: genirq: Introduce IRQD_MANAGED_SHUTDOWN Affinity managed interrupts should keep their assigned affinity accross CPU hotplug. To avoid magic hackery in device drivers, the core code shall manage them transparently. This will set these interrupts into a managed shutdown state when the last CPU of the assigned affinity mask goes offline. The interrupt will be restarted when one of the CPUs in the assigned affinity mask comes back online. Introduce the necessary state flag and the accessor functions. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235446.954523476@linutronix.de --- include/linux/irq.h | 8 ++++++++ kernel/irq/internals.h | 10 ++++++++++ 2 files changed, 18 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 4087ef268ba9..0e37276c5315 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -207,6 +207,8 @@ struct irq_data { * IRQD_FORWARDED_TO_VCPU - The interrupt is forwarded to a VCPU * IRQD_AFFINITY_MANAGED - Affinity is auto-managed by the kernel * IRQD_IRQ_STARTED - Startup state of the interrupt + * IRQD_MANAGED_SHUTDOWN - Interrupt was shutdown due to empty affinity + * mask. Applies only to affinity managed irqs. */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -225,6 +227,7 @@ enum { IRQD_FORWARDED_TO_VCPU = (1 << 20), IRQD_AFFINITY_MANAGED = (1 << 21), IRQD_IRQ_STARTED = (1 << 22), + IRQD_MANAGED_SHUTDOWN = (1 << 23), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -343,6 +346,11 @@ static inline bool irqd_is_started(struct irq_data *d) return __irqd_to_state(d) & IRQD_IRQ_STARTED; } +static inline bool irqd_is_managed_shutdown(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 040806f1124c..ca4666b4cd39 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -193,6 +193,16 @@ static inline void irqd_clr_move_pending(struct irq_data *d) __irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING; } +static inline void irqd_set_managed_shutdown(struct irq_data *d) +{ + __irqd_to_state(d) |= IRQD_MANAGED_SHUTDOWN; +} + +static inline void irqd_clr_managed_shutdown(struct irq_data *d) +{ + __irqd_to_state(d) &= ~IRQD_MANAGED_SHUTDOWN; +} + static inline void irqd_clear(struct irq_data *d, unsigned int mask) { __irqd_to_state(d) &= ~mask; -- cgit v1.3-14-g43fede From 708d174b6c32bffc5d73793bc7a267bcafeb6558 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:48 +0200 Subject: genirq: Split out irq_startup() code Split out the inner workings of irq_startup() so it can be reused to handle managed interrupts gracefully. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235447.033235144@linutronix.de --- kernel/irq/chip.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e290d73b88e2..1163089aa245 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -195,6 +195,23 @@ static void irq_state_set_started(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_STARTED); } +static int __irq_startup(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + int ret = 0; + + irq_domain_activate_irq(d); + if (d->chip->irq_startup) { + ret = d->chip->irq_startup(d); + irq_state_clr_disabled(desc); + irq_state_clr_masked(desc); + } else { + irq_enable(desc); + } + irq_state_set_started(desc); + return ret; +} + int irq_startup(struct irq_desc *desc, bool resend) { int ret = 0; @@ -204,19 +221,9 @@ int irq_startup(struct irq_desc *desc, bool resend) if (irqd_is_started(&desc->irq_data)) { irq_enable(desc); } else { - irq_domain_activate_irq(&desc->irq_data); - if (desc->irq_data.chip->irq_startup) { - ret = desc->irq_data.chip->irq_startup(&desc->irq_data); - irq_state_clr_disabled(desc); - irq_state_clr_masked(desc); - } else { - irq_enable(desc); - } - irq_state_set_started(desc); - /* Set default affinity mask once everything is setup */ + ret = __irq_startup(desc); irq_setup_affinity(desc); } - if (resend) check_irq_resend(desc); -- cgit v1.3-14-g43fede From 4cde9c6b826834b861a2b58653ab33150f562064 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:49 +0200 Subject: genirq: Add force argument to irq_startup() In order to handle managed interrupts gracefully on irq_startup() so they won't lose their assigned affinity, it's necessary to allow startups which keep the interrupts in managed shutdown state, if none of the assigend CPUs is online. This allows drivers to request interrupts w/o the CPUs being online, which avoid online/offline churn in drivers. Add a force argument which can override that decision and let only request_irq() and enable_irq() allow the managed shutdown handling. enable_irq() is required, because the interrupt might be requested with IRQF_NOAUTOEN and enable_irq() invokes irq_startup() which would then wreckage the assignment again. All other callers force startup and potentially break the assigned affinity. No functional change as this only adds the function argument. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235447.112094565@linutronix.de --- kernel/irq/autoprobe.c | 4 ++-- kernel/irq/chip.c | 4 ++-- kernel/irq/internals.h | 9 ++++++++- kernel/irq/manage.c | 4 ++-- 4 files changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 0119b9d467ae..d30a0dd5cc02 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -53,7 +53,7 @@ unsigned long probe_irq_on(void) if (desc->irq_data.chip->irq_set_type) desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); - irq_startup(desc, false); + irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE); } raw_spin_unlock_irq(&desc->lock); } @@ -70,7 +70,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc, false)) + if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE)) desc->istate |= IRQS_PENDING; } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 1163089aa245..b7599e952d3b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -212,7 +212,7 @@ static int __irq_startup(struct irq_desc *desc) return ret; } -int irq_startup(struct irq_desc *desc, bool resend) +int irq_startup(struct irq_desc *desc, bool resend, bool force) { int ret = 0; @@ -892,7 +892,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); desc->action = &chained_action; - irq_startup(desc, true); + irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); } } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ca4666b4cd39..5fd105e252c3 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -66,7 +66,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags); extern void __disable_irq(struct irq_desc *desc); extern void __enable_irq(struct irq_desc *desc); -extern int irq_startup(struct irq_desc *desc, bool resend); +#define IRQ_RESEND true +#define IRQ_NORESEND false + +#define IRQ_START_FORCE true +#define IRQ_START_COND false + +extern int irq_startup(struct irq_desc *desc, bool resend, bool force); + extern void irq_shutdown(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7dcf19397c39..3577c091ac7b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -509,7 +509,7 @@ void __enable_irq(struct irq_desc *desc) * time. If it was already started up, then irq_startup() * will invoke irq_enable() under the hood. */ - irq_startup(desc, true); + irq_startup(desc, IRQ_RESEND, IRQ_START_COND); break; } default: @@ -1306,7 +1306,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (irq_settings_can_autoenable(desc)) { - irq_startup(desc, true); + irq_startup(desc, IRQ_RESEND, IRQ_START_COND); } else { /* * Shared interrupts do not go well with disabling -- cgit v1.3-14-g43fede From 761ea388e8c4e3ac883a94e16bcc8c51fa419d4f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:50 +0200 Subject: genirq: Handle managed irqs gracefully in irq_startup() Affinity managed interrupts should keep their assigned affinity accross CPU hotplug. To avoid magic hackery in device drivers, the core code shall manage them transparently and set these interrupts into a managed shutdown state when the last CPU of the assigned affinity mask goes offline. The interrupt will be restarted when one of the CPUs in the assigned affinity mask comes back online. Add the necessary logic to irq_startup(). If an interrupt is requested and started up, the code checks whether it is affinity managed and if so, it checks whether a CPU in the interrupts affinity mask is online. If not, it puts the interrupt into managed shutdown state. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235447.189851170@linutronix.de --- include/linux/irq.h | 2 +- kernel/irq/chip.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 62 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 0e37276c5315..807042b46af1 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -346,7 +346,7 @@ static inline bool irqd_is_started(struct irq_data *d) return __irqd_to_state(d) & IRQD_IRQ_STARTED; } -static inline bool irqd_is_managed_shutdown(struct irq_data *d) +static inline bool irqd_is_managed_and_shutdown(struct irq_data *d) { return __irqd_to_state(d) & IRQD_MANAGED_SHUTDOWN; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b7599e952d3b..fc89eeb8a6b4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -195,6 +195,52 @@ static void irq_state_set_started(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_STARTED); } +enum { + IRQ_STARTUP_NORMAL, + IRQ_STARTUP_MANAGED, + IRQ_STARTUP_ABORT, +}; + +#ifdef CONFIG_SMP +static int +__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + + if (!irqd_affinity_is_managed(d)) + return IRQ_STARTUP_NORMAL; + + irqd_clr_managed_shutdown(d); + + if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) { + /* + * Catch code which fiddles with enable_irq() on a managed + * and potentially shutdown IRQ. Chained interrupt + * installment or irq auto probing should not happen on + * managed irqs either. Emit a warning, break the affinity + * and start it up as a normal interrupt. + */ + if (WARN_ON_ONCE(force)) + return IRQ_STARTUP_NORMAL; + /* + * The interrupt was requested, but there is no online CPU + * in it's affinity mask. Put it into managed shutdown + * state and let the cpu hotplug mechanism start it up once + * a CPU in the mask becomes available. + */ + irqd_set_managed_shutdown(d); + return IRQ_STARTUP_ABORT; + } + return IRQ_STARTUP_MANAGED; +} +#else +static int +__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) +{ + return IRQ_STARTUP_NORMAL; +} +#endif + static int __irq_startup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); @@ -214,15 +260,27 @@ static int __irq_startup(struct irq_desc *desc) int irq_startup(struct irq_desc *desc, bool resend, bool force) { + struct irq_data *d = irq_desc_get_irq_data(desc); + struct cpumask *aff = irq_data_get_affinity_mask(d); int ret = 0; desc->depth = 0; - if (irqd_is_started(&desc->irq_data)) { + if (irqd_is_started(d)) { irq_enable(desc); } else { - ret = __irq_startup(desc); - irq_setup_affinity(desc); + switch (__irq_startup_managed(desc, aff, force)) { + case IRQ_STARTUP_NORMAL: + ret = __irq_startup(desc); + irq_setup_affinity(desc); + break; + case IRQ_STARTUP_MANAGED: + ret = __irq_startup(desc); + irq_set_affinity_locked(d, aff, false); + break; + case IRQ_STARTUP_ABORT: + return 0; + } } if (resend) check_irq_resend(desc); -- cgit v1.3-14-g43fede From c5cb83bb337c25caae995d992d1cdf9b317f83de Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:51 +0200 Subject: genirq/cpuhotplug: Handle managed IRQs on CPU hotplug If a CPU goes offline, interrupts affine to the CPU are moved away. If the outgoing CPU is the last CPU in the affinity mask the migration code breaks the affinity and sets it it all online cpus. This is a problem for affinity managed interrupts as CPU hotplug is often used for power management purposes. If the affinity is broken, the interrupt is not longer affine to the CPUs to which it was allocated. The affinity spreading allows to lay out multi queue devices in a way that they are assigned to a single CPU or a group of CPUs. If the last CPU goes offline, then the queue is not longer used, so the interrupt can be shutdown gracefully and parked until one of the assigned CPUs comes online again. Add a graceful shutdown mechanism into the irq affinity breaking code path, mark the irq as MANAGED_SHUTDOWN and leave the affinity mask unmodified. In the online path, scan the active interrupts for managed interrupts and if the interrupt is functional and the newly online CPU is part of the affinity mask, restart the interrupt if it is marked MANAGED_SHUTDOWN or if the interrupts is started up, try to add the CPU back to the effective affinity mask. Originally-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170619235447.273417334@linutronix.de --- include/linux/cpuhotplug.h | 1 + include/linux/irq.h | 5 +++++ kernel/cpu.c | 5 +++++ kernel/irq/cpuhotplug.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 0f2a80377520..c15f22c54535 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -124,6 +124,7 @@ enum cpuhp_state { CPUHP_AP_ONLINE_IDLE, CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_X86_VDSO_VMA_ONLINE, + CPUHP_AP_IRQ_AFFINITY_ONLINE, CPUHP_AP_PERF_ONLINE, CPUHP_AP_PERF_X86_ONLINE, CPUHP_AP_PERF_X86_UNCORE_ONLINE, diff --git a/include/linux/irq.h b/include/linux/irq.h index 807042b46af1..19cea6326599 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -500,7 +500,12 @@ extern int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *cpumask, bool force); extern int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info); +#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_IRQ_MIGRATION) extern void irq_migrate_all_off_this_cpu(void); +extern int irq_affinity_online_cpu(unsigned int cpu); +#else +# define irq_affinity_online_cpu NULL +#endif #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) void irq_move_irq(struct irq_data *data); diff --git a/kernel/cpu.c b/kernel/cpu.c index cb5103413bd8..b86b32ebb3b2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1252,6 +1252,11 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup.single = smpboot_unpark_threads, .teardown.single = NULL, }, + [CPUHP_AP_IRQ_AFFINITY_ONLINE] = { + .name = "irq/affinity:online", + .startup.single = irq_affinity_online_cpu, + .teardown.single = NULL, + }, [CPUHP_AP_PERF_ONLINE] = { .name = "perf:online", .startup.single = perf_event_init_cpu, diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 0b093db3336b..b7964e72ded7 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -83,6 +83,15 @@ static bool migrate_one_irq(struct irq_desc *desc) chip->irq_mask(d); if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + /* + * If the interrupt is managed, then shut it down and leave + * the affinity untouched. + */ + if (irqd_affinity_is_managed(d)) { + irqd_set_managed_shutdown(d); + irq_shutdown(desc); + return false; + } affinity = cpu_online_mask; brokeaff = true; } @@ -129,3 +138,39 @@ void irq_migrate_all_off_this_cpu(void) } } } + +static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + const struct cpumask *affinity = irq_data_get_affinity_mask(data); + + if (!irqd_affinity_is_managed(data) || !desc->action || + !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity)) + return; + + if (irqd_is_managed_and_shutdown(data)) + irq_startup(desc, IRQ_RESEND, IRQ_START_COND); + else + irq_set_affinity_locked(data, affinity, false); +} + +/** + * irq_affinity_online_cpu - Restore affinity for managed interrupts + * @cpu: Upcoming CPU for which interrupts should be restored + */ +int irq_affinity_online_cpu(unsigned int cpu) +{ + struct irq_desc *desc; + unsigned int irq; + + irq_lock_sparse(); + for_each_active_irq(irq) { + desc = irq_to_desc(irq); + raw_spin_lock_irq(&desc->lock); + irq_restore_affinity_of_irq(desc, cpu); + raw_spin_unlock_irq(&desc->lock); + } + irq_unlock_sparse(); + + return 0; +} -- cgit v1.3-14-g43fede From d52dd44175bd27ad9d8e34a994fb80877c1f6d61 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:52 +0200 Subject: genirq: Introduce IRQD_SINGLE_TARGET flag Many interrupt chips allow only a single CPU as interrupt target. The core code has no knowledge about that. That's unfortunate as it could avoid trying to readd a newly online CPU to the effective affinity mask. Add the status flag and the necessary accessors. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235447.352343969@linutronix.de --- include/linux/irq.h | 16 ++++++++++++++++ kernel/irq/debugfs.c | 1 + 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 19cea6326599..00db35b61e9e 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -209,6 +209,7 @@ struct irq_data { * IRQD_IRQ_STARTED - Startup state of the interrupt * IRQD_MANAGED_SHUTDOWN - Interrupt was shutdown due to empty affinity * mask. Applies only to affinity managed irqs. + * IRQD_SINGLE_TARGET - IRQ allows only a single affinity target */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -228,6 +229,7 @@ enum { IRQD_AFFINITY_MANAGED = (1 << 21), IRQD_IRQ_STARTED = (1 << 22), IRQD_MANAGED_SHUTDOWN = (1 << 23), + IRQD_SINGLE_TARGET = (1 << 24), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -276,6 +278,20 @@ static inline bool irqd_is_level_type(struct irq_data *d) return __irqd_to_state(d) & IRQD_LEVEL; } +/* + * Must only be called of irqchip.irq_set_affinity() or low level + * hieararchy domain allocation functions. + */ +static inline void irqd_set_single_target(struct irq_data *d) +{ + __irqd_to_state(d) |= IRQD_SINGLE_TARGET; +} + +static inline bool irqd_is_single_target(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_SINGLE_TARGET; +} + static inline bool irqd_is_wakeup_set(struct irq_data *d) { return __irqd_to_state(d) & IRQD_WAKEUP_STATE; diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index edbef252d0c4..dbd6e78db213 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -105,6 +105,7 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_PER_CPU), BIT_MASK_DESCR(IRQD_NO_BALANCING), + BIT_MASK_DESCR(IRQD_SINGLE_TARGET), BIT_MASK_DESCR(IRQD_MOVE_PCNTXT), BIT_MASK_DESCR(IRQD_AFFINITY_SET), BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), -- cgit v1.3-14-g43fede From 8f31a9845db348f5781df47ce04c79e4cfe90016 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:53 +0200 Subject: genirq/cpuhotplug: Avoid irq affinity setting for single targets Avoid trying to add a newly online CPU to the effective affinity mask of an started up interrupt. That interrupt will either stay on the already online CPU or move around for no value. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235447.431321047@linutronix.de --- kernel/irq/cpuhotplug.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index b7964e72ded7..aee8f7ec40af 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -148,9 +148,17 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity)) return; - if (irqd_is_managed_and_shutdown(data)) + if (irqd_is_managed_and_shutdown(data)) { irq_startup(desc, IRQ_RESEND, IRQ_START_COND); - else + return; + } + + /* + * If the interrupt can only be directed to a single target + * CPU then it is already assigned to a CPU in the affinity + * mask. No point in trying to move it around. + */ + if (!irqd_is_single_target(data)) irq_set_affinity_locked(data, affinity, false); } -- cgit v1.3-14-g43fede From 9a0ef98e186d86fb3c1ff3ec267a76f067005f74 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Jun 2017 01:37:55 +0200 Subject: genirq/affinity: Assign vectors to all present CPUs Currently the irq vector spread algorithm is restricted to online CPUs, which ties the IRQ mapping to the currently online devices and doesn't deal nicely with the fact that CPUs could come and go rapidly due to e.g. power management. Instead assign vectors to all present CPUs to avoid this churn. Build a map of all possible CPUs for a given node, as the architectures only provide a map of all onlines CPUs. Do this dynamically on each call for the vector assingments, which is a bit suboptimal and could be optimized in the future by provinding a mapping from the arch code. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: Sagi Grimberg Cc: Marc Zyngier Cc: Michael Ellerman Cc: linux-nvme@lists.infradead.org Cc: Keith Busch Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20170603140403.27379-5-hch@lst.de --- kernel/irq/affinity.c | 76 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e2d356dd7581..d2747f9c5707 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -1,4 +1,7 @@ - +/* + * Copyright (C) 2016 Thomas Gleixner. + * Copyright (C) 2016-2017 Christoph Hellwig. + */ #include #include #include @@ -35,13 +38,54 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, } } -static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) +static cpumask_var_t *alloc_node_to_present_cpumask(void) +{ + cpumask_var_t *masks; + int node; + + masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); + if (!masks) + return NULL; + + for (node = 0; node < nr_node_ids; node++) { + if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) + goto out_unwind; + } + + return masks; + +out_unwind: + while (--node >= 0) + free_cpumask_var(masks[node]); + kfree(masks); + return NULL; +} + +static void free_node_to_present_cpumask(cpumask_var_t *masks) +{ + int node; + + for (node = 0; node < nr_node_ids; node++) + free_cpumask_var(masks[node]); + kfree(masks); +} + +static void build_node_to_present_cpumask(cpumask_var_t *masks) +{ + int cpu; + + for_each_present_cpu(cpu) + cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); +} + +static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask, + const struct cpumask *mask, nodemask_t *nodemsk) { int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ - for_each_online_node(n) { - if (cpumask_intersects(mask, cpumask_of_node(n))) { + for_each_node(n) { + if (cpumask_intersects(mask, node_to_present_cpumask[n])) { node_set(n, *nodemsk); nodes++; } @@ -64,7 +108,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; struct cpumask *masks; - cpumask_var_t nmsk; + cpumask_var_t nmsk, *node_to_present_cpumask; if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; @@ -73,13 +117,19 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!masks) goto out; + node_to_present_cpumask = alloc_node_to_present_cpumask(); + if (!node_to_present_cpumask) + goto out; + /* Fill out vectors at the beginning that don't need affinity */ for (curvec = 0; curvec < affd->pre_vectors; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); /* Stabilize the cpumasks */ get_online_cpus(); - nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk); + build_node_to_present_cpumask(node_to_present_cpumask); + nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask, + &nodemsk); /* * If the number of nodes in the mask is greater than or equal the @@ -87,7 +137,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) */ if (affv <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_copy(masks + curvec, cpumask_of_node(n)); + cpumask_copy(masks + curvec, + node_to_present_cpumask[n]); if (++curvec == last_affv) break; } @@ -101,7 +152,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n)); + cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); @@ -133,6 +184,7 @@ done: /* Fill out vectors at the end that don't need affinity */ for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); + free_node_to_present_cpumask(node_to_present_cpumask); out: free_cpumask_var(nmsk); return masks; @@ -147,12 +199,10 @@ int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) { int resv = affd->pre_vectors + affd->post_vectors; int vecs = maxvec - resv; - int cpus; + int ret; - /* Stabilize the cpumasks */ get_online_cpus(); - cpus = cpumask_weight(cpu_online_mask); + ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv; put_online_cpus(); - - return min(cpus, vecs) + resv; + return ret; } -- cgit v1.3-14-g43fede From 61d0a000b7746665c7cfcff766532f6f2a922a61 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 22 Jun 2017 11:34:57 +0100 Subject: genirq/irqdomain: Add irq_domain_update_bus_token helper We can have irq domains that are identified by the same fwnode (because they are serviced by the same HW), and yet have different functionnality (because they serve different busses, for example). This is what we use the bus_token field. Since we don't use this field when generating the domain name, all the aliasing domains will get the same name, and the debugfs file creation fails. Also, bus_token is updated by individual drivers, and the core code is unaware of that update. In order to sort this mess, let's introduce a helper that takes care of updating bus_token, and regenerate the debugfs file. A separate patch will update all the individual users. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 3 +++ kernel/irq/irqdomain.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 914b0c31d233..222f47af12f4 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -273,6 +273,9 @@ static inline bool is_fwnode_irqchip(struct fwnode_handle *fwnode) return fwnode && fwnode->type == FWNODE_IRQCHIP; } +extern void irq_domain_update_bus_token(struct irq_domain *domain, + enum irq_domain_bus_token bus_token); + static inline struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, enum irq_domain_bus_token bus_token) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 75e1f0851c33..f6adeaeb4c16 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -245,6 +245,37 @@ void irq_domain_remove(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_domain_remove); +void irq_domain_update_bus_token(struct irq_domain *domain, + enum irq_domain_bus_token bus_token) +{ + char *name; + + if (domain->bus_token == bus_token) + return; + + mutex_lock(&irq_domain_mutex); + + domain->bus_token = bus_token; + + name = kasprintf(GFP_KERNEL, "%s-%d", domain->name, bus_token); + if (!name) { + mutex_unlock(&irq_domain_mutex); + return; + } + + debugfs_remove_domain_dir(domain); + + if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) + kfree(domain->name); + else + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + + domain->name = name; + debugfs_add_domain_dir(domain); + + mutex_unlock(&irq_domain_mutex); +} + /** * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs * @of_node: pointer to interrupt controller's device tree node. -- cgit v1.3-14-g43fede From 6a6544e520abecd484ab8b67fb50d1fc003f3275 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 20 Jun 2017 22:17:44 +0100 Subject: genirq/irqdomain: Remove auto-recursive hierarchy support It did seem like a good idea at the time, but it never really caught on, and auto-recursive domains remain unused 3 years after having been introduced. Oh well, time for a late spring cleanup. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 9 +++----- kernel/irq/irqdomain.c | 55 +++++++++++------------------------------------ kernel/irq/msi.c | 2 +- 3 files changed, 17 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 222f47af12f4..cac77a5c5555 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -180,8 +180,8 @@ enum { /* Irq domain is hierarchical */ IRQ_DOMAIN_FLAG_HIERARCHY = (1 << 0), - /* Core calls alloc/free recursive through the domain hierarchy. */ - IRQ_DOMAIN_FLAG_AUTO_RECURSIVE = (1 << 1), + /* Irq domain name was allocated in __irq_domain_add() */ + IRQ_DOMAIN_NAME_ALLOCATED = (1 << 6), /* Irq domain is an IPI domain with virq per cpu */ IRQ_DOMAIN_FLAG_IPI_PER_CPU = (1 << 2), @@ -195,9 +195,6 @@ enum { /* Irq domain implements MSI remapping */ IRQ_DOMAIN_FLAG_MSI_REMAP = (1 << 5), - /* Irq domain name was allocated in __irq_domain_add() */ - IRQ_DOMAIN_NAME_ALLOCATED = (1 << 6), - /* * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved * for implementation specific purposes and ignored by the @@ -448,7 +445,7 @@ static inline int irq_domain_alloc_irqs(struct irq_domain *domain, NULL); } -extern int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, +extern int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, void *arg); extern int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f6adeaeb4c16..14fe862aa2e3 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1342,43 +1342,18 @@ void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, irq_domain_free_irqs_common(domain, virq, nr_irqs); } -static bool irq_domain_is_auto_recursive(struct irq_domain *domain) -{ - return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE; -} - -static void irq_domain_free_irqs_recursive(struct irq_domain *domain, +static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs) { domain->ops->free(domain, irq_base, nr_irqs); - if (irq_domain_is_auto_recursive(domain)) { - BUG_ON(!domain->parent); - irq_domain_free_irqs_recursive(domain->parent, irq_base, - nr_irqs); - } } -int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, +int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, void *arg) { - int ret = 0; - struct irq_domain *parent = domain->parent; - bool recursive = irq_domain_is_auto_recursive(domain); - - BUG_ON(recursive && !parent); - if (recursive) - ret = irq_domain_alloc_irqs_recursive(parent, irq_base, - nr_irqs, arg); - if (ret < 0) - return ret; - - ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); - if (ret < 0 && recursive) - irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); - - return ret; + return domain->ops->alloc(domain, irq_base, nr_irqs, arg); } /** @@ -1439,7 +1414,7 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, } mutex_lock(&irq_domain_mutex); - ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg); + ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg); if (ret < 0) { mutex_unlock(&irq_domain_mutex); goto out_free_irq_data; @@ -1474,7 +1449,7 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) mutex_lock(&irq_domain_mutex); for (i = 0; i < nr_irqs; i++) irq_domain_remove_irq(virq + i); - irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs); + irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs); mutex_unlock(&irq_domain_mutex); irq_domain_free_irq_data(virq, nr_irqs); @@ -1494,15 +1469,11 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs, void *arg) { - /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */ - if (irq_domain_is_auto_recursive(domain)) - return 0; + if (!domain->parent) + return -ENOSYS; - domain = domain->parent; - if (domain) - return irq_domain_alloc_irqs_recursive(domain, irq_base, - nr_irqs, arg); - return -ENOSYS; + return irq_domain_alloc_irqs_hierarchy(domain->parent, irq_base, + nr_irqs, arg); } EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); @@ -1517,10 +1488,10 @@ EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); void irq_domain_free_irqs_parent(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs) { - /* irq_domain_free_irqs_recursive() will call parent's free */ - if (!irq_domain_is_auto_recursive(domain) && domain->parent) - irq_domain_free_irqs_recursive(domain->parent, irq_base, - nr_irqs); + if (!domain->parent) + return; + + irq_domain_free_irqs_hierarchy(domain->parent, irq_base, nr_irqs); } EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 9e3f1857c6bd..48eadf416c24 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -315,7 +315,7 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, ops->set_desc(arg, desc); /* Assumes the domain mutex is held! */ - ret = irq_domain_alloc_irqs_recursive(domain, virq, 1, arg); + ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); if (ret) break; -- cgit v1.3-14-g43fede From feaf1283d11794b9d518fcfd54b6bf8bee1f0b4b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 22 Jun 2017 17:04:55 -0400 Subject: tracing: Show address when function names are not found Currently, when a function is not found in kallsyms, instead of simply showing the function address, it shows nothing at all: # echo ':mod:kvm_intel' > /sys/kernel/tracing/set_ftrace_filter # echo function > /sys/kernel/tracing/set_ftrace_filter # qemu -enable-kvm /home/my-qemu-image # rmmod kvm_intel # cat /sys/kernel/tracing/trace qemu-system-x86-2408 [001] d..2 135.013238: <-kvm_arch_hardware_enable qemu-system-x86-2408 [001] .... 135.014574: <-kvm_arch_vm_ioctl qemu-system-x86-2408 [001] .... 135.015420: <-kvm_vm_ioctl_check_extension qemu-system-x86-2408 [001] .... 135.045411: <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: <-__do_cpuid_ent qemu-system-x86-2408 [001] ...1 135.045413: <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045413: <-__do_cpuid_ent When it should show: qemu-system-x86-2408 [001] d..2 135.013238: 0xffffffffa02a39f0 <-kvm_arch_hardware_enable qemu-system-x86-2408 [001] .... 135.014574: 0xffffffffa02a2ba0 <-kvm_arch_vm_ioctl qemu-system-x86-2408 [001] .... 135.015420: 0xffffffffa029e4e0 <-kvm_vm_ioctl_check_extension qemu-system-x86-2408 [001] .... 135.045411: 0xffffffffa02a1380 <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: 0xffffffffa029e160 <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: 0xffffffffa029e180 <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045412: 0xffffffffa029e520 <-__do_cpuid_ent qemu-system-x86-2408 [001] ...1 135.045413: 0xffffffffa02a13b0 <-__do_cpuid_ent qemu-system-x86-2408 [001] .... 135.045413: 0xffffffffa02a1380 <-__do_cpuid_ent instead. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 08f9bab8089e..01ff99969ca7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -340,31 +340,41 @@ static inline const char *kretprobed(const char *name) static void seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { -#ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_KALLSYMS const char *name; kallsyms_lookup(address, NULL, NULL, NULL, str); name = kretprobed(str); - trace_seq_printf(s, fmt, name); + if (name && strlen(name)) { + trace_seq_printf(s, fmt, name); + return; + } #endif + snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); + trace_seq_printf(s, fmt, str); } static void seq_print_sym_offset(struct trace_seq *s, const char *fmt, unsigned long address) { -#ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_KALLSYMS const char *name; sprint_symbol(str, address); name = kretprobed(str); - trace_seq_printf(s, fmt, name); + if (name && strlen(name)) { + trace_seq_printf(s, fmt, name); + return; + } #endif + snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); + trace_seq_printf(s, fmt, str); } #ifndef CONFIG_64BIT -- cgit v1.3-14-g43fede From e1d4eeec5aaa28d25f249c0195b0e1d9b9feb7bd Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 14 Jun 2017 13:19:23 -0400 Subject: sched/cpuset: Only offer CONFIG_CPUSETS if SMP is enabled Make CONFIG_CPUSETS=y depend on SMP as this feature makes no sense on UP. This allows for configuring out cpuset_cpumask_can_shrink() and task_can_attach() entirely, which shrinks the kernel a bit. Signed-off-by: Nicolas Pitre Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170614171926.8345-2-nicolas.pitre@linaro.org Signed-off-by: Ingo Molnar --- init/Kconfig | 1 + kernel/sched/core.c | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 1d3475fc9496..c359038ebeed 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1156,6 +1156,7 @@ config CGROUP_HUGETLB config CPUSETS bool "Cpuset controller" + depends on SMP help This option will let you create and manage CPUSETs which allow dynamically partitioning a system into sets of CPUs and diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 62166da1c359..7faf4b322b63 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5451,6 +5451,8 @@ void init_idle(struct task_struct *idle, int cpu) #endif } +#ifdef CONFIG_SMP + int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -5494,7 +5496,6 @@ int task_can_attach(struct task_struct *p, goto out; } -#ifdef CONFIG_SMP if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, cs_cpus_allowed)) { unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, @@ -5524,13 +5525,11 @@ int task_can_attach(struct task_struct *p, rcu_read_unlock_sched(); } -#endif + out: return ret; } -#ifdef CONFIG_SMP - bool sched_smp_initialized __read_mostly; #ifdef CONFIG_NUMA_BALANCING -- cgit v1.3-14-g43fede From 06a76fe08d4daaeea01ca0f175ad29f40c781ece Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 21 Jun 2017 14:22:01 -0400 Subject: sched/deadline: Move DL related code from sched/core.c to sched/deadline.c This helps making sched/core.c smaller and hopefully easier to understand and maintain. Signed-off-by: Nicolas Pitre Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170621182203.30626-2-nicolas.pitre@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 343 +---------------------------------------------- kernel/sched/deadline.c | 344 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 17 ++- 3 files changed, 364 insertions(+), 340 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7faf4b322b63..54e1b0700af3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2139,25 +2139,6 @@ int wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } -/* - * This function clears the sched_dl_entity static params. - */ -void __dl_clear_params(struct task_struct *p) -{ - struct sched_dl_entity *dl_se = &p->dl; - - dl_se->dl_runtime = 0; - dl_se->dl_deadline = 0; - dl_se->dl_period = 0; - dl_se->flags = 0; - dl_se->dl_bw = 0; - dl_se->dl_density = 0; - - dl_se->dl_throttled = 0; - dl_se->dl_yielded = 0; - dl_se->dl_non_contending = 0; -} - /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -2438,101 +2419,6 @@ unsigned long to_ratio(u64 period, u64 runtime) return div64_u64(runtime << BW_SHIFT, period); } -#ifdef CONFIG_SMP -inline struct dl_bw *dl_bw_of(int i) -{ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), - "sched RCU must be held"); - return &cpu_rq(i)->rd->dl_bw; -} - -inline int dl_bw_cpus(int i) -{ - struct root_domain *rd = cpu_rq(i)->rd; - int cpus = 0; - - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), - "sched RCU must be held"); - for_each_cpu_and(i, rd->span, cpu_active_mask) - cpus++; - - return cpus; -} -#else -inline struct dl_bw *dl_bw_of(int i) -{ - return &cpu_rq(i)->dl.dl_bw; -} - -inline int dl_bw_cpus(int i) -{ - return 1; -} -#endif - -/* - * We must be sure that accepting a new task (or allowing changing the - * parameters of an existing one) is consistent with the bandwidth - * constraints. If yes, this function also accordingly updates the currently - * allocated bandwidth to reflect the new situation. - * - * This function is called while holding p's rq->lock. - */ -static int dl_overflow(struct task_struct *p, int policy, - const struct sched_attr *attr) -{ - - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - u64 period = attr->sched_period ?: attr->sched_deadline; - u64 runtime = attr->sched_runtime; - u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus, err = -1; - - /* !deadline task may carry old deadline bandwidth */ - if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) - return 0; - - /* - * Either if a task, enters, leave, or stays -deadline but changes - * its parameters, we may need to update accordingly the total - * allocated bandwidth of the container. - */ - raw_spin_lock(&dl_b->lock); - cpus = dl_bw_cpus(task_cpu(p)); - if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { - if (hrtimer_active(&p->dl.inactive_timer)) - __dl_clear(dl_b, p->dl.dl_bw, cpus); - __dl_add(dl_b, new_bw, cpus); - err = 0; - } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { - /* - * XXX this is slightly incorrect: when the task - * utilization decreases, we should delay the total - * utilization change until the task's 0-lag point. - * But this would require to set the task's "inactive - * timer" when the task is not inactive. - */ - __dl_clear(dl_b, p->dl.dl_bw, cpus); - __dl_add(dl_b, new_bw, cpus); - dl_change_utilization(p, new_bw); - err = 0; - } else if (!dl_policy(policy) && task_has_dl_policy(p)) { - /* - * Do not decrease the total deadline utilization here, - * switched_from_dl() will take care to do it at the correct - * (0-lag) time. - */ - err = 0; - } - raw_spin_unlock(&dl_b->lock); - - return err; -} - -extern void init_dl_bw(struct dl_bw *dl_b); - /* * wake_up_new_task - wake up a newly created task for the first time. * @@ -4014,27 +3900,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } -/* - * This function initializes the sched_dl_entity of a newly becoming - * SCHED_DEADLINE task. - * - * Only the static values are considered here, the actual runtime and the - * absolute deadline will be properly calculated when the task is enqueued - * for the first time with its new policy. - */ -static void -__setparam_dl(struct task_struct *p, const struct sched_attr *attr) -{ - struct sched_dl_entity *dl_se = &p->dl; - - dl_se->dl_runtime = attr->sched_runtime; - dl_se->dl_deadline = attr->sched_deadline; - dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; - dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); - dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); -} - /* * sched_setparam() passes in -1 for its policy, to let the functions * it calls know not to change it. @@ -4088,59 +3953,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, p->sched_class = &fair_sched_class; } -static void -__getparam_dl(struct task_struct *p, struct sched_attr *attr) -{ - struct sched_dl_entity *dl_se = &p->dl; - - attr->sched_priority = p->rt_priority; - attr->sched_runtime = dl_se->dl_runtime; - attr->sched_deadline = dl_se->dl_deadline; - attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; -} - -/* - * This function validates the new parameters of a -deadline task. - * We ask for the deadline not being zero, and greater or equal - * than the runtime, as well as the period of being zero or - * greater than deadline. Furthermore, we have to be sure that - * user parameters are above the internal resolution of 1us (we - * check sched_runtime only since it is always the smaller one) and - * below 2^63 ns (we have to check both sched_deadline and - * sched_period, as the latter can be zero). - */ -static bool -__checkparam_dl(const struct sched_attr *attr) -{ - /* deadline != 0 */ - if (attr->sched_deadline == 0) - return false; - - /* - * Since we truncate DL_SCALE bits, make sure we're at least - * that big. - */ - if (attr->sched_runtime < (1ULL << DL_SCALE)) - return false; - - /* - * Since we use the MSB for wrap-around and sign issues, make - * sure it's not set (mind that period can be equal to zero). - */ - if (attr->sched_deadline & (1ULL << 63) || - attr->sched_period & (1ULL << 63)) - return false; - - /* runtime <= deadline <= period (if period != 0) */ - if ((attr->sched_period != 0 && - attr->sched_period < attr->sched_deadline) || - attr->sched_deadline < attr->sched_runtime) - return false; - - return true; -} - /* * Check the target process has a UID that matches the current process's: */ @@ -4157,19 +3969,6 @@ static bool check_same_owner(struct task_struct *p) return match; } -static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) -{ - struct sched_dl_entity *dl_se = &p->dl; - - if (dl_se->dl_runtime != attr->sched_runtime || - dl_se->dl_deadline != attr->sched_deadline || - dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) - return true; - - return false; -} - static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi) @@ -4350,7 +4149,7 @@ change: * of a SCHED_DEADLINE task) we need to check if enough bandwidth * is available. */ - if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { + if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { task_rq_unlock(rq, p, &rf); return -EBUSY; } @@ -5456,23 +5255,12 @@ void init_idle(struct task_struct *idle, int cpu) int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { - int ret = 1, trial_cpus; - struct dl_bw *cur_dl_b; - unsigned long flags; + int ret = 1; if (!cpumask_weight(cur)) return ret; - rcu_read_lock_sched(); - cur_dl_b = dl_bw_of(cpumask_any(cur)); - trial_cpus = cpumask_weight(trial); - - raw_spin_lock_irqsave(&cur_dl_b->lock, flags); - if (cur_dl_b->bw != -1 && - cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) - ret = 0; - raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); - rcu_read_unlock_sched(); + ret = dl_cpuset_cpumask_can_shrink(cur, trial); return ret; } @@ -5497,34 +5285,8 @@ int task_can_attach(struct task_struct *p, } if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_cpus_allowed)) { - unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, - cs_cpus_allowed); - struct dl_bw *dl_b; - bool overflow; - int cpus; - unsigned long flags; - - rcu_read_lock_sched(); - dl_b = dl_bw_of(dest_cpu); - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); - if (overflow) - ret = -EBUSY; - else { - /* - * We reserve space for this task in the destination - * root_domain, as we can't fail after this point. - * We will free resources in the source root_domain - * later on (see set_cpus_allowed_dl()). - */ - __dl_add(dl_b, p->dl.dl_bw, cpus); - } - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - rcu_read_unlock_sched(); - - } + cs_cpus_allowed)) + ret = dl_task_can_attach(p, cs_cpus_allowed); out: return ret; @@ -5792,23 +5554,8 @@ static void cpuset_cpu_active(void) static int cpuset_cpu_inactive(unsigned int cpu) { - unsigned long flags; - struct dl_bw *dl_b; - bool overflow; - int cpus; - if (!cpuhp_tasks_frozen) { - rcu_read_lock_sched(); - dl_b = dl_bw_of(cpu); - - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - - rcu_read_unlock_sched(); - - if (overflow) + if (dl_cpu_busy(cpu)) return -EBUSY; cpuset_update_active_cpus(); } else { @@ -6711,84 +6458,6 @@ static int sched_rt_global_constraints(void) } #endif /* CONFIG_RT_GROUP_SCHED */ -static int sched_dl_global_validate(void) -{ - u64 runtime = global_rt_runtime(); - u64 period = global_rt_period(); - u64 new_bw = to_ratio(period, runtime); - struct dl_bw *dl_b; - int cpu, ret = 0; - unsigned long flags; - - /* - * Here we want to check the bandwidth not being set to some - * value smaller than the currently allocated bandwidth in - * any of the root_domains. - * - * FIXME: Cycling on all the CPUs is overdoing, but simpler than - * cycling on root_domains... Discussion on different/better - * solutions is welcome! - */ - for_each_possible_cpu(cpu) { - rcu_read_lock_sched(); - dl_b = dl_bw_of(cpu); - - raw_spin_lock_irqsave(&dl_b->lock, flags); - if (new_bw < dl_b->total_bw) - ret = -EBUSY; - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - - rcu_read_unlock_sched(); - - if (ret) - break; - } - - return ret; -} - -void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) -{ - if (global_rt_runtime() == RUNTIME_INF) { - dl_rq->bw_ratio = 1 << RATIO_SHIFT; - dl_rq->extra_bw = 1 << BW_SHIFT; - } else { - dl_rq->bw_ratio = to_ratio(global_rt_runtime(), - global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); - dl_rq->extra_bw = to_ratio(global_rt_period(), - global_rt_runtime()); - } -} - -static void sched_dl_do_global(void) -{ - u64 new_bw = -1; - struct dl_bw *dl_b; - int cpu; - unsigned long flags; - - def_dl_bandwidth.dl_period = global_rt_period(); - def_dl_bandwidth.dl_runtime = global_rt_runtime(); - - if (global_rt_runtime() != RUNTIME_INF) - new_bw = to_ratio(global_rt_period(), global_rt_runtime()); - - /* - * FIXME: As above... - */ - for_each_possible_cpu(cpu) { - rcu_read_lock_sched(); - dl_b = dl_bw_of(cpu); - - raw_spin_lock_irqsave(&dl_b->lock, flags); - dl_b->bw = new_bw; - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - - rcu_read_unlock_sched(); - init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); - } -} - static int sched_rt_global_validate(void) { if (sysctl_sched_rt_period <= 0) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e12f85975857..a84299f44b5d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,7 @@ #include "sched.h" #include +#include struct dl_bandwidth def_dl_bandwidth; @@ -43,6 +44,38 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +#ifdef CONFIG_SMP +static inline struct dl_bw *dl_bw_of(int i) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + return &cpu_rq(i)->rd->dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ + struct root_domain *rd = cpu_rq(i)->rd; + int cpus = 0; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) + cpus++; + + return cpus; +} +#else +static inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->dl.dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ + return 1; +} +#endif + static inline void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { @@ -2318,6 +2351,317 @@ const struct sched_class dl_sched_class = { .update_curr = update_curr_dl, }; +int sched_dl_global_validate(void) +{ + u64 runtime = global_rt_runtime(); + u64 period = global_rt_period(); + u64 new_bw = to_ratio(period, runtime); + struct dl_bw *dl_b; + int cpu, ret = 0; + unsigned long flags; + + /* + * Here we want to check the bandwidth not being set to some + * value smaller than the currently allocated bandwidth in + * any of the root_domains. + * + * FIXME: Cycling on all the CPUs is overdoing, but simpler than + * cycling on root_domains... Discussion on different/better + * solutions is welcome! + */ + for_each_possible_cpu(cpu) { + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + if (new_bw < dl_b->total_bw) + ret = -EBUSY; + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); + + if (ret) + break; + } + + return ret; +} + +void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) +{ + if (global_rt_runtime() == RUNTIME_INF) { + dl_rq->bw_ratio = 1 << RATIO_SHIFT; + dl_rq->extra_bw = 1 << BW_SHIFT; + } else { + dl_rq->bw_ratio = to_ratio(global_rt_runtime(), + global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); + dl_rq->extra_bw = to_ratio(global_rt_period(), + global_rt_runtime()); + } +} + +void sched_dl_do_global(void) +{ + u64 new_bw = -1; + struct dl_bw *dl_b; + int cpu; + unsigned long flags; + + def_dl_bandwidth.dl_period = global_rt_period(); + def_dl_bandwidth.dl_runtime = global_rt_runtime(); + + if (global_rt_runtime() != RUNTIME_INF) + new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + + /* + * FIXME: As above... + */ + for_each_possible_cpu(cpu) { + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + dl_b->bw = new_bw; + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); + init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); + } +} + +/* + * We must be sure that accepting a new task (or allowing changing the + * parameters of an existing one) is consistent with the bandwidth + * constraints. If yes, this function also accordingly updates the currently + * allocated bandwidth to reflect the new situation. + * + * This function is called while holding p's rq->lock. + */ +int sched_dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr) +{ + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + u64 period = attr->sched_period ?: attr->sched_deadline; + u64 runtime = attr->sched_runtime; + u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; + int cpus, err = -1; + + /* !deadline task may carry old deadline bandwidth */ + if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) + return 0; + + /* + * Either if a task, enters, leave, or stays -deadline but changes + * its parameters, we may need to update accordingly the total + * allocated bandwidth of the container. + */ + raw_spin_lock(&dl_b->lock); + cpus = dl_bw_cpus(task_cpu(p)); + if (dl_policy(policy) && !task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, 0, new_bw)) { + if (hrtimer_active(&p->dl.inactive_timer)) + __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + err = 0; + } else if (dl_policy(policy) && task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + /* + * XXX this is slightly incorrect: when the task + * utilization decreases, we should delay the total + * utilization change until the task's 0-lag point. + * But this would require to set the task's "inactive + * timer" when the task is not inactive. + */ + __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + dl_change_utilization(p, new_bw); + err = 0; + } else if (!dl_policy(policy) && task_has_dl_policy(p)) { + /* + * Do not decrease the total deadline utilization here, + * switched_from_dl() will take care to do it at the correct + * (0-lag) time. + */ + err = 0; + } + raw_spin_unlock(&dl_b->lock); + + return err; +} + +/* + * This function initializes the sched_dl_entity of a newly becoming + * SCHED_DEADLINE task. + * + * Only the static values are considered here, the actual runtime and the + * absolute deadline will be properly calculated when the task is enqueued + * for the first time with its new policy. + */ +void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + dl_se->dl_runtime = attr->sched_runtime; + dl_se->dl_deadline = attr->sched_deadline; + dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; + dl_se->flags = attr->sched_flags; + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); +} + +void __getparam_dl(struct task_struct *p, struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + attr->sched_priority = p->rt_priority; + attr->sched_runtime = dl_se->dl_runtime; + attr->sched_deadline = dl_se->dl_deadline; + attr->sched_period = dl_se->dl_period; + attr->sched_flags = dl_se->flags; +} + +/* + * This function validates the new parameters of a -deadline task. + * We ask for the deadline not being zero, and greater or equal + * than the runtime, as well as the period of being zero or + * greater than deadline. Furthermore, we have to be sure that + * user parameters are above the internal resolution of 1us (we + * check sched_runtime only since it is always the smaller one) and + * below 2^63 ns (we have to check both sched_deadline and + * sched_period, as the latter can be zero). + */ +bool __checkparam_dl(const struct sched_attr *attr) +{ + /* deadline != 0 */ + if (attr->sched_deadline == 0) + return false; + + /* + * Since we truncate DL_SCALE bits, make sure we're at least + * that big. + */ + if (attr->sched_runtime < (1ULL << DL_SCALE)) + return false; + + /* + * Since we use the MSB for wrap-around and sign issues, make + * sure it's not set (mind that period can be equal to zero). + */ + if (attr->sched_deadline & (1ULL << 63) || + attr->sched_period & (1ULL << 63)) + return false; + + /* runtime <= deadline <= period (if period != 0) */ + if ((attr->sched_period != 0 && + attr->sched_period < attr->sched_deadline) || + attr->sched_deadline < attr->sched_runtime) + return false; + + return true; +} + +/* + * This function clears the sched_dl_entity static params. + */ +void __dl_clear_params(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + + dl_se->dl_runtime = 0; + dl_se->dl_deadline = 0; + dl_se->dl_period = 0; + dl_se->flags = 0; + dl_se->dl_bw = 0; + dl_se->dl_density = 0; + + dl_se->dl_throttled = 0; + dl_se->dl_yielded = 0; + dl_se->dl_non_contending = 0; +} + +bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + if (dl_se->dl_runtime != attr->sched_runtime || + dl_se->dl_deadline != attr->sched_deadline || + dl_se->dl_period != attr->sched_period || + dl_se->flags != attr->sched_flags) + return true; + + return false; +} + +#ifdef CONFIG_SMP +int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) +{ + unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, + cs_cpus_allowed); + struct dl_bw *dl_b; + bool overflow; + int cpus, ret; + unsigned long flags; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(dest_cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(dest_cpu); + overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + if (overflow) + ret = -EBUSY; + else { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw, cpus); + ret = 0; + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + return ret; +} + +int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial) +{ + int ret = 1, trial_cpus; + struct dl_bw *cur_dl_b; + unsigned long flags; + + rcu_read_lock_sched(); + cur_dl_b = dl_bw_of(cpumask_any(cur)); + trial_cpus = cpumask_weight(trial); + + raw_spin_lock_irqsave(&cur_dl_b->lock, flags); + if (cur_dl_b->bw != -1 && + cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + ret = 0; + raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); + rcu_read_unlock_sched(); + return ret; +} + +bool dl_cpu_busy(unsigned int cpu) +{ + unsigned long flags; + struct dl_bw *dl_b; + bool overflow; + int cpus; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(cpu); + overflow = __dl_overflow(dl_b, cpus, 0, 0); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + return overflow; +} +#endif + #ifdef CONFIG_SCHED_DEBUG extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e0329d10bdb8..d4eb3f67529d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -218,9 +218,6 @@ static inline int dl_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } -extern struct dl_bw *dl_bw_of(int i); -extern int dl_bw_cpus(int i); - struct dl_bw { raw_spinlock_t lock; u64 bw, total_bw; @@ -251,6 +248,20 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) void dl_change_utilization(struct task_struct *p, u64 new_bw); extern void init_dl_bw(struct dl_bw *dl_b); +extern int sched_dl_global_validate(void); +extern void sched_dl_do_global(void); +extern int sched_dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr); +extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); +extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); +extern bool __checkparam_dl(const struct sched_attr *attr); +extern void __dl_clear_params(struct task_struct *p); +extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); +extern int dl_task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed); +extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial); +extern bool dl_cpu_busy(unsigned int cpu); #ifdef CONFIG_CGROUP_SCHED -- cgit v1.3-14-g43fede From 8887cd99038bf242fb47f2d07fa0cf9371efa643 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 21 Jun 2017 14:22:02 -0400 Subject: sched/rt: Move RT related code from sched/core.c to sched/rt.c This helps making sched/core.c smaller and hopefully easier to understand and maintain. Signed-off-by: Nicolas Pitre Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170621182203.30626-3-nicolas.pitre@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 315 --------------------------------------------------- kernel/sched/rt.c | 310 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 5 + 3 files changed, 315 insertions(+), 315 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 54e1b0700af3..5186797908dc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6224,321 +6224,6 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, tsk, &rf); } -#endif /* CONFIG_CGROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -/* - * Ensure that the real time constraints are schedulable. - */ -static DEFINE_MUTEX(rt_constraints_mutex); - -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) -{ - struct task_struct *g, *p; - - /* - * Autogroups do not have RT tasks; see autogroup_create(). - */ - if (task_group_is_autogroup(tg)) - return 0; - - for_each_process_thread(g, p) { - if (rt_task(p) && task_group(p) == tg) - return 1; - } - - return 0; -} - -struct rt_schedulable_data { - struct task_group *tg; - u64 rt_period; - u64 rt_runtime; -}; - -static int tg_rt_schedulable(struct task_group *tg, void *data) -{ - struct rt_schedulable_data *d = data; - struct task_group *child; - unsigned long total, sum = 0; - u64 period, runtime; - - period = ktime_to_ns(tg->rt_bandwidth.rt_period); - runtime = tg->rt_bandwidth.rt_runtime; - - if (tg == d->tg) { - period = d->rt_period; - runtime = d->rt_runtime; - } - - /* - * Cannot have more runtime than the period. - */ - if (runtime > period && runtime != RUNTIME_INF) - return -EINVAL; - - /* - * Ensure we don't starve existing RT tasks. - */ - if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) - return -EBUSY; - - total = to_ratio(period, runtime); - - /* - * Nobody can have more than the global setting allows. - */ - if (total > to_ratio(global_rt_period(), global_rt_runtime())) - return -EINVAL; - - /* - * The sum of our children's runtime should not exceed our own. - */ - list_for_each_entry_rcu(child, &tg->children, siblings) { - period = ktime_to_ns(child->rt_bandwidth.rt_period); - runtime = child->rt_bandwidth.rt_runtime; - - if (child == d->tg) { - period = d->rt_period; - runtime = d->rt_runtime; - } - - sum += to_ratio(period, runtime); - } - - if (sum > total) - return -EINVAL; - - return 0; -} - -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - int ret; - - struct rt_schedulable_data data = { - .tg = tg, - .rt_period = period, - .rt_runtime = runtime, - }; - - rcu_read_lock(); - ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); - rcu_read_unlock(); - - return ret; -} - -static int tg_set_rt_bandwidth(struct task_group *tg, - u64 rt_period, u64 rt_runtime) -{ - int i, err = 0; - - /* - * Disallowing the root group RT runtime is BAD, it would disallow the - * kernel creating (and or operating) RT threads. - */ - if (tg == &root_task_group && rt_runtime == 0) - return -EINVAL; - - /* No period doesn't make any sense. */ - if (rt_period == 0) - return -EINVAL; - - mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); - err = __rt_schedulable(tg, rt_period, rt_runtime); - if (err) - goto unlock; - - raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); - tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); - tg->rt_bandwidth.rt_runtime = rt_runtime; - - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = tg->rt_rq[i]; - - raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = rt_runtime; - raw_spin_unlock(&rt_rq->rt_runtime_lock); - } - raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); -unlock: - read_unlock(&tasklist_lock); - mutex_unlock(&rt_constraints_mutex); - - return err; -} - -static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) -{ - u64 rt_runtime, rt_period; - - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; - if (rt_runtime_us < 0) - rt_runtime = RUNTIME_INF; - - return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); -} - -static long sched_group_rt_runtime(struct task_group *tg) -{ - u64 rt_runtime_us; - - if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) - return -1; - - rt_runtime_us = tg->rt_bandwidth.rt_runtime; - do_div(rt_runtime_us, NSEC_PER_USEC); - return rt_runtime_us; -} - -static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) -{ - u64 rt_runtime, rt_period; - - rt_period = rt_period_us * NSEC_PER_USEC; - rt_runtime = tg->rt_bandwidth.rt_runtime; - - return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); -} - -static long sched_group_rt_period(struct task_group *tg) -{ - u64 rt_period_us; - - rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); - do_div(rt_period_us, NSEC_PER_USEC); - return rt_period_us; -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -#ifdef CONFIG_RT_GROUP_SCHED -static int sched_rt_global_constraints(void) -{ - int ret = 0; - - mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); - ret = __rt_schedulable(NULL, 0, 0); - read_unlock(&tasklist_lock); - mutex_unlock(&rt_constraints_mutex); - - return ret; -} - -static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) -{ - /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) - return 0; - - return 1; -} - -#else /* !CONFIG_RT_GROUP_SCHED */ -static int sched_rt_global_constraints(void) -{ - unsigned long flags; - int i; - - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = &cpu_rq(i)->rt; - - raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = global_rt_runtime(); - raw_spin_unlock(&rt_rq->rt_runtime_lock); - } - raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - - return 0; -} -#endif /* CONFIG_RT_GROUP_SCHED */ - -static int sched_rt_global_validate(void) -{ - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - if ((sysctl_sched_rt_runtime != RUNTIME_INF) && - (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) - return -EINVAL; - - return 0; -} - -static void sched_rt_do_global(void) -{ - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); -} - -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int old_period, old_runtime; - static DEFINE_MUTEX(mutex); - int ret; - - mutex_lock(&mutex); - old_period = sysctl_sched_rt_period; - old_runtime = sysctl_sched_rt_runtime; - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - - if (!ret && write) { - ret = sched_rt_global_validate(); - if (ret) - goto undo; - - ret = sched_dl_global_validate(); - if (ret) - goto undo; - - ret = sched_rt_global_constraints(); - if (ret) - goto undo; - - sched_rt_do_global(); - sched_dl_do_global(); - } - if (0) { -undo: - sysctl_sched_rt_period = old_period; - sysctl_sched_rt_runtime = old_runtime; - } - mutex_unlock(&mutex); - - return ret; -} - -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); - /* - * Make sure that internally we keep jiffies. - * Also, writing zero resets the timeslice to default: - */ - if (!ret && write) { - sched_rr_timeslice = - sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : - msecs_to_jiffies(sysctl_sched_rr_timeslice); - } - mutex_unlock(&mutex); - return ret; -} - -#ifdef CONFIG_CGROUP_SCHED static inline struct task_group *css_tg(struct cgroup_subsys_state *css) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 581d5c7a5264..45caf937ef90 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2449,6 +2449,316 @@ const struct sched_class rt_sched_class = { .update_curr = update_curr_rt, }; +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex); + +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) +{ + struct task_struct *g, *p; + + /* + * Autogroups do not have RT tasks; see autogroup_create(). + */ + if (task_group_is_autogroup(tg)) + return 0; + + for_each_process_thread(g, p) { + if (rt_task(p) && task_group(p) == tg) + return 1; + } + + return 0; +} + +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; + +static int tg_rt_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; + + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + /* + * Ensure we don't starve existing RT tasks. + */ + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; + + total = to_ratio(period, runtime); + + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; + + /* + * The sum of our children's runtime should not exceed our own. + */ + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); + } + + if (sum > total) + return -EINVAL; + + return 0; +} + +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ + int ret; + + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + rcu_read_lock(); + ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); + rcu_read_unlock(); + + return ret; +} + +static int tg_set_rt_bandwidth(struct task_group *tg, + u64 rt_period, u64 rt_runtime) +{ + int i, err = 0; + + /* + * Disallowing the root group RT runtime is BAD, it would disallow the + * kernel creating (and or operating) RT threads. + */ + if (tg == &root_task_group && rt_runtime == 0) + return -EINVAL; + + /* No period doesn't make any sense. */ + if (rt_period == 0) + return -EINVAL; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) + goto unlock; + + raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); + tg->rt_bandwidth.rt_runtime = rt_runtime; + + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = tg->rt_rq[i]; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_runtime; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } + raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); +unlock: + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return err; +} + +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us < 0) + rt_runtime = RUNTIME_INF; + + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_runtime(struct task_group *tg) +{ + u64 rt_runtime_us; + + if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) + return -1; + + rt_runtime_us = tg->rt_bandwidth.rt_runtime; + do_div(rt_runtime_us, NSEC_PER_USEC); + return rt_runtime_us; +} + +int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) +{ + u64 rt_runtime, rt_period; + + rt_period = rt_period_us * NSEC_PER_USEC; + rt_runtime = tg->rt_bandwidth.rt_runtime; + + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ + u64 rt_period_us; + + rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); + do_div(rt_period_us, NSEC_PER_USEC); + return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ + int ret = 0; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + ret = __rt_schedulable(NULL, 0, 0); + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return ret; +} + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + return 0; + + return 1; +} + +#else /* !CONFIG_RT_GROUP_SCHED */ +static int sched_rt_global_constraints(void) +{ + unsigned long flags; + int i; + + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = &cpu_rq(i)->rt; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = global_rt_runtime(); + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + + return 0; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +static int sched_rt_global_validate(void) +{ + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + if ((sysctl_sched_rt_runtime != RUNTIME_INF) && + (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) + return -EINVAL; + + return 0; +} + +static void sched_rt_do_global(void) +{ + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); +} + +int sched_rt_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + int ret; + + mutex_lock(&mutex); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_validate(); + if (ret) + goto undo; + + ret = sched_dl_global_validate(); + if (ret) + goto undo; + + ret = sched_rt_global_constraints(); + if (ret) + goto undo; + + sched_rt_do_global(); + sched_dl_do_global(); + } + if (0) { +undo: + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } + mutex_unlock(&mutex); + + return ret; +} + +int sched_rr_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + /* + * Make sure that internally we keep jiffies. + * Also, writing zero resets the timeslice to default: + */ + if (!ret && write) { + sched_rr_timeslice = + sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : + msecs_to_jiffies(sysctl_sched_rr_timeslice); + } + mutex_unlock(&mutex); + return ret; +} + #ifdef CONFIG_SCHED_DEBUG extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d4eb3f67529d..eeef1a3086d1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -383,6 +383,11 @@ extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent); +extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us); +extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us); +extern long sched_group_rt_runtime(struct task_group *tg); +extern long sched_group_rt_period(struct task_group *tg); +extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); extern struct task_group *sched_create_group(struct task_group *parent); extern void sched_online_group(struct task_group *tg, -- cgit v1.3-14-g43fede From 239946314e57711d7da546b67964d0b387a3ee42 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 22 Jun 2017 15:07:39 -0700 Subject: bpf: possibly avoid extra masking for narrower load in verifier Commit 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") permits narrower load for certain ctx fields. The commit however will already generate a masking even if the prog-specific ctx conversion produces the result with narrower size. For example, for __sk_buff->protocol, the ctx conversion loads the data into register with 2-byte load. A narrower 2-byte load should not generate masking. For __sk_buff->vlan_present, the conversion function set the result as either 0 or 1, essentially a byte. The narrower 2-byte or 1-byte load should not generate masking. To avoid unnecessary masking, prog-specific *_is_valid_access now passes converted_op_size back to verifier, which indicates the valid data width after perceived future conversion. Based on this information, verifier is able to avoid unnecessary marking. Since we want more information back from prog-specific *_is_valid_access checking, all of them are packed into one data structure for more clarity. Acked-by: Daniel Borkmann Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- include/linux/bpf.h | 11 +++++- include/linux/bpf_verifier.h | 3 +- kernel/bpf/verifier.c | 29 ++++++++++---- kernel/trace/bpf_trace.c | 17 +++++--- net/core/filter.c | 92 +++++++++++++++++++++++++------------------- 5 files changed, 97 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1bcbf0a71f75..deca4e7f2845 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -149,6 +149,15 @@ enum bpf_reg_type { struct bpf_prog; +/* The information passed from prog-specific *_is_valid_access + * back to the verifier. + */ +struct bpf_insn_access_aux { + enum bpf_reg_type reg_type; + int ctx_field_size; + int converted_op_size; +}; + struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); @@ -157,7 +166,7 @@ struct bpf_verifier_ops { * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, int *ctx_field_size); + struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); u32 (*convert_ctx_access)(enum bpf_access_type type, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 189741c0da85..621076f56251 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -73,7 +73,8 @@ struct bpf_insn_aux_data { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ }; - int ctx_field_size; /* the ctx field size for load/store insns, maybe 0 */ + int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ + int converted_op_size; /* the valid value width after perceived conversion */ }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 44b97d958fb7..74ea96ea391b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -761,22 +761,34 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { - int ctx_field_size = 0; + struct bpf_insn_access_aux info = { .reg_type = *reg_type }; /* for analyzer ctx accesses are already validated and converted */ if (env->analyzer_ops) return 0; if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, reg_type, &ctx_field_size)) { - /* a non zero ctx_field_size indicates: + env->prog->aux->ops->is_valid_access(off, size, t, &info)) { + /* a non zero info.ctx_field_size indicates: * . For this field, the prog type specific ctx conversion algorithm * only supports whole field access. * . This ctx access is a candiate for later verifier transformation * to load the whole field and then apply a mask to get correct result. + * a non zero info.converted_op_size indicates perceived actual converted + * value width in convert_ctx_access. */ - if (ctx_field_size) - env->insn_aux_data[insn_idx].ctx_field_size = ctx_field_size; + if ((info.ctx_field_size && !info.converted_op_size) || + (!info.ctx_field_size && info.converted_op_size)) { + verbose("verifier bug in is_valid_access prog type=%u off=%d size=%d\n", + env->prog->type, off, size); + return -EACCES; + } + + if (info.ctx_field_size) { + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + env->insn_aux_data[insn_idx].converted_op_size = info.converted_op_size; + } + *reg_type = info.reg_type; /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) @@ -3388,7 +3400,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i, cnt, off, size, ctx_field_size, is_narrower_load, delta = 0; + int i, cnt, off, size, ctx_field_size, converted_op_size, is_narrower_load, delta = 0; if (ops->gen_prologue) { cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, @@ -3431,7 +3443,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) off = insn->off; size = bpf_size_to_bytes(BPF_SIZE(insn->code)); ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; - is_narrower_load = (type == BPF_READ && size < ctx_field_size); + converted_op_size = env->insn_aux_data[i + delta].converted_op_size; + is_narrower_load = type == BPF_READ && size < ctx_field_size; /* If the read access is a narrower load of the field, * convert to a 4/8-byte load, to minimum program type specific @@ -3453,7 +3466,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) verbose("bpf verifier is misconfigured\n"); return -EINVAL; } - if (is_narrower_load) { + if (is_narrower_load && size < converted_op_size) { if (ctx_field_size <= 4) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, (1 << size * 8) - 1); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9d3ec8253131..97c46b440cd6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -479,7 +479,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, int *ctx_field_size) + struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct pt_regs)) return false; @@ -562,7 +562,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, int *ctx_field_size) + struct bpf_insn_access_aux *info) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) return false; @@ -581,7 +581,7 @@ const struct bpf_verifier_ops tracepoint_prog_ops = { }; static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, int *ctx_field_size) + struct bpf_insn_access_aux *info) { int sample_period_off; @@ -595,12 +595,17 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type /* permit 1, 2, 4 byte narrower and 8 normal read access to sample_period */ sample_period_off = offsetof(struct bpf_perf_event_data, sample_period); if (off >= sample_period_off && off < sample_period_off + sizeof(__u64)) { - *ctx_field_size = 8; + int allowed; + #ifdef __LITTLE_ENDIAN - return (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0; + allowed = (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0; #else - return ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0; + allowed = ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0; #endif + if (!allowed) + return false; + info->ctx_field_size = 8; + info->converted_op_size = 8; } else { if (size != sizeof(long)) return false; diff --git a/net/core/filter.c b/net/core/filter.c index 60ed6f343a63..4b788007415f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2856,8 +2856,37 @@ lwt_xmit_func_proto(enum bpf_func_id func_id) } } +static void __set_access_aux_info(int off, struct bpf_insn_access_aux *info) +{ + info->ctx_field_size = 4; + switch (off) { + case offsetof(struct __sk_buff, pkt_type) ... + offsetof(struct __sk_buff, pkt_type) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, vlan_present) ... + offsetof(struct __sk_buff, vlan_present) + sizeof(__u32) - 1: + info->converted_op_size = 1; + break; + case offsetof(struct __sk_buff, queue_mapping) ... + offsetof(struct __sk_buff, queue_mapping) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, protocol) ... + offsetof(struct __sk_buff, protocol) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, vlan_tci) ... + offsetof(struct __sk_buff, vlan_tci) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, vlan_proto) ... + offsetof(struct __sk_buff, vlan_proto) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, tc_index) ... + offsetof(struct __sk_buff, tc_index) + sizeof(__u32) - 1: + case offsetof(struct __sk_buff, tc_classid) ... + offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1: + info->converted_op_size = 2; + break; + default: + info->converted_op_size = 4; + } +} + static bool __is_valid_access(int off, int size, enum bpf_access_type type, - int *ctx_field_size) + struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct __sk_buff)) return false; @@ -2875,24 +2904,32 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type, break; case offsetof(struct __sk_buff, data) ... offsetof(struct __sk_buff, data) + sizeof(__u32) - 1: + if (size != sizeof(__u32)) + return false; + info->reg_type = PTR_TO_PACKET; + break; case offsetof(struct __sk_buff, data_end) ... offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1: if (size != sizeof(__u32)) return false; + info->reg_type = PTR_TO_PACKET_END; break; default: - /* permit narrower load for not cb/data/data_end fields */ - *ctx_field_size = 4; if (type == BPF_WRITE) { if (size != sizeof(__u32)) return false; } else { - if (size != sizeof(__u32)) + int allowed; + + /* permit narrower load for not cb/data/data_end fields */ #ifdef __LITTLE_ENDIAN - return (off & 0x3) == 0 && (size == 1 || size == 2); + allowed = (off & 0x3) == 0 && size <= 4 && (size & (size - 1)) == 0; #else - return (off & 0x3) + size == 4 && (size == 1 || size == 2); + allowed = (off & 0x3) + size == 4 && size <= 4 && (size & (size - 1)) == 0; #endif + if (!allowed) + return false; + __set_access_aux_info(off, info); } } @@ -2901,8 +2938,7 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type, static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, - int *ctx_field_size) + struct bpf_insn_access_aux *info) { switch (off) { case offsetof(struct __sk_buff, tc_classid) ... @@ -2924,13 +2960,12 @@ static bool sk_filter_is_valid_access(int off, int size, } } - return __is_valid_access(off, size, type, ctx_field_size); + return __is_valid_access(off, size, type, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, - int *ctx_field_size) + struct bpf_insn_access_aux *info) { switch (off) { case offsetof(struct __sk_buff, tc_classid) ... @@ -2950,22 +2985,12 @@ static bool lwt_is_valid_access(int off, int size, } } - switch (off) { - case offsetof(struct __sk_buff, data): - *reg_type = PTR_TO_PACKET; - break; - case offsetof(struct __sk_buff, data_end): - *reg_type = PTR_TO_PACKET_END; - break; - } - - return __is_valid_access(off, size, type, ctx_field_size); + return __is_valid_access(off, size, type, info); } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, - int *ctx_field_size) + struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { switch (off) { @@ -3028,8 +3053,7 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, - int *ctx_field_size) + struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { switch (off) { @@ -3045,16 +3069,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, } } - switch (off) { - case offsetof(struct __sk_buff, data): - *reg_type = PTR_TO_PACKET; - break; - case offsetof(struct __sk_buff, data_end): - *reg_type = PTR_TO_PACKET_END; - break; - } - - return __is_valid_access(off, size, type, ctx_field_size); + return __is_valid_access(off, size, type, info); } static bool __is_valid_xdp_access(int off, int size) @@ -3071,18 +3086,17 @@ static bool __is_valid_xdp_access(int off, int size) static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type, - int *ctx_field_size) + struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) return false; switch (off) { case offsetof(struct xdp_md, data): - *reg_type = PTR_TO_PACKET; + info->reg_type = PTR_TO_PACKET; break; case offsetof(struct xdp_md, data_end): - *reg_type = PTR_TO_PACKET_END; + info->reg_type = PTR_TO_PACKET_END; break; } -- cgit v1.3-14-g43fede From 739294fb03f590401bbd7faa6d31a507e3ffada5 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 23 Jun 2017 12:55:27 -0400 Subject: sched/numa: Override part of migrate_degrades_locality() when idle balancing Several tests in the NAS benchmark seem to run a lot slower with NUMA balancing enabled, than with NUMA balancing disabled. The slower run time corresponds with increased idle time. Overriding the final test of migrate_degrades_locality (but still doing the other NUMA tests first) seems to improve performance of those benchmarks. Reported-by: Jirka Hladky Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-2-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 694c258b8771..6e0c0524131e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6688,6 +6688,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (dst_nid == p->numa_preferred_nid) return 0; + /* Leaving a core idle is often worse than degrading locality. */ + if (env->idle != CPU_NOT_IDLE) + return -1; + if (numa_group) { src_faults = group_faults(p, src_nid); dst_faults = group_faults(p, dst_nid); -- cgit v1.3-14-g43fede From 7d894e6e34a5cdd12309c7e4a3f830277ad4b7bf Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 23 Jun 2017 12:55:28 -0400 Subject: sched/fair: Simplify wake_affine() for the single socket case Then 'this_cpu' and 'prev_cpu' are in the same socket, select_idle_sibling() will do its thing regardless of the return value of wake_affine(). Just return true and don't look at all the other things. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhladky@redhat.com Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-3-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e0c0524131e..fe1901686fa5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5419,6 +5419,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); + /* + * Common case: CPUs are in the same socket, and select_idle_sibling() + * will do its thing regardless of what we return: + */ + if (cpus_share_cache(prev_cpu, this_cpu)) + return true; + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -6007,11 +6014,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) + if (cpu == prev_cpu) + goto pick_cpu; + + if (wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } if (!sd) { + pick_cpu: if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); -- cgit v1.3-14-g43fede From 3fed382b46baac83703130fe4cd3d9147f427fb9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 23 Jun 2017 12:55:29 -0400 Subject: sched/numa: Implement NUMA node level wake_affine() Since select_idle_sibling() can place a task anywhere on a socket, comparing loads between individual CPU cores makes no real sense for deciding whether to do an affine wakeup across sockets, either. Instead, compare the load between the sockets in a similar way the load balancer and the numa balancing code do. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhladky@redhat.com Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-4-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 130 ++++++++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe1901686fa5..79ac078caf5d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2586,6 +2586,60 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } } + +/* + * Can a task be moved from prev_cpu to this_cpu without causing a load + * imbalance that would trigger the load balancer? + */ +static inline bool numa_wake_affine(struct sched_domain *sd, + struct task_struct *p, int this_cpu, + int prev_cpu, int sync) +{ + struct numa_stats prev_load, this_load; + s64 this_eff_load, prev_eff_load; + + update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); + update_numa_stats(&this_load, cpu_to_node(this_cpu)); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) { + unsigned long current_load = task_h_load(current); + + if (this_load.load > current_load) + this_load.load -= current_load; + else + this_load.load = 0; + } + + /* + * In low-load situations, where this_cpu's node is idle due to the + * sync cause above having dropped this_load.load to 0, move the task. + * Moving to an idle socket will not create a bad imbalance. + * + * Otherwise check if the nodes are near enough in load to allow this + * task to be woken on this_cpu's node. + */ + if (this_load.load > 0) { + unsigned long task_load = task_h_load(p); + + this_eff_load = 100; + this_eff_load *= prev_load.compute_capacity; + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= this_load.compute_capacity; + + this_eff_load *= this_load.load + task_load; + prev_eff_load *= prev_load.load - task_load; + + return this_eff_load <= prev_eff_load; + } + + return true; +} #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2598,6 +2652,13 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } + +static inline bool numa_wake_affine(struct sched_domain *sd, + struct task_struct *p, int this_cpu, + int prev_cpu, int sync) +{ + return true; +} #endif /* CONFIG_NUMA_BALANCING */ static void @@ -5407,74 +5468,25 @@ static int wake_wide(struct task_struct *p) static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { - s64 this_load, load; - s64 this_eff_load, prev_eff_load; - int idx, this_cpu; - struct task_group *tg; - unsigned long weight; - int balanced; - - idx = sd->wake_idx; - this_cpu = smp_processor_id(); - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); + int this_cpu = smp_processor_id(); + bool affine = false; /* * Common case: CPUs are in the same socket, and select_idle_sibling() * will do its thing regardless of what we return: */ if (cpus_share_cache(prev_cpu, this_cpu)) - return true; - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) { - tg = task_group(current); - weight = current->se.avg.load_avg; - - this_load += effective_load(tg, this_cpu, -weight, -weight); - load += effective_load(tg, prev_cpu, 0, -weight); - } - - tg = task_group(p); - weight = p->se.avg.load_avg; - - /* - * In low-load situations, where prev_cpu is idle and this_cpu is idle - * due to the sync cause above having dropped this_load to 0, we'll - * always have an imbalance, but there's really nothing you can do - * about that, so that's good too. - * - * Otherwise check if either cpus are near enough in load to allow this - * task to be woken on this_cpu. - */ - this_eff_load = 100; - this_eff_load *= capacity_of(prev_cpu); - - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= capacity_of(this_cpu); - - if (this_load > 0) { - this_eff_load *= this_load + - effective_load(tg, this_cpu, weight, weight); - - prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); - } - - balanced = this_eff_load <= prev_eff_load; + affine = true; + else + affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); + if (affine) { + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); + } - if (!balanced) - return 0; - - schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); - - return 1; + return affine; } static inline int task_util(struct task_struct *p); -- cgit v1.3-14-g43fede From 815abf5af45f04f759f12f3172afd15226fd7f71 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 23 Jun 2017 12:55:30 -0400 Subject: sched/fair: Remove effective_load() The effective_load() function was only used by the NUMA balancing code, and not by the regular load balancing code. Now that the NUMA balancing code no longer uses it either, get rid of it. Signed-off-by: Rik van Riel Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jhladky@redhat.com Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170623165530.22514-5-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 124 +--------------------------------------------------- 1 file changed, 1 insertion(+), 123 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79ac078caf5d..6f4f155adf5f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1382,7 +1382,6 @@ static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long capacity_of(int cpu); -static long effective_load(struct task_group *tg, int cpu, long wl, long wg); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -3045,8 +3044,7 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) * differential update where we store the last value we propagated. This in * turn allows skipping updates if the differential is 'small'. * - * Updating tg's load_avg is necessary before update_cfs_share() (which is - * done) and effective_load() (which is not done because it is too costly). + * Updating tg's load_avg is necessary before update_cfs_share(). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { @@ -5298,126 +5296,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return 0; } -#ifdef CONFIG_FAIR_GROUP_SCHED -/* - * effective_load() calculates the load change as seen from the root_task_group - * - * Adding load to a group doesn't make a group heavier, but can cause movement - * of group shares between cpus. Assuming the shares were perfectly aligned one - * can calculate the shift in shares. - * - * Calculate the effective load difference if @wl is added (subtracted) to @tg - * on this @cpu and results in a total addition (subtraction) of @wg to the - * total group weight. - * - * Given a runqueue weight distribution (rw_i) we can compute a shares - * distribution (s_i) using: - * - * s_i = rw_i / \Sum rw_j (1) - * - * Suppose we have 4 CPUs and our @tg is a direct child of the root group and - * has 7 equal weight tasks, distributed as below (rw_i), with the resulting - * shares distribution (s_i): - * - * rw_i = { 2, 4, 1, 0 } - * s_i = { 2/7, 4/7, 1/7, 0 } - * - * As per wake_affine() we're interested in the load of two CPUs (the CPU the - * task used to run on and the CPU the waker is running on), we need to - * compute the effect of waking a task on either CPU and, in case of a sync - * wakeup, compute the effect of the current task going to sleep. - * - * So for a change of @wl to the local @cpu with an overall group weight change - * of @wl we can compute the new shares distribution (s'_i) using: - * - * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) - * - * Suppose we're interested in CPUs 0 and 1, and want to compute the load - * differences in waking a task to CPU 0. The additional task changes the - * weight and shares distributions like: - * - * rw'_i = { 3, 4, 1, 0 } - * s'_i = { 3/8, 4/8, 1/8, 0 } - * - * We can then compute the difference in effective weight by using: - * - * dw_i = S * (s'_i - s_i) (3) - * - * Where 'S' is the group weight as seen by its parent. - * - * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) - * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - - * 4/7) times the weight of the group. - */ -static long effective_load(struct task_group *tg, int cpu, long wl, long wg) -{ - struct sched_entity *se = tg->se[cpu]; - - if (!tg->parent) /* the trivial, non-cgroup case */ - return wl; - - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = se->my_q; - long W, w = cfs_rq_load_avg(cfs_rq); - - tg = cfs_rq->tg; - - /* - * W = @wg + \Sum rw_j - */ - W = wg + atomic_long_read(&tg->load_avg); - - /* Ensure \Sum rw_j >= rw_i */ - W -= cfs_rq->tg_load_avg_contrib; - W += w; - - /* - * w = rw_i + @wl - */ - w += wl; - - /* - * wl = S * s'_i; see (2) - */ - if (W > 0 && w < W) - wl = (w * (long)scale_load_down(tg->shares)) / W; - else - wl = scale_load_down(tg->shares); - - /* - * Per the above, wl is the new se->load.weight value; since - * those are clipped to [MIN_SHARES, ...) do so now. See - * calc_cfs_shares(). - */ - if (wl < MIN_SHARES) - wl = MIN_SHARES; - - /* - * wl = dw_i = S * (s'_i - s_i); see (3) - */ - wl -= se->avg.load_avg; - - /* - * Recursively apply this logic to all parent groups to compute - * the final effective load change on the root group. Since - * only the @tg group gets extra weight, all parent groups can - * only redistribute existing shares. @wl is the shift in shares - * resulting from this level per the above. - */ - wg = 0; - } - - return wl; -} -#else - -static long effective_load(struct task_group *tg, int cpu, long wl, long wg) -{ - return wl; -} - -#endif - static void record_wakee(struct task_struct *p) { /* -- cgit v1.3-14-g43fede From c2ce34c0a0e5187195ecade872be950d2611ba68 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Jun 2017 11:05:59 +0200 Subject: genirq/debugfs: Remove pointless NULL pointer check debugfs_remove() has it's own NULL pointer check. Remove the conditional and make irq_remove_debugfs_entry() an inline helper Reported-by: kbuild test robot Signed-off-by: Thomas Gleixner --- kernel/irq/debugfs.c | 7 ------- kernel/irq/internals.h | 7 ++++++- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index dbd6e78db213..4d384edc0c64 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -3,7 +3,6 @@ * * This file is licensed under the GPL V2. */ -#include #include #include @@ -191,12 +190,6 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) &dfs_irq_ops); } -void irq_remove_debugfs_entry(struct irq_desc *desc) -{ - if (desc->debugfs_file) - debugfs_remove(desc->debugfs_file); -} - static int __init irq_debugfs_init(void) { struct dentry *root_dir; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 5fd105e252c3..a573e0771baf 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -318,8 +318,13 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #ifdef CONFIG_GENERIC_IRQ_DEBUGFS +#include + void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); -void irq_remove_debugfs_entry(struct irq_desc *desc); +static inline void irq_remove_debugfs_entry(struct irq_desc *desc) +{ + debugfs_remove(desc->debugfs_file); +} # ifdef CONFIG_IRQ_DOMAIN void irq_domain_debugfs_init(struct dentry *root); # else -- cgit v1.3-14-g43fede From b2d3d61adb7b73cfe5f82404f7a130a76fc64232 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 23 Jun 2017 16:11:07 +0200 Subject: genirq/timings: Add infrastructure to track the interrupt timings The interrupt framework gives a lot of information about each interrupt. It does not keep track of when those interrupts occur though, which is a prerequisite for estimating the next interrupt arrival for power management purposes. Add a mechanism to record the timestamp for each interrupt occurrences in a per-CPU circular buffer to help with the prediction of the next occurrence using a statistical model. Each CPU can store up to IRQ_TIMINGS_SIZE events , the current value of IRQ_TIMINGS_SIZE is 32. Each event is encoded into a single u64, where the high 48 bits are used for the timestamp and the low 16 bits are for the irq number. A static key is introduced so when the irq prediction is switched off at runtime, the overhead is near to zero. It results in most of the code in internals.h for inline reasons and a very few in the new file timings.c. The latter will contain more in the next patch which will provide the statistical model for the next event prediction. Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Acked-by: Nicolas Pitre Cc: Jens Axboe Cc: Hannes Reinecke Cc: Vincent Guittot Cc: "Rafael J . Wysocki" Cc: Peter Zijlstra Cc: Bjorn Helgaas Link: http://lkml.kernel.org/r/1498227072-5980-1-git-send-email-daniel.lezcano@linaro.org --- include/linux/interrupt.h | 5 +++ kernel/irq/Kconfig | 3 ++ kernel/irq/Makefile | 1 + kernel/irq/handle.c | 2 ++ kernel/irq/internals.h | 90 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/irq/manage.c | 3 ++ kernel/irq/timings.c | 30 ++++++++++++++++ 7 files changed, 134 insertions(+) create mode 100644 kernel/irq/timings.c (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a6fba4804672..9f617238a2f7 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -703,6 +703,11 @@ static inline void init_irq_proc(void) } #endif +#ifdef CONFIG_IRQ_TIMINGS +void irq_timings_enable(void); +void irq_timings_disable(void); +#endif + struct seq_file; int show_interrupts(struct seq_file *p, void *v); int arch_show_interrupts(struct seq_file *p, int prec); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index fcbb1d6d51cb..27c4e774071c 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -85,6 +85,9 @@ config GENERIC_MSI_IRQ_DOMAIN config HANDLE_DOMAIN_IRQ bool +config IRQ_TIMINGS + bool + config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" depends on IRQ_DOMAIN && DEBUG_FS diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index c61fc9c2d1f7..e4aef7351f2b 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,5 +1,6 @@ obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o +obj-$(CONFIG_IRQ_TIMINGS) += timings.o obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d3f24905852c..eb4d3e8945b8 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -138,6 +138,8 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags unsigned int irq = desc->irq_data.irq; struct irqaction *action; + record_irq_time(desc); + for_each_action_of_desc(desc, action) { irqreturn_t res; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a573e0771baf..b95b74920433 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef CONFIG_SPARSE_IRQ # define IRQ_BITMAP_BITS (NR_IRQS + 8196) @@ -57,6 +58,7 @@ enum { IRQS_WAITING = 0x00000080, IRQS_PENDING = 0x00000200, IRQS_SUSPENDED = 0x00000800, + IRQS_TIMINGS = 0x00001000, }; #include "debug.h" @@ -255,6 +257,94 @@ static inline void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } #endif +#ifdef CONFIG_IRQ_TIMINGS + +#define IRQ_TIMINGS_SHIFT 5 +#define IRQ_TIMINGS_SIZE (1 << IRQ_TIMINGS_SHIFT) +#define IRQ_TIMINGS_MASK (IRQ_TIMINGS_SIZE - 1) + +/** + * struct irq_timings - irq timings storing structure + * @values: a circular buffer of u64 encoded values + * @count: the number of elements in the array + */ +struct irq_timings { + u64 values[IRQ_TIMINGS_SIZE]; + int count; +}; + +DECLARE_PER_CPU(struct irq_timings, irq_timings); + +static inline void irq_remove_timings(struct irq_desc *desc) +{ + desc->istate &= ~IRQS_TIMINGS; +} + +static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act) +{ + /* + * We don't need the measurement because the idle code already + * knows the next expiry event. + */ + if (act->flags & __IRQF_TIMER) + return; + + desc->istate |= IRQS_TIMINGS; +} + +extern void irq_timings_enable(void); +extern void irq_timings_disable(void); + +DECLARE_STATIC_KEY_FALSE(irq_timing_enabled); + +/* + * The interrupt number and the timestamp are encoded into a single + * u64 variable to optimize the size. + * 48 bit time stamp and 16 bit IRQ number is way sufficient. + * Who cares an IRQ after 78 hours of idle time? + */ +static inline u64 irq_timing_encode(u64 timestamp, int irq) +{ + return (timestamp << 16) | irq; +} + +static inline int irq_timing_decode(u64 value, u64 *timestamp) +{ + *timestamp = value >> 16; + return value & U16_MAX; +} + +/* + * The function record_irq_time is only called in one place in the + * interrupts handler. We want this function always inline so the code + * inside is embedded in the function and the static key branching + * code can act at the higher level. Without the explicit + * __always_inline we can end up with a function call and a small + * overhead in the hotpath for nothing. + */ +static __always_inline void record_irq_time(struct irq_desc *desc) +{ + if (!static_branch_likely(&irq_timing_enabled)) + return; + + if (desc->istate & IRQS_TIMINGS) { + struct irq_timings *timings = this_cpu_ptr(&irq_timings); + + timings->values[timings->count & IRQ_TIMINGS_MASK] = + irq_timing_encode(local_clock(), + irq_desc_get_irq(desc)); + + timings->count++; + } +} +#else +static inline void irq_remove_timings(struct irq_desc *desc) {} +static inline void irq_setup_timings(struct irq_desc *desc, + struct irqaction *act) {}; +static inline void record_irq_time(struct irq_desc *desc) {} +#endif /* CONFIG_IRQ_TIMINGS */ + + #ifdef CONFIG_GENERIC_IRQ_CHIP void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3577c091ac7b..5c11c1730ba5 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1348,6 +1348,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_setup_timings(desc, new); + /* * Strictly no need to wake it up, but hung_task complains * when no hard interrupt wakes the thread up. @@ -1474,6 +1476,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) irq_settings_clr_disable_unlazy(desc); irq_shutdown(desc); irq_release_resources(desc); + irq_remove_timings(desc); } #ifdef CONFIG_SMP diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c new file mode 100644 index 000000000000..56cf6870fa26 --- /dev/null +++ b/kernel/irq/timings.c @@ -0,0 +1,30 @@ +/* + * linux/kernel/irq/timings.c + * + * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include + +#include "internals.h" + +DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); + +DEFINE_PER_CPU(struct irq_timings, irq_timings); + +void irq_timings_enable(void) +{ + static_branch_enable(&irq_timing_enabled); +} + +void irq_timings_disable(void) +{ + static_branch_disable(&irq_timing_enabled); +} -- cgit v1.3-14-g43fede From e1c921495534002d727b15a76a2f8c20b6b108b5 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 23 Jun 2017 16:11:08 +0200 Subject: genirq/timings: Add infrastructure for estimating the next interrupt arrival time An interrupt behaves with a burst of activity with periodic interval of time followed by one or two peaks of longer interval. As the time intervals are periodic, statistically speaking they follow a normal distribution and each interrupts can be tracked individually. Add a mechanism to compute the statistics on all interrupts, except the timers which are deterministic from a prediction point of view, as their expiry time is known. The goal is to extract the periodicity for each interrupt, with the last timestamp and sum them, so the next event can be predicted to a certain extent. Taking the earliest prediction gives the expected wakeup on the system (assuming a timer won't expire before). Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: Nicolas Pitre Cc: Jens Axboe Cc: Hannes Reinecke Cc: Vincent Guittot Cc: "Rafael J . Wysocki" Cc: Peter Zijlstra Cc: Bjorn Helgaas Link: http://lkml.kernel.org/r/1498227072-5980-2-git-send-email-daniel.lezcano@linaro.org --- include/linux/interrupt.h | 1 + kernel/irq/internals.h | 19 +++ kernel/irq/timings.c | 339 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 359 insertions(+) (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 9f617238a2f7..37f8e354f564 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -706,6 +706,7 @@ static inline void init_irq_proc(void) #ifdef CONFIG_IRQ_TIMINGS void irq_timings_enable(void); void irq_timings_disable(void); +u64 irq_timings_next_event(u64 now); #endif struct seq_file; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b95b74920433..9da14d125df4 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -275,13 +275,21 @@ struct irq_timings { DECLARE_PER_CPU(struct irq_timings, irq_timings); +extern void irq_timings_free(int irq); +extern int irq_timings_alloc(int irq); + static inline void irq_remove_timings(struct irq_desc *desc) { desc->istate &= ~IRQS_TIMINGS; + + irq_timings_free(irq_desc_get_irq(desc)); } static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act) { + int irq = irq_desc_get_irq(desc); + int ret; + /* * We don't need the measurement because the idle code already * knows the next expiry event. @@ -289,6 +297,17 @@ static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *ac if (act->flags & __IRQF_TIMER) return; + /* + * In case the timing allocation fails, we just want to warn, + * not fail, so letting the system boot anyway. + */ + ret = irq_timings_alloc(irq); + if (ret) { + pr_warn("Failed to allocate irq timing stats for irq%d (%d)", + irq, ret); + return; + } + desc->istate |= IRQS_TIMINGS; } diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 56cf6870fa26..c8c1d073fbf1 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -8,10 +8,16 @@ * published by the Free Software Foundation. * */ +#include #include +#include #include #include +#include #include +#include + +#include #include "internals.h" @@ -19,6 +25,18 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); DEFINE_PER_CPU(struct irq_timings, irq_timings); +struct irqt_stat { + u64 next_evt; + u64 last_ts; + u64 variance; + u32 avg; + u32 nr_samples; + int anomalies; + int valid; +}; + +static DEFINE_IDR(irqt_stats); + void irq_timings_enable(void) { static_branch_enable(&irq_timing_enabled); @@ -28,3 +46,324 @@ void irq_timings_disable(void) { static_branch_disable(&irq_timing_enabled); } + +/** + * irqs_update - update the irq timing statistics with a new timestamp + * + * @irqs: an irqt_stat struct pointer + * @ts: the new timestamp + * + * The statistics are computed online, in other words, the code is + * designed to compute the statistics on a stream of values rather + * than doing multiple passes on the values to compute the average, + * then the variance. The integer division introduces a loss of + * precision but with an acceptable error margin regarding the results + * we would have with the double floating precision: we are dealing + * with nanosec, so big numbers, consequently the mantisse is + * negligeable, especially when converting the time in usec + * afterwards. + * + * The computation happens at idle time. When the CPU is not idle, the + * interrupts' timestamps are stored in the circular buffer, when the + * CPU goes idle and this routine is called, all the buffer's values + * are injected in the statistical model continuying to extend the + * statistics from the previous busy-idle cycle. + * + * The observations showed a device will trigger a burst of periodic + * interrupts followed by one or two peaks of longer time, for + * instance when a SD card device flushes its cache, then the periodic + * intervals occur again. A one second inactivity period resets the + * stats, that gives us the certitude the statistical values won't + * exceed 1x10^9, thus the computation won't overflow. + * + * Basically, the purpose of the algorithm is to watch the periodic + * interrupts and eliminate the peaks. + * + * An interrupt is considered periodically stable if the interval of + * its occurences follow the normal distribution, thus the values + * comply with: + * + * avg - 3 x stddev < value < avg + 3 x stddev + * + * Which can be simplified to: + * + * -3 x stddev < value - avg < 3 x stddev + * + * abs(value - avg) < 3 x stddev + * + * In order to save a costly square root computation, we use the + * variance. For the record, stddev = sqrt(variance). The equation + * above becomes: + * + * abs(value - avg) < 3 x sqrt(variance) + * + * And finally we square it: + * + * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2 + * + * (value - avg) x (value - avg) < 9 x variance + * + * Statistically speaking, any values out of this interval is + * considered as an anomaly and is discarded. However, a normal + * distribution appears when the number of samples is 30 (it is the + * rule of thumb in statistics, cf. "30 samples" on Internet). When + * there are three consecutive anomalies, the statistics are resetted. + * + */ +static void irqs_update(struct irqt_stat *irqs, u64 ts) +{ + u64 old_ts = irqs->last_ts; + u64 variance = 0; + u64 interval; + s64 diff; + + /* + * The timestamps are absolute time values, we need to compute + * the timing interval between two interrupts. + */ + irqs->last_ts = ts; + + /* + * The interval type is u64 in order to deal with the same + * type in our computation, that prevent mindfuck issues with + * overflow, sign and division. + */ + interval = ts - old_ts; + + /* + * The interrupt triggered more than one second apart, that + * ends the sequence as predictible for our purpose. In this + * case, assume we have the beginning of a sequence and the + * timestamp is the first value. As it is impossible to + * predict anything at this point, return. + * + * Note the first timestamp of the sequence will always fall + * in this test because the old_ts is zero. That is what we + * want as we need another timestamp to compute an interval. + */ + if (interval >= NSEC_PER_SEC) { + memset(irqs, 0, sizeof(*irqs)); + irqs->last_ts = ts; + return; + } + + /* + * Pre-compute the delta with the average as the result is + * used several times in this function. + */ + diff = interval - irqs->avg; + + /* + * Increment the number of samples. + */ + irqs->nr_samples++; + + /* + * Online variance divided by the number of elements if there + * is more than one sample. Normally the formula is division + * by nr_samples - 1 but we assume the number of element will be + * more than 32 and dividing by 32 instead of 31 is enough + * precise. + */ + if (likely(irqs->nr_samples > 1)) + variance = irqs->variance >> IRQ_TIMINGS_SHIFT; + + /* + * The rule of thumb in statistics for the normal distribution + * is having at least 30 samples in order to have the model to + * apply. Values outside the interval are considered as an + * anomaly. + */ + if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) { + /* + * After three consecutive anomalies, we reset the + * stats as it is no longer stable enough. + */ + if (irqs->anomalies++ >= 3) { + memset(irqs, 0, sizeof(*irqs)); + irqs->last_ts = ts; + return; + } + } else { + /* + * The anomalies must be consecutives, so at this + * point, we reset the anomalies counter. + */ + irqs->anomalies = 0; + } + + /* + * The interrupt is considered stable enough to try to predict + * the next event on it. + */ + irqs->valid = 1; + + /* + * Online average algorithm: + * + * new_average = average + ((value - average) / count) + * + * The variance computation depends on the new average + * to be computed here first. + * + */ + irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT); + + /* + * Online variance algorithm: + * + * new_variance = variance + (value - average) x (value - new_average) + * + * Warning: irqs->avg is updated with the line above, hence + * 'interval - irqs->avg' is no longer equal to 'diff' + */ + irqs->variance = irqs->variance + (diff * (interval - irqs->avg)); + + /* + * Update the next event + */ + irqs->next_evt = ts + irqs->avg; +} + +/** + * irq_timings_next_event - Return when the next event is supposed to arrive + * + * During the last busy cycle, the number of interrupts is incremented + * and stored in the irq_timings structure. This information is + * necessary to: + * + * - know if the index in the table wrapped up: + * + * If more than the array size interrupts happened during the + * last busy/idle cycle, the index wrapped up and we have to + * begin with the next element in the array which is the last one + * in the sequence, otherwise it is a the index 0. + * + * - have an indication of the interrupts activity on this CPU + * (eg. irq/sec) + * + * The values are 'consumed' after inserting in the statistical model, + * thus the count is reinitialized. + * + * The array of values **must** be browsed in the time direction, the + * timestamp must increase between an element and the next one. + * + * Returns a nanosec time based estimation of the earliest interrupt, + * U64_MAX otherwise. + */ +u64 irq_timings_next_event(u64 now) +{ + struct irq_timings *irqts = this_cpu_ptr(&irq_timings); + struct irqt_stat *irqs; + struct irqt_stat __percpu *s; + u64 ts, next_evt = U64_MAX; + int i, irq = 0; + + /* + * This function must be called with the local irq disabled in + * order to prevent the timings circular buffer to be updated + * while we are reading it. + */ + WARN_ON_ONCE(!irqs_disabled()); + + /* + * Number of elements in the circular buffer: If it happens it + * was flushed before, then the number of elements could be + * smaller than IRQ_TIMINGS_SIZE, so the count is used, + * otherwise the array size is used as we wrapped. The index + * begins from zero when we did not wrap. That could be done + * in a nicer way with the proper circular array structure + * type but with the cost of extra computation in the + * interrupt handler hot path. We choose efficiency. + * + * Inject measured irq/timestamp to the statistical model + * while decrementing the counter because we consume the data + * from our circular buffer. + */ + for (i = irqts->count & IRQ_TIMINGS_MASK, + irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); + irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { + + irq = irq_timing_decode(irqts->values[i], &ts); + + s = idr_find(&irqt_stats, irq); + if (s) { + irqs = this_cpu_ptr(s); + irqs_update(irqs, ts); + } + } + + /* + * Look in the list of interrupts' statistics, the earliest + * next event. + */ + idr_for_each_entry(&irqt_stats, s, i) { + + irqs = this_cpu_ptr(s); + + if (!irqs->valid) + continue; + + if (irqs->next_evt <= now) { + irq = i; + next_evt = now; + + /* + * This interrupt mustn't use in the future + * until new events occur and update the + * statistics. + */ + irqs->valid = 0; + break; + } + + if (irqs->next_evt < next_evt) { + irq = i; + next_evt = irqs->next_evt; + } + } + + return next_evt; +} + +void irq_timings_free(int irq) +{ + struct irqt_stat __percpu *s; + + s = idr_find(&irqt_stats, irq); + if (s) { + free_percpu(s); + idr_remove(&irqt_stats, irq); + } +} + +int irq_timings_alloc(int irq) +{ + struct irqt_stat __percpu *s; + int id; + + /* + * Some platforms can have the same private interrupt per cpu, + * so this function may be be called several times with the + * same interrupt number. Just bail out in case the per cpu + * stat structure is already allocated. + */ + s = idr_find(&irqt_stats, irq); + if (s) + return 0; + + s = alloc_percpu(*s); + if (!s) + return -ENOMEM; + + idr_preload(GFP_KERNEL); + id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT); + idr_preload_end(); + + if (id < 0) { + free_percpu(s); + return id; + } + + return 0; +} -- cgit v1.3-14-g43fede From f59dd9c886acb3abb188e8e94a99436560976835 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:02 -0700 Subject: time: add get_timespec64 and put_timespec64 Add helper functions to convert between struct timespec64 and struct timespec at userspace boundaries. This is a preparatory patch to use timespec64 as the basic type internally in the kernel as timespec is not y2038 safe on 32 bit systems. The patch helps the cause by containing all data conversions at the userspace boundaries within these functions. Suggested-by: Arnd Bergmann Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- include/linux/compat.h | 2 ++ include/linux/time.h | 5 +++++ kernel/compat.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ kernel/time/time.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 79 insertions(+) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 425563c7647b..3eb04016ffa9 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -164,6 +164,8 @@ extern int compat_get_timespec(struct timespec *, const void __user *); extern int compat_put_timespec(const struct timespec *, void __user *); extern int compat_get_timeval(struct timeval *, const void __user *); extern int compat_put_timeval(const struct timeval *, void __user *); +extern int compat_get_timespec64(struct timespec64 *, const void __user *); +extern int compat_put_timespec64(const struct timespec64 *, void __user *); /* * This function convert a timespec if necessary and returns a *user diff --git a/include/linux/time.h b/include/linux/time.h index c0543f5f25de..36afb579495f 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -8,6 +8,11 @@ extern struct timezone sys_tz; +int get_timespec64(struct timespec64 *ts, + const struct timespec __user *uts); +int put_timespec64(const struct timespec64 *ts, + struct timespec __user *uts); + #define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1) static inline int timespec_equal(const struct timespec *a, diff --git a/kernel/compat.c b/kernel/compat.c index ebd8bdc3fd68..73f26ba44a8a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -120,6 +120,50 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } +static int __compat_get_timespec64(struct timespec64 *ts64, + const struct compat_timespec __user *cts) +{ + struct compat_timespec ts; + int ret; + + ret = copy_from_user(&ts, cts, sizeof(ts)); + if (ret) + return -EFAULT; + + ts64->tv_sec = ts.tv_sec; + ts64->tv_nsec = ts.tv_nsec; + + return 0; +} + +static int __compat_put_timespec64(const struct timespec64 *ts64, + struct compat_timespec __user *cts) +{ + struct compat_timespec ts = { + .tv_sec = ts64->tv_sec, + .tv_nsec = ts64->tv_nsec + }; + return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; +} + +int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; + else + return __compat_get_timespec64(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_get_timespec64); + +int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; + else + return __compat_put_timespec64(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_put_timespec64); + int compat_get_timeval(struct timeval *tv, const void __user *utv) { if (COMPAT_USE_64BIT_TIME) diff --git a/kernel/time/time.c b/kernel/time/time.c index 7c89e437c4d7..adb9853ca6b0 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -890,3 +890,31 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, return res; } + +int get_timespec64(struct timespec64 *ts, + const struct timespec __user *uts) +{ + struct timespec kts; + int ret; + + ret = copy_from_user(&kts, uts, sizeof(kts)); + if (ret) + return -EFAULT; + + ts->tv_sec = kts.tv_sec; + ts->tv_nsec = kts.tv_nsec; + + return 0; +} +EXPORT_SYMBOL_GPL(get_timespec64); + +int put_timespec64(const struct timespec64 *ts, + struct timespec __user *uts) +{ + struct timespec kts = { + .tv_sec = ts->tv_sec, + .tv_nsec = ts->tv_nsec + }; + return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(put_timespec64); -- cgit v1.3-14-g43fede From d5b7ffbfbdacc29e4db035f90665951668fa9c58 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:03 -0700 Subject: time: introduce {get,put}_itimerspec64 As we change the user space type for the timerfd and posix timer functions to newer data types, we need some form of conversion helpers to avoid duplicating that logic. Suggested-by: Arnd Bergmann Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- include/linux/compat.h | 4 ++++ include/linux/posix-timers.h | 1 - include/linux/time.h | 13 +++++++++++++ kernel/compat.c | 21 +++++++++++++++++++++ kernel/time/time.c | 30 ++++++++++++++++++++++++++++++ 5 files changed, 68 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 3eb04016ffa9..2ed54020ace0 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -166,6 +166,10 @@ extern int compat_get_timeval(struct timeval *, const void __user *); extern int compat_put_timeval(const struct timeval *, void __user *); extern int compat_get_timespec64(struct timespec64 *, const void __user *); extern int compat_put_timespec64(const struct timespec64 *, void __user *); +extern int get_compat_itimerspec64(struct itimerspec64 *its, + const struct compat_itimerspec __user *uits); +extern int put_compat_itimerspec64(const struct itimerspec64 *its, + struct compat_itimerspec __user *uits); /* * This function convert a timespec if necessary and returns a *user diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 29f1b7f09ced..62839fd04dce 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -113,5 +113,4 @@ void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); void posixtimer_rearm(struct siginfo *info); - #endif diff --git a/include/linux/time.h b/include/linux/time.h index 36afb579495f..f9858d7e6361 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -12,6 +12,10 @@ int get_timespec64(struct timespec64 *ts, const struct timespec __user *uts); int put_timespec64(const struct timespec64 *ts, struct timespec __user *uts); +int get_itimerspec64(struct itimerspec64 *it, + const struct itimerspec __user *uit); +int put_itimerspec64(const struct itimerspec64 *it, + struct itimerspec __user *uit); #define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1) @@ -275,4 +279,13 @@ static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) a->tv_nsec = ns; } +static inline bool itimerspec64_valid(const struct itimerspec64 *its) +{ + if (!timespec64_valid(&(its->it_interval)) || + !timespec64_valid(&(its->it_value))) + return false; + + return true; +} + #endif diff --git a/kernel/compat.c b/kernel/compat.c index 73f26ba44a8a..a350deda503a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -586,6 +586,27 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst, return 0; } +int get_compat_itimerspec64(struct itimerspec64 *its, + const struct compat_itimerspec __user *uits) +{ + + if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) || + __compat_get_timespec64(&its->it_value, &uits->it_value)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(get_compat_itimerspec64); + +int put_compat_itimerspec64(const struct itimerspec64 *its, + struct compat_itimerspec __user *uits) +{ + if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) || + __compat_put_timespec64(&its->it_value, &uits->it_value)) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(put_compat_itimerspec64); + /* * We currently only need the following fields from the sigevent * structure: sigev_value, sigev_signo, sig_notify and (sometimes diff --git a/kernel/time/time.c b/kernel/time/time.c index adb9853ca6b0..44a8c1402133 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -918,3 +918,33 @@ int put_timespec64(const struct timespec64 *ts, return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(put_timespec64); + +int get_itimerspec64(struct itimerspec64 *it, + const struct itimerspec __user *uit) +{ + int ret; + + ret = get_timespec64(&it->it_interval, &uit->it_interval); + if (ret) + return ret; + + ret = get_timespec64(&it->it_value, &uit->it_value); + + return ret; +} +EXPORT_SYMBOL_GPL(get_itimerspec64); + +int put_itimerspec64(const struct itimerspec64 *it, + struct itimerspec __user *uit) +{ + int ret; + + ret = put_timespec64(&it->it_interval, &uit->it_interval); + if (ret) + return ret; + + ret = put_timespec64(&it->it_value, &uit->it_value); + + return ret; +} +EXPORT_SYMBOL_GPL(put_itimerspec64); -- cgit v1.3-14-g43fede From 63a766a1780f9581e8885bdb64270a594a84f81a Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:04 -0700 Subject: posix-stubs: Conditionally include COMPAT_SYS_NI defines These apis only need to be defined if CONFIG_COMPAT is enabled. Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- kernel/time/posix-stubs.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 38f3b20efa29..65878221cbfb 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -41,12 +41,6 @@ SYS_NI(setitimer); #ifdef __ARCH_WANT_SYS_ALARM SYS_NI(alarm); #endif -COMPAT_SYS_NI(timer_create); -COMPAT_SYS_NI(clock_adjtime); -COMPAT_SYS_NI(timer_settime); -COMPAT_SYS_NI(timer_gettime); -COMPAT_SYS_NI(getitimer); -COMPAT_SYS_NI(setitimer); /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC @@ -138,6 +132,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, } #ifdef CONFIG_COMPAT +COMPAT_SYS_NI(timer_create); +COMPAT_SYS_NI(clock_adjtime); +COMPAT_SYS_NI(timer_settime); +COMPAT_SYS_NI(timer_gettime); +COMPAT_SYS_NI(getitimer); +COMPAT_SYS_NI(setitimer); + COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, struct compat_timespec __user *, tp) { -- cgit v1.3-14-g43fede From d829b8fb2431595422289cfc210f0a955a8bec74 Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Mon, 26 Jun 2017 19:33:33 +0800 Subject: genirq: Set irq masked state when initializing irq_desc The irq default state is set to disabled when allocating irq desc, but the masked state flag is not set. This is inconsistent vs. the state tracking logic which is used to prevent unnecessary calls to hardware level irq chip functions. Set the masked state flag as well. Signed-off-by: Jeffy Chen Signed-off-by: Thomas Gleixner Cc: tfiga@chromium.org Cc: briannorris@chromium.org Cc: dianders@chromium.org Link: http://lkml.kernel.org/r/1498476814-12563-1-git-send-email-jeffy.chen@rock-chips.com --- kernel/irq/irqdesc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 35a95fadcfda..948b50e78549 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -116,6 +116,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc->irq_data.chip_data = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); + irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; -- cgit v1.3-14-g43fede From bf22ff45bed664aefb5c4e43029057a199b7070c Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Mon, 26 Jun 2017 19:33:34 +0800 Subject: genirq: Avoid unnecessary low level irq function calls Check irq state in enable/disable/unmask/mask_irq to avoid unnecessary low level irq function calls. This has two advantages: - Conditionals are faster than hardware access - Solves issues with the underlying refcounting of the pinctrl infrastructure Suggested-by: Thomas Gleixner Signed-off-by: Jeffy Chen Signed-off-by: Thomas Gleixner Cc: tfiga@chromium.org Cc: briannorris@chromium.org Cc: dianders@chromium.org Link: http://lkml.kernel.org/r/1498476814-12563-2-git-send-email-jeffy.chen@rock-chips.com --- kernel/irq/chip.c | 53 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fc89eeb8a6b4..2e30d925a40d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -314,22 +314,32 @@ void irq_shutdown(struct irq_desc *desc) void irq_enable(struct irq_desc *desc) { - irq_state_clr_disabled(desc); - if (desc->irq_data.chip->irq_enable) - desc->irq_data.chip->irq_enable(&desc->irq_data); - else - desc->irq_data.chip->irq_unmask(&desc->irq_data); - irq_state_clr_masked(desc); + if (!irqd_irq_disabled(&desc->irq_data)) { + unmask_irq(desc); + } else { + irq_state_clr_disabled(desc); + if (desc->irq_data.chip->irq_enable) { + desc->irq_data.chip->irq_enable(&desc->irq_data); + irq_state_clr_masked(desc); + } else { + unmask_irq(desc); + } + } } static void __irq_disable(struct irq_desc *desc, bool mask) { - irq_state_set_disabled(desc); - if (desc->irq_data.chip->irq_disable) { - desc->irq_data.chip->irq_disable(&desc->irq_data); - irq_state_set_masked(desc); - } else if (mask) { - mask_irq(desc); + if (irqd_irq_disabled(&desc->irq_data)) { + if (mask) + mask_irq(desc); + } else { + irq_state_set_disabled(desc); + if (desc->irq_data.chip->irq_disable) { + desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_state_set_masked(desc); + } else if (mask) { + mask_irq(desc); + } } } @@ -378,18 +388,21 @@ void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu) static inline void mask_ack_irq(struct irq_desc *desc) { - if (desc->irq_data.chip->irq_mask_ack) + if (desc->irq_data.chip->irq_mask_ack) { desc->irq_data.chip->irq_mask_ack(&desc->irq_data); - else { - desc->irq_data.chip->irq_mask(&desc->irq_data); + irq_state_set_masked(desc); + } else { + mask_irq(desc); if (desc->irq_data.chip->irq_ack) desc->irq_data.chip->irq_ack(&desc->irq_data); } - irq_state_set_masked(desc); } void mask_irq(struct irq_desc *desc) { + if (irqd_irq_masked(&desc->irq_data)) + return; + if (desc->irq_data.chip->irq_mask) { desc->irq_data.chip->irq_mask(&desc->irq_data); irq_state_set_masked(desc); @@ -398,6 +411,9 @@ void mask_irq(struct irq_desc *desc) void unmask_irq(struct irq_desc *desc) { + if (!irqd_irq_masked(&desc->irq_data)) + return; + if (desc->irq_data.chip->irq_unmask) { desc->irq_data.chip->irq_unmask(&desc->irq_data); irq_state_clr_masked(desc); @@ -411,10 +427,7 @@ void unmask_threaded_irq(struct irq_desc *desc) if (chip->flags & IRQCHIP_EOI_THREADED) chip->irq_eoi(&desc->irq_data); - if (chip->irq_unmask) { - chip->irq_unmask(&desc->irq_data); - irq_state_clr_masked(desc); - } + unmask_irq(desc); } /* -- cgit v1.3-14-g43fede From 1ba5c08b58a0c21fca222f1bf2fde184aa26103f Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 6 Jun 2017 14:17:39 +0200 Subject: kernel/module.c: suppress warning about unused nowarn variable This patch fix the following warning: kernel/module.c: In function 'add_usage_links': kernel/module.c:1653:6: warning: variable 'nowarn' set but not used [-Wunused-but-set-variable] [jeyu: folded in first patch since it only swapped the function order so that del_usage_links can be called from add_usage_links] Signed-off-by: Corentin Labbe Signed-off-by: Jessica Yu --- kernel/module.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 3803449ca219..f546d574f436 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1666,31 +1666,36 @@ static inline void remove_notes_attrs(struct module *mod) } #endif /* CONFIG_KALLSYMS */ -static void add_usage_links(struct module *mod) +static void del_usage_links(struct module *mod) { #ifdef CONFIG_MODULE_UNLOAD struct module_use *use; - int nowarn; mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) { - nowarn = sysfs_create_link(use->target->holders_dir, - &mod->mkobj.kobj, mod->name); - } + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); mutex_unlock(&module_mutex); #endif } -static void del_usage_links(struct module *mod) +static int add_usage_links(struct module *mod) { + int ret = 0; #ifdef CONFIG_MODULE_UNLOAD struct module_use *use; mutex_lock(&module_mutex); - list_for_each_entry(use, &mod->target_list, target_list) - sysfs_remove_link(use->target->holders_dir, mod->name); + list_for_each_entry(use, &mod->target_list, target_list) { + ret = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + if (ret) + break; + } mutex_unlock(&module_mutex); + if (ret) + del_usage_links(mod); #endif + return ret; } static int module_add_modinfo_attrs(struct module *mod) @@ -1801,13 +1806,18 @@ static int mod_sysfs_setup(struct module *mod, if (err) goto out_unreg_param; - add_usage_links(mod); + err = add_usage_links(mod); + if (err) + goto out_unreg_modinfo_attrs; + add_sect_attrs(mod, info); add_notes_attrs(mod, info); kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; +out_unreg_modinfo_attrs: + module_remove_modinfo_attrs(mod); out_unreg_param: module_param_sysfs_remove(mod); out_unreg_holders: -- cgit v1.3-14-g43fede From 673feb9d76ab3eddde7acfd94b206e321cfc90b9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 23 Jun 2017 15:26:26 -0400 Subject: ftrace: Add :mod: caching infrastructure to trace_array This is the start of the infrastructure work to allow for tracing module functions before it is loaded. Currently the following command: # echo :mod:some-mod > set_ftrace_filter will enable tracing of all functions within the module "some-mod" if it is loaded. What we want, is if the module is not loaded, that line will be saved. When the module is loaded, then the "some-mod" will have that line executed on it, so that the functions within it starts being traced. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++--- kernel/trace/trace.h | 12 +++++ 2 files changed, 148 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9e5841dc14b5..1867edec6269 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1293,6 +1293,28 @@ static void ftrace_hash_clear(struct ftrace_hash *hash) FTRACE_WARN_ON(hash->count); } +static void free_ftrace_mod(struct ftrace_mod_load *ftrace_mod) +{ + list_del(&ftrace_mod->list); + kfree(ftrace_mod->module); + kfree(ftrace_mod->func); + kfree(ftrace_mod); +} + +static void clear_ftrace_mod_list(struct list_head *head) +{ + struct ftrace_mod_load *p, *n; + + /* stack tracer isn't supported yet */ + if (!head) + return; + + mutex_lock(&ftrace_lock); + list_for_each_entry_safe(p, n, head, list) + free_ftrace_mod(p); + mutex_unlock(&ftrace_lock); +} + static void free_ftrace_hash(struct ftrace_hash *hash) { if (!hash || hash == EMPTY_HASH) @@ -1346,6 +1368,35 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits) return hash; } + +static int ftrace_add_mod(struct trace_array *tr, + const char *func, const char *module, + int enable) +{ + struct ftrace_mod_load *ftrace_mod; + struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace; + + ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL); + if (!ftrace_mod) + return -ENOMEM; + + ftrace_mod->func = kstrdup(func, GFP_KERNEL); + ftrace_mod->module = kstrdup(module, GFP_KERNEL); + ftrace_mod->enable = enable; + + if (!ftrace_mod->func || !ftrace_mod->module) + goto out_free; + + list_add(&ftrace_mod->list, mod_head); + + return 0; + + out_free: + free_ftrace_mod(ftrace_mod); + + return -ENOMEM; +} + static struct ftrace_hash * alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) { @@ -3457,6 +3508,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, { struct ftrace_iterator *iter; struct ftrace_hash *hash; + struct list_head *mod_head; + struct trace_array *tr = ops->private; int ret = 0; ftrace_ops_init(ops); @@ -3478,18 +3531,23 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, mutex_lock(&ops->func_hash->regex_lock); - if (flag & FTRACE_ITER_NOTRACE) + if (flag & FTRACE_ITER_NOTRACE) { hash = ops->func_hash->notrace_hash; - else + mod_head = tr ? &tr->mod_trace : NULL; + } else { hash = ops->func_hash->filter_hash; + mod_head = tr ? &tr->mod_notrace : NULL; + } if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; - if (file->f_flags & O_TRUNC) + if (file->f_flags & O_TRUNC) { iter->hash = alloc_ftrace_hash(size_bits); - else + clear_ftrace_mod_list(mod_head); + } else { iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); + } if (!iter->hash) { trace_parser_put(&iter->parser); @@ -3761,6 +3819,68 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, return ret; } +static bool module_exists(const char *module) +{ + /* All modules have the symbol __this_module */ + const char this_mod[] = "__this_module"; + const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1; + char modname[modname_size + 1]; + unsigned long val; + int n; + + n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod); + + if (n > modname_size) + return false; + + val = module_kallsyms_lookup_name(modname); + return val != 0; +} + +static int cache_mod(struct trace_array *tr, + const char *func, char *module, int enable) +{ + struct ftrace_mod_load *ftrace_mod, *n; + struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace; + int ret; + + mutex_lock(&ftrace_lock); + + /* We do not cache inverse filters */ + if (func[0] == '!') { + func++; + ret = -EINVAL; + + /* Look to remove this hash */ + list_for_each_entry_safe(ftrace_mod, n, head, list) { + if (strcmp(ftrace_mod->module, module) != 0) + continue; + + /* no func matches all */ + if (!func || strcmp(func, "*") == 0 || + (ftrace_mod->func && + strcmp(ftrace_mod->func, func) == 0)) { + ret = 0; + free_ftrace_mod(ftrace_mod); + continue; + } + } + goto out; + } + + ret = -EINVAL; + /* We only care about modules that have not been loaded yet */ + if (module_exists(module)) + goto out; + + /* Save this string off, and execute it when the module is loaded */ + ret = ftrace_add_mod(tr, func, module, enable); + out: + mutex_unlock(&ftrace_lock); + + return ret; +} + /* * We register the module command as a template to show others how * to register the a command as well. @@ -3768,10 +3888,16 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, static int ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, - char *func, char *cmd, char *module, int enable) + char *func_orig, char *cmd, char *module, int enable) { + char *func; int ret; + /* match_records() modifies func, and we need the original */ + func = kstrdup(func_orig, GFP_KERNEL); + if (!func) + return -ENOMEM; + /* * cmd == 'mod' because we only registered this func * for the 'mod' ftrace_func_command. @@ -3780,8 +3906,10 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, * parameter. */ ret = match_records(hash, func, strlen(func), module); + kfree(func); + if (!ret) - return -EINVAL; + return cache_mod(tr, func_orig, module, enable); if (ret < 0) return ret; return 0; @@ -5570,6 +5698,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) void ftrace_init_trace_array(struct trace_array *tr) { INIT_LIST_HEAD(&tr->func_probes); + INIT_LIST_HEAD(&tr->mod_trace); + INIT_LIST_HEAD(&tr->mod_notrace); } #else diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 69a3ab3ee4f5..d63550cdbdfa 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -263,7 +263,10 @@ struct trace_array { struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; #ifdef CONFIG_DYNAMIC_FTRACE + /* All of these are protected by the ftrace_lock */ struct list_head func_probes; + struct list_head mod_trace; + struct list_head mod_notrace; #endif /* function tracing enabled */ int function_enabled; @@ -761,6 +764,15 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern char trace_find_mark(unsigned long long duration); +struct ftrace_hash; + +struct ftrace_mod_load { + struct list_head list; + char *func; + char *module; + int enable; +}; + struct ftrace_hash { unsigned long size_bits; struct hlist_head *buckets; -- cgit v1.3-14-g43fede From 5985ea8bd5d1b820b909af49fbc2767a990080a6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 23 Jun 2017 16:05:11 -0400 Subject: ftrace: Have the cached module list show in set_ftrace_filter When writing in a module filter into set_ftrace_filter for a module that is not yet loaded, it it cached, and will be executed when the module is loaded (although that is not implemented yet at this commit). Display the list of cached modules to be traced. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 3 +- kernel/trace/ftrace.c | 112 +++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 102 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1b6992e994e6..9fb9a67dc9d4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -446,7 +446,8 @@ enum { FTRACE_ITER_PRINTALL = (1 << 2), FTRACE_ITER_DO_PROBES = (1 << 3), FTRACE_ITER_PROBE = (1 << 4), - FTRACE_ITER_ENABLED = (1 << 5), + FTRACE_ITER_MOD = (1 << 5), + FTRACE_ITER_ENABLED = (1 << 6), }; void arch_ftrace_update_code(int command); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1867edec6269..bfdbce78064b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3112,6 +3112,7 @@ ftrace_allocate_pages(unsigned long num_to_init) struct ftrace_iterator { loff_t pos; loff_t func_pos; + loff_t mod_pos; struct ftrace_page *pg; struct dyn_ftrace *func; struct ftrace_func_probe *probe; @@ -3119,6 +3120,8 @@ struct ftrace_iterator { struct trace_parser parser; struct ftrace_hash *hash; struct ftrace_ops *ops; + struct trace_array *tr; + struct list_head *mod_list; int pidx; int idx; unsigned flags; @@ -3203,13 +3206,13 @@ static void *t_probe_start(struct seq_file *m, loff_t *pos) if (!(iter->flags & FTRACE_ITER_DO_PROBES)) return NULL; - if (iter->func_pos > *pos) + if (iter->mod_pos > *pos) return NULL; iter->probe = NULL; iter->probe_entry = NULL; iter->pidx = 0; - for (l = 0; l <= (*pos - iter->func_pos); ) { + for (l = 0; l <= (*pos - iter->mod_pos); ) { p = t_probe_next(m, &l); if (!p) break; @@ -3247,6 +3250,82 @@ t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) return 0; } +static void * +t_mod_next(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + + (*pos)++; + iter->pos = *pos; + + iter->mod_list = iter->mod_list->next; + + if (iter->mod_list == &tr->mod_trace || + iter->mod_list == &tr->mod_notrace) { + iter->flags &= ~FTRACE_ITER_MOD; + return NULL; + } + + iter->mod_pos = *pos; + + return iter; +} + +static void *t_mod_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l; + + if (iter->func_pos > *pos) + return NULL; + + iter->mod_pos = iter->func_pos; + + /* probes are only available if tr is set */ + if (!iter->tr) + return NULL; + + for (l = 0; l <= (*pos - iter->func_pos); ) { + p = t_mod_next(m, &l); + if (!p) + break; + } + if (!p) { + iter->flags &= ~FTRACE_ITER_MOD; + return t_probe_start(m, pos); + } + + /* Only set this if we have an item */ + iter->flags |= FTRACE_ITER_MOD; + + return iter; +} + +static int +t_mod_show(struct seq_file *m, struct ftrace_iterator *iter) +{ + struct ftrace_mod_load *ftrace_mod; + struct trace_array *tr = iter->tr; + + if (WARN_ON_ONCE(!iter->mod_list) || + iter->mod_list == &tr->mod_trace || + iter->mod_list == &tr->mod_notrace) + return -EIO; + + ftrace_mod = list_entry(iter->mod_list, struct ftrace_mod_load, list); + + if (ftrace_mod->func) + seq_printf(m, "%s", ftrace_mod->func); + else + seq_putc(m, '*'); + + seq_printf(m, ":mod:%s\n", ftrace_mod->module); + + return 0; +} + static void * t_func_next(struct seq_file *m, loff_t *pos) { @@ -3288,7 +3367,7 @@ static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; - loff_t l = *pos; /* t_hash_start() must use original pos */ + loff_t l = *pos; /* t_probe_start() must use original pos */ void *ret; if (unlikely(ftrace_disabled)) @@ -3297,16 +3376,19 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (iter->flags & FTRACE_ITER_PROBE) return t_probe_next(m, pos); + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_next(m, pos); + if (iter->flags & FTRACE_ITER_PRINTALL) { /* next must increment pos, and t_probe_start does not */ (*pos)++; - return t_probe_start(m, &l); + return t_mod_start(m, &l); } ret = t_func_next(m, pos); if (!ret) - return t_probe_start(m, &l); + return t_mod_start(m, &l); return ret; } @@ -3315,7 +3397,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) { iter->pos = 0; iter->func_pos = 0; - iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE); + iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE | FTRACE_ITER_MOD); } static void *t_start(struct seq_file *m, loff_t *pos) @@ -3344,15 +3426,15 @@ static void *t_start(struct seq_file *m, loff_t *pos) ftrace_hash_empty(iter->hash)) { iter->func_pos = 1; /* Account for the message */ if (*pos > 0) - return t_probe_start(m, pos); + return t_mod_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; /* reset in case of seek/pread */ iter->flags &= ~FTRACE_ITER_PROBE; return iter; } - if (iter->flags & FTRACE_ITER_PROBE) - return t_probe_start(m, pos); + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_start(m, pos); /* * Unfortunately, we need to restart at ftrace_pages_start @@ -3368,7 +3450,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) } if (!p) - return t_probe_start(m, pos); + return t_mod_start(m, pos); return iter; } @@ -3402,6 +3484,9 @@ static int t_show(struct seq_file *m, void *v) if (iter->flags & FTRACE_ITER_PROBE) return t_probe_show(m, iter); + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_show(m, iter); + if (iter->flags & FTRACE_ITER_PRINTALL) { if (iter->flags & FTRACE_ITER_NOTRACE) seq_puts(m, "#### no functions disabled ####\n"); @@ -3528,17 +3613,20 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, iter->ops = ops; iter->flags = flag; + iter->tr = tr; mutex_lock(&ops->func_hash->regex_lock); if (flag & FTRACE_ITER_NOTRACE) { hash = ops->func_hash->notrace_hash; - mod_head = tr ? &tr->mod_trace : NULL; + mod_head = tr ? &tr->mod_notrace : NULL; } else { hash = ops->func_hash->filter_hash; - mod_head = tr ? &tr->mod_notrace : NULL; + mod_head = tr ? &tr->mod_trace : NULL; } + iter->mod_list = mod_head; + if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; -- cgit v1.3-14-g43fede From d7fbf8df7ca0a5c7e85db79f7005f99cb461c525 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 26 Jun 2017 10:57:21 -0400 Subject: ftrace: Implement cached modules tracing on module load If a module is cached in the set_ftrace_filter, and that module is loaded, then enable tracing on that module as if the cached module text was written into set_ftrace_filter just as the module is loaded. # echo ":mod:kvm_intel" > # cat /sys/kernel/tracing/set_ftrace_filter #### all functions enabled #### :mod:kvm_intel # modprobe kvm_intel # cat /sys/kernel/tracing/set_ftrace_filter vmx_get_rflags [kvm_intel] vmx_get_pkru [kvm_intel] vmx_get_interrupt_shadow [kvm_intel] vmx_rdtscp_supported [kvm_intel] vmx_invpcid_supported [kvm_intel] [..] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bfdbce78064b..f1ccf8be9df7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3969,6 +3969,97 @@ static int cache_mod(struct trace_array *tr, return ret; } +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable); + +static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, + char *mod, bool enable) +{ + struct ftrace_mod_load *ftrace_mod, *n; + struct ftrace_hash **orig_hash, *new_hash; + LIST_HEAD(process_mods); + char *func; + int ret; + + mutex_lock(&ops->func_hash->regex_lock); + + if (enable) + orig_hash = &ops->func_hash->filter_hash; + else + orig_hash = &ops->func_hash->notrace_hash; + + new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, + *orig_hash); + if (!new_hash) + return; /* Warn? */ + + mutex_lock(&ftrace_lock); + + list_for_each_entry_safe(ftrace_mod, n, head, list) { + + if (strcmp(ftrace_mod->module, mod) != 0) + continue; + + if (ftrace_mod->func) + func = kstrdup(ftrace_mod->func, GFP_KERNEL); + else + func = kstrdup("*", GFP_KERNEL); + + if (!func) /* warn? */ + continue; + + list_del(&ftrace_mod->list); + list_add(&ftrace_mod->list, &process_mods); + + /* Use the newly allocated func, as it may be "*" */ + kfree(ftrace_mod->func); + ftrace_mod->func = func; + } + + mutex_unlock(&ftrace_lock); + + list_for_each_entry_safe(ftrace_mod, n, &process_mods, list) { + + func = ftrace_mod->func; + + /* Grabs ftrace_lock, which is why we have this extra step */ + match_records(new_hash, func, strlen(func), mod); + free_ftrace_mod(ftrace_mod); + } + + mutex_lock(&ftrace_lock); + + ret = ftrace_hash_move_and_update_ops(ops, orig_hash, + new_hash, enable); + mutex_unlock(&ftrace_lock); + + mutex_unlock(&ops->func_hash->regex_lock); + + free_ftrace_hash(new_hash); +} + +static void process_cached_mods(const char *mod_name) +{ + struct trace_array *tr; + char *mod; + + mod = kstrdup(mod_name, GFP_KERNEL); + if (!mod) + return; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!list_empty(&tr->mod_trace)) + process_mod_list(&tr->mod_trace, tr->ops, mod, true); + if (!list_empty(&tr->mod_notrace)) + process_mod_list(&tr->mod_notrace, tr->ops, mod, false); + } + mutex_unlock(&trace_types_lock); + + kfree(mod); +} + /* * We register the module command as a template to show others how * to register the a command as well. @@ -5682,6 +5773,8 @@ void ftrace_module_enable(struct module *mod) out_unlock: mutex_unlock(&ftrace_lock); + + process_cached_mods(mod->name); } void ftrace_module_init(struct module *mod) -- cgit v1.3-14-g43fede From 8c08f0d5c6fb10ff93ffb1cbf416f4f1c3a52a80 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 26 Jun 2017 11:47:31 -0400 Subject: ftrace: Have cached module filters be an active filter When a module filter is added to set_ftrace_filter, if the module is not loaded, it is cached. This should be considered an active filter, and function tracing should be filtered by this. That is, if a cached module filter is the only filter set, then no function tracing should be happening, as all the functions available will be filtered out. This makes sense, as the reason to add a cached module filter, is to trace the module when you load it. There shouldn't be any other tracing happening until then. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 20 +++++++++++++++----- kernel/trace/trace.h | 7 ++++++- 3 files changed, 23 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9fb9a67dc9d4..5857390ac35a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -120,6 +120,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * this ops will fail to register or set_filter_ip. * PID - Is affected by set_ftrace_pid (allows filtering on those pids) * RCU - Set when the ops can only be called when RCU is watching. + * TRACE_ARRAY - The ops->private points to a trace_array descriptor. */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -138,6 +139,7 @@ enum { FTRACE_OPS_FL_IPMODIFY = 1 << 13, FTRACE_OPS_FL_PID = 1 << 14, FTRACE_OPS_FL_RCU = 1 << 15, + FTRACE_OPS_FL_TRACE_ARRAY = 1 << 16, }; #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f1ccf8be9df7..914539e3e301 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1410,6 +1410,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) if (!new_hash) return NULL; + if (hash) + new_hash->flags = hash->flags; + /* Empty hash? */ if (ftrace_hash_empty(hash)) return new_hash; @@ -1454,7 +1457,7 @@ __ftrace_hash_move(struct ftrace_hash *src) /* * If the new source is empty, just return the empty_hash. */ - if (!src->count) + if (ftrace_hash_empty(src)) return EMPTY_HASH; /* @@ -1471,6 +1474,8 @@ __ftrace_hash_move(struct ftrace_hash *src) if (!new_hash) return NULL; + new_hash->flags = src->flags; + size = 1 << src->size_bits; for (i = 0; i < size; i++) { hhd = &src->buckets[i]; @@ -1701,7 +1706,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, struct dyn_ftrace *rec; bool update = false; int count = 0; - int all = 0; + int all = false; /* Only update if the ops has been registered */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) @@ -1722,7 +1727,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, hash = ops->func_hash->filter_hash; other_hash = ops->func_hash->notrace_hash; if (ftrace_hash_empty(hash)) - all = 1; + all = true; } else { inc = !inc; hash = ops->func_hash->notrace_hash; @@ -4028,6 +4033,9 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, free_ftrace_mod(ftrace_mod); } + if (enable && list_empty(head)) + new_hash->flags &= ~FTRACE_HASH_FL_MOD; + mutex_lock(&ftrace_lock); ret = ftrace_hash_move_and_update_ops(ops, orig_hash, @@ -5035,9 +5043,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file) if (file->f_mode & FMODE_WRITE) { filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); - if (filter_hash) + if (filter_hash) { orig_hash = &iter->ops->func_hash->filter_hash; - else + if (!list_empty(&iter->tr->mod_trace)) + iter->hash->flags |= FTRACE_HASH_FL_MOD; + } else orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d63550cdbdfa..13823951e42b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -773,10 +773,15 @@ struct ftrace_mod_load { int enable; }; +enum { + FTRACE_HASH_FL_MOD = (1 << 0), +}; + struct ftrace_hash { unsigned long size_bits; struct hlist_head *buckets; unsigned long count; + unsigned long flags; struct rcu_head rcu; }; @@ -785,7 +790,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip); static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) { - return !hash || !hash->count; + return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD)); } /* Standard output formatting function used for function return traces */ -- cgit v1.3-14-g43fede From 131b63515932d18a3b1a60db3958f3c0dd5462bc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 23 Feb 2017 09:24:24 -0800 Subject: seccomp: Clean up core dump logic This just cleans up the core dumping logic to avoid the braces around the RET_KILL case. Signed-off-by: Kees Cook --- kernel/seccomp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 65f61077ad50..fce83885b7ef 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -641,11 +641,12 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_KILL: - default: { - siginfo_t info; + default: audit_seccomp(this_syscall, SIGSYS, action); /* Dump core only if this is the last remaining thread. */ if (get_nr_threads(current) == 1) { + siginfo_t info; + /* Show the original registers in the dump. */ syscall_rollback(current, task_pt_regs(current)); /* Trigger a manual coredump since do_exit skips it. */ @@ -654,7 +655,6 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, } do_exit(SIGSYS); } - } unreachable(); -- cgit v1.3-14-g43fede From 0b5fa2290637a3235898d18dc0e7a136783f1bd2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 26 Jun 2017 09:24:00 -0700 Subject: seccomp: Switch from atomic_t to recount_t This switches the seccomp usage tracking from atomic_t to refcount_t to gain refcount overflow protections. Cc: Elena Reshetova Cc: David Windsor Cc: Hans Liljestrand Signed-off-by: Kees Cook --- kernel/seccomp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index fce83885b7ef..98b59b5db90b 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -13,7 +13,7 @@ * of Berkeley Packet Filters/Linux Socket Filters. */ -#include +#include #include #include #include @@ -56,7 +56,7 @@ * to a task_struct (other than @usage). */ struct seccomp_filter { - atomic_t usage; + refcount_t usage; struct seccomp_filter *prev; struct bpf_prog *prog; }; @@ -378,7 +378,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return ERR_PTR(ret); } - atomic_set(&sfilter->usage, 1); + refcount_set(&sfilter->usage, 1); return sfilter; } @@ -465,7 +465,7 @@ void get_seccomp_filter(struct task_struct *tsk) if (!orig) return; /* Reference count is bounded by the number of total processes. */ - atomic_inc(&orig->usage); + refcount_inc(&orig->usage); } static inline void seccomp_filter_free(struct seccomp_filter *filter) @@ -481,7 +481,7 @@ void put_seccomp_filter(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; /* Clean up single-reference branches iteratively. */ - while (orig && atomic_dec_and_test(&orig->usage)) { + while (orig && refcount_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; orig = orig->prev; seccomp_filter_free(freeme); -- cgit v1.3-14-g43fede From 49368a47f6dc1b256e3b83813da5c9b0731fe268 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Sat, 3 Jun 2017 20:52:32 +1000 Subject: PM / hibernate: Use CONFIG_HAVE_SET_MEMORY for include condition Kbuild reported a build failure when CONFIG_STRICT_KERNEL_RWX was enabled on powerpc. We don't yet have ARCH_HAS_SET_MEMORY and ppc32 saw a build failure. I've only done a basic compile test with a config that has hibernation enabled. Fixes: 50327ddfbc92 (kernel/power/snapshot.c: use set_memory.h header) Reported-by: Christophe Leroy Signed-off-by: Balbir Singh Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index fa46606f3356..71730d672290 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -36,13 +36,13 @@ #include #include #include -#ifdef CONFIG_STRICT_KERNEL_RWX +#ifdef CONFIG_ARCH_HAS_SET_MEMORY #include #endif #include "power.h" -#ifdef CONFIG_STRICT_KERNEL_RWX +#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_ARCH_HAS_SET_MEMORY) static bool hibernate_restore_protection; static bool hibernate_restore_protection_active; @@ -77,7 +77,7 @@ static inline void hibernate_restore_protection_begin(void) {} static inline void hibernate_restore_protection_end(void) {} static inline void hibernate_restore_protect_page(void *page_address) {} static inline void hibernate_restore_unprotect_page(void *page_address) {} -#endif /* CONFIG_STRICT_KERNEL_RWX */ +#endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */ static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); -- cgit v1.3-14-g43fede From eba74c294467d55c697e2199c37dfaf8126fe396 Mon Sep 17 00:00:00 2001 From: BaoJun Luo Date: Tue, 27 Jun 2017 02:10:44 +0200 Subject: PM / hibernate: Drop redundant parameter of swsusp_alloc() The first parameter of swsusp_alloc is not used, so drop it. Signed-off-by: BaoJun Luo [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 71730d672290..b7708e319941 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1929,8 +1929,7 @@ static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, * also be located in the high memory, because of the way in which * copy_data_pages() works. */ -static int swsusp_alloc(struct memory_bitmap *orig_bm, - struct memory_bitmap *copy_bm, +static int swsusp_alloc(struct memory_bitmap *copy_bm, unsigned int nr_pages, unsigned int nr_highmem) { if (nr_highmem > 0) { @@ -1976,7 +1975,7 @@ asmlinkage __visible int swsusp_save(void) return -ENOMEM; } - if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { + if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) { printk(KERN_ERR "PM: Memory allocation failed\n"); return -ENOMEM; } -- cgit v1.3-14-g43fede From 6a9c981b1e9657ca5866d10aa38b8a4fe1159138 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 27 Jun 2017 11:02:49 -0400 Subject: ftrace: Remove unused function ftrace_arch_read_dyn_info() ftrace_arch_read_dyn_info() was used so that archs could add its own debug information into the dyn_ftrace_total_info in the tracefs file system. That file is for debugging usage of dynamic ftrace. No arch uses that function anymore, so just get rid of it. This also allows for tracing_read_dyn_info() to be cleaned up a bit. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 19ac2088d10a..14318ce92b13 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6737,33 +6737,18 @@ static const struct file_operations tracing_stats_fops = { #ifdef CONFIG_DYNAMIC_FTRACE -int __weak ftrace_arch_read_dyn_info(char *buf, int size) -{ - return 0; -} - static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - static char ftrace_dyn_info_buffer[1024]; - static DEFINE_MUTEX(dyn_info_mutex); unsigned long *p = filp->private_data; - char *buf = ftrace_dyn_info_buffer; - int size = ARRAY_SIZE(ftrace_dyn_info_buffer); + char buf[64]; /* Not too big for a shallow stack */ int r; - mutex_lock(&dyn_info_mutex); - r = sprintf(buf, "%ld ", *p); - - r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); + r = scnprintf(buf, 63, "%ld", *p); buf[r++] = '\n'; - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); - - mutex_unlock(&dyn_info_mutex); - - return r; + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } static const struct file_operations tracing_dyn_info_fops = { -- cgit v1.3-14-g43fede From 83dd14933e33a45e9b366c572e15505982b46845 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 27 Jun 2017 11:04:40 -0400 Subject: ftrace: Decrement count for dyn_ftrace_total_info file The dyn_ftrace_total_info file is used to show how many functions have been converted into nops and can be used by ftrace. The problem is that it does not get decremented when functions are removed (init boot code being freed, and modules being freed). That means the number is very inaccurate everytime functions are removed from the ftrace tables. Decrement it when functions are removed. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 914539e3e301..7509ef9810bf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5705,6 +5705,7 @@ void ftrace_release_mod(struct module *mod) if (pg == ftrace_pages) ftrace_pages = next_to_ftrace_page(last_pg); + ftrace_update_tot_cnt -= pg->index; *last_pg = pg->next; order = get_count_order(pg->size / ENTRIES_PER_PAGE); free_pages((unsigned long)pg->records, order); -- cgit v1.3-14-g43fede From d914ba37d7145acb9fd3bb23075c2d56e5a44eb6 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 26 Jun 2017 19:01:55 -0700 Subject: tracing: Add support for recording tgid of tasks Inorder to support recording of tgid, the following changes are made: * Introduce a new API (tracing_record_taskinfo) to additionally record the tgid along with the task's comm at the same time. This has has the benefit of not setting trace_cmdline_save before all the information for a task is saved. * Add a new API tracing_record_taskinfo_sched_switch to record task information for 2 tasks at a time (previous and next) and use it from sched_switch probe. * Preserve the old API (tracing_record_cmdline) and create it as a wrapper around the new one so that existing callers aren't affected. * Reuse the existing sched_switch and sched_wakeup probes to record tgid information and add a new option 'record-tgid' to enable recording of tgid When record-tgid option isn't enabled to being with, we take care to make sure that there's isn't memory or runtime overhead. Link: http://lkml.kernel.org/r/20170627020155.5139-1-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Tested-by: Michael Sartain Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 13 ++++- kernel/trace/trace.c | 105 ++++++++++++++++++++++++++++++++++---- kernel/trace/trace.h | 7 +++ kernel/trace/trace_events.c | 42 ++++++++++++++- kernel/trace/trace_sched_switch.c | 72 +++++++++++++++++++++----- 5 files changed, 213 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index a556805eff8a..f73cedfa2e0b 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -151,7 +151,15 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer, int type, unsigned long len, unsigned long flags, int pc); -void tracing_record_cmdline(struct task_struct *tsk); +#define TRACE_RECORD_CMDLINE BIT(0) +#define TRACE_RECORD_TGID BIT(1) + +void tracing_record_taskinfo(struct task_struct *task, int flags); +void tracing_record_taskinfo_sched_switch(struct task_struct *prev, + struct task_struct *next, int flags); + +void tracing_record_cmdline(struct task_struct *task); +void tracing_record_tgid(struct task_struct *task); int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...); @@ -290,6 +298,7 @@ struct trace_subsystem_dir; enum { EVENT_FILE_FL_ENABLED_BIT, EVENT_FILE_FL_RECORDED_CMD_BIT, + EVENT_FILE_FL_RECORDED_TGID_BIT, EVENT_FILE_FL_FILTERED_BIT, EVENT_FILE_FL_NO_SET_FILTER_BIT, EVENT_FILE_FL_SOFT_MODE_BIT, @@ -303,6 +312,7 @@ enum { * Event file flags: * ENABLED - The event is enabled * RECORDED_CMD - The comms should be recorded at sched_switch + * RECORDED_TGID - The tgids should be recorded at sched_switch * FILTERED - The event has a filter attached * NO_SET_FILTER - Set when filter has error and is to be ignored * SOFT_MODE - The event is enabled/disabled by SOFT_DISABLED @@ -315,6 +325,7 @@ enum { enum { EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT), EVENT_FILE_FL_RECORDED_CMD = (1 << EVENT_FILE_FL_RECORDED_CMD_BIT), + EVENT_FILE_FL_RECORDED_TGID = (1 << EVENT_FILE_FL_RECORDED_TGID_BIT), EVENT_FILE_FL_FILTERED = (1 << EVENT_FILE_FL_FILTERED_BIT), EVENT_FILE_FL_NO_SET_FILTER = (1 << EVENT_FILE_FL_NO_SET_FILTER_BIT), EVENT_FILE_FL_SOFT_MODE = (1 << EVENT_FILE_FL_SOFT_MODE_BIT), diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 14318ce92b13..ab9db750dd29 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -87,7 +87,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) * tracing is active, only save the comm when a trace event * occurred. */ -static DEFINE_PER_CPU(bool, trace_cmdline_save); +static DEFINE_PER_CPU(bool, trace_taskinfo_save); /* * Kill all tracing for good (never come back). @@ -790,7 +790,7 @@ EXPORT_SYMBOL_GPL(tracing_on); static __always_inline void __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { - __this_cpu_write(trace_cmdline_save, true); + __this_cpu_write(trace_taskinfo_save, true); /* If this is the temp buffer, we need to commit fully */ if (this_cpu_read(trace_buffered_event) == event) { @@ -1709,6 +1709,8 @@ void tracing_reset_all_online_cpus(void) } } +static int *tgid_map; + #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -1722,7 +1724,7 @@ struct saved_cmdlines_buffer { static struct saved_cmdlines_buffer *savedcmd; /* temporary disable recording */ -static atomic_t trace_record_cmdline_disabled __read_mostly; +static atomic_t trace_record_taskinfo_disabled __read_mostly; static inline char *get_saved_cmdlines(int idx) { @@ -1990,16 +1992,87 @@ void trace_find_cmdline(int pid, char comm[]) preempt_enable(); } -void tracing_record_cmdline(struct task_struct *tsk) +int trace_find_tgid(int pid) +{ + if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT)) + return 0; + + return tgid_map[pid]; +} + +static int trace_save_tgid(struct task_struct *tsk) { - if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) + if (unlikely(!tgid_map || !tsk->pid || tsk->pid > PID_MAX_DEFAULT)) + return 0; + + tgid_map[tsk->pid] = tsk->tgid; + return 1; +} + +static bool tracing_record_taskinfo_skip(int flags) +{ + if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) + return true; + if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on()) + return true; + if (!__this_cpu_read(trace_taskinfo_save)) + return true; + return false; +} + +/** + * tracing_record_taskinfo - record the task info of a task + * + * @task - task to record + * @flags - TRACE_RECORD_CMDLINE for recording comm + * - TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo(struct task_struct *task, int flags) +{ + if (tracing_record_taskinfo_skip(flags)) + return; + if ((flags & TRACE_RECORD_CMDLINE) && !trace_save_cmdline(task)) + return; + if ((flags & TRACE_RECORD_TGID) && !trace_save_tgid(task)) return; - if (!__this_cpu_read(trace_cmdline_save)) + __this_cpu_write(trace_taskinfo_save, false); +} + +/** + * tracing_record_taskinfo_sched_switch - record task info for sched_switch + * + * @prev - previous task during sched_switch + * @next - next task during sched_switch + * @flags - TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo_sched_switch(struct task_struct *prev, + struct task_struct *next, int flags) +{ + if (tracing_record_taskinfo_skip(flags)) return; - if (trace_save_cmdline(tsk)) - __this_cpu_write(trace_cmdline_save, false); + if ((flags & TRACE_RECORD_CMDLINE) && + (!trace_save_cmdline(prev) || !trace_save_cmdline(next))) + return; + + if ((flags & TRACE_RECORD_TGID) && + (!trace_save_tgid(prev) || !trace_save_tgid(next))) + return; + + __this_cpu_write(trace_taskinfo_save, false); +} + +/* Helpers to record a specific task information */ +void tracing_record_cmdline(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); +} + +void tracing_record_tgid(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_TGID); } /* @@ -3144,7 +3217,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) #endif if (!iter->snapshot) - atomic_inc(&trace_record_cmdline_disabled); + atomic_inc(&trace_record_taskinfo_disabled); if (*pos != iter->pos) { iter->ent = NULL; @@ -3189,7 +3262,7 @@ static void s_stop(struct seq_file *m, void *p) #endif if (!iter->snapshot) - atomic_dec(&trace_record_cmdline_disabled); + atomic_dec(&trace_record_taskinfo_disabled); trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); @@ -4236,6 +4309,18 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); + if (mask == TRACE_ITER_RECORD_TGID) { + if (!tgid_map) + tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map), + GFP_KERNEL); + if (!tgid_map) { + tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; + return -ENOMEM; + } + + trace_event_enable_tgid_record(enabled); + } + if (mask == TRACE_ITER_EVENT_FORK) trace_event_follow_fork(tr, enabled); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 13823951e42b..6ade1c55cc3a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -640,6 +640,9 @@ void set_graph_array(struct trace_array *tr); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); +void tracing_start_tgid_record(void); +void tracing_stop_tgid_record(void); + int register_tracer(struct tracer *type); int is_tracing_stopped(void); @@ -700,6 +703,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, extern u64 ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); +extern int trace_find_tgid(int pid); extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE @@ -1124,6 +1128,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ C(LATENCY_FMT, "latency-format"), \ C(RECORD_CMD, "record-cmd"), \ + C(RECORD_TGID, "record-tgid"), \ C(OVERWRITE, "overwrite"), \ C(STOP_ON_FREE, "disable_on_free"), \ C(IRQ_INFO, "irq-info"), \ @@ -1440,6 +1445,8 @@ struct ftrace_event_field * trace_find_event_field(struct trace_event_call *call, char *name); extern void trace_event_enable_cmd_record(bool enable); +extern void trace_event_enable_tgid_record(bool enable); + extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 83dfd0dbbbfe..36132f9280e6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -343,6 +343,28 @@ void trace_event_enable_cmd_record(bool enable) mutex_unlock(&event_mutex); } +void trace_event_enable_tgid_record(bool enable) +{ + struct trace_event_file *file; + struct trace_array *tr; + + mutex_lock(&event_mutex); + do_for_each_event_file(tr, file) { + if (!(file->flags & EVENT_FILE_FL_ENABLED)) + continue; + + if (enable) { + tracing_start_tgid_record(); + set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); + } else { + tracing_stop_tgid_record(); + clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, + &file->flags); + } + } while_for_each_event_file(); + mutex_unlock(&event_mutex); +} + static int __ftrace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable) { @@ -381,6 +403,12 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, tracing_stop_cmdline_record(); clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } + + if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { + tracing_stop_tgid_record(); + clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); + } + call->class->reg(call, TRACE_REG_UNREGISTER, file); } /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ @@ -407,18 +435,30 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, } if (!(file->flags & EVENT_FILE_FL_ENABLED)) { + bool cmd = false, tgid = false; /* Keep the event disabled, when going to SOFT_MODE. */ if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { + cmd = true; tracing_start_cmdline_record(); set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); } + + if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + tgid = true; + tracing_start_tgid_record(); + set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); + } + ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { - tracing_stop_cmdline_record(); + if (cmd) + tracing_stop_cmdline_record(); + if (tgid) + tracing_stop_tgid_record(); pr_info("event trace: Could not enable event " "%s\n", trace_event_name(call)); break; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 4c896a0101bd..b341c02730be 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -12,27 +12,38 @@ #include "trace.h" -static int sched_ref; +#define RECORD_CMDLINE 1 +#define RECORD_TGID 2 + +static int sched_cmdline_ref; +static int sched_tgid_ref; static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, struct task_struct *prev, struct task_struct *next) { - if (unlikely(!sched_ref)) - return; + int flags; + + flags = (RECORD_TGID * !!sched_tgid_ref) + + (RECORD_CMDLINE * !!sched_cmdline_ref); - tracing_record_cmdline(prev); - tracing_record_cmdline(next); + if (!flags) + return; + tracing_record_taskinfo_sched_switch(prev, next, flags); } static void probe_sched_wakeup(void *ignore, struct task_struct *wakee) { - if (unlikely(!sched_ref)) - return; + int flags; + + flags = (RECORD_TGID * !!sched_tgid_ref) + + (RECORD_CMDLINE * !!sched_cmdline_ref); - tracing_record_cmdline(current); + if (!flags) + return; + tracing_record_taskinfo(current, flags); } static int tracing_sched_register(void) @@ -75,28 +86,61 @@ static void tracing_sched_unregister(void) unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); } -static void tracing_start_sched_switch(void) +static void tracing_start_sched_switch(int ops) { + bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref); mutex_lock(&sched_register_mutex); - if (!(sched_ref++)) + + switch (ops) { + case RECORD_CMDLINE: + sched_cmdline_ref++; + break; + + case RECORD_TGID: + sched_tgid_ref++; + break; + } + + if (sched_register && (sched_cmdline_ref || sched_tgid_ref)) tracing_sched_register(); mutex_unlock(&sched_register_mutex); } -static void tracing_stop_sched_switch(void) +static void tracing_stop_sched_switch(int ops) { mutex_lock(&sched_register_mutex); - if (!(--sched_ref)) + + switch (ops) { + case RECORD_CMDLINE: + sched_cmdline_ref--; + break; + + case RECORD_TGID: + sched_tgid_ref--; + break; + } + + if (!sched_cmdline_ref && !sched_tgid_ref) tracing_sched_unregister(); mutex_unlock(&sched_register_mutex); } void tracing_start_cmdline_record(void) { - tracing_start_sched_switch(); + tracing_start_sched_switch(RECORD_CMDLINE); } void tracing_stop_cmdline_record(void) { - tracing_stop_sched_switch(); + tracing_stop_sched_switch(RECORD_CMDLINE); +} + +void tracing_start_tgid_record(void) +{ + tracing_start_sched_switch(RECORD_TGID); +} + +void tracing_stop_tgid_record(void) +{ + tracing_stop_sched_switch(RECORD_TGID); } -- cgit v1.3-14-g43fede From 441dae8f2f2975c68101a84bc3f528ec95ecf7c3 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 25 Jun 2017 22:38:43 -0700 Subject: tracing: Add support for display of tgid in trace output Earlier patches introduced ability to record the tgid using the 'record-tgid' option. Here we read the tgid and output it if the option is enabled. Link: http://lkml.kernel.org/r/20170626053844.5746-3-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Tested-by: Michael Sartain Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 36 ++++++++++++++++++++++-------------- kernel/trace/trace_output.c | 9 +++++++++ 2 files changed, 31 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ab9db750dd29..c579dea4a0eb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3319,23 +3319,29 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m) seq_puts(m, "#\n"); } -static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) +static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m, + unsigned int flags) { + bool tgid = flags & TRACE_ITER_RECORD_TGID; + print_event_info(buf, m); - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n" - "# | | | | |\n"); + + seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); + seq_printf(m, "# | | | %s | |\n", tgid ? " | " : ""); } -static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) +static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m, + unsigned int flags) { - print_event_info(buf, m); - seq_puts(m, "# _-----=> irqs-off\n" - "# / _----=> need-resched\n" - "# | / _---=> hardirq/softirq\n" - "# || / _--=> preempt-depth\n" - "# ||| / delay\n" - "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n" - "# | | | |||| | |\n"); + bool tgid = flags & TRACE_ITER_RECORD_TGID; + + seq_printf(m, "# %s _-----=> irqs-off\n", tgid ? " " : ""); + seq_printf(m, "# %s / _----=> need-resched\n", tgid ? " " : ""); + seq_printf(m, "# %s| / _---=> hardirq/softirq\n", tgid ? " " : ""); + seq_printf(m, "# %s|| / _--=> preempt-depth\n", tgid ? " " : ""); + seq_printf(m, "# %s||| / delay\n", tgid ? " " : ""); + seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); + seq_printf(m, "# | | | %s|||| | |\n", tgid ? " | " : ""); } void @@ -3651,9 +3657,11 @@ void trace_default_header(struct seq_file *m) } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(iter->trace_buffer, m); + print_func_help_header_irq(iter->trace_buffer, + m, trace_flags); else - print_func_help_header(iter->trace_buffer, m); + print_func_help_header(iter->trace_buffer, m, + trace_flags); } } } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 01ff99969ca7..bac629af2285 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -597,6 +597,15 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "%16s-%-5d [%03d] ", comm, entry->pid, iter->cpu); + if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + unsigned int tgid = trace_find_tgid(entry->pid); + + if (!tgid) + trace_seq_printf(s, "(-----) "); + else + trace_seq_printf(s, "(%5d) ", tgid); + } + if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); -- cgit v1.3-14-g43fede From 93437353daeff31bd5b11810daa4d2d509d1a64e Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 26 May 2017 14:12:25 -0700 Subject: module: use list_for_each_entry_rcu() on find_module_all() The module list has been using RCU in a lot of other calls for a while now, we just overlooked changing this one over to use RCU. Signed-off-by: Luis R. Rodriguez Signed-off-by: Jessica Yu --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f546d574f436..afc6ede7bcdf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -603,7 +603,7 @@ static struct module *find_module_all(const char *name, size_t len, module_assert_mutex_or_preempt(); - list_for_each_entry(mod, &modules, list) { + list_for_each_entry_rcu(mod, &modules, list) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) continue; if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) -- cgit v1.3-14-g43fede From 165d1cc0074b2f938586274776d029b9bce914c4 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 23 Jun 2017 12:19:12 -0700 Subject: kmod: reduce atomic operations on kmod_concurrent and simplify When checking if we want to allow a kmod thread to kick off we increment, then read to see if we should enable a thread. If we were over the allowed limit limit we decrement. Splitting the increment far apart from decrement means there could be a time where two increments happen potentially giving a false failure on a thread which should have been allowed. CPU1 CPU2 atomic_inc() atomic_inc() atomic_read() atomic_read() atomic_dec() atomic_dec() In this case a read on CPU1 gets the atomic_inc()'s and we could negate it from getting a kmod thread. We could try to prevent this with a lock or preemption but that is overkill. We can fix by reducing the number of atomic operations. We do this by inverting the logic of of the enabler, instead of incrementing kmod_concurrent as we get new kmod users, define the variable kmod_concurrent_max as the max number of currently allowed kmod users and as we get new kmod users just decrement it if its still positive. This combines the dec and read in one atomic operation. In this case we no longer get the same false failure: CPU1 CPU2 atomic_dec_if_positive() atomic_dec_if_positive() atomic_inc() atomic_inc() The number of threads is computed at init, and since the current computation of kmod_concurrent includes the thread count we can avoid setting kmod_concurrent_max later in boot through an init call by simply sticking to 50 as the kmod_concurrent_max. The assumption here is a system with modules must at least have ~16 MiB of RAM. Suggested-by: Petr Mladek Suggested-by: Dmitry Torokhov Signed-off-by: Luis R. Rodriguez Reviewed-by: Petr Mladek Signed-off-by: Jessica Yu --- kernel/kmod.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 563f97e2be36..ff68198fe83b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -45,8 +45,6 @@ #include -extern int max_threads; - #define CAP_BSET (void *)1 #define CAP_PI (void *)2 @@ -56,6 +54,20 @@ static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); #ifdef CONFIG_MODULES +/* + * Assuming: + * + * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, + * (u64) THREAD_SIZE * 8UL); + * + * If you need less than 50 threads would mean we're dealing with systems + * smaller than 3200 pages. This assuems you are capable of having ~13M memory, + * and this would only be an be an upper limit, after which the OOM killer + * would take effect. Systems like these are very unlikely if modules are + * enabled. + */ +#define MAX_KMOD_CONCURRENT 50 +static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); /* modprobe_path is set via /proc/sys. @@ -127,10 +139,7 @@ int __request_module(bool wait, const char *fmt, ...) { va_list args; char module_name[MODULE_NAME_LEN]; - unsigned int max_modprobes; int ret; - static atomic_t kmod_concurrent = ATOMIC_INIT(0); -#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; /* @@ -154,21 +163,7 @@ int __request_module(bool wait, const char *fmt, ...) if (ret) return ret; - /* If modprobe needs a service that is in a module, we get a recursive - * loop. Limit the number of running kmod threads to max_threads/2 or - * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method - * would be to run the parents of this process, counting how many times - * kmod was invoked. That would mean accessing the internals of the - * process tables to get the command line, proc_pid_cmdline is static - * and it is not worth changing the proc code just to handle this case. - * KAO. - * - * "trace the ppid" is simple, but will fail if someone's - * parent exits. I think this is as good as it gets. --RR - */ - max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT); - atomic_inc(&kmod_concurrent); - if (atomic_read(&kmod_concurrent) > max_modprobes) { + if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) { /* We may be blaming an innocent here, but unlikely */ if (kmod_loop_msg < 5) { printk(KERN_ERR @@ -176,7 +171,6 @@ int __request_module(bool wait, const char *fmt, ...) module_name); kmod_loop_msg++; } - atomic_dec(&kmod_concurrent); return -ENOMEM; } @@ -184,10 +178,12 @@ int __request_module(bool wait, const char *fmt, ...) ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); - atomic_dec(&kmod_concurrent); + atomic_inc(&kmod_concurrent_max); + return ret; } EXPORT_SYMBOL(__request_module); + #endif /* CONFIG_MODULES */ static void call_usermodehelper_freeinfo(struct subprocess_info *info) -- cgit v1.3-14-g43fede From 3b58a3c72f484393c65995a551902945f5a18c70 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 28 Jun 2017 09:09:38 -0400 Subject: ftrace: Unlock hash mutex on failed allocation in process_mod_list() If the new_hash fails to allocate, then unlock the hash mutex on error. Reported-by: Julia Lawall Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7509ef9810bf..2c79630cd267 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3997,7 +3997,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!new_hash) - return; /* Warn? */ + goto out; /* warn? */ mutex_lock(&ftrace_lock); @@ -4042,6 +4042,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, new_hash, enable); mutex_unlock(&ftrace_lock); + out: mutex_unlock(&ops->func_hash->regex_lock); free_ftrace_hash(new_hash); -- cgit v1.3-14-g43fede From 4ec78467858739c0119569c0610676aa50dfa8fb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 28 Jun 2017 11:57:03 -0400 Subject: ftrace: Decrement count for dyn_ftrace_total_info for init functions Init boot up functions may be traced, but they are also freed when the kernel finishes booting. These are removed from the ftrace tables, and the debug variable for dyn_ftrace_total_info needs to reflect that as well. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2c79630cd267..e392f750a1cf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5825,6 +5825,7 @@ void __init ftrace_free_init_mem(void) if (!rec) continue; pg->index--; + ftrace_update_tot_cnt--; if (!pg->index) { *last_pg = pg->next; order = get_count_order(pg->size / ENTRIES_PER_PAGE); -- cgit v1.3-14-g43fede From 824ecbe01c5d833b8c8a371c209e3ac3a76cd18a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 25 Jun 2017 00:27:59 -0400 Subject: cgroup: restructure cgroup_procs_write_permission() Restructure cgroup_procs_write_permission() to make extending permission logic easier. This patch doesn't cause any functional changes. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 57 +++++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index dbfd7028b1c6..d48069ee84c2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2363,27 +2363,12 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *dst_cgrp, struct kernfs_open_file *of) { - int ret = 0; - - if (cgroup_on_dfl(dst_cgrp)) { - struct super_block *sb = of->file->f_path.dentry->d_sb; - struct cgroup *cgrp; - struct inode *inode; - - spin_lock_irq(&css_set_lock); - cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_irq(&css_set_lock); - - while (!cgroup_is_descendant(dst_cgrp, cgrp)) - cgrp = cgroup_parent(cgrp); + struct super_block *sb = of->file->f_path.dentry->d_sb; + struct cgroup *src_cgrp, *com_cgrp; + struct inode *inode; + int ret; - ret = -ENOMEM; - inode = kernfs_get_inode(sb, cgrp->procs_file.kn); - if (inode) { - ret = inode_permission(inode, MAY_WRITE); - iput(inode); - } - } else { + if (!cgroup_on_dfl(dst_cgrp)) { const struct cred *cred = current_cred(); const struct cred *tcred = get_task_cred(task); @@ -2391,14 +2376,38 @@ static int cgroup_procs_write_permission(struct task_struct *task, * even if we're attaching all tasks in the thread group, * we only need to check permissions on one of them. */ - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) + if (uid_eq(cred->euid, GLOBAL_ROOT_UID) || + uid_eq(cred->euid, tcred->uid) || + uid_eq(cred->euid, tcred->suid)) + ret = 0; + else ret = -EACCES; + put_cred(tcred); + return ret; } - return ret; + /* find the source cgroup */ + spin_lock_irq(&css_set_lock); + src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); + spin_unlock_irq(&css_set_lock); + + /* and the common ancestor */ + com_cgrp = src_cgrp; + while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) + com_cgrp = cgroup_parent(com_cgrp); + + /* %current should be authorized to migrate to the common ancestor */ + inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); + if (!inode) + return -ENOMEM; + + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + if (ret) + return ret; + + return 0; } /* -- cgit v1.3-14-g43fede From 5136f6365ce3eace5a926e10f16ed2a233db5ba9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Jun 2017 14:30:28 -0400 Subject: cgroup: implement "nsdelegate" mount option Currently, cgroup only supports delegation to !root users and cgroup namespaces don't get any special treatments. This limits the usefulness of cgroup namespaces as they by themselves can't be safe delegation boundaries. A process inside a cgroup can change the resource control knobs of the parent in the namespace root and may move processes in and out of the namespace if cgroups outside its namespace are visible somehow. This patch adds a new mount option "nsdelegate" which makes cgroup namespaces delegation boundaries. If set, cgroup behaves as if write permission based delegation took place at namespace boundaries - writes to the resource control knobs from the namespace root are denied and migration crossing the namespace boundary aren't allowed from inside the namespace. This allows cgroup namespace to function as a delegation boundary by itself. v2: Silently ignore nsdelegate specified on !init mounts. Signed-off-by: Tejun Heo Cc: Aravind Anbudurai Cc: Serge Hallyn Cc: Eric Biederman --- Documentation/cgroup-v2.txt | 61 +++++++++++++++++++++---------- include/linux/cgroup-defs.h | 9 +++++ kernel/cgroup/cgroup.c | 88 ++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 135 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 0260ed053efd..558c3a739baf 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -149,6 +149,16 @@ during boot, before manual intervention is possible. To make testing and experimenting easier, the kernel parameter cgroup_no_v1= allows disabling controllers in v1 and make them always available in v2. +cgroup v2 currently supports the following mount options. + + nsdelegate + + Consider cgroup namespaces as delegation boundaries. This + option is system wide and can only be set on mount or modified + through remount from the init namespace. The mount option is + ignored on non-init namespace mounts. Please refer to the + Delegation section for details. + 2-2. Organizing Processes @@ -308,19 +318,27 @@ file. 2-5-1. Model of Delegation -A cgroup can be delegated to a less privileged user by granting write -access of the directory and its "cgroup.procs" and -"cgroup.subtree_control" files to the user. Note that resource -control interface files in a given directory control the distribution -of the parent's resources and thus must not be delegated along with -the directory. - -Once delegated, the user can build sub-hierarchy under the directory, -organize processes as it sees fit and further distribute the resources -it received from the parent. The limits and other settings of all -resource controllers are hierarchical and regardless of what happens -in the delegated sub-hierarchy, nothing can escape the resource -restrictions imposed by the parent. +A cgroup can be delegated in two ways. First, to a less privileged +user by granting write access of the directory and its "cgroup.procs" +and "cgroup.subtree_control" files to the user. Second, if the +"nsdelegate" mount option is set, automatically to a cgroup namespace +on namespace creation. + +Because the resource control interface files in a given directory +control the distribution of the parent's resources, the delegatee +shouldn't be allowed to write to them. For the first method, this is +achieved by not granting access to these files. For the second, the +kernel rejects writes to all files other than "cgroup.procs" and +"cgroup.subtree_control" on a namespace root from inside the +namespace. + +The end results are equivalent for both delegation types. Once +delegated, the user can build sub-hierarchy under the directory, +organize processes inside it as it sees fit and further distribute the +resources it received from the parent. The limits and other settings +of all resource controllers are hierarchical and regardless of what +happens in the delegated sub-hierarchy, nothing can escape the +resource restrictions imposed by the parent. Currently, cgroup doesn't impose any restrictions on the number of cgroups in or nesting depth of a delegated sub-hierarchy; however, @@ -330,10 +348,12 @@ this may be limited explicitly in the future. 2-5-2. Delegation Containment A delegated sub-hierarchy is contained in the sense that processes -can't be moved into or out of the sub-hierarchy by the delegatee. For -a process with a non-root euid to migrate a target process into a -cgroup by writing its PID to the "cgroup.procs" file, the following -conditions must be met. +can't be moved into or out of the sub-hierarchy by the delegatee. + +For delegations to a less privileged user, this is achieved by +requiring the following conditions for a process with a non-root euid +to migrate a target process into a cgroup by writing its PID to the +"cgroup.procs" file. - The writer must have write access to the "cgroup.procs" file. @@ -360,6 +380,11 @@ destination cgroup C00 is above the points of delegation and U0 would not have write access to its "cgroup.procs" files and thus the write will be denied with -EACCES. +For delegations to namespaces, containment is achieved by requiring +that both the source and destination cgroups are reachable from the +namespace of the process which is attempting the migration. If either +is not reachable, the migration is rejected with -ENOENT. + 2-6. Guidelines @@ -1414,7 +1439,7 @@ D. Deprecated v1 Core Features - Multiple hierarchies including named ones are not supported. -- All mount options and remounting are not supported. +- All v1 mount options are not supported. - The "tasks" file is removed and "cgroup.procs" is not sorted. diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 3bc4196bf217..09f4c7df1478 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -67,12 +67,21 @@ enum { enum { CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ + + /* + * Consider namespaces as delegation boundaries. If this flag is + * set, controller specific interface files in a namespace root + * aren't writeable from inside the namespace. + */ + CGRP_ROOT_NS_DELEGATE = (1 << 3), }; /* cftype->flags */ enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ + CFTYPE_NS_DELEGATABLE = (1 << 2), /* writeable beyond delegation boundaries */ + CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d48069ee84c2..620794a20a33 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1547,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } +static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) +{ + char *token; + + *root_flags = 0; + + if (!data) + return 0; + + while ((token = strsep(&data, ",")) != NULL) { + if (!strcmp(token, "nsdelegate")) { + *root_flags |= CGRP_ROOT_NS_DELEGATE; + continue; + } + + pr_err("cgroup2: unknown option \"%s\"\n", token); + return -EINVAL; + } + + return 0; +} + +static void apply_cgroup_root_flags(unsigned int root_flags) +{ + if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { + if (root_flags & CGRP_ROOT_NS_DELEGATE) + cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + } +} + +static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) +{ + if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) + seq_puts(seq, ",nsdelegate"); + return 0; +} + static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { - pr_err("remount is not allowed\n"); - return -EINVAL; + unsigned int root_flags; + int ret; + + ret = parse_cgroup_root_flags(data, &root_flags); + if (ret) + return ret; + + apply_cgroup_root_flags(root_flags); + return 0; } /* @@ -1790,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct dentry *dentry; + int ret; get_cgroup_ns(ns); @@ -1807,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, cgroup_enable_task_cg_lists(); if (fs_type == &cgroup2_fs_type) { - if (data) { - pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + unsigned int root_flags; + + ret = parse_cgroup_root_flags(data, &root_flags); + if (ret) { put_cgroup_ns(ns); - return ERR_PTR(-EINVAL); + return ERR_PTR(ret); } + cgrp_dfl_visible = true; cgroup_get_live(&cgrp_dfl_root.cgrp); dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, CGROUP2_SUPER_MAGIC, ns); + if (!IS_ERR(dentry)) + apply_cgroup_root_flags(root_flags); } else { dentry = cgroup1_mount(&cgroup_fs_type, flags, data, CGROUP_SUPER_MAGIC, ns); @@ -2364,6 +2416,8 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct kernfs_open_file *of) { struct super_block *sb = of->file->f_path.dentry->d_sb; + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp; struct cgroup *src_cgrp, *com_cgrp; struct inode *inode; int ret; @@ -2407,6 +2461,15 @@ static int cgroup_procs_write_permission(struct task_struct *task, if (ret) return ret; + /* + * If namespaces are delegation boundaries, %current must be able + * to see both source and destination cgroups from its namespace. + */ + if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && + (!cgroup_is_descendant(src_cgrp, root_cgrp) || + !cgroup_is_descendant(dst_cgrp, root_cgrp))) + return -ENOENT; + return 0; } @@ -2971,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of) static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of->kn->priv; struct cgroup_subsys_state *css; int ret; + /* + * If namespaces are delegation boundaries, disallow writes to + * files in an non-init namespace root from inside the namespace + * except for the files explicitly marked delegatable - + * cgroup.procs and cgroup.subtree_control. + */ + if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && + !(cft->flags & CFTYPE_NS_DELEGATABLE) && + ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + return -EPERM; + if (cft->write) return cft->write(of, buf, nbytes, off); @@ -3809,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v) static struct cftype cgroup_base_files[] = { { .name = "cgroup.procs", + .flags = CFTYPE_NS_DELEGATABLE, .file_offset = offsetof(struct cgroup, procs_file), .release = cgroup_procs_release, .seq_start = cgroup_procs_start, @@ -3822,6 +3898,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cgroup.subtree_control", + .flags = CFTYPE_NS_DELEGATABLE, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, @@ -4410,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn) } static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { + .show_options = cgroup_show_options, .remount_fs = cgroup_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, -- cgit v1.3-14-g43fede From 2287d8664fe7345ead891017eccd879fc605305e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 27 Jun 2017 18:15:38 +0200 Subject: timers: Make the cpu base lock raw The timers cpu base lock could not be converted to a raw spinlock becaue the lock held time was non-deterministic due to cascading and long lasting timer wheel traversals. The rework of the timer wheel to the new non-cascading model removed also the wheel traversals and the lock held times are deterministic now. This allows to make the lock raw and thereby unbreaks NOHz* on preempt-RT. Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/20170627161538.30257-1-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 709a404bd133..71ce3f4eead3 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -195,7 +195,7 @@ EXPORT_SYMBOL(jiffies_64); #endif struct timer_base { - spinlock_t lock; + raw_spinlock_t lock; struct timer_list *running_timer; unsigned long clk; unsigned long next_expiry; @@ -913,10 +913,10 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, if (!(tf & TIMER_MIGRATING)) { base = get_timer_base(tf); - spin_lock_irqsave(&base->lock, *flags); + raw_spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base; - spin_unlock_irqrestore(&base->lock, *flags); + raw_spin_unlock_irqrestore(&base->lock, *flags); } cpu_relax(); } @@ -986,9 +986,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) /* See the comment in lock_timer_base() */ timer->flags |= TIMER_MIGRATING; - spin_unlock(&base->lock); + raw_spin_unlock(&base->lock); base = new_base; - spin_lock(&base->lock); + raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | base->cpu); } @@ -1013,7 +1013,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } out_unlock: - spin_unlock_irqrestore(&base->lock, flags); + raw_spin_unlock_irqrestore(&base->lock, flags); return ret; } @@ -1106,16 +1106,16 @@ void add_timer_on(struct timer_list *timer, int cpu) if (base != new_base) { timer->flags |= TIMER_MIGRATING; - spin_unlock(&base->lock); + raw_spin_unlock(&base->lock); base = new_base; - spin_lock(&base->lock); + raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | cpu); } debug_activate(timer, timer->expires); internal_add_timer(base, timer); - spin_unlock_irqrestore(&base->lock, flags); + raw_spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); @@ -1141,7 +1141,7 @@ int del_timer(struct timer_list *timer) if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, true); - spin_unlock_irqrestore(&base->lock, flags); + raw_spin_unlock_irqrestore(&base->lock, flags); } return ret; @@ -1168,7 +1168,7 @@ int try_to_del_timer_sync(struct timer_list *timer) if (base->running_timer != timer) ret = detach_if_pending(timer, base, true); - spin_unlock_irqrestore(&base->lock, flags); + raw_spin_unlock_irqrestore(&base->lock, flags); return ret; } @@ -1299,13 +1299,13 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) data = timer->data; if (timer->flags & TIMER_IRQSAFE) { - spin_unlock(&base->lock); + raw_spin_unlock(&base->lock); call_timer_fn(timer, fn, data); - spin_lock(&base->lock); + raw_spin_lock(&base->lock); } else { - spin_unlock_irq(&base->lock); + raw_spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, data); - spin_lock_irq(&base->lock); + raw_spin_lock_irq(&base->lock); } } } @@ -1474,7 +1474,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if (cpu_is_offline(smp_processor_id())) return expires; - spin_lock(&base->lock); + raw_spin_lock(&base->lock); nextevt = __next_timer_interrupt(base); is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); base->next_expiry = nextevt; @@ -1502,7 +1502,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if ((expires - basem) > TICK_NSEC) base->is_idle = true; } - spin_unlock(&base->lock); + raw_spin_unlock(&base->lock); return cmp_next_hrtimer_event(basem, expires); } @@ -1590,7 +1590,7 @@ static inline void __run_timers(struct timer_base *base) if (!time_after_eq(jiffies, base->clk)) return; - spin_lock_irq(&base->lock); + raw_spin_lock_irq(&base->lock); while (time_after_eq(jiffies, base->clk)) { @@ -1601,7 +1601,7 @@ static inline void __run_timers(struct timer_base *base) expire_timers(base, heads + levels); } base->running_timer = NULL; - spin_unlock_irq(&base->lock); + raw_spin_unlock_irq(&base->lock); } /* @@ -1786,16 +1786,16 @@ int timers_dead_cpu(unsigned int cpu) * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock_irq(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock_irq(&new_base->lock); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base->running_timer); for (i = 0; i < WHEEL_SIZE; i++) migrate_timer_list(new_base, old_base->vectors + i); - spin_unlock(&old_base->lock); - spin_unlock_irq(&new_base->lock); + raw_spin_unlock(&old_base->lock); + raw_spin_unlock_irq(&new_base->lock); put_cpu_ptr(&timer_bases); } return 0; @@ -1811,7 +1811,7 @@ static void __init init_timer_cpu(int cpu) for (i = 0; i < NR_BASES; i++) { base = per_cpu_ptr(&timer_bases[i], cpu); base->cpu = cpu; - spin_lock_init(&base->lock); + raw_spin_lock_init(&base->lock); base->clk = jiffies; } } -- cgit v1.3-14-g43fede From ff801b716effd652f420204eddb36f6e4a716819 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2017 08:25:52 +0200 Subject: sched/numa: Hide numa_wake_affine() from UP build Stephen reported the following build warning in UP: kernel/sched/fair.c:2657:9: warning: 'struct sched_domain' declared inside parameter list ^ /home/sfr/next/next/kernel/sched/fair.c:2657:9: warning: its scope is only this definition or declaration, which is probably not what you want Hide the numa_wake_affine() inline stub on UP builds to get rid of it. Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()") Reported-by: Stephen Rothwell Signed-off-by: Thomas Gleixner Cc: Rik van Riel Cc: Peter Zijlstra --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6f4f155adf5f..008c514dc241 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2652,12 +2652,14 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } +#ifdef CONFIG_SMP static inline bool numa_wake_affine(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { return true; } +#endif /* !SMP */ #endif /* CONFIG_NUMA_BALANCING */ static void -- cgit v1.3-14-g43fede From 96b5b19459b3c2aed2872bac42cbe19edfae710f Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 28 Jun 2017 18:32:31 -0700 Subject: module: make the modinfo name const This can be accomplished by making blacklisted() also accept const. Signed-off-by: Luis R. Rodriguez Acked-by: Kees Cook [jeyu: fix typo] Signed-off-by: Jessica Yu --- kernel/module.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index afc6ede7bcdf..d07287707557 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -302,7 +302,7 @@ int unregister_module_notifier(struct notifier_block *nb) EXPORT_SYMBOL(unregister_module_notifier); struct load_info { - char *name; + const char *name; Elf_Ehdr *hdr; unsigned long len; Elf_Shdr *sechdrs; @@ -3265,7 +3265,7 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, /* module_blacklist is a comma-separated list of module names */ static char *module_blacklist; -static bool blacklisted(char *module_name) +static bool blacklisted(const char *module_name) { const char *p; size_t len; -- cgit v1.3-14-g43fede From 14dc6f04f49dc12614d7e90928b495b8d73cd471 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 27 Jun 2017 23:08:34 -0700 Subject: bpf: Add syscall lookup support for fd array and htab This patch allows userspace to do BPF_MAP_LOOKUP_ELEM on BPF_MAP_TYPE_PROG_ARRAY, BPF_MAP_TYPE_ARRAY_OF_MAPS and BPF_MAP_TYPE_HASH_OF_MAPS. The lookup returns a prog-id or map-id to the userspace. The userspace can then use the BPF_PROG_GET_FD_BY_ID or BPF_MAP_GET_FD_BY_ID to get a fd. Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 +++ kernel/bpf/arraymap.c | 27 +++++++++++++++++++++++++++ kernel/bpf/hashtab.c | 21 +++++++++++++++++++++ kernel/bpf/map_in_map.c | 5 +++++ kernel/bpf/map_in_map.h | 1 + kernel/bpf/syscall.c | 16 +++++++++++++--- 6 files changed, 70 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index deca4e7f2845..5175729270d7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -36,6 +36,7 @@ struct bpf_map_ops { int fd); void (*map_fd_put_ptr)(void *ptr); u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); + u32 (*map_fd_sys_lookup_elem)(void *ptr); }; struct bpf_map { @@ -288,9 +289,11 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); +int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); void bpf_fd_array_map_clear(struct bpf_map *map); int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); +int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index ecb43542246e..d771a3872500 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -334,6 +334,26 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +/* only called from syscall */ +int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) +{ + void **elem, *ptr; + int ret = 0; + + if (!map->ops->map_fd_sys_lookup_elem) + return -ENOTSUPP; + + rcu_read_lock(); + elem = array_map_lookup_elem(map, key); + if (elem && (ptr = READ_ONCE(*elem))) + *value = map->ops->map_fd_sys_lookup_elem(ptr); + else + ret = -ENOENT; + rcu_read_unlock(); + + return ret; +} + /* only called from syscall */ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) @@ -400,6 +420,11 @@ static void prog_fd_array_put_ptr(void *ptr) bpf_prog_put(ptr); } +static u32 prog_fd_array_sys_lookup_elem(void *ptr) +{ + return ((struct bpf_prog *)ptr)->aux->id; +} + /* decrement refcnt of all bpf_progs that are stored in this map */ void bpf_fd_array_map_clear(struct bpf_map *map) { @@ -418,6 +443,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, + .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, @@ -585,4 +611,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { .map_delete_elem = fd_array_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, + .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, }; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 004334ea13ba..4fb463172aa8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1243,6 +1243,26 @@ static void fd_htab_map_free(struct bpf_map *map) htab_map_free(map); } +/* only called from syscall */ +int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) +{ + void **ptr; + int ret = 0; + + if (!map->ops->map_fd_sys_lookup_elem) + return -ENOTSUPP; + + rcu_read_lock(); + ptr = htab_map_lookup_elem(map, key); + if (ptr) + *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); + else + ret = -ENOENT; + rcu_read_unlock(); + + return ret; +} + /* only called from syscall */ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags) @@ -1305,4 +1325,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_fd_get_ptr = bpf_map_fd_get_ptr, .map_fd_put_ptr = bpf_map_fd_put_ptr, + .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 59bcdf821ae4..1da574612bea 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -95,3 +95,8 @@ void bpf_map_fd_put_ptr(void *ptr) */ bpf_map_put(ptr); } + +u32 bpf_map_fd_sys_lookup_elem(void *ptr) +{ + return ((struct bpf_map *)ptr)->id; +} diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index 177fadb689dc..6183db9ec08c 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -19,5 +19,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); void bpf_map_fd_put_ptr(void *ptr); +u32 bpf_map_fd_sys_lookup_elem(void *ptr); #endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8942c820d620..4409ccca8831 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -24,6 +24,13 @@ #include #include +#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ + (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) +#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) + DEFINE_PER_CPU(int, bpf_prog_active); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); @@ -411,6 +418,8 @@ static int map_lookup_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else if (IS_FD_MAP(map)) + value_size = sizeof(u32); else value_size = map->value_size; @@ -426,9 +435,10 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || - map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { - err = -ENOTSUPP; + } else if (IS_FD_ARRAY(map)) { + err = bpf_fd_array_map_lookup_elem(map, key, value); + } else if (IS_FD_HASH(map)) { + err = bpf_fd_htab_map_lookup_elem(map, key, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); -- cgit v1.3-14-g43fede From 8007e40a24e12d35189203370268c7278f29ab74 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 28 Jun 2017 10:41:24 -0700 Subject: bpf: Fix out-of-bound access on interpreters[] The index is off-by-one when fp->aux->stack_depth has already been rounded up to 32. In particular, if stack_depth is 512, the index will be 16. The fix is to round_up and then takes -1 instead of round_down. [ 22.318680] ================================================================== [ 22.319745] BUG: KASAN: global-out-of-bounds in bpf_prog_select_runtime+0x48a/0x670 [ 22.320737] Read of size 8 at addr ffffffff82aadae0 by task sockex3/1946 [ 22.321646] [ 22.321858] CPU: 1 PID: 1946 Comm: sockex3 Tainted: G W 4.12.0-rc6-01680-g2ee87db3a287 #22 [ 22.323061] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014 [ 22.324260] Call Trace: [ 22.324612] dump_stack+0x67/0x99 [ 22.325081] print_address_description+0x1e8/0x290 [ 22.325734] ? bpf_prog_select_runtime+0x48a/0x670 [ 22.326360] kasan_report+0x265/0x350 [ 22.326860] __asan_report_load8_noabort+0x19/0x20 [ 22.327484] bpf_prog_select_runtime+0x48a/0x670 [ 22.328109] bpf_prog_load+0x626/0xd40 [ 22.328637] ? __bpf_prog_charge+0xc0/0xc0 [ 22.329222] ? check_nnp_nosuid.isra.61+0x100/0x100 [ 22.329890] ? __might_fault+0xf6/0x1b0 [ 22.330446] ? lock_acquire+0x360/0x360 [ 22.331013] SyS_bpf+0x67c/0x24d0 [ 22.331491] ? trace_hardirqs_on+0xd/0x10 [ 22.332049] ? __getnstimeofday64+0xaf/0x1c0 [ 22.332635] ? bpf_prog_get+0x20/0x20 [ 22.333135] ? __audit_syscall_entry+0x300/0x600 [ 22.333770] ? syscall_trace_enter+0x540/0xdd0 [ 22.334339] ? exit_to_usermode_loop+0xe0/0xe0 [ 22.334950] ? do_syscall_64+0x48/0x410 [ 22.335446] ? bpf_prog_get+0x20/0x20 [ 22.335954] do_syscall_64+0x181/0x410 [ 22.336454] entry_SYSCALL64_slow_path+0x25/0x25 [ 22.337121] RIP: 0033:0x7f263fe81f19 [ 22.337618] RSP: 002b:00007ffd9a3440c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000141 [ 22.338619] RAX: ffffffffffffffda RBX: 0000000000aac5fb RCX: 00007f263fe81f19 [ 22.339600] RDX: 0000000000000030 RSI: 00007ffd9a3440d0 RDI: 0000000000000005 [ 22.340470] RBP: 0000000000a9a1e0 R08: 0000000000a9a1e0 R09: 0000009d00000001 [ 22.341430] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000010000 [ 22.342411] R13: 0000000000a9a023 R14: 0000000000000001 R15: 0000000000000003 [ 22.343369] [ 22.343593] The buggy address belongs to the variable: [ 22.344241] interpreters+0x80/0x980 [ 22.344708] [ 22.344908] Memory state around the buggy address: [ 22.345556] ffffffff82aad980: 00 00 00 04 fa fa fa fa 04 fa fa fa fa fa fa fa [ 22.346449] ffffffff82aada00: 00 00 00 00 00 fa fa fa fa fa fa fa 00 00 00 00 [ 22.347361] >ffffffff82aada80: 00 00 00 00 00 00 00 00 00 00 00 00 fa fa fa fa [ 22.348301] ^ [ 22.349142] ffffffff82aadb00: 00 01 fa fa fa fa fa fa 00 00 00 00 00 00 00 00 [ 22.350058] ffffffff82aadb80: 00 00 07 fa fa fa fa fa 00 00 05 fa fa fa fa fa [ 22.350984] ================================================================== Fixes: b870aa901f4b ("bpf: use different interpreter depending on required stack size") Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 774069ca18a7..ad5f55922a13 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1297,7 +1297,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { - fp->bpf_func = interpreters[round_down(fp->aux->stack_depth, 32) / 32]; + u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); + + fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during -- cgit v1.3-14-g43fede From 59494fe2c89e46e85d83de9bc45dd1d528955c49 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 29 Jun 2017 16:58:40 +0530 Subject: PM: hibernate: constify attribute_group structures. attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 6332 488 308 7128 1bd8 kernel/power/hibernate.o File size After adding 'const': text data bss dec hex filename 6396 424 308 7128 1bd8 kernel/power/hibernate.o Signed-off-by: Arvind Yadav Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a8b978c35a6a..e1914c7b85b1 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1108,7 +1108,7 @@ static struct attribute * g[] = { }; -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = g, }; -- cgit v1.3-14-g43fede From a9bd8dfa539493db265e46a496c1a89279ab31d1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 May 2017 18:39:01 -0400 Subject: kimage_file_prepare_segments(): don't open-code memdup_user() Signed-off-by: Al Viro --- kernel/kexec_file.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b118735fea9d..766e7e4d3ad9 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -162,16 +162,10 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, } if (cmdline_len) { - image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); - if (!image->cmdline_buf) { - ret = -ENOMEM; - goto out; - } - - ret = copy_from_user(image->cmdline_buf, cmdline_ptr, - cmdline_len); - if (ret) { - ret = -EFAULT; + image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); + if (IS_ERR(image->cmdline_buf)) { + ret = PTR_ERR(image->cmdline_buf); + image->cmdline_buf = NULL; goto out; } -- cgit v1.3-14-g43fede From e4448ed87ccdbacb74871736f63220642242b32f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 May 2017 18:43:00 -0400 Subject: bpf: don't open-code memdup_user() Signed-off-by: Al Viro --- kernel/bpf/syscall.c | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fd2411fd6914..4b8b10bddfde 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -322,14 +322,11 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -402,14 +399,11 @@ static int map_update_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -488,14 +482,11 @@ static int map_delete_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } preempt_disable(); __this_cpu_inc(bpf_prog_active); @@ -507,7 +498,6 @@ static int map_delete_elem(union bpf_attr *attr) if (!err) trace_bpf_map_delete_elem(map, ufd, key); -free_key: kfree(key); err_put: fdput(f); @@ -536,14 +526,11 @@ static int map_get_next_key(union bpf_attr *attr) return PTR_ERR(map); if (ukey) { - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) + key = memdup_user(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); goto err_put; - - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + } } else { key = NULL; } -- cgit v1.3-14-g43fede From 48365b38849fdb1ee6dc65beac044ca59f669683 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 26 Jun 2017 17:07:14 +0200 Subject: sched/debug: Expose the number of RT/DL tasks that can migrate Add the value of the rt_rq.rt_nr_migratory and dl_rq.dl_nr_migratory to the sched_debug output, for instance: rt_rq[0]: .rt_nr_running : 2 .rt_nr_migratory : 1 <--- Like this .rt_throttled : 0 .rt_time : 828.645877 .rt_runtime : 1000.000000 This is useful to debug problems related to the RT/DL schedulers. This also fixes the format of some variables, that were unsigned, rather than signed. Signed-off-by: Daniel Bristot de Oliveira Cc: Clark Williams Cc: Linus Torvalds Cc: Luis Claudio R. Goncalves Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Cc: linux-rt-users Link: http://lkml.kernel.org/r/7896f71cada54ee7dd8507bb666063a2e051c3d4.1498482127.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 38f019324f1a..4fa66de52bd6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -552,15 +552,21 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) +#define PU(x) \ + SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(rt_rq->x)) #define PN(x) \ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) - P(rt_nr_running); + PU(rt_nr_running); +#ifdef CONFIG_SMP + PU(rt_nr_migratory); +#endif P(rt_throttled); PN(rt_time); PN(rt_runtime); #undef PN +#undef PU #undef P } @@ -569,14 +575,21 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) struct dl_bw *dl_bw; SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); - SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); + +#define PU(x) \ + SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) + + PU(dl_nr_running); #ifdef CONFIG_SMP + PU(dl_nr_migratory); dl_bw = &cpu_rq(cpu)->rd->dl_bw; #else dl_bw = &dl_rq->dl_bw; #endif SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); + +#undef PU } extern __read_mostly int sched_clock_running; -- cgit v1.3-14-g43fede From 993647a293814dd47ae41d38657fda6e4ab04e33 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 29 Jun 2017 17:40:47 +0530 Subject: cpu/hotplug: Constify attribute_group structures attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const: File size before: text data bss dec hex filename 12582 15361 20 27963 6d3b kernel/cpu.o File size After adding 'const': text data bss dec hex filename 12710 15265 20 27995 6d5b kernel/cpu.o Signed-off-by: Arvind Yadav Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: anna-maria@linutronix.de Cc: bigeasy@linutronix.de Cc: boris.ostrovsky@oracle.com Cc: rcochran@linutronix.de Link: http://lkml.kernel.org/r/f9079e94e12b36d245e7adbf67d312bc5d0250c6.1498737970.git.arvind.yadav.cs@gmail.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index d0f5f54aa087..b69c0588f8c9 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1629,7 +1629,7 @@ static struct attribute *cpuhp_cpu_attrs[] = { NULL }; -static struct attribute_group cpuhp_cpu_attr_group = { +static const struct attribute_group cpuhp_cpu_attr_group = { .attrs = cpuhp_cpu_attrs, .name = "hotplug", NULL @@ -1661,7 +1661,7 @@ static struct attribute *cpuhp_cpu_root_attrs[] = { NULL }; -static struct attribute_group cpuhp_cpu_root_attr_group = { +static const struct attribute_group cpuhp_cpu_root_attr_group = { .attrs = cpuhp_cpu_root_attrs, .name = "hotplug", NULL -- cgit v1.3-14-g43fede From 72298e5c92c50edd8cb7cfda4519483ce65fa166 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 29 Jun 2017 13:41:28 -0500 Subject: sched/cputime: Refactor the cputime_adjust() code Address a Coverity false positive, which is caused by overly convoluted code: Value assigned to variable 'utime' at line 619:utime = rtime; is overwritten at line 642:utime = rtime - stime; before it can be used. This makes such variable assignment useless. Remove this variable assignment and refactor the code related. Addresses-Coverity-ID: 1371643 Signed-off-by: Gustavo A. R. Silva Cc: Frans Klaver Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/20170629184128.GA5271@embeddedgus Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index aea3135c5d90..67c70e287647 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -615,19 +615,13 @@ static void cputime_adjust(struct task_cputime *curr, * userspace. Once a task gets some ticks, the monotonicy code at * 'update' will ensure things converge to the observed ratio. */ - if (stime == 0) { - utime = rtime; - goto update; + if (stime != 0) { + if (utime == 0) + stime = rtime; + else + stime = scale_stime(stime, rtime, stime + utime); } - if (utime == 0) { - stime = rtime; - goto update; - } - - stime = scale_stime(stime, rtime, stime + utime); - -update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. -- cgit v1.3-14-g43fede From 5c4994102fb508d4a0f7a8afa46560c314c1ebd4 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:05 -0700 Subject: posix-timers: Use get_timespec64() and put_timespec64() Usage of these apis and their compat versions makes the syscalls: clock_gettime, clock_settime, clock_getres and their compat implementations simpler. This is a preparatory patch to isolate data conversions to struct timespec64 at userspace boundaries. This helps contain the changes needed to transition to new y2038 safe types. Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- kernel/time/posix-stubs.c | 83 +++++++++++++++++++++++++--------------------- kernel/time/posix-timers.c | 63 ++++++++++++++--------------------- 2 files changed, 70 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 65878221cbfb..06f34feb635e 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -51,40 +51,52 @@ SYS_NI(alarm); SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { - struct timespec64 new_tp64; - struct timespec new_tp; + struct timespec64 new_tp; if (which_clock != CLOCK_REALTIME) return -EINVAL; - if (copy_from_user(&new_tp, tp, sizeof (*tp))) + if (get_timespec64(&new_tp, tp)) return -EFAULT; - new_tp64 = timespec_to_timespec64(new_tp); - return do_sys_settimeofday64(&new_tp64, NULL); + return do_sys_settimeofday64(&new_tp, NULL); } -SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct timespec __user *,tp) +int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) { - struct timespec64 kernel_tp64; - struct timespec kernel_tp; - switch (which_clock) { - case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; - case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; - case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; - default: return -EINVAL; + case CLOCK_REALTIME: + ktime_get_real_ts64(tp); + break; + case CLOCK_MONOTONIC: + ktime_get_ts64(tp); + break; + case CLOCK_BOOTTIME: + get_monotonic_boottime64(tp); + break; + default: + return -EINVAL; } - kernel_tp = timespec64_to_timespec(kernel_tp64); - if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + return 0; +} +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct timespec __user *, tp) +{ + int ret; + struct timespec64 kernel_tp; + + ret = do_clock_gettime(which_clock, &kernel_tp); + if (ret) + return ret; + + if (put_timespec64(&kernel_tp, tp)) return -EFAULT; return 0; } SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { - struct timespec rtn_tp = { + struct timespec64 rtn_tp = { .tv_sec = 0, .tv_nsec = hrtimer_resolution, }; @@ -93,7 +105,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: - if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp))) + if (put_timespec64(&rtn_tp, tp)) return -EFAULT; return 0; default: @@ -142,41 +154,35 @@ COMPAT_SYS_NI(setitimer); COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, struct compat_timespec __user *, tp) { - struct timespec64 new_tp64; - struct timespec new_tp; + struct timespec64 new_tp; if (which_clock != CLOCK_REALTIME) return -EINVAL; - if (compat_get_timespec(&new_tp, tp)) + if (compat_get_timespec64(&new_tp, tp)) return -EFAULT; - new_tp64 = timespec_to_timespec64(new_tp); - return do_sys_settimeofday64(&new_tp64, NULL); + return do_sys_settimeofday64(&new_tp, NULL); } -COMPAT_SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, - struct compat_timespec __user *,tp) +COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, + struct compat_timespec __user *, tp) { - struct timespec64 kernel_tp64; - struct timespec kernel_tp; + int ret; + struct timespec64 kernel_tp; - switch (which_clock) { - case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; - case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; - case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; - default: return -EINVAL; - } + ret = do_clock_gettime(which_clock, &kernel_tp); + if (ret) + return ret; - kernel_tp = timespec64_to_timespec(kernel_tp64); - if (compat_put_timespec(&kernel_tp, tp)) + if (compat_put_timespec64(&kernel_tp, tp)) return -EFAULT; return 0; } -COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, +COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, struct compat_timespec __user *, tp) { - struct timespec rtn_tp = { + struct timespec64 rtn_tp = { .tv_sec = 0, .tv_nsec = hrtimer_resolution, }; @@ -185,13 +191,14 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, case CLOCK_REALTIME: case CLOCK_MONOTONIC: case CLOCK_BOOTTIME: - if (compat_put_timespec(&rtn_tp, tp)) + if (compat_put_timespec64(&rtn_tp, tp)) return -EFAULT; return 0; default: return -EINVAL; } } + COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 82d67be7d9d1..39322ae5dd87 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1049,34 +1049,30 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 new_tp64; - struct timespec new_tp; + struct timespec64 new_tp; if (!kc || !kc->clock_set) return -EINVAL; - if (copy_from_user(&new_tp, tp, sizeof (*tp))) + if (get_timespec64(&new_tp, tp)) return -EFAULT; - new_tp64 = timespec_to_timespec64(new_tp); - return kc->clock_set(which_clock, &new_tp64); + return kc->clock_set(which_clock, &new_tp); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 kernel_tp64; - struct timespec kernel_tp; + struct timespec64 kernel_tp; int error; if (!kc) return -EINVAL; - error = kc->clock_get(which_clock, &kernel_tp64); - kernel_tp = timespec64_to_timespec(kernel_tp64); + error = kc->clock_get(which_clock, &kernel_tp); - if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + if (!error && put_timespec64(&kernel_tp, tp)) error = -EFAULT; return error; @@ -1109,17 +1105,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 rtn_tp64; - struct timespec rtn_tp; + struct timespec64 rtn_tp; int error; if (!kc) return -EINVAL; - error = kc->clock_getres(which_clock, &rtn_tp64); - rtn_tp = timespec64_to_timespec(rtn_tp64); + error = kc->clock_getres(which_clock, &rtn_tp); - if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) + if (!error && tp && put_timespec64(&rtn_tp, tp)) error = -EFAULT; return error; @@ -1131,38 +1125,33 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, struct compat_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 new_tp64; - struct timespec new_tp; + struct timespec64 ts; if (!kc || !kc->clock_set) return -EINVAL; - if (compat_get_timespec(&new_tp, tp)) + if (compat_get_timespec64(&ts, tp)) return -EFAULT; - new_tp64 = timespec_to_timespec64(new_tp); - - return kc->clock_set(which_clock, &new_tp64); + return kc->clock_set(which_clock, &ts); } COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, struct compat_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 kernel_tp64; - struct timespec kernel_tp; - int error; + struct timespec64 ts; + int err; if (!kc) return -EINVAL; - error = kc->clock_get(which_clock, &kernel_tp64); - kernel_tp = timespec64_to_timespec(kernel_tp64); + err = kc->clock_get(which_clock, &ts); - if (!error && compat_put_timespec(&kernel_tp, tp)) - error = -EFAULT; + if (!err && compat_put_timespec64(&ts, tp)) + err = -EFAULT; - return error; + return err; } COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, @@ -1193,21 +1182,19 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, struct compat_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 rtn_tp64; - struct timespec rtn_tp; - int error; + struct timespec64 ts; + int err; if (!kc) return -EINVAL; - error = kc->clock_getres(which_clock, &rtn_tp64); - rtn_tp = timespec64_to_timespec(rtn_tp64); - - if (!error && tp && compat_put_timespec(&rtn_tp, tp)) - error = -EFAULT; + err = kc->clock_getres(which_clock, &ts); + if (!err && tp && compat_put_timespec64(&ts, tp)) + return -EFAULT; - return error; + return err; } + #endif /* -- cgit v1.3-14-g43fede From c0edd7c9acd0eaee149ab6cb4441cc71a1af87f0 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:06 -0700 Subject: nanosleep: Use get_timespec64() and put_timespec64() Usage of these apis and their compat versions makes the syscalls: clock_nanosleep and nanosleep and their compat implementations simpler. This is a preparatory patch to isolate data conversions to struct timespec64 at userspace boundaries. This helps contain the changes needed to transition to new y2038 safe types. Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- include/linux/hrtimer.h | 2 +- kernel/time/alarmtimer.c | 4 ++-- kernel/time/hrtimer.c | 30 +++++++++++++----------------- kernel/time/posix-cpu-timers.c | 8 ++------ kernel/time/posix-timers.c | 20 ++++++++------------ 5 files changed, 26 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 255edd5e7a74..012c37fdb688 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -453,7 +453,7 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer, /* Precise sleep: */ -extern int nanosleep_copyout(struct restart_block *, struct timespec *); +extern int nanosleep_copyout(struct restart_block *, struct timespec64 *); extern long hrtimer_nanosleep(const struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c991cf212c6d..0b8ff7d257ea 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -712,14 +712,14 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, alarmtimer_freezerset(absexp, type); restart = ¤t->restart_block; if (restart->nanosleep.type != TT_NONE) { - struct timespec rmt; + struct timespec64 rmt; ktime_t rem; rem = ktime_sub(absexp, alarm_bases[type].gettime()); if (rem <= 0) return 0; - rmt = ktime_to_timespec(rem); + rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 81da124f1115..88f75f92ef36 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1440,17 +1440,17 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); -int nanosleep_copyout(struct restart_block *restart, struct timespec *ts) +int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT case TT_COMPAT: - if (compat_put_timespec(ts, restart->nanosleep.compat_rmtp)) + if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif case TT_NATIVE: - if (copy_to_user(restart->nanosleep.rmtp, ts, sizeof(struct timespec))) + if (put_timespec64(ts, restart->nanosleep.rmtp)) return -EFAULT; break; default: @@ -1485,11 +1485,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod restart = ¤t->restart_block; if (restart->nanosleep.type != TT_NONE) { ktime_t rem = hrtimer_expires_remaining(&t->timer); - struct timespec rmt; + struct timespec64 rmt; if (rem <= 0) return 0; - rmt = ktime_to_timespec(rem); + rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } @@ -1546,19 +1546,17 @@ out: SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, struct timespec __user *, rmtp) { - struct timespec64 tu64; - struct timespec tu; + struct timespec64 tu; - if (copy_from_user(&tu, rqtp, sizeof(tu))) + if (get_timespec64(&tu, rqtp)) return -EFAULT; - tu64 = timespec_to_timespec64(tu); - if (!timespec64_valid(&tu64)) + if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #ifdef CONFIG_COMPAT @@ -1566,19 +1564,17 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) { - struct timespec64 tu64; - struct timespec tu; + struct timespec64 tu; - if (compat_get_timespec(&tu, rqtp)) + if (compat_get_timespec64(&tu, rqtp)) return -EFAULT; - tu64 = timespec_to_timespec64(tu); - if (!timespec64_valid(&tu64)) + if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 9df618ee64cf..7323da5950cc 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1314,12 +1314,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, */ restart = ¤t->restart_block; restart->nanosleep.expires = expires; - if (restart->nanosleep.type != TT_NONE) { - struct timespec ts; - - ts = timespec64_to_timespec(it.it_value); - error = nanosleep_copyout(restart, &ts); - } + if (restart->nanosleep.type != TT_NONE) + error = nanosleep_copyout(restart, &it.it_value); } return error; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 39322ae5dd87..4b0fc3b0a1c4 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1213,26 +1213,24 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, struct timespec __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 t64; - struct timespec t; + struct timespec64 t; if (!kc) return -EINVAL; if (!kc->nsleep) return -ENANOSLEEP_NOTSUP; - if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + if (get_timespec64(&t, rqtp)) return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) + if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return kc->nsleep(which_clock, flags, &t64); + return kc->nsleep(which_clock, flags, &t); } #ifdef CONFIG_COMPAT @@ -1241,26 +1239,24 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, struct compat_timespec __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timespec64 t64; - struct timespec t; + struct timespec64 t; if (!kc) return -EINVAL; if (!kc->nsleep) return -ENANOSLEEP_NOTSUP; - if (compat_get_timespec(&t, rqtp)) + if (compat_get_timespec64(&t, rqtp)) return -EFAULT; - t64 = timespec_to_timespec64(t); - if (!timespec64_valid(&t64)) + if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return kc->nsleep(which_clock, flags, &t64); + return kc->nsleep(which_clock, flags, &t); } #endif -- cgit v1.3-14-g43fede From 725816e8aabb1c183baa2bc9572ab9a0d26b9ea1 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 24 Jun 2017 11:45:08 -0700 Subject: posix_clocks: Use get_itimerspec64() and put_itimerspec64() Usage of these apis and their compat versions makes the syscalls: timer_settime and timer_gettime and their compat implementations simpler. This patch also serves as a preparatory patch for changing syscalls to use new time_t data types to support the y2038 effort by isolating the processing of user pointers through these apis. Signed-off-by: Deepa Dinamani Signed-off-by: Al Viro --- kernel/time/posix-timers.c | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4b0fc3b0a1c4..13d6881f908b 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -739,13 +739,11 @@ static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct itimerspec __user *, setting) { - struct itimerspec64 cur_setting64; + struct itimerspec64 cur_setting; - int ret = do_timer_gettime(timer_id, &cur_setting64); + int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { - struct itimerspec cur_setting; - cur_setting = itimerspec64_to_itimerspec(&cur_setting64); - if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + if (put_itimerspec64(&cur_setting, setting)) ret = -EFAULT; } return ret; @@ -755,13 +753,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct compat_itimerspec __user *, setting) { - struct itimerspec64 cur_setting64; + struct itimerspec64 cur_setting; - int ret = do_timer_gettime(timer_id, &cur_setting64); + int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { - struct itimerspec cur_setting; - cur_setting = itimerspec64_to_itimerspec(&cur_setting64); - if (put_compat_itimerspec(setting, &cur_setting)) + if (put_compat_itimerspec64(&cur_setting, setting)) ret = -EFAULT; } return ret; @@ -907,23 +903,19 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, const struct itimerspec __user *, new_setting, struct itimerspec __user *, old_setting) { - struct itimerspec64 new_spec64, old_spec64; - struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; - struct itimerspec new_spec; + struct itimerspec64 new_spec, old_spec; + struct itimerspec64 *rtn = old_setting ? &old_spec : NULL; int error = 0; if (!new_setting) return -EINVAL; - if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) + if (get_itimerspec64(&new_spec, new_setting)) return -EFAULT; - new_spec64 = itimerspec_to_itimerspec64(&new_spec); - error = do_timer_settime(timer_id, flags, &new_spec64, rtn); + error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old_setting) { - struct itimerspec old_spec; - old_spec = itimerspec64_to_itimerspec(&old_spec64); - if (copy_to_user(old_setting, &old_spec, sizeof (old_spec))) + if (put_itimerspec64(&old_spec, old_setting)) error = -EFAULT; } return error; @@ -934,22 +926,18 @@ COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, struct compat_itimerspec __user *, new, struct compat_itimerspec __user *, old) { - struct itimerspec64 new_spec64, old_spec64; - struct itimerspec64 *rtn = old ? &old_spec64 : NULL; - struct itimerspec new_spec; + struct itimerspec64 new_spec, old_spec; + struct itimerspec64 *rtn = old ? &old_spec : NULL; int error = 0; if (!new) return -EINVAL; - if (get_compat_itimerspec(&new_spec, new)) + if (get_compat_itimerspec64(&new_spec, new)) return -EFAULT; - new_spec64 = itimerspec_to_itimerspec64(&new_spec); - error = do_timer_settime(timer_id, flags, &new_spec64, rtn); + error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old) { - struct itimerspec old_spec; - old_spec = itimerspec64_to_itimerspec(&old_spec64); - if (put_compat_itimerspec(old, &old_spec)) + if (put_compat_itimerspec64(&old_spec, old)) error = -EFAULT; } return error; -- cgit v1.3-14-g43fede From c207aee48037abca71c669cbec407b9891965c34 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 28 Jun 2017 10:11:06 -0500 Subject: objtool, x86: Add several functions and files to the objtool whitelist In preparation for an objtool rewrite which will have broader checks, whitelist functions and files which cause problems because they do unusual things with the stack. These whitelists serve as a TODO list for which functions and files don't yet have undwarf unwinder coverage. Eventually most of the whitelists can be removed in favor of manual CFI hint annotations or objtool improvements. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Jiri Slaby Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: live-patching@vger.kernel.org Link: http://lkml.kernel.org/r/7f934a5d707a574bda33ea282e9478e627fb1829.1498659915.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/crypto/Makefile | 2 ++ arch/x86/crypto/sha1-mb/Makefile | 2 ++ arch/x86/crypto/sha256-mb/Makefile | 2 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/acpi/Makefile | 2 ++ arch/x86/kernel/kprobes/opt.c | 9 ++++++++- arch/x86/kernel/reboot.c | 2 ++ arch/x86/kvm/svm.c | 2 ++ arch/x86/kvm/vmx.c | 3 +++ arch/x86/lib/msr-reg.S | 8 ++++---- arch/x86/net/Makefile | 2 ++ arch/x86/platform/efi/Makefile | 1 + arch/x86/power/Makefile | 2 ++ arch/x86/xen/Makefile | 3 +++ kernel/kexec_core.c | 4 +++- 15 files changed, 39 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 34b3fa2889d1..9e32d40d71bd 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -2,6 +2,8 @@ # Arch-specific CryptoAPI modules. # +OBJECT_FILES_NON_STANDARD := y + avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ $(comma)4)$(comma)%ymm2,yes,no) diff --git a/arch/x86/crypto/sha1-mb/Makefile b/arch/x86/crypto/sha1-mb/Makefile index 2f8756375df5..2e14acc3da25 100644 --- a/arch/x86/crypto/sha1-mb/Makefile +++ b/arch/x86/crypto/sha1-mb/Makefile @@ -2,6 +2,8 @@ # Arch-specific CryptoAPI modules. # +OBJECT_FILES_NON_STANDARD := y + avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ $(comma)4)$(comma)%ymm2,yes,no) ifeq ($(avx2_supported),yes) diff --git a/arch/x86/crypto/sha256-mb/Makefile b/arch/x86/crypto/sha256-mb/Makefile index 41089e7c400c..45b4fca6c4a8 100644 --- a/arch/x86/crypto/sha256-mb/Makefile +++ b/arch/x86/crypto/sha256-mb/Makefile @@ -2,6 +2,8 @@ # Arch-specific CryptoAPI modules. # +OBJECT_FILES_NON_STANDARD := y + avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ $(comma)4)$(comma)%ymm2,yes,no) ifeq ($(avx2_supported),yes) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4b994232cb57..3c7c419c4e3e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -29,6 +29,7 @@ OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y +OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 26b78d86f25a..85a9e17e0dbc 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,3 +1,5 @@ +OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y + obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o obj-$(CONFIG_ACPI_APEI) += apei.o diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 901c640d152f..69ea0bc1cfa3 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -94,6 +95,7 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) } asm ( + "optprobe_template_func:\n" ".global optprobe_template_entry\n" "optprobe_template_entry:\n" #ifdef CONFIG_X86_64 @@ -131,7 +133,12 @@ asm ( " popf\n" #endif ".global optprobe_template_end\n" - "optprobe_template_end:\n"); + "optprobe_template_end:\n" + ".type optprobe_template_func, @function\n" + ".size optprobe_template_func, .-optprobe_template_func\n"); + +void optprobe_template_func(void); +STACK_FRAME_NON_STANDARD(optprobe_template_func); #define TMPL_MOVE_IDX \ ((long)&optprobe_template_val - (long)&optprobe_template_entry) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 2544700a2a87..67393fc88353 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -123,6 +124,7 @@ void __noreturn machine_real_restart(unsigned int type) #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(machine_real_restart); #endif +STACK_FRAME_NON_STANDARD(machine_real_restart); /* * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ba9891ac5c56..33460fcdeef9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -4906,6 +4907,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) mark_all_clean(svm->vmcb); } +STACK_FRAME_NON_STANDARD(svm_vcpu_run); static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ca5d2b93385c..1b469b6c762f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "kvm_cache_regs.h" #include "x86.h" @@ -8652,6 +8653,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) ); } } +STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); static bool vmx_has_high_real_mode_segbase(void) { @@ -9028,6 +9030,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); } +STACK_FRAME_NON_STANDARD(vmx_vcpu_run); static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) { diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index c81556409bbb..10ffa7e8519f 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -13,14 +13,14 @@ .macro op_safe_regs op ENTRY(\op\()_safe_regs) pushq %rbx - pushq %rbp + pushq %r12 movq %rdi, %r10 /* Save pointer */ xorl %r11d, %r11d /* Return value */ movl (%rdi), %eax movl 4(%rdi), %ecx movl 8(%rdi), %edx movl 12(%rdi), %ebx - movl 20(%rdi), %ebp + movl 20(%rdi), %r12d movl 24(%rdi), %esi movl 28(%rdi), %edi 1: \op @@ -29,10 +29,10 @@ ENTRY(\op\()_safe_regs) movl %ecx, 4(%r10) movl %edx, 8(%r10) movl %ebx, 12(%r10) - movl %ebp, 20(%r10) + movl %r12d, 20(%r10) movl %esi, 24(%r10) movl %edi, 28(%r10) - popq %rbp + popq %r12 popq %rbx ret 3: diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile index 90568c33ddb0..fefb4b619598 100644 --- a/arch/x86/net/Makefile +++ b/arch/x86/net/Makefile @@ -1,4 +1,6 @@ # # Arch-specific network modules # +OBJECT_FILES_NON_STANDARD_bpf_jit.o += y + obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile index f1d83b34c329..2f56e1ed61c3 100644 --- a/arch/x86/platform/efi/Makefile +++ b/arch/x86/platform/efi/Makefile @@ -1,4 +1,5 @@ OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_efi_stub_$(BITS).o := y obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index a6a198c33623..05041871ac90 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -1,3 +1,5 @@ +OBJECT_FILES_NON_STANDARD_hibernate_asm_$(BITS).o := y + # __restore_processor_state() restores %gs after S3 resume and so should not # itself be stack-protected nostackp := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index fffb0a16f9e3..bced7a369a11 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,3 +1,6 @@ +OBJECT_FILES_NON_STANDARD_xen-asm_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_xen-pvh.o := y + ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_spinlock.o = -pg diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index ae1a3ba24df5..154ffb489b93 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -874,7 +875,7 @@ int kexec_load_disabled; * only when panic_cpu holds the current CPU number; this is the only CPU * which processes crash_kexec routines. */ -void __crash_kexec(struct pt_regs *regs) +void __noclone __crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel @@ -896,6 +897,7 @@ void __crash_kexec(struct pt_regs *regs) mutex_unlock(&kexec_mutex); } } +STACK_FRAME_NON_STANDARD(__crash_kexec); void crash_kexec(struct pt_regs *regs) { -- cgit v1.3-14-g43fede From 3859a271a003aba01e45b85c9d8b355eb7bf25f9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 28 Oct 2016 01:22:25 -0700 Subject: randstruct: Mark various structs for randomization This marks many critical kernel structures for randomization. These are structures that have been targeted in the past in security exploits, or contain functions pointers, pointers to function pointer tables, lists, workqueues, ref-counters, credentials, permissions, or are otherwise sensitive. This initial list was extracted from Brad Spengler/PaX Team's code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Left out of this list is task_struct, which requires special handling and will be covered in a subsequent patch. Signed-off-by: Kees Cook --- arch/x86/include/asm/processor.h | 2 +- fs/mount.h | 4 ++-- fs/namei.c | 2 +- fs/proc/internal.h | 6 +++--- include/linux/binfmts.h | 4 ++-- include/linux/cdev.h | 2 +- include/linux/cred.h | 4 ++-- include/linux/dcache.h | 2 +- include/linux/fs.h | 17 +++++++++-------- include/linux/fs_struct.h | 2 +- include/linux/ipc.h | 2 +- include/linux/ipc_namespace.h | 2 +- include/linux/key-type.h | 4 ++-- include/linux/kmod.h | 2 +- include/linux/kobject.h | 2 +- include/linux/lsm_hooks.h | 4 ++-- include/linux/mm_types.h | 4 ++-- include/linux/module.h | 4 ++-- include/linux/mount.h | 2 +- include/linux/msg.h | 2 +- include/linux/path.h | 2 +- include/linux/pid_namespace.h | 2 +- include/linux/proc_ns.h | 2 +- include/linux/sched.h | 2 +- include/linux/sched/signal.h | 2 +- include/linux/sem.h | 2 +- include/linux/shm.h | 2 +- include/linux/sysctl.h | 2 +- include/linux/tty.h | 2 +- include/linux/tty_driver.h | 4 ++-- include/linux/user_namespace.h | 2 +- include/linux/utsname.h | 2 +- include/net/af_unix.h | 2 +- include/net/neighbour.h | 2 +- include/net/net_namespace.h | 2 +- include/net/sock.h | 2 +- kernel/futex.c | 4 ++-- security/keys/internal.h | 2 +- 38 files changed, 57 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3cada998a402..e2335edb9fc5 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -129,7 +129,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; u32 microcode; -}; +} __randomize_layout; struct cpuid_regs { u32 eax, ebx, ecx, edx; diff --git a/fs/mount.h b/fs/mount.h index bf1fda6eed8f..e406b286fba1 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -16,7 +16,7 @@ struct mnt_namespace { u64 event; unsigned int mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; -}; +} __randomize_layout; struct mnt_pcp { int mnt_count; @@ -68,7 +68,7 @@ struct mount { struct hlist_head mnt_pins; struct fs_pin mnt_umount; struct dentry *mnt_ex_mountpoint; -}; +} __randomize_layout; #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ diff --git a/fs/namei.c b/fs/namei.c index 6571a5f5112e..1764620ac383 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -524,7 +524,7 @@ struct nameidata { struct inode *link_inode; unsigned root_seq; int dfd; -}; +} __randomize_layout; static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index c5ae09b6c726..07b16318223f 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -51,7 +51,7 @@ struct proc_dir_entry { spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ u8 namelen; char name[]; -}; +} __randomize_layout; union proc_op { int (*proc_get_link)(struct dentry *, struct path *); @@ -70,7 +70,7 @@ struct proc_inode { struct list_head sysctl_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; -}; +} __randomize_layout; /* * General functions @@ -279,7 +279,7 @@ struct proc_maps_private { #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif -}; +} __randomize_layout; struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 05488da3aee9..3ae9013eeaaa 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -46,7 +46,7 @@ struct linux_binprm { unsigned interp_flags; unsigned interp_data; unsigned long loader, exec; -}; +} __randomize_layout; #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0 #define BINPRM_FLAGS_ENFORCE_NONDUMP (1 << BINPRM_FLAGS_ENFORCE_NONDUMP_BIT) @@ -81,7 +81,7 @@ struct linux_binfmt { int (*load_shlib)(struct file *); int (*core_dump)(struct coredump_params *cprm); unsigned long min_coredump; /* minimal dump size */ -}; +} __randomize_layout; extern void __register_binfmt(struct linux_binfmt *fmt, int insert); diff --git a/include/linux/cdev.h b/include/linux/cdev.h index 408bc09ce497..cb28eb21e3ca 100644 --- a/include/linux/cdev.h +++ b/include/linux/cdev.h @@ -17,7 +17,7 @@ struct cdev { struct list_head list; dev_t dev; unsigned int count; -}; +} __randomize_layout; void cdev_init(struct cdev *, const struct file_operations *); diff --git a/include/linux/cred.h b/include/linux/cred.h index b03e7d049a64..82c8a9e1aabb 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -31,7 +31,7 @@ struct group_info { atomic_t usage; int ngroups; kgid_t gid[0]; -}; +} __randomize_layout; /** * get_group_info - Get a reference to a group info structure @@ -145,7 +145,7 @@ struct cred { struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ struct group_info *group_info; /* supplementary groups for euid/fsgid */ struct rcu_head rcu; /* RCU deletion hook */ -}; +} __randomize_layout; extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index d2e38dc6172c..7eb262e13d3c 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -113,7 +113,7 @@ struct dentry { struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; } d_u; -}; +} __randomize_layout; /* * dentry->d_lock spinlock nesting subclasses: diff --git a/include/linux/fs.h b/include/linux/fs.h index 803e5a9b2654..8f28143486c4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -275,7 +275,7 @@ struct kiocb { void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); void *private; int ki_flags; -}; +} __randomize_layout; static inline bool is_sync_kiocb(struct kiocb *kiocb) { @@ -392,7 +392,7 @@ struct address_space { gfp_t gfp_mask; /* implicit gfp mask for allocations */ struct list_head private_list; /* ditto */ void *private_data; /* ditto */ -} __attribute__((aligned(sizeof(long)))); +} __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but * must be enforced here for CRIS, to let the least significant bit @@ -435,7 +435,7 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; -}; +} __randomize_layout; /* * Radix-tree tags, for tagging dirty and writeback pages within the pagecache @@ -653,7 +653,7 @@ struct inode { #endif void *i_private; /* fs or device private pointer */ -}; +} __randomize_layout; static inline unsigned int i_blocksize(const struct inode *node) { @@ -868,7 +868,8 @@ struct file { struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; -} __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ +} __randomize_layout + __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { __u32 handle_bytes; @@ -1005,7 +1006,7 @@ struct file_lock { int state; /* state of grant or error if -ve */ } afs; } fl_u; -}; +} __randomize_layout; struct file_lock_context { spinlock_t flc_lock; @@ -1404,7 +1405,7 @@ struct super_block { spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ -}; +} __randomize_layout; /* Helper functions so that in most cases filesystems will * not need to deal directly with kuid_t and kgid_t and can @@ -1690,7 +1691,7 @@ struct file_operations { u64); ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, u64); -}; +} __randomize_layout; struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 0efc3e62843a..7a026240cbb1 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -12,7 +12,7 @@ struct fs_struct { int umask; int in_exec; struct path root, pwd; -}; +} __randomize_layout; extern struct kmem_cache *fs_cachep; diff --git a/include/linux/ipc.h b/include/linux/ipc.h index 71fd92d81b26..ea0eb0b5f98c 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -20,6 +20,6 @@ struct kern_ipc_perm { umode_t mode; unsigned long seq; void *security; -} ____cacheline_aligned_in_smp; +} ____cacheline_aligned_in_smp __randomize_layout; #endif /* _LINUX_IPC_H */ diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 848e5796400e..65327ee0936b 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -61,7 +61,7 @@ struct ipc_namespace { struct ucounts *ucounts; struct ns_common ns; -}; +} __randomize_layout; extern struct ipc_namespace init_ipc_ns; extern spinlock_t mq_lock; diff --git a/include/linux/key-type.h b/include/linux/key-type.h index 8496cf64575c..9520fc3c3b9a 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -45,7 +45,7 @@ struct key_preparsed_payload { size_t datalen; /* Raw datalen */ size_t quotalen; /* Quota length for proposed payload */ time_t expiry; /* Expiry time of key */ -}; +} __randomize_layout; typedef int (*request_key_actor_t)(struct key_construction *key, const char *op, void *aux); @@ -158,7 +158,7 @@ struct key_type { /* internal fields */ struct list_head link; /* link in types list */ struct lock_class_key lock_class; /* key->sem lock class */ -}; +} __randomize_layout; extern struct key_type key_type_keyring; diff --git a/include/linux/kmod.h b/include/linux/kmod.h index c4e441e00db5..655082c88fd9 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -64,7 +64,7 @@ struct subprocess_info { int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; -}; +} __randomize_layout; extern int call_usermodehelper(const char *path, char **argv, char **envp, int wait); diff --git a/include/linux/kobject.h b/include/linux/kobject.h index ca85cb80e99a..084513350317 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -172,7 +172,7 @@ struct kset { spinlock_t list_lock; struct kobject kobj; const struct kset_uevent_ops *uevent_ops; -}; +} __randomize_layout; extern void kset_init(struct kset *kset); extern int __must_check kset_register(struct kset *kset); diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 080f34e66017..565163fc9ad4 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1876,7 +1876,7 @@ struct security_hook_heads { struct list_head audit_rule_match; struct list_head audit_rule_free; #endif /* CONFIG_AUDIT */ -}; +} __randomize_layout; /* * Security module hook list structure. @@ -1887,7 +1887,7 @@ struct security_hook_list { struct list_head *head; union security_list_options hook; char *lsm; -}; +} __randomize_layout; /* * Initializing a security_hook_list structure takes diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 45cdb27791a3..ff151814a02d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -342,7 +342,7 @@ struct vm_area_struct { struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; -}; +} __randomize_layout; struct core_thread { struct task_struct *task; @@ -500,7 +500,7 @@ struct mm_struct { atomic_long_t hugetlb_usage; #endif struct work_struct async_put_work; -}; +} __randomize_layout; extern struct mm_struct init_mm; diff --git a/include/linux/module.h b/include/linux/module.h index 21f56393602f..d93111d7def6 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -45,7 +45,7 @@ struct module_kobject { struct kobject *drivers_dir; struct module_param_attrs *mp; struct completion *kobj_completion; -}; +} __randomize_layout; struct module_attribute { struct attribute attr; @@ -475,7 +475,7 @@ struct module { ctor_fn_t *ctors; unsigned int num_ctors; #endif -} ____cacheline_aligned; +} ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} #endif diff --git a/include/linux/mount.h b/include/linux/mount.h index 8e0352af06b7..1ce85e6fd95f 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -67,7 +67,7 @@ struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ int mnt_flags; -}; +} __randomize_layout; struct file; /* forward dec */ struct path; diff --git a/include/linux/msg.h b/include/linux/msg.h index f3f302f9c197..a001305f5a79 100644 --- a/include/linux/msg.h +++ b/include/linux/msg.h @@ -29,7 +29,7 @@ struct msg_queue { struct list_head q_messages; struct list_head q_receivers; struct list_head q_senders; -}; +} __randomize_layout; /* Helper routines for sys_msgsnd and sys_msgrcv */ extern long do_msgsnd(int msqid, long mtype, void __user *mtext, diff --git a/include/linux/path.h b/include/linux/path.h index d1372186f431..cde895cc4af4 100644 --- a/include/linux/path.h +++ b/include/linux/path.h @@ -7,7 +7,7 @@ struct vfsmount; struct path { struct vfsmount *mnt; struct dentry *dentry; -}; +} __randomize_layout; extern void path_get(const struct path *); extern void path_put(const struct path *); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index c2a989dee876..b09136f88cf4 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -52,7 +52,7 @@ struct pid_namespace { int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; -}; +} __randomize_layout; extern struct pid_namespace init_pid_ns; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 58ab28d81fc2..06844b54dfc1 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -21,7 +21,7 @@ struct proc_ns_operations { int (*install)(struct nsproxy *nsproxy, struct ns_common *ns); struct user_namespace *(*owner)(struct ns_common *ns); struct ns_common *(*get_parent)(struct ns_common *ns); -}; +} __randomize_layout; extern const struct proc_ns_operations netns_operations; extern const struct proc_ns_operations utsns_operations; diff --git a/include/linux/sched.h b/include/linux/sched.h index 2b69fc650201..f833254fce00 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -408,7 +408,7 @@ struct sched_rt_entity { /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif -}; +} __randomize_layout; struct sched_dl_entity { struct rb_node rb_node; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index c06d63b3a583..2a0dd40b15db 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -222,7 +222,7 @@ struct signal_struct { struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) */ -}; +} __randomize_layout; /* * Bits in flags field of signal_struct. diff --git a/include/linux/sem.h b/include/linux/sem.h index 9edec926e9d9..23bcbdfad4a6 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -21,7 +21,7 @@ struct sem_array { int sem_nsems; /* no. of semaphores in array */ int complex_count; /* pending complex operations */ unsigned int use_global_lock;/* >0: global lock required */ -}; +} __randomize_layout; #ifdef CONFIG_SYSVIPC diff --git a/include/linux/shm.h b/include/linux/shm.h index 04e881829625..0fb7061ec54c 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -22,7 +22,7 @@ struct shmid_kernel /* private to the kernel */ /* The task created the shm object. NULL if the task is dead. */ struct task_struct *shm_creator; struct list_head shm_clist; /* list by creator */ -}; +} __randomize_layout; /* shm_mode upper byte flags */ #define SHM_DEST 01000 /* segment will be destroyed on last detach */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 80d07816def0..9ddeef2c03e2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -117,7 +117,7 @@ struct ctl_table struct ctl_table_poll *poll; void *extra1; void *extra2; -}; +} __randomize_layout; struct ctl_node { struct rb_node node; diff --git a/include/linux/tty.h b/include/linux/tty.h index d07cd2105a6c..73f8d0977bb0 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -333,7 +333,7 @@ struct tty_struct { /* If the tty has a pending do_SAK, queue it here - akpm */ struct work_struct SAK_work; struct tty_port *port; -}; +} __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ struct tty_file_private { diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index b742b5e47cc2..00b2213f6a35 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -291,7 +291,7 @@ struct tty_operations { void (*poll_put_char)(struct tty_driver *driver, int line, char ch); #endif const struct file_operations *proc_fops; -}; +} __randomize_layout; struct tty_driver { int magic; /* magic number for this structure */ @@ -325,7 +325,7 @@ struct tty_driver { const struct tty_operations *ops; struct list_head tty_drivers; -}; +} __randomize_layout; extern struct list_head tty_drivers; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 32354b4b4b2b..b3575ce29148 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -66,7 +66,7 @@ struct user_namespace { #endif struct ucounts *ucounts; int ucount_max[UCOUNT_COUNTS]; -}; +} __randomize_layout; struct ucounts { struct hlist_node node; diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 60f0bb83b313..da826ed059cf 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -26,7 +26,7 @@ struct uts_namespace { struct user_namespace *user_ns; struct ucounts *ucounts; struct ns_common ns; -}; +} __randomize_layout; extern struct uts_namespace init_uts_ns; #ifdef CONFIG_UTS_NS diff --git a/include/net/af_unix.h b/include/net/af_unix.h index fd60eccb59a6..64e2a1e24a2c 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -36,7 +36,7 @@ struct unix_skb_parms { u32 secid; /* Security ID */ #endif u32 consumed; -}; +} __randomize_layout; #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) diff --git a/include/net/neighbour.h b/include/net/neighbour.h index e4dd3a214034..a62959d2b3f7 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -155,7 +155,7 @@ struct neighbour { struct rcu_head rcu; struct net_device *dev; u8 primary_key[0]; -}; +} __randomize_layout; struct neigh_ops { int family; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index fe80bb48ab1f..a224196d16ac 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -147,7 +147,7 @@ struct net { #endif struct sock *diag_nlsk; atomic_t fnhe_genid; -}; +} __randomize_layout; #include diff --git a/include/net/sock.h b/include/net/sock.h index f33e3d134e0b..d349297db9e9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1113,7 +1113,7 @@ struct proto { atomic_t socks; #endif int (*diag_destroy)(struct sock *sk, int err); -}; +} __randomize_layout; int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); diff --git a/kernel/futex.c b/kernel/futex.c index 357348a6cf6b..5616511abf39 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -212,7 +212,7 @@ struct futex_pi_state { atomic_t refcount; union futex_key key; -}; +} __randomize_layout; /** * struct futex_q - The hashed futex queue entry, one per waiting task @@ -246,7 +246,7 @@ struct futex_q { struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; -}; +} __randomize_layout; static const struct futex_q futex_q_init = { /* list gets initialized in queue_me()*/ diff --git a/security/keys/internal.h b/security/keys/internal.h index c0f8682eba69..6494954e9980 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -197,7 +197,7 @@ struct request_key_auth { void *callout_info; size_t callout_len; pid_t pid; -}; +} __randomize_layout; extern struct key_type key_type_request_key_auth; extern struct key *request_key_auth_new(struct key *target, -- cgit v1.3-14-g43fede From 40304b2a1567fecc321f640ee4239556dd0f3ee0 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 30 Jun 2017 20:02:40 -0700 Subject: bpf: BPF support for sock_ops Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding struct that allows BPF programs of this type to access some of the socket's fields (such as IP addresses, ports, etc.). It uses the existing bpf cgroups infrastructure so the programs can be attached per cgroup with full inheritance support. The program will be called at appropriate times to set relevant connections parameters such as buffer sizes, SYN and SYN-ACK RTOs, etc., based on connection information such as IP addresses, port numbers, etc. Alghough there are already 3 mechanisms to set parameters (sysctls, route metrics and setsockopts), this new mechanism provides some distinct advantages. Unlike sysctls, it can set parameters per connection. In contrast to route metrics, it can also use port numbers and information provided by a user level program. In addition, it could set parameters probabilistically for evaluation purposes (i.e. do something different on 10% of the flows and compare results with the other 90% of the flows). Also, in cases where IPv6 addresses contain geographic information, the rules to make changes based on the distance (or RTT) between the hosts are much easier than route metric rules and can be global. Finally, unlike setsockopt, it oes not require application changes and it can be updated easily at any time. Although the bpf cgroup framework already contains a sock related program type (BPF_PROG_TYPE_CGROUP_SOCK), I created the new type (BPF_PROG_TYPE_SOCK_OPS) beccause the existing type expects to be called only once during the connections's lifetime. In contrast, the new program type will be called multiple times from different places in the network stack code. For example, before sending SYN and SYN-ACKs to set an appropriate timeout, when the connection is established to set congestion control, etc. As a result it has "op" field to specify the type of operation requested. The purpose of this new program type is to simplify setting connection parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is easy to use facebook's internal IPv6 addresses to determine if both hosts of a connection are in the same datacenter. Therefore, it is easy to write a BPF program to choose a small SYN RTO value when both hosts are in the same datacenter. This patch only contains the framework to support the new BPF program type, following patches add the functionality to set various connection parameters. This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS and a new bpf syscall command to load a new program of this type: BPF_PROG_LOAD_SOCKET_OPS. Two new corresponding structs (one for the kernel one for the user/BPF program): /* kernel version */ struct bpf_sock_ops_kern { struct sock *sk; __u32 op; union { __u32 reply; __u32 replylong[4]; }; }; /* user version * Some fields are in network byte order reflecting the sock struct * Use the bpf_ntohl helper macro in samples/bpf/bpf_endian.h to * convert them to host byte order. */ struct bpf_sock_ops { __u32 op; union { __u32 reply; __u32 replylong[4]; }; __u32 family; __u32 remote_ip4; /* In network byte order */ __u32 local_ip4; /* In network byte order */ __u32 remote_ip6[4]; /* In network byte order */ __u32 local_ip6[4]; /* In network byte order */ __u32 remote_port; /* In network byte order */ __u32 local_port; /* In host byte horder */ }; Currently there are two types of ops. The first type expects the BPF program to return a value which is then used by the caller (or a negative value to indicate the operation is not supported). The second type expects state changes to be done by the BPF program, for example through a setsockopt BPF helper function, and they ignore the return value. The reply fields of the bpf_sockt_ops struct are there in case a bpf program needs to return a value larger than an integer. Signed-off-by: Lawrence Brakmo Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 18 +++++ include/linux/bpf_types.h | 1 + include/linux/filter.h | 9 +++ include/net/tcp.h | 36 ++++++++++ include/uapi/linux/bpf.h | 30 ++++++++ kernel/bpf/cgroup.c | 37 ++++++++++ kernel/bpf/syscall.c | 5 ++ net/core/filter.c | 168 +++++++++++++++++++++++++++++++++++++++++++++ samples/bpf/bpf_load.c | 13 +++- 9 files changed, 314 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c970a25d2a49..360c082e885c 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -7,6 +7,7 @@ struct sock; struct cgroup; struct sk_buff; +struct bpf_sock_ops_kern; #ifdef CONFIG_CGROUP_BPF @@ -42,6 +43,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, + struct bpf_sock_ops_kern *sock_ops, + enum bpf_attach_type type); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -75,6 +80,18 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, __ret; \ }) +#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled && (sock_ops)->sk) { \ + typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ + if (sk_fullsock(__sk)) \ + __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ + sock_ops, \ + BPF_CGROUP_SOCK_OPS); \ + } \ + __ret; \ +}) #else struct cgroup_bpf {}; @@ -85,6 +102,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #endif /* CONFIG_CGROUP_BPF */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 03bf223f18be..3d137c33d664 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -10,6 +10,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops) #endif #ifdef CONFIG_BPF_EVENTS BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops) diff --git a/include/linux/filter.h b/include/linux/filter.h index 1fa26dc562ce..738f8b14f025 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -898,4 +898,13 @@ static inline int bpf_tell_extensions(void) return SKF_AD_MAX; } +struct bpf_sock_ops_kern { + struct sock *sk; + u32 op; + union { + u32 reply; + u32 replylong[4]; + }; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d0751b79d99c..e58500825006 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -46,6 +46,10 @@ #include #include +#include +#include +#include + extern struct inet_hashinfo tcp_hashinfo; extern struct percpu_counter tcp_orphan_count; @@ -2021,4 +2025,36 @@ int tcp_set_ulp(struct sock *sk, const char *name); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); +/* Call BPF_SOCK_OPS program that returns an int. If the return value + * is < 0, then the BPF op failed (for example if the loaded BPF + * program does not support the chosen operation or there is no BPF + * program loaded). + */ +#ifdef CONFIG_BPF +static inline int tcp_call_bpf(struct sock *sk, int op) +{ + struct bpf_sock_ops_kern sock_ops; + int ret; + + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + + memset(&sock_ops, 0, sizeof(sock_ops)); + sock_ops.sk = sk; + sock_ops.op = op; + + ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); + if (ret == 0) + ret = sock_ops.reply; + else + ret = -1; + return ret; +} +#else +static inline int tcp_call_bpf(struct sock *sk, int op) +{ + return -EPERM; +} +#endif + #endif /* _TCP_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f94b48b168dc..01cd485ccd4f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -120,12 +120,14 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_IN, BPF_PROG_TYPE_LWT_OUT, BPF_PROG_TYPE_LWT_XMIT, + BPF_PROG_TYPE_SOCK_OPS, }; enum bpf_attach_type { BPF_CGROUP_INET_INGRESS, BPF_CGROUP_INET_EGRESS, BPF_CGROUP_INET_SOCK_CREATE, + BPF_CGROUP_SOCK_OPS, __MAX_BPF_ATTACH_TYPE }; @@ -720,4 +722,32 @@ struct bpf_map_info { __u32 map_flags; } __attribute__((aligned(8))); +/* User bpf_sock_ops struct to access socket values and specify request ops + * and their replies. + * Some of this fields are in network (bigendian) byte order and may need + * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h). + * New fields can only be added at the end of this structure + */ +struct bpf_sock_ops { + __u32 op; + union { + __u32 reply; + __u32 replylong[4]; + }; + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ +}; + +/* List of known BPF sock_ops operators. + * New entries can only be added at the end + */ +enum { + BPF_SOCK_OPS_VOID, +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ea6033cba947..546113430049 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); + +/** + * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock + * @sk: socket to get cgroup from + * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains + * sk with connection information (IP addresses, etc.) May not contain + * cgroup info if it is a req sock. + * @type: The type of program to be exectuted + * + * socket passed is expected to be of type INET or INET6. + * + * The program type passed in via @type must be suitable for sock_ops + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, + struct bpf_sock_ops_kern *sock_ops, + enum bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog *prog; + int ret = 0; + + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) + ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM; + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4409ccca8831..d4d47de75bba 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1079,6 +1079,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET_SOCK_CREATE: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_SOCK_OPS: + ptype = BPF_PROG_TYPE_SOCK_OPS; + break; default: return -EINVAL; } @@ -1119,6 +1122,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_SOCK_OPS: cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp); @@ -1133,6 +1137,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) return ret; } + #endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration diff --git a/net/core/filter.c b/net/core/filter.c index b39c869d22e3..1f6a26c4f8b9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3110,6 +3110,36 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static bool __is_valid_sock_ops_access(int off, int size) +{ + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) + return false; + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; +} + +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock_ops, op) ... + offsetof(struct bpf_sock_ops, replylong[3]): + break; + default: + return false; + } + } + + return __is_valid_sock_ops_access(off, size); +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -3379,6 +3409,138 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct bpf_sock_ops, op) ... + offsetof(struct bpf_sock_ops, replylong[3]): + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, op)); + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, reply)); + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong)); + off = si->off; + off -= offsetof(struct bpf_sock_ops, op); + off += offsetof(struct bpf_sock_ops_kern, op); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + off); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + off); + break; + + case offsetof(struct bpf_sock_ops, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_family)); + break; + + case offsetof(struct bpf_sock_ops, remote_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_daddr)); + break; + + case offsetof(struct bpf_sock_ops, local_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_rcv_saddr)); + break; + + case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... + offsetof(struct bpf_sock_ops, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock_ops, local_ip6[0]) ... + offsetof(struct bpf_sock_ops, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct bpf_sock_ops, local_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock_ops, remote_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct bpf_sock_ops, local_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_num)); + break; + } + return insn - insn_buf; +} + const struct bpf_verifier_ops sk_filter_prog_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -3428,6 +3590,12 @@ const struct bpf_verifier_ops cg_sock_prog_ops = { .convert_ctx_access = sock_filter_convert_ctx_access, }; +const struct bpf_verifier_ops sock_ops_prog_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = sock_ops_is_valid_access, + .convert_ctx_access = sock_ops_convert_ctx_access, +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index a91c57dd8571..a4be7cfa6519 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -64,6 +64,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_perf_event = strncmp(event, "perf_event", 10) == 0; bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0; + bool is_sockops = strncmp(event, "sockops", 7) == 0; size_t insns_cnt = size / sizeof(struct bpf_insn); enum bpf_prog_type prog_type; char buf[256]; @@ -89,6 +90,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_CGROUP_SKB; } else if (is_cgroup_sk) { prog_type = BPF_PROG_TYPE_CGROUP_SOCK; + } else if (is_sockops) { + prog_type = BPF_PROG_TYPE_SOCK_OPS; } else { printf("Unknown event '%s'\n", event); return -1; @@ -106,8 +109,11 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk) return 0; - if (is_socket) { - event += 6; + if (is_socket || is_sockops) { + if (is_socket) + event += 6; + else + event += 7; if (*event != '/') return 0; event++; @@ -560,7 +566,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) memcmp(shname, "xdp", 3) == 0 || memcmp(shname, "perf_event", 10) == 0 || memcmp(shname, "socket", 6) == 0 || - memcmp(shname, "cgroup/", 7) == 0) + memcmp(shname, "cgroup/", 7) == 0 || + memcmp(shname, "sockops", 7) == 0) load_and_attach(shname, data->d_buf, data->d_size); } -- cgit v1.3-14-g43fede From f96da09473b52c09125cc9bf7d7d4576ae8229e0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 2 Jul 2017 02:13:27 +0200 Subject: bpf: simplify narrower ctx access This work tries to make the semantics and code around the narrower ctx access a bit easier to follow. Right now everything is done inside the .is_valid_access(). Offset matching is done differently for read/write types, meaning writes don't support narrower access and thus matching only on offsetof(struct foo, bar) is enough whereas for read case that supports narrower access we must check for offsetof(struct foo, bar) + offsetof(struct foo, bar) + sizeof() - 1 for each of the cases. For read cases of individual members that don't support narrower access (like packet pointers or skb->cb[] case which has its own narrow access logic), we check as usual only offsetof(struct foo, bar) like in write case. Then, for the case where narrower access is allowed, we also need to set the aux info for the access. Meaning, ctx_field_size and converted_op_size have to be set. First is the original field size e.g. sizeof() as in above example from the user facing ctx, and latter one is the target size after actual rewrite happened, thus for the kernel facing ctx. Also here we need the range match and we need to keep track changing convert_ctx_access() and converted_op_size from is_valid_access() as both are not at the same location. We can simplify the code a bit: check_ctx_access() becomes simpler in that we only store ctx_field_size as a meta data and later in convert_ctx_accesses() we fetch the target_size right from the location where we do convert. Should the verifier be misconfigured we do reject for BPF_WRITE cases or target_size that are not provided. For the subsystems, we always work on ranges in is_valid_access() and add small helpers for ranges and narrow access, convert_ctx_accesses() sets target_size for the relevant instruction. Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Cc: Yonghong Song Signed-off-by: David S. Miller --- include/linux/bpf.h | 9 +- include/linux/filter.h | 47 ++++++++++ kernel/bpf/verifier.c | 78 +++++++--------- kernel/trace/bpf_trace.c | 31 +++--- net/core/filter.c | 239 +++++++++++++++++++++-------------------------- 5 files changed, 209 insertions(+), 195 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5175729270d7..b69e7a5869ff 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -156,9 +156,14 @@ struct bpf_prog; struct bpf_insn_access_aux { enum bpf_reg_type reg_type; int ctx_field_size; - int converted_op_size; }; +static inline void +bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) +{ + aux->ctx_field_size = size; +} + struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); @@ -173,7 +178,7 @@ struct bpf_verifier_ops { u32 (*convert_ctx_access)(enum bpf_access_type type, const struct bpf_insn *src, struct bpf_insn *dst, - struct bpf_prog *prog); + struct bpf_prog *prog, u32 *target_size); int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); }; diff --git a/include/linux/filter.h b/include/linux/filter.h index 738f8b14f025..f1fc9baa3509 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -337,6 +337,22 @@ struct bpf_prog_aux; bpf_size; \ }) +#define bpf_size_to_bytes(bpf_size) \ +({ \ + int bytes = -EINVAL; \ + \ + if (bpf_size == BPF_B) \ + bytes = sizeof(u8); \ + else if (bpf_size == BPF_H) \ + bytes = sizeof(u16); \ + else if (bpf_size == BPF_W) \ + bytes = sizeof(u32); \ + else if (bpf_size == BPF_DW) \ + bytes = sizeof(u64); \ + \ + bytes; \ +}) + #define BPF_SIZEOF(type) \ ({ \ const int __size = bytes_to_bpf_size(sizeof(type)); \ @@ -351,6 +367,13 @@ struct bpf_prog_aux; __size; \ }) +#define BPF_LDST_BYTES(insn) \ + ({ \ + const int __size = bpf_size_to_bytes(BPF_SIZE(insn->code)); \ + WARN_ON(__size < 0); \ + __size; \ + }) + #define __BPF_MAP_0(m, v, ...) v #define __BPF_MAP_1(m, v, t, a, ...) m(t, a) #define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__) @@ -401,6 +424,18 @@ struct bpf_prog_aux; #define BPF_CALL_4(name, ...) BPF_CALL_x(4, name, __VA_ARGS__) #define BPF_CALL_5(name, ...) BPF_CALL_x(5, name, __VA_ARGS__) +#define bpf_ctx_range(TYPE, MEMBER) \ + offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 +#define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2) \ + offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1 + +#define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE) \ + ({ \ + BUILD_BUG_ON(FIELD_SIZEOF(TYPE, MEMBER) != (SIZE)); \ + *(PTR_SIZE) = (SIZE); \ + offsetof(TYPE, MEMBER); \ + }) + #ifdef CONFIG_COMPAT /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { @@ -564,6 +599,18 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog) return prog->type == BPF_PROG_TYPE_UNSPEC; } +static inline bool +bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default) +{ + bool off_ok; +#ifdef __LITTLE_ENDIAN + off_ok = (off & (size_default - 1)) == 0; +#else + off_ok = (off & (size_default - 1)) + size == size_default; +#endif + return off_ok && size <= size_default && (size & (size - 1)) == 0; +} + #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) #ifdef CONFIG_ARCH_HAS_SET_MEMORY diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6ea2adcb233b..6f820a044079 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -546,20 +546,6 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, return 0; } -static int bpf_size_to_bytes(int bpf_size) -{ - if (bpf_size == BPF_W) - return 4; - else if (bpf_size == BPF_H) - return 2; - else if (bpf_size == BPF_B) - return 1; - else if (bpf_size == BPF_DW) - return 8; - else - return -EINVAL; -} - static bool is_spillable_regtype(enum bpf_reg_type type) { switch (type) { @@ -761,7 +747,9 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) { - struct bpf_insn_access_aux info = { .reg_type = *reg_type }; + struct bpf_insn_access_aux info = { + .reg_type = *reg_type, + }; /* for analyzer ctx accesses are already validated and converted */ if (env->analyzer_ops) @@ -769,25 +757,14 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, if (env->prog->aux->ops->is_valid_access && env->prog->aux->ops->is_valid_access(off, size, t, &info)) { - /* a non zero info.ctx_field_size indicates: - * . For this field, the prog type specific ctx conversion algorithm - * only supports whole field access. - * . This ctx access is a candiate for later verifier transformation - * to load the whole field and then apply a mask to get correct result. - * a non zero info.converted_op_size indicates perceived actual converted - * value width in convert_ctx_access. + /* A non zero info.ctx_field_size indicates that this field is a + * candidate for later verifier transformation to load the whole + * field and then apply a mask when accessed with a narrower + * access than actual ctx access size. A zero info.ctx_field_size + * will only allow for whole field access and rejects any other + * type of narrower access. */ - if ((info.ctx_field_size && !info.converted_op_size) || - (!info.ctx_field_size && info.converted_op_size)) { - verbose("verifier bug in is_valid_access prog type=%u off=%d size=%d\n", - env->prog->type, off, size); - return -EACCES; - } - - if (info.ctx_field_size) { - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; - env->insn_aux_data[insn_idx].converted_op_size = info.converted_op_size; - } + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; *reg_type = info.reg_type; /* remember the offset of last byte accessed in ctx */ @@ -3401,11 +3378,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of static int convert_ctx_accesses(struct bpf_verifier_env *env) { const struct bpf_verifier_ops *ops = env->prog->aux->ops; + int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; struct bpf_prog *new_prog; enum bpf_access_type type; - int i, cnt, off, size, ctx_field_size, converted_op_size, is_narrower_load, delta = 0; + bool is_narrower_load; + u32 target_size; if (ops->gen_prologue) { cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, @@ -3445,39 +3424,50 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; - off = insn->off; - size = bpf_size_to_bytes(BPF_SIZE(insn->code)); ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; - converted_op_size = env->insn_aux_data[i + delta].converted_op_size; - is_narrower_load = type == BPF_READ && size < ctx_field_size; + size = BPF_LDST_BYTES(insn); /* If the read access is a narrower load of the field, * convert to a 4/8-byte load, to minimum program type specific * convert_ctx_access changes. If conversion is successful, * we will apply proper mask to the result. */ + is_narrower_load = size < ctx_field_size; if (is_narrower_load) { - int size_code = BPF_H; + u32 off = insn->off; + u8 size_code; + + if (type == BPF_WRITE) { + verbose("bpf verifier narrow ctx access misconfigured\n"); + return -EINVAL; + } + size_code = BPF_H; if (ctx_field_size == 4) size_code = BPF_W; else if (ctx_field_size == 8) size_code = BPF_DW; + insn->off = off & ~(ctx_field_size - 1); insn->code = BPF_LDX | BPF_MEM | size_code; } - cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + + target_size = 0; + cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog, + &target_size); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || + (ctx_field_size && !target_size)) { verbose("bpf verifier is misconfigured\n"); return -EINVAL; } - if (is_narrower_load && size < converted_op_size) { + + if (is_narrower_load && size < target_size) { if (ctx_field_size <= 4) insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, - (1 << size * 8) - 1); + (1 << size * 8) - 1); else insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, - (1 << size * 8) - 1); + (1 << size * 8) - 1); } new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 97c46b440cd6..5c6d538dbf43 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -583,7 +583,8 @@ const struct bpf_verifier_ops tracepoint_prog_ops = { static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { - int sample_period_off; + const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data, + sample_period); if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) return false; @@ -592,43 +593,35 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type if (off % size != 0) return false; - /* permit 1, 2, 4 byte narrower and 8 normal read access to sample_period */ - sample_period_off = offsetof(struct bpf_perf_event_data, sample_period); - if (off >= sample_period_off && off < sample_period_off + sizeof(__u64)) { - int allowed; - -#ifdef __LITTLE_ENDIAN - allowed = (off & 0x7) == 0 && size <= 8 && (size & (size - 1)) == 0; -#else - allowed = ((off & 0x7) + size) == 8 && size <= 8 && (size & (size - 1)) == 0; -#endif - if (!allowed) + switch (off) { + case bpf_ctx_range(struct bpf_perf_event_data, sample_period): + bpf_ctx_record_field_size(info, size_sp); + if (!bpf_ctx_narrow_access_ok(off, size, size_sp)) return false; - info->ctx_field_size = 8; - info->converted_op_size = 8; - } else { + break; + default: if (size != sizeof(long)) return false; } + return true; } static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct bpf_perf_event_data, sample_period): - BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, data), si->dst_reg, si->src_reg, offsetof(struct bpf_perf_event_data_kern, data)); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, - offsetof(struct perf_sample_data, period)); + bpf_target_off(struct perf_sample_data, period, 8, + target_size)); break; default: *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, diff --git a/net/core/filter.c b/net/core/filter.c index 29620df45b7c..94169572d002 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3088,38 +3088,11 @@ lwt_xmit_func_proto(enum bpf_func_id func_id) } } -static void __set_access_aux_info(int off, struct bpf_insn_access_aux *info) +static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) { - info->ctx_field_size = 4; - switch (off) { - case offsetof(struct __sk_buff, pkt_type) ... - offsetof(struct __sk_buff, pkt_type) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, vlan_present) ... - offsetof(struct __sk_buff, vlan_present) + sizeof(__u32) - 1: - info->converted_op_size = 1; - break; - case offsetof(struct __sk_buff, queue_mapping) ... - offsetof(struct __sk_buff, queue_mapping) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, protocol) ... - offsetof(struct __sk_buff, protocol) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, vlan_tci) ... - offsetof(struct __sk_buff, vlan_tci) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, vlan_proto) ... - offsetof(struct __sk_buff, vlan_proto) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, tc_index) ... - offsetof(struct __sk_buff, tc_index) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, tc_classid) ... - offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1: - info->converted_op_size = 2; - break; - default: - info->converted_op_size = 4; - } -} + const int size_default = sizeof(__u32); -static bool __is_valid_access(int off, int size, enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ if (off < 0 || off >= sizeof(struct __sk_buff)) return false; @@ -3128,40 +3101,24 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type, return false; switch (off) { - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: - if (off + size > - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32)) + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; - case offsetof(struct __sk_buff, data) ... - offsetof(struct __sk_buff, data) + sizeof(__u32) - 1: - if (size != sizeof(__u32)) + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_end): + if (size != size_default) return false; - info->reg_type = PTR_TO_PACKET; - break; - case offsetof(struct __sk_buff, data_end) ... - offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1: - if (size != sizeof(__u32)) - return false; - info->reg_type = PTR_TO_PACKET_END; break; default: + /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { - if (size != sizeof(__u32)) + if (size != size_default) return false; } else { - int allowed; - - /* permit narrower load for not cb/data/data_end fields */ -#ifdef __LITTLE_ENDIAN - allowed = (off & 0x3) == 0 && size <= 4 && (size & (size - 1)) == 0; -#else - allowed = (off & 0x3) + size == 4 && size <= 4 && (size & (size - 1)) == 0; -#endif - if (!allowed) + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; - __set_access_aux_info(off, info); } } @@ -3173,26 +3130,22 @@ static bool sk_filter_is_valid_access(int off, int size, struct bpf_insn_access_aux *info) { switch (off) { - case offsetof(struct __sk_buff, tc_classid) ... - offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, data) ... - offsetof(struct __sk_buff, data) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, data_end) ... - offsetof(struct __sk_buff, data_end) + sizeof(__u32) - 1: + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_end): return false; } if (type == BPF_WRITE) { switch (off) { - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } - return __is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, info); } static bool lwt_is_valid_access(int off, int size, @@ -3200,24 +3153,31 @@ static bool lwt_is_valid_access(int off, int size, struct bpf_insn_access_aux *info) { switch (off) { - case offsetof(struct __sk_buff, tc_classid) ... - offsetof(struct __sk_buff, tc_classid) + sizeof(__u32) - 1: + case bpf_ctx_range(struct __sk_buff, tc_classid): return false; } if (type == BPF_WRITE) { switch (off) { - case offsetof(struct __sk_buff, mark): - case offsetof(struct __sk_buff, priority): - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } - return __is_valid_access(off, size, type, info); + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, info); } static bool sock_filter_is_valid_access(int off, int size, @@ -3289,19 +3249,27 @@ static bool tc_cls_act_is_valid_access(int off, int size, { if (type == BPF_WRITE) { switch (off) { - case offsetof(struct __sk_buff, mark): - case offsetof(struct __sk_buff, tc_index): - case offsetof(struct __sk_buff, priority): - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, tc_index): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } - return __is_valid_access(off, size, type, info); + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, info); } static bool __is_valid_xdp_access(int off, int size) @@ -3374,98 +3342,108 @@ static bool sock_ops_is_valid_access(int off, int size, static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, len): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, len)); + bpf_target_off(struct sk_buff, len, 4, + target_size)); break; case offsetof(struct __sk_buff, protocol): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, protocol)); + bpf_target_off(struct sk_buff, protocol, 2, + target_size)); break; case offsetof(struct __sk_buff, vlan_proto): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, vlan_proto)); + bpf_target_off(struct sk_buff, vlan_proto, 2, + target_size)); break; case offsetof(struct __sk_buff, priority): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); - if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, priority)); + bpf_target_off(struct sk_buff, priority, 4, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, priority)); + bpf_target_off(struct sk_buff, priority, 4, + target_size)); break; case offsetof(struct __sk_buff, ingress_ifindex): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, skb_iif)); + bpf_target_off(struct sk_buff, skb_iif, 4, + target_size)); break; case offsetof(struct __sk_buff, ifindex): - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - offsetof(struct net_device, ifindex)); + bpf_target_off(struct net_device, ifindex, 4, + target_size)); break; case offsetof(struct __sk_buff, hash): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, hash)); + bpf_target_off(struct sk_buff, hash, 4, + target_size)); break; case offsetof(struct __sk_buff, mark): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); - if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, mark)); + bpf_target_off(struct sk_buff, mark, 4, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, mark)); + bpf_target_off(struct sk_buff, mark, 4, + target_size)); break; case offsetof(struct __sk_buff, pkt_type): - return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg, - si->src_reg, insn); + *target_size = 1; + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, + PKT_TYPE_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); +#endif + break; case offsetof(struct __sk_buff, queue_mapping): - return convert_skb_access(SKF_AD_QUEUE, si->dst_reg, - si->src_reg, insn); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, queue_mapping, 2, + target_size)); + break; case offsetof(struct __sk_buff, vlan_present): - return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, - si->dst_reg, si->src_reg, insn); - case offsetof(struct __sk_buff, vlan_tci): - return convert_skb_access(SKF_AD_VLAN_TAG, - si->dst_reg, si->src_reg, insn); + BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, vlan_tci, 2, + target_size)); + if (si->off == offsetof(struct __sk_buff, vlan_tci)) { + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, + ~VLAN_TAG_PRESENT); + } else { + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); + } + break; case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: + offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data)) % @@ -3491,6 +3469,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, off -= offsetof(struct __sk_buff, tc_classid); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, tc_classid); + *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, off); @@ -3516,14 +3495,14 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); - if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, tc_index)); + bpf_target_off(struct sk_buff, tc_index, 2, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, tc_index)); + bpf_target_off(struct sk_buff, tc_index, 2, + target_size)); #else if (type == BPF_WRITE) *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); @@ -3534,10 +3513,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, napi_id): #if defined(CONFIG_NET_RX_BUSY_POLL) - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, napi_id) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, napi_id)); + bpf_target_off(struct sk_buff, napi_id, 4, + target_size)); *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #else @@ -3552,7 +3530,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; @@ -3596,22 +3574,22 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct __sk_buff, ifindex): - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - offsetof(struct net_device, ifindex)); + bpf_target_off(struct net_device, ifindex, 4, + target_size)); break; default: - return bpf_convert_ctx_access(type, si, insn_buf, prog); + return bpf_convert_ctx_access(type, si, insn_buf, prog, + target_size); } return insn - insn_buf; @@ -3620,7 +3598,7 @@ static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, static u32 xdp_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; @@ -3643,7 +3621,8 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, + u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; -- cgit v1.3-14-g43fede From 9780c0ab1a4e64ef6998c4d83f9df5be806a02dc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 2 Jul 2017 02:13:28 +0200 Subject: bpf: export whether tail call has jited owner We do export through fdinfo already whether a prog is JITed or not, given a program load can fail in case of either prog or tail call map has JITed property, but neither both are JITed or not JITed, we can facilitate error reporting in loaders like iproute2 through exporting owner_jited of tail call map. We already do export owner_prog_type through this facility, so parser can pick up both for comparison. Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d4d47de75bba..18980472f5b0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -216,10 +216,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) const struct bpf_map *map = filp->private_data; const struct bpf_array *array; u32 owner_prog_type = 0; + u32 owner_jited = 0; if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { array = container_of(map, struct bpf_array, map); owner_prog_type = array->owner_prog_type; + owner_jited = array->owner_jited; } seq_printf(m, @@ -236,9 +238,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->map_flags, map->pages * 1ULL << PAGE_SHIFT); - if (owner_prog_type) + if (owner_prog_type) { seq_printf(m, "owner_prog_type:\t%u\n", owner_prog_type); + seq_printf(m, "owner_jited:\t%u\n", + owner_jited); + } } #endif -- cgit v1.3-14-g43fede From 7bda4b40c5624c3f1c69227f8ebfd46a4b83f2ef Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 2 Jul 2017 02:13:29 +0200 Subject: bpf: extend bpf_trace_printk to support %i Currently, bpf_trace_printk does not support common formatting symbol '%i' however vsprintf does and is what eventually gets called by bpf helper. If users are used to '%i' and currently make use of it, then bpf_trace_printk will just return with error without dumping anything to the trace pipe, so just add support for '%i' to the helper. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5c6d538dbf43..37385193a608 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -122,8 +122,8 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) } /* - * limited trace_printk() - * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed + * Only limited trace_printk() conversion specifiers allowed: + * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s */ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) @@ -198,7 +198,8 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, i++; } - if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') + if (fmt[i] != 'i' && fmt[i] != 'd' && + fmt[i] != 'u' && fmt[i] != 'x') return -EINVAL; fmt_cnt++; } -- cgit v1.3-14-g43fede From 43188702b3d98d2792969a3377a30957f05695e6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 2 Jul 2017 02:13:30 +0200 Subject: bpf, verifier: add additional patterns to evaluate_reg_imm_alu Currently the verifier does not track imm across alu operations when the source register is of unknown type. This adds additional pattern matching to catch this and track imm. We've seen LLVM generating this pattern while working on cilium. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6f820a044079..6a86723c5b64 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1657,6 +1657,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } +static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; + u8 opcode = BPF_OP(insn->code); + s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm); + + /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */ + if (src_reg->imm > 0 && dst_reg->imm) { + switch (opcode) { + case BPF_ADD: + /* dreg += sreg + * where both have zero upper bits. Adding them + * can only result making one more bit non-zero + * in the larger value. + * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47) + * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47) + */ + dst_reg->imm = min(src_reg->imm, 63 - imm_log2); + dst_reg->imm--; + break; + case BPF_AND: + /* dreg &= sreg + * AND can not extend zero bits only shrink + * Ex. 0x00..00ffffff + * & 0x0f..ffffffff + * ---------------- + * 0x00..00ffffff + */ + dst_reg->imm = max(src_reg->imm, 63 - imm_log2); + break; + case BPF_OR: + /* dreg |= sreg + * OR can only extend zero bits + * Ex. 0x00..00ffffff + * | 0x0f..ffffffff + * ---------------- + * 0x0f..00ffffff + */ + dst_reg->imm = min(src_reg->imm, 63 - imm_log2); + break; + case BPF_SUB: + case BPF_MUL: + case BPF_RSH: + case BPF_LSH: + /* These may be flushed out later */ + default: + mark_reg_unknown_value(regs, insn->dst_reg); + } + } else { + mark_reg_unknown_value(regs, insn->dst_reg); + } + + dst_reg->type = UNKNOWN_VALUE; + return 0; +} + static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) { @@ -1666,6 +1725,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); u64 dst_imm = dst_reg->imm; + if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE) + return evaluate_reg_imm_alu_unknown(env, insn); + /* dst_reg->type == CONST_IMM here. Simulate execution of insns * containing ALU ops. Don't care about overflow or negative * values, just add/sub/... them; registers are in u64. -- cgit v1.3-14-g43fede From 3b9c08ae3dd44201b3a188aef34d6ddf73434015 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 4 Jul 2017 11:53:40 +0200 Subject: Revert "sched/cputime: Refactor the cputime_adjust() code" This reverts commit 72298e5c92c50edd8cb7cfda4519483ce65fa166. As Peter explains: > Argh, no... That code was perfectly fine. The new code otoh is > convoluted. > > The old code had the following form: > > if (exception1) > deal with exception1 > > if (execption2) > deal with exception2 > > do normal stuff > > Which is as simple and straight forward as it gets. > > The new code otoh reads like: > > if (!exception1) { > if (exception2) > deal with exception 2 > else > do normal stuff > } So restore the old form. Also fix the comment describing the logic, as it was confusing. Requested-by: Peter Zijlstra Cc: Gustavo A. R. Silva Cc: Frans Klaver Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Rik van Riel Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Wanpeng Li Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 67c70e287647..84a419bdf5aa 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr, utime = curr->utime; /* - * If either stime or both stime and utime are 0, assume all runtime is - * userspace. Once a task gets some ticks, the monotonicy code at - * 'update' will ensure things converge to the observed ratio. + * If either stime or utime are 0, assume all runtime is userspace. + * Once a task gets some ticks, the monotonicy code at 'update:' + * will ensure things converge to the observed ratio. */ - if (stime != 0) { - if (utime == 0) - stime = rtime; - else - stime = scale_stime(stime, rtime, stime + utime); + if (stime == 0) { + utime = rtime; + goto update; } + if (utime == 0) { + stime = rtime; + goto update; + } + + stime = scale_stime(stime, rtime, stime + utime); + +update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. -- cgit v1.3-14-g43fede From e5682b4eecb2b73282853d0ef314d3164b986997 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 4 Jul 2017 11:25:15 +0200 Subject: genirq/debugfs: Fix build for !CONFIG_IRQ_DOMAIN Fix this build error: kernel/irq/internals.h:440:20: error: inlining failed in call to always_inline 'irq_domain_debugfs_init': function body not available kernel/irq/debugfs.c:202:2: note: called from here irq_domain_debugfs_init(root_dir); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Sebastian Ott Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.LFD.2.20.1707041124000.1712@schleppi --- kernel/irq/internals.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 9da14d125df4..dbfba9933ed2 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -437,7 +437,9 @@ static inline void irq_remove_debugfs_entry(struct irq_desc *desc) # ifdef CONFIG_IRQ_DOMAIN void irq_domain_debugfs_init(struct dentry *root); # else -static inline void irq_domain_debugfs_init(struct dentry *root); +static inline void irq_domain_debugfs_init(struct dentry *root) +{ +} # endif #else /* CONFIG_GENERIC_IRQ_DEBUGFS */ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) -- cgit v1.3-14-g43fede From 2372a519f63829b8effcdde5f4564a7e036294f0 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 4 Jul 2017 12:06:01 +0200 Subject: genirq: Force inlining of __irq_startup_managed to prevent build failure If CONFIG_SMP=n, and gcc (e.g. 4.1.2) decides not to inline __irq_startup_managed(), the build fails with: kernel/built-in.o: In function `irq_startup': (.text+0x38ed8): undefined reference to `irq_set_affinity_locked' Fix this by forcing inlining of __irq_startup_managed(). Fixes: 761ea388e8c4e3ac ("genirq: Handle managed irqs gracefully in irq_startup()") Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/1499162761-12398-1-git-send-email-geert@linux-m68k.org --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2e30d925a40d..aa5497dfb29e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -234,7 +234,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) return IRQ_STARTUP_MANAGED; } #else -static int +static __always_inline int __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) { return IRQ_STARTUP_NORMAL; -- cgit v1.3-14-g43fede From 3a90795e1e885167209056a1a90be965add30e25 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2017 23:33:36 +0200 Subject: genirq: Move bus locking into __setup_irq() There is no point in having the irq_bus_lock() protection around all callers to __setup_irq(). Move it into __setup_irq(). This is also a preparatory patch for addressing the issues with the irq resource callbacks. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Heiko Stuebner Cc: Julia Cartwright Cc: Linus Walleij Cc: Brian Norris Cc: Doug Anderson Cc: linux-rockchip@lists.infradead.org Cc: John Keeping Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/20170629214343.960949031@linutronix.de --- kernel/irq/manage.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5c11c1730ba5..0934e02fa04e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1167,6 +1167,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; + chip_bus_lock(desc); + /* * The following block of code has to be executed atomically */ @@ -1347,6 +1349,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); irq_setup_timings(desc, new); @@ -1378,6 +1381,8 @@ mismatch: out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); + out_thread: if (new->thread) { struct task_struct *t = new->thread; @@ -1417,9 +1422,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) if (retval < 0) return retval; - chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); - chip_bus_sync_unlock(desc); if (retval) irq_chip_pm_put(&desc->irq_data); @@ -1674,9 +1677,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, return retval; } - chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); - chip_bus_sync_unlock(desc); if (retval) { irq_chip_pm_put(&desc->irq_data); @@ -1924,9 +1925,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) if (retval < 0) return retval; - chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); - chip_bus_sync_unlock(desc); if (retval) irq_chip_pm_put(&desc->irq_data); @@ -1980,9 +1979,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, return retval; } - chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); - chip_bus_sync_unlock(desc); if (retval) { irq_chip_pm_put(&desc->irq_data); -- cgit v1.3-14-g43fede From 9114014cf4e6df0b22d764380ae1fc54f1a7a8b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2017 23:33:37 +0200 Subject: genirq: Add mutex to irq desc to serialize request/free_irq() The irq_request/release_resources() callbacks ar currently invoked under desc->lock with interrupts disabled. This is a source of problems on RT and conceptually not required. Add a seperate mutex to struct irq_desc which allows to serialize request/free_irq(), which can be used to move the resource functions out of the desc->lock held region. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Heiko Stuebner Cc: Julia Cartwright Cc: Linus Walleij Cc: Brian Norris Cc: Doug Anderson Cc: linux-rockchip@lists.infradead.org Cc: John Keeping Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/20170629214344.039220922@linutronix.de --- include/linux/irqdesc.h | 3 +++ kernel/irq/irqdesc.c | 1 + kernel/irq/manage.c | 8 ++++++++ 3 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index d425a3a09722..3e90a094798d 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -3,6 +3,7 @@ #include #include +#include /* * Core internal functions to deal with irq descriptors @@ -45,6 +46,7 @@ struct pt_regs; * IRQF_FORCE_RESUME set * @rcu: rcu head for delayed free * @kobj: kobject used to represent this struct in sysfs + * @request_mutex: mutex to protect request/free before locking desc->lock * @dir: /proc/irq/ procfs entry * @debugfs_file: dentry for the debugfs file * @name: flow handler name for /proc/interrupts output @@ -96,6 +98,7 @@ struct irq_desc { struct rcu_head rcu; struct kobject kobj; #endif + struct mutex request_mutex; int parent_irq; struct module *owner; const char *name; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 948b50e78549..906a67e58391 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -373,6 +373,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, raw_spin_lock_init(&desc->lock); lockdep_set_class(&desc->lock, &irq_desc_lock_class); + mutex_init(&desc->request_mutex); init_rcu_head(&desc->rcu); desc_set_defaults(irq, desc, node, affinity, owner); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0934e02fa04e..0139908b8d53 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1167,6 +1167,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; + mutex_lock(&desc->request_mutex); + chip_bus_lock(desc); /* @@ -1350,6 +1352,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); + mutex_unlock(&desc->request_mutex); irq_setup_timings(desc, new); @@ -1383,6 +1386,8 @@ out_unlock: chip_bus_sync_unlock(desc); + mutex_unlock(&desc->request_mutex); + out_thread: if (new->thread) { struct task_struct *t = new->thread; @@ -1446,6 +1451,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc) return NULL; + mutex_lock(&desc->request_mutex); chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); @@ -1521,6 +1527,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + mutex_unlock(&desc->request_mutex); + irq_chip_pm_put(&desc->irq_data); module_put(desc->owner); kfree(action->secondary); -- cgit v1.3-14-g43fede From 46e48e257360f0845fe17089713cbad4db611e70 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2017 23:33:38 +0200 Subject: genirq: Move irq resource handling out of spinlocked region Aside of being conceptually wrong, there is also an actual (hard to trigger and mostly theoretical) problem. CPU0 CPU1 free_irq(X) interrupt X spin_lock(desc->lock) wake irq thread() spin_unlock(desc->lock) spin_lock(desc->lock) remove action() shutdown_irq() release_resources() thread_handler() spin_unlock(desc->lock) access released resources. synchronize_irq() Move the release resources invocation after synchronize_irq() so it's guaranteed that the threaded handler has finished. Move the resource request call out of the desc->lock held region as well, so the invocation context is the same for both request and release. This solves the problems with those functions on RT as well. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Heiko Stuebner Cc: Julia Cartwright Cc: Linus Walleij Cc: Brian Norris Cc: Doug Anderson Cc: linux-rockchip@lists.infradead.org Cc: John Keeping Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/20170629214344.117028181@linutronix.de --- kernel/irq/manage.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0139908b8d53..3e693430bfe1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1168,6 +1168,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->flags &= ~IRQF_ONESHOT; mutex_lock(&desc->request_mutex); + if (!desc->action) { + ret = irq_request_resources(desc); + if (ret) { + pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", + new->name, irq, desc->irq_data.chip->name); + goto out_mutex; + } + } chip_bus_lock(desc); @@ -1271,13 +1279,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { - ret = irq_request_resources(desc); - if (ret) { - pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", - new->name, irq, desc->irq_data.chip->name); - goto out_unlock; - } - init_waitqueue_head(&desc->wait_for_threads); /* Setup the type (level, edge polarity) if configured: */ @@ -1386,6 +1387,10 @@ out_unlock: chip_bus_sync_unlock(desc); + if (!desc->action) + irq_release_resources(desc); + +out_mutex: mutex_unlock(&desc->request_mutex); out_thread: @@ -1484,7 +1489,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc->action) { irq_settings_clr_disable_unlazy(desc); irq_shutdown(desc); - irq_release_resources(desc); irq_remove_timings(desc); } @@ -1527,6 +1531,9 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + if (!desc->action) + irq_release_resources(desc); + mutex_unlock(&desc->request_mutex); irq_chip_pm_put(&desc->irq_data); -- cgit v1.3-14-g43fede From 2343877fbda701599653e63f8dcc318aa1bf15ee Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jun 2017 23:33:39 +0200 Subject: genirq/timings: Move free timings out of spinlocked region No point to do memory management from a interrupt disabled spin locked region. Signed-off-by: Thomas Gleixner Reviewed-by: Marc Zyngier Cc: Daniel Lezcano Cc: Heiko Stuebner Cc: Julia Cartwright Cc: Linus Walleij Cc: Brian Norris Cc: Doug Anderson Cc: linux-rockchip@lists.infradead.org Cc: John Keeping Cc: linux-gpio@vger.kernel.org Link: http://lkml.kernel.org/r/20170629214344.196130646@linutronix.de --- kernel/irq/manage.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3e693430bfe1..91e1f2390752 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1489,7 +1489,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc->action) { irq_settings_clr_disable_unlazy(desc); irq_shutdown(desc); - irq_remove_timings(desc); } #ifdef CONFIG_SMP @@ -1531,8 +1530,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } - if (!desc->action) + if (!desc->action) { irq_release_resources(desc); + irq_remove_timings(desc); + } mutex_unlock(&desc->request_mutex); -- cgit v1.3-14-g43fede From 1d0c6e593023ac5dafc2ea2b3f23d96f1c1f2fa2 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Fri, 30 Jun 2017 10:22:14 +0530 Subject: PM / sleep: constify attribute_group structures attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 3802 624 32 4458 116a kernel/power/main.o File size After adding 'const': text data bss dec hex filename 3866 560 32 4458 116a kernel/power/main.o Signed-off-by: Arvind Yadav Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index d401c21136d1..42bd800a6755 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -705,7 +705,7 @@ static struct attribute * g[] = { NULL, }; -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = g, }; -- cgit v1.3-14-g43fede From 1c3eda01a79b8e9237d91c52c5a75b20983f47c6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 29 Jun 2017 19:15:07 +0200 Subject: vtime, sched/cputime: Remove vtime_account_user() It's an unnecessary function between vtime_user_exit() and account_user_time(). Tested-by: Luiz Capitulino Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/vtime.h | 9 +-------- kernel/sched/cputime.c | 12 ++++++------ 2 files changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 0681fe25abeb..18b405e3cd93 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -67,19 +67,12 @@ static inline void vtime_account_system(struct task_struct *tsk) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void arch_vtime_task_switch(struct task_struct *tsk); -extern void vtime_account_user(struct task_struct *tsk); extern void vtime_user_enter(struct task_struct *tsk); - -static inline void vtime_user_exit(struct task_struct *tsk) -{ - vtime_account_user(tsk); -} - +extern void vtime_user_exit(struct task_struct *tsk); extern void vtime_guest_enter(struct task_struct *tsk); extern void vtime_guest_exit(struct task_struct *tsk); extern void vtime_init_idle(struct task_struct *tsk, int cpu); #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -static inline void vtime_account_user(struct task_struct *tsk) { } static inline void vtime_user_enter(struct task_struct *tsk) { } static inline void vtime_user_exit(struct task_struct *tsk) { } static inline void vtime_guest_enter(struct task_struct *tsk) { } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 84a419bdf5aa..5adc896d0f64 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -724,21 +724,21 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_end(&tsk->vtime_seqcount); } -void vtime_account_user(struct task_struct *tsk) +void vtime_user_enter(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - tsk->vtime_snap_whence = VTIME_SYS; if (vtime_delta(tsk)) - account_user_time(tsk, get_vtime_delta(tsk)); + __vtime_account_system(tsk); + tsk->vtime_snap_whence = VTIME_USER; write_seqcount_end(&tsk->vtime_seqcount); } -void vtime_user_enter(struct task_struct *tsk) +void vtime_user_exit(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); + tsk->vtime_snap_whence = VTIME_SYS; if (vtime_delta(tsk)) - __vtime_account_system(tsk); - tsk->vtime_snap_whence = VTIME_USER; + account_user_time(tsk, get_vtime_delta(tsk)); write_seqcount_end(&tsk->vtime_seqcount); } -- cgit v1.3-14-g43fede From 9fa57cf5a5c4aed1e45879b335fe433048709327 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 29 Jun 2017 19:15:08 +0200 Subject: sched/cputime: Always set tsk->vtime_snap_whence after accounting vtime Even though it doesn't have functional consequences, setting the task's new context state after we actually accounted the pending vtime from the old context state makes more sense from a review perspective. vtime_user_exit() is the only function that doesn't follow that rule and that can bug the reviewer for a little while until he realizes there is no reason for this special case. Tested-by: Luiz Capitulino Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5adc896d0f64..ab68927e8e94 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -736,9 +736,9 @@ void vtime_user_enter(struct task_struct *tsk) void vtime_user_exit(struct task_struct *tsk) { write_seqcount_begin(&tsk->vtime_seqcount); - tsk->vtime_snap_whence = VTIME_SYS; if (vtime_delta(tsk)) account_user_time(tsk, get_vtime_delta(tsk)); + tsk->vtime_snap_whence = VTIME_SYS; write_seqcount_end(&tsk->vtime_seqcount); } -- cgit v1.3-14-g43fede From 60a9ce57e7c5ac1df3a39fb941022bbfa40c0862 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 29 Jun 2017 19:15:09 +0200 Subject: sched/cputime: Rename vtime fields The current "snapshot" based naming on vtime fields suggests we record some past event but that's a low level picture of their actual purpose which comes out blurry. The real point of these fields is to run a basic state machine that tracks down cputime entry while switching between contexts. So lets reflect that with more meaningful names. Tested-by: Luiz Capitulino Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 4 ++-- include/linux/sched.h | 4 ++-- kernel/fork.c | 4 ++-- kernel/sched/cputime.c | 30 +++++++++++++++--------------- 4 files changed, 21 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index e049526bc188..3d537331cd4e 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -171,8 +171,8 @@ extern struct cred init_cred; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN # define INIT_VTIME(tsk) \ .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \ - .vtime_snap = 0, \ - .vtime_snap_whence = VTIME_SYS, + .vtime_starttime = 0, \ + .vtime_state = VTIME_SYS, #else # define INIT_VTIME(tsk) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c4ca7433d9d..ff001646549e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -689,7 +689,7 @@ struct task_struct { struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqcount_t vtime_seqcount; - unsigned long long vtime_snap; + unsigned long long vtime_starttime; enum { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, @@ -697,7 +697,7 @@ struct task_struct { VTIME_USER, /* Task runs in kernelspace in a CPU with VTIME active: */ VTIME_SYS, - } vtime_snap_whence; + } vtime_state; #endif #ifdef CONFIG_NO_HZ_FULL diff --git a/kernel/fork.c b/kernel/fork.c index e53770d2bf95..83c4f9bf3e14 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1638,8 +1638,8 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqcount_init(&p->vtime_seqcount); - p->vtime_snap = 0; - p->vtime_snap_whence = VTIME_INACTIVE; + p->vtime_starttime = 0; + p->vtime_state = VTIME_INACTIVE; #endif #if defined(SPLIT_RSS_COUNTING) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ab68927e8e94..8c64753067c5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -683,10 +683,10 @@ static u64 vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - if (time_before(now, (unsigned long)tsk->vtime_snap)) + if (time_before(now, (unsigned long)tsk->vtime_starttime)) return 0; - return jiffies_to_nsecs(now - tsk->vtime_snap); + return jiffies_to_nsecs(now - tsk->vtime_starttime); } static u64 get_vtime_delta(struct task_struct *tsk) @@ -701,10 +701,10 @@ static u64 get_vtime_delta(struct task_struct *tsk) * elapsed time. Limit account_other_time to prevent rounding * errors from causing elapsed vtime to go negative. */ - delta = jiffies_to_nsecs(now - tsk->vtime_snap); + delta = jiffies_to_nsecs(now - tsk->vtime_starttime); other = account_other_time(delta); - WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); - tsk->vtime_snap = now; + WARN_ON_ONCE(tsk->vtime_state == VTIME_INACTIVE); + tsk->vtime_starttime = now; return delta - other; } @@ -746,7 +746,7 @@ void vtime_guest_enter(struct task_struct *tsk) { /* * The flags must be updated under the lock with - * the vtime_snap flush and update. + * the vtime_starttime flush and update. * That enforces a right ordering and update sequence * synchronization against the reader (task_gtime()) * that can thus safely catch up with a tickless delta. @@ -776,12 +776,12 @@ void vtime_account_idle(struct task_struct *tsk) void arch_vtime_task_switch(struct task_struct *prev) { write_seqcount_begin(&prev->vtime_seqcount); - prev->vtime_snap_whence = VTIME_INACTIVE; + prev->vtime_state = VTIME_INACTIVE; write_seqcount_end(&prev->vtime_seqcount); write_seqcount_begin(¤t->vtime_seqcount); - current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = jiffies; + current->vtime_state = VTIME_SYS; + current->vtime_starttime = jiffies; write_seqcount_end(¤t->vtime_seqcount); } @@ -791,8 +791,8 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&t->vtime_seqcount); - t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = jiffies; + t->vtime_state = VTIME_SYS; + t->vtime_starttime = jiffies; write_seqcount_end(&t->vtime_seqcount); local_irq_restore(flags); } @@ -809,7 +809,7 @@ u64 task_gtime(struct task_struct *t) seq = read_seqcount_begin(&t->vtime_seqcount); gtime = t->gtime; - if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) + if (t->vtime_state == VTIME_SYS && t->flags & PF_VCPU) gtime += vtime_delta(t); } while (read_seqcount_retry(&t->vtime_seqcount, seq)); @@ -840,7 +840,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) *stime = t->stime; /* Task is sleeping, nothing to add */ - if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) + if (t->vtime_state == VTIME_INACTIVE || is_idle_task(t)) continue; delta = vtime_delta(t); @@ -849,9 +849,9 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) * Task runs either in user or kernel space, add pending nohz time to * the right place. */ - if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) + if (t->vtime_state == VTIME_USER || t->flags & PF_VCPU) *utime += delta; - else if (t->vtime_snap_whence == VTIME_SYS) + else if (t->vtime_state == VTIME_SYS) *stime += delta; } while (read_seqcount_retry(&t->vtime_seqcount, seq)); } -- cgit v1.3-14-g43fede From bac5b6b6b11560f323e71d0ebac4061cfe5f56c0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 29 Jun 2017 19:15:10 +0200 Subject: sched/cputime: Move the vtime task fields to their own struct We are about to add vtime accumulation fields to the task struct. Let's avoid more bloatification and gather vtime information to their own struct. Tested-by: Luiz Capitulino Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-5-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 6 +-- include/linux/sched.h | 26 ++++++----- kernel/fork.c | 6 +-- kernel/sched/cputime.c | 112 ++++++++++++++++++++++++++-------------------- 4 files changed, 86 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 3d537331cd4e..a2f6707e9fc0 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -170,9 +170,9 @@ extern struct cred init_cred; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN # define INIT_VTIME(tsk) \ - .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \ - .vtime_starttime = 0, \ - .vtime_state = VTIME_SYS, + .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount), \ + .vtime.starttime = 0, \ + .vtime.state = VTIME_SYS, #else # define INIT_VTIME(tsk) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index ff001646549e..eeff8a024f0c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -223,6 +223,21 @@ struct task_cputime { #define prof_exp stime #define sched_exp sum_exec_runtime +enum vtime_state { + /* Task is sleeping or running in a CPU with VTIME inactive: */ + VTIME_INACTIVE = 0, + /* Task runs in userspace in a CPU with VTIME active: */ + VTIME_USER, + /* Task runs in kernelspace in a CPU with VTIME active: */ + VTIME_SYS, +}; + +struct vtime { + seqcount_t seqcount; + unsigned long long starttime; + enum vtime_state state; +}; + struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ @@ -688,16 +703,7 @@ struct task_struct { u64 gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN - seqcount_t vtime_seqcount; - unsigned long long vtime_starttime; - enum { - /* Task is sleeping or running in a CPU with VTIME inactive: */ - VTIME_INACTIVE = 0, - /* Task runs in userspace in a CPU with VTIME active: */ - VTIME_USER, - /* Task runs in kernelspace in a CPU with VTIME active: */ - VTIME_SYS, - } vtime_state; + struct vtime vtime; #endif #ifdef CONFIG_NO_HZ_FULL diff --git a/kernel/fork.c b/kernel/fork.c index 83c4f9bf3e14..d927ec11aa7a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1637,9 +1637,9 @@ static __latent_entropy struct task_struct *copy_process( prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN - seqcount_init(&p->vtime_seqcount); - p->vtime_starttime = 0; - p->vtime_state = VTIME_INACTIVE; + seqcount_init(&p->vtime.seqcount); + p->vtime.starttime = 0; + p->vtime.state = VTIME_INACTIVE; #endif #if defined(SPLIT_RSS_COUNTING) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8c64753067c5..9ee725edcbe0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -679,17 +679,17 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static u64 vtime_delta(struct task_struct *tsk) +static u64 vtime_delta(struct vtime *vtime) { unsigned long now = READ_ONCE(jiffies); - if (time_before(now, (unsigned long)tsk->vtime_starttime)) + if (time_before(now, (unsigned long)vtime->starttime)) return 0; - return jiffies_to_nsecs(now - tsk->vtime_starttime); + return jiffies_to_nsecs(now - vtime->starttime); } -static u64 get_vtime_delta(struct task_struct *tsk) +static u64 get_vtime_delta(struct vtime *vtime) { unsigned long now = READ_ONCE(jiffies); u64 delta, other; @@ -701,49 +701,56 @@ static u64 get_vtime_delta(struct task_struct *tsk) * elapsed time. Limit account_other_time to prevent rounding * errors from causing elapsed vtime to go negative. */ - delta = jiffies_to_nsecs(now - tsk->vtime_starttime); + delta = jiffies_to_nsecs(now - vtime->starttime); other = account_other_time(delta); - WARN_ON_ONCE(tsk->vtime_state == VTIME_INACTIVE); - tsk->vtime_starttime = now; + WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); + vtime->starttime = now; return delta - other; } static void __vtime_account_system(struct task_struct *tsk) { - account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); + account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime)); } void vtime_account_system(struct task_struct *tsk) { - if (!vtime_delta(tsk)) + struct vtime *vtime = &tsk->vtime; + + if (!vtime_delta(vtime)) return; - write_seqcount_begin(&tsk->vtime_seqcount); + write_seqcount_begin(&vtime->seqcount); __vtime_account_system(tsk); - write_seqcount_end(&tsk->vtime_seqcount); + write_seqcount_end(&vtime->seqcount); } void vtime_user_enter(struct task_struct *tsk) { - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); + if (vtime_delta(vtime)) __vtime_account_system(tsk); - tsk->vtime_snap_whence = VTIME_USER; - write_seqcount_end(&tsk->vtime_seqcount); + vtime->state = VTIME_USER; + write_seqcount_end(&vtime->seqcount); } void vtime_user_exit(struct task_struct *tsk) { - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) - account_user_time(tsk, get_vtime_delta(tsk)); - tsk->vtime_snap_whence = VTIME_SYS; - write_seqcount_end(&tsk->vtime_seqcount); + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); + if (vtime_delta(vtime)) + account_user_time(tsk, get_vtime_delta(vtime)); + vtime->state = VTIME_SYS; + write_seqcount_end(&vtime->seqcount); } void vtime_guest_enter(struct task_struct *tsk) { + struct vtime *vtime = &tsk->vtime; /* * The flags must be updated under the lock with * the vtime_starttime flush and update. @@ -751,54 +758,62 @@ void vtime_guest_enter(struct task_struct *tsk) * synchronization against the reader (task_gtime()) * that can thus safely catch up with a tickless delta. */ - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) + write_seqcount_begin(&vtime->seqcount); + if (vtime_delta(vtime)) __vtime_account_system(tsk); current->flags |= PF_VCPU; - write_seqcount_end(&tsk->vtime_seqcount); + write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { - write_seqcount_begin(&tsk->vtime_seqcount); + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); __vtime_account_system(tsk); current->flags &= ~PF_VCPU; - write_seqcount_end(&tsk->vtime_seqcount); + write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); void vtime_account_idle(struct task_struct *tsk) { - account_idle_time(get_vtime_delta(tsk)); + account_idle_time(get_vtime_delta(&tsk->vtime)); } void arch_vtime_task_switch(struct task_struct *prev) { - write_seqcount_begin(&prev->vtime_seqcount); - prev->vtime_state = VTIME_INACTIVE; - write_seqcount_end(&prev->vtime_seqcount); + struct vtime *vtime = &prev->vtime; - write_seqcount_begin(¤t->vtime_seqcount); - current->vtime_state = VTIME_SYS; - current->vtime_starttime = jiffies; - write_seqcount_end(¤t->vtime_seqcount); + write_seqcount_begin(&vtime->seqcount); + vtime->state = VTIME_INACTIVE; + write_seqcount_end(&vtime->seqcount); + + vtime = ¤t->vtime; + + write_seqcount_begin(&vtime->seqcount); + vtime->state = VTIME_SYS; + vtime->starttime = jiffies; + write_seqcount_end(&vtime->seqcount); } void vtime_init_idle(struct task_struct *t, int cpu) { + struct vtime *vtime = &t->vtime; unsigned long flags; local_irq_save(flags); - write_seqcount_begin(&t->vtime_seqcount); - t->vtime_state = VTIME_SYS; - t->vtime_starttime = jiffies; - write_seqcount_end(&t->vtime_seqcount); + write_seqcount_begin(&vtime->seqcount); + vtime->state = VTIME_SYS; + vtime->starttime = jiffies; + write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } u64 task_gtime(struct task_struct *t) { + struct vtime *vtime = &t->vtime; unsigned int seq; u64 gtime; @@ -806,13 +821,13 @@ u64 task_gtime(struct task_struct *t) return t->gtime; do { - seq = read_seqcount_begin(&t->vtime_seqcount); + seq = read_seqcount_begin(&vtime->seqcount); gtime = t->gtime; - if (t->vtime_state == VTIME_SYS && t->flags & PF_VCPU) - gtime += vtime_delta(t); + if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) + gtime += vtime_delta(vtime); - } while (read_seqcount_retry(&t->vtime_seqcount, seq)); + } while (read_seqcount_retry(&vtime->seqcount, seq)); return gtime; } @@ -824,8 +839,9 @@ u64 task_gtime(struct task_struct *t) */ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { - u64 delta; + struct vtime *vtime = &t->vtime; unsigned int seq; + u64 delta; if (!vtime_accounting_enabled()) { *utime = t->utime; @@ -834,25 +850,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) } do { - seq = read_seqcount_begin(&t->vtime_seqcount); + seq = read_seqcount_begin(&vtime->seqcount); *utime = t->utime; *stime = t->stime; /* Task is sleeping, nothing to add */ - if (t->vtime_state == VTIME_INACTIVE || is_idle_task(t)) + if (vtime->state == VTIME_INACTIVE || is_idle_task(t)) continue; - delta = vtime_delta(t); + delta = vtime_delta(vtime); /* * Task runs either in user or kernel space, add pending nohz time to * the right place. */ - if (t->vtime_state == VTIME_USER || t->flags & PF_VCPU) + if (vtime->state == VTIME_USER || t->flags & PF_VCPU) *utime += delta; - else if (t->vtime_state == VTIME_SYS) + else if (vtime->state == VTIME_SYS) *stime += delta; - } while (read_seqcount_retry(&t->vtime_seqcount, seq)); + } while (read_seqcount_retry(&vtime->seqcount, seq)); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit v1.3-14-g43fede From 2a42eb9594a1480b4ead9e036e06ee1290e5fa6d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 29 Jun 2017 19:15:11 +0200 Subject: sched/cputime: Accumulate vtime on top of nsec clocksource Currently the cputime source used by vtime is jiffies. When we cross a context boundary and jiffies have changed since the last snapshot, the pending cputime is accounted to the switching out context. This system works ok if the ticks are not aligned across CPUs. If they instead are aligned (ie: all fire at the same time) and the CPUs run in userspace, the jiffies change is only observed on tick exit and therefore the user cputime is accounted as system cputime. This is because the CPU that maintains timekeeping fires its tick at the same time as the others. It updates jiffies in the middle of the tick and the other CPUs see that update on IRQ exit: CPU 0 (timekeeper) CPU 1 ------------------- ------------- jiffies = N ... run in userspace for a jiffy tick entry tick entry (sees jiffies = N) set jiffies = N + 1 tick exit tick exit (sees jiffies = N + 1) account 1 jiffy as stime Fix this with using a nanosec clock source instead of jiffies. The cputime is then accumulated and flushed everytime the pending delta reaches a jiffy in order to mitigate the accounting overhead. [ fweisbec: changelog, rebase on struct vtime, field renames, add delta on cputime readers, keep idle vtime as-is (low overhead accounting), harmonize clock sources. ] Suggested-by: Thomas Gleixner Reported-by: Luiz Capitulino Tested-by: Luiz Capitulino Signed-off-by: Wanpeng Li Signed-off-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1498756511-11714-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +++ kernel/sched/cputime.c | 64 +++++++++++++++++++++++++++++++++----------------- 2 files changed, 45 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index eeff8a024f0c..4818126c5153 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -236,6 +236,9 @@ struct vtime { seqcount_t seqcount; unsigned long long starttime; enum vtime_state state; + u64 utime; + u64 stime; + u64 gtime; }; struct sched_info { diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 9ee725edcbe0..6e3ea4ac1bda 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -681,18 +681,19 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN static u64 vtime_delta(struct vtime *vtime) { - unsigned long now = READ_ONCE(jiffies); + unsigned long long clock; - if (time_before(now, (unsigned long)vtime->starttime)) + clock = sched_clock_cpu(smp_processor_id()); + if (clock < vtime->starttime) return 0; - return jiffies_to_nsecs(now - vtime->starttime); + return clock - vtime->starttime; } static u64 get_vtime_delta(struct vtime *vtime) { - unsigned long now = READ_ONCE(jiffies); - u64 delta, other; + u64 delta = vtime_delta(vtime); + u64 other; /* * Unlike tick based timing, vtime based timing never has lost @@ -701,17 +702,31 @@ static u64 get_vtime_delta(struct vtime *vtime) * elapsed time. Limit account_other_time to prevent rounding * errors from causing elapsed vtime to go negative. */ - delta = jiffies_to_nsecs(now - vtime->starttime); other = account_other_time(delta); WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); - vtime->starttime = now; + vtime->starttime += delta; return delta - other; } -static void __vtime_account_system(struct task_struct *tsk) +static void __vtime_account_system(struct task_struct *tsk, + struct vtime *vtime) { - account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime)); + vtime->stime += get_vtime_delta(vtime); + if (vtime->stime >= TICK_NSEC) { + account_system_time(tsk, irq_count(), vtime->stime); + vtime->stime = 0; + } +} + +static void vtime_account_guest(struct task_struct *tsk, + struct vtime *vtime) +{ + vtime->gtime += get_vtime_delta(vtime); + if (vtime->gtime >= TICK_NSEC) { + account_guest_time(tsk, vtime->gtime); + vtime->gtime = 0; + } } void vtime_account_system(struct task_struct *tsk) @@ -722,7 +737,11 @@ void vtime_account_system(struct task_struct *tsk) return; write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk); + /* We might have scheduled out from guest path */ + if (current->flags & PF_VCPU) + vtime_account_guest(tsk, vtime); + else + __vtime_account_system(tsk, vtime); write_seqcount_end(&vtime->seqcount); } @@ -731,8 +750,7 @@ void vtime_user_enter(struct task_struct *tsk) struct vtime *vtime = &tsk->vtime; write_seqcount_begin(&vtime->seqcount); - if (vtime_delta(vtime)) - __vtime_account_system(tsk); + __vtime_account_system(tsk, vtime); vtime->state = VTIME_USER; write_seqcount_end(&vtime->seqcount); } @@ -742,8 +760,11 @@ void vtime_user_exit(struct task_struct *tsk) struct vtime *vtime = &tsk->vtime; write_seqcount_begin(&vtime->seqcount); - if (vtime_delta(vtime)) - account_user_time(tsk, get_vtime_delta(vtime)); + vtime->utime += get_vtime_delta(vtime); + if (vtime->utime >= TICK_NSEC) { + account_user_time(tsk, vtime->utime); + vtime->utime = 0; + } vtime->state = VTIME_SYS; write_seqcount_end(&vtime->seqcount); } @@ -759,8 +780,7 @@ void vtime_guest_enter(struct task_struct *tsk) * that can thus safely catch up with a tickless delta. */ write_seqcount_begin(&vtime->seqcount); - if (vtime_delta(vtime)) - __vtime_account_system(tsk); + __vtime_account_system(tsk, vtime); current->flags |= PF_VCPU; write_seqcount_end(&vtime->seqcount); } @@ -771,7 +791,7 @@ void vtime_guest_exit(struct task_struct *tsk) struct vtime *vtime = &tsk->vtime; write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk); + vtime_account_guest(tsk, vtime); current->flags &= ~PF_VCPU; write_seqcount_end(&vtime->seqcount); } @@ -794,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = jiffies; + vtime->starttime = sched_clock_cpu(smp_processor_id()); write_seqcount_end(&vtime->seqcount); } @@ -806,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = jiffies; + vtime->starttime = sched_clock_cpu(cpu); write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } @@ -825,7 +845,7 @@ u64 task_gtime(struct task_struct *t) gtime = t->gtime; if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) - gtime += vtime_delta(vtime); + gtime += vtime->gtime + vtime_delta(vtime); } while (read_seqcount_retry(&vtime->seqcount, seq)); @@ -866,9 +886,9 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) * the right place. */ if (vtime->state == VTIME_USER || t->flags & PF_VCPU) - *utime += delta; + *utime += vtime->utime + delta; else if (vtime->state == VTIME_SYS) - *stime += delta; + *stime += vtime->stime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit v1.3-14-g43fede From a0c4acd2c220376b4e9690e75782d0c0afdaab9f Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 16 Jun 2017 16:44:34 +0300 Subject: locking/rwsem-spinlock: Fix EINTR branch in __down_write_common() If a writer could been woken up, the above branch if (sem->count == 0) break; would have moved us to taking the sem. So, it's not the time to wake a writer now, and only readers are allowed now. Thus, 0 must be passed to __rwsem_do_wake(). Next, __rwsem_do_wake() wakes readers unconditionally. But we mustn't do that if the sem is owned by writer in the moment. Otherwise, writer and reader own the sem the same time, which leads to memory corruption in callers. rwsem-xadd.c does not need that, as: 1) the similar check is made lockless there, 2) in __rwsem_mark_wake::try_reader_grant we test, that sem is not owned by writer. Signed-off-by: Kirill Tkhai Acked-by: Peter Zijlstra Cc: Cc: Linus Torvalds Cc: Niklas Cassel Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 17fcbd590d0c "locking/rwsem: Fix down_write_killable() for CONFIG_RWSEM_GENERIC_SPINLOCK=y" Link: http://lkml.kernel.org/r/149762063282.19811.9129615532201147826.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-spinlock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index c65f7989f850..20819df98125 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -231,8 +231,8 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state) out_nolock: list_del(&waiter.list); - if (!list_empty(&sem->wait_list)) - __rwsem_do_wake(sem, 1); + if (!list_empty(&sem->wait_list) && sem->count >= 0) + __rwsem_do_wake(sem, 0); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); return -EINTR; -- cgit v1.3-14-g43fede From 69d71879d2cf67a381055f698a1d7def00dc4ed7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 5 Jul 2017 09:45:43 -0400 Subject: ftrace: Test for NULL iter->tr in regex for stack_trace_filter changes As writing into stack_trace_filter, the iter-tr is not set and is NULL. Check if it is NULL before dereferencing it in ftrace_regex_release(). Fixes: 8c08f0d5c6fb ("ftrace: Have cached module filters be an active filter") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f8c18f15b190..2953d558bbee 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5043,7 +5043,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) if (filter_hash) { orig_hash = &iter->ops->func_hash->filter_hash; - if (!list_empty(&iter->tr->mod_trace)) + if (iter->tr && !list_empty(&iter->tr->mod_trace)) iter->hash->flags |= FTRACE_HASH_FL_MOD; } else orig_hash = &iter->ops->func_hash->notrace_hash; -- cgit v1.3-14-g43fede From 65a4433aebe36c8c6abeb69b99ef00274b971c6c Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Wed, 7 Jun 2017 13:18:57 -0600 Subject: sched/fair: Fix load_balance() affinity redo path If load_balance() fails to migrate any tasks because all tasks were affined, load_balance() removes the source CPU from consideration and attempts to redo and balance among the new subset of CPUs. There is a bug in this code path where the algorithm considers all active CPUs in the system (minus the source that was just masked out). This is not valid for two reasons: some active CPUs may not be in the current scheduling domain and one of the active CPUs is dst_cpu. These CPUs should not be considered, as we cannot pull load from them. Instead of failing out of load_balance(), we may end up redoing the search with no valid CPUs and incorrectly concluding the domain is balanced. Additionally, if the group_imbalance flag was just set, it may also be incorrectly unset, thus the flag will not be seen by other CPUs in future load_balance() runs as that algorithm intends. Fix the check by removing CPUs not in the current domain and the dst_cpu from considertation, thus limiting the evaluation to valid remaining CPUs from which load might be migrated. Co-authored-by: Austin Christ Co-authored-by: Dietmar Eggemann Tested-by: Tyler Baicar Signed-off-by: Jeffrey Hugo Acked-by: Peter Zijlstra Cc: Austin Christ Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Timur Tabi Link: http://lkml.kernel.org/r/1496863138-11322-2-git-send-email-jhugo@codeaurora.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 008c514dc241..c95880e216f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6646,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * our sched_group. We may want to revisit it if we couldn't * meet load balance goals by pulling other tasks on src_cpu. * - * Also avoid computing new_dst_cpu if we have already computed - * one in current iteration. + * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have + * already computed one in current iteration. */ - if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) + if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) return 0; /* Prevent to re-select dst_cpu via env's cpus */ @@ -8022,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .tasks = LIST_HEAD_INIT(env.tasks), }; - /* - * For NEWLY_IDLE load_balancing, we don't need to consider - * other cpus in our group - */ - if (idle == CPU_NEWLY_IDLE) - env.dst_grpmask = NULL; - - cpumask_copy(cpus, cpu_active_mask); + cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); schedstat_inc(sd->lb_count[idle]); @@ -8151,7 +8144,15 @@ more_balance: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) { + /* + * Attempting to continue load balancing at the current + * sched_domain level only makes sense if there are + * active CPUs remaining as possible busiest CPUs to + * pull load from which are not contained within the + * destination group that is receiving any migrated + * load. + */ + if (!cpumask_subset(cpus, env.dst_grpmask)) { env.loop = 0; env.loop_break = sched_nr_migrate_break; goto redo; @@ -8447,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data) .src_cpu = busiest_rq->cpu, .src_rq = busiest_rq, .idle = CPU_IDLE, + /* + * can_migrate_task() doesn't need to compute new_dst_cpu + * for active balancing. Since we have CPU_IDLE, but no + * @dst_grpmask we need to make that test go away with lying + * about DST_PINNED. + */ + .flags = LBF_DST_PINNED, }; schedstat_inc(sd->alb_count); -- cgit v1.3-14-g43fede From 4cc7c1864bbd4cf80f6bdc8ba3217de5aa5f4688 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Jul 2017 16:24:49 +0100 Subject: bpf: Implement show_options Implement the show_options superblock op for bpf as part of a bid to get rid of s_options and generic_show_options() to make it easier to implement a context-based mount where the mount options can be passed individually over a file descriptor. Signed-off-by: David Howells cc: Alexei Starovoitov cc: Daniel Borkmann cc: netdev@vger.kernel.org Signed-off-by: Al Viro --- kernel/bpf/inode.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9bbd33497d3d..e833ed914358 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -377,10 +377,22 @@ static void bpf_evict_inode(struct inode *inode) bpf_any_put(inode->i_private, type); } +/* + * Display the mount options in /proc/mounts. + */ +static int bpf_show_options(struct seq_file *m, struct dentry *root) +{ + umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; + + if (mode != S_IRWXUGO) + seq_printf(m, ",mode=%o", mode); + return 0; +} + static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, - .show_options = generic_show_options, + .show_options = bpf_show_options, .evict_inode = bpf_evict_inode, }; @@ -434,8 +446,6 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) struct inode *inode; int ret; - save_mount_options(sb, data); - ret = bpf_parse_options(data, &opts); if (ret) return ret; -- cgit v1.3-14-g43fede From 9cd4f1a4e7a858849e889a081a99adff83e08e4c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Jul 2017 22:20:23 +0200 Subject: smp/hotplug: Move unparking of percpu threads to the control CPU Vikram reported the following backtrace: BUG: scheduling while atomic: swapper/7/0/0x00000002 CPU: 7 PID: 0 Comm: swapper/7 Not tainted 4.9.32-perf+ #680 schedule schedule_hrtimeout_range_clock schedule_hrtimeout wait_task_inactive __kthread_bind_mask __kthread_bind __kthread_unpark kthread_unpark cpuhp_online_idle cpu_startup_entry secondary_start_kernel He analyzed correctly that a parked cpu hotplug thread of an offlined CPU was still on the runqueue when the CPU came back online and tried to unpark it. This causes the thread which invoked kthread_unpark() to call wait_task_inactive() and subsequently schedule() with preemption disabled. His proposed workaround was to "make sure" that a parked thread has scheduled out when the CPU goes offline, so the situation cannot happen. But that's still wrong because the root cause is not the fact that the percpu thread is still on the runqueue and neither that preemption is disabled, which could be simply solved by enabling preemption before calling kthread_unpark(). The real issue is that the calling thread is the idle task of the upcoming CPU, which is not supposed to call anything which might sleep. The moron, who wrote that code, missed completely that kthread_unpark() might end up in schedule(). The solution is simpler than expected. The thread which controls the hotplug operation is waiting for the CPU to call complete() on the hotplug state completion. So the idle task of the upcoming CPU can set its state to CPUHP_AP_ONLINE_IDLE and invoke complete(). This in turn wakes the control task on a different CPU, which then can safely do the unpark and kick the now unparked hotplug thread of the upcoming CPU to complete the bringup to the final target state. Control CPU AP bringup_cpu(); __cpu_up() ------------> bringup_ap(); bringup_wait_for_ap() wait_for_completion(); cpuhp_online_idle(); <------------ complete(); unpark(AP->stopper); unpark(AP->hotplugthread); while(1) do_idle(); kick(AP->hotplugthread); wait_for_completion(); hotplug_thread() run_online_callbacks(); complete(); Fixes: 8df3e07e7f21 ("cpu/hotplug: Let upcoming cpu bring itself fully up") Reported-by: Vikram Mulukutla Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Sebastian Sewior Cc: Rusty Russell Cc: Tejun Heo Cc: Andrew Morton Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1707042218020.2131@nanos Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index b03a32595cfe..ab860453841d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -271,11 +271,25 @@ void cpu_hotplug_enable(void) EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ +static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); + static int bringup_wait_for_ap(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ wait_for_completion(&st->done); + BUG_ON(!cpu_online(cpu)); + + /* Unpark the stopper thread and the hotplug thread of the target cpu */ + stop_machine_unpark(cpu); + kthread_unpark(st->thread); + + /* Should we go further up ? */ + if (st->target > CPUHP_AP_ONLINE_IDLE) { + __cpuhp_kick_ap_work(st); + wait_for_completion(&st->done); + } return st->result; } @@ -296,9 +310,7 @@ static int bringup_cpu(unsigned int cpu) irq_unlock_sparse(); if (ret) return ret; - ret = bringup_wait_for_ap(cpu); - BUG_ON(!cpu_online(cpu)); - return ret; + return bringup_wait_for_ap(cpu); } /* @@ -767,31 +779,20 @@ void notify_cpu_starting(unsigned int cpu) } /* - * Called from the idle task. We need to set active here, so we can kick off - * the stopper thread and unpark the smpboot threads. If the target state is - * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the - * cpu further. + * Called from the idle task. Wake up the controlling task which brings the + * stopper and the hotplug thread of the upcoming CPU up and then delegates + * the rest of the online bringup to the hotplug thread. */ void cpuhp_online_idle(enum cpuhp_state state) { struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); - unsigned int cpu = smp_processor_id(); /* Happens for the boot cpu */ if (state != CPUHP_AP_ONLINE_IDLE) return; st->state = CPUHP_AP_ONLINE_IDLE; - - /* Unpark the stopper thread and the hotplug thread of this cpu */ - stop_machine_unpark(cpu); - kthread_unpark(st->thread); - - /* Should we go further up ? */ - if (st->target > CPUHP_AP_ONLINE_IDLE) - __cpuhp_kick_ap_work(st); - else - complete(&st->done); + complete(&st->done); } /* Requires cpu_add_remove_lock to be held */ -- cgit v1.3-14-g43fede From 99c621d704cf1c4eb74c3c42e674edf3df64f92d Mon Sep 17 00:00:00 2001 From: Michael Sartain Date: Wed, 5 Jul 2017 22:07:15 -0600 Subject: tracing: Add saved_tgids file to show cached pid to tgid mappings Export the cached pid / tgid mappings in debugfs tracing saved_tgids file. This allows user apps to translate the pids from a trace to their respective thread group. Example saved_tgids file with pid / tgid values separated by ' ': # cat saved_tgids 1048 1048 1047 1047 7 7 1049 1047 1054 1047 1053 1047 Link: http://lkml.kernel.org/r/20170630004023.064965233@goodmis.org Link: http://lkml.kernel.org/r/20170706040713.unwkumbta5menygi@mikesart-cos Reviewed-by: Joel Fernandes Signed-off-by: Michael Sartain Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 00e2e4169b1e..f079a8ca1117 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4688,6 +4688,76 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; +static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) +{ + int *ptr = v; + + if (*pos || m->count) + ptr++; + + (*pos)++; + + for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) { + if (trace_find_tgid(*ptr)) + return ptr; + } + + return NULL; +} + +static void *saved_tgids_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; + + if (!tgid_map) + return NULL; + + v = &tgid_map[0]; + while (l <= *pos) { + v = saved_tgids_next(m, v, &l); + if (!v) + return NULL; + } + + return v; +} + +static void saved_tgids_stop(struct seq_file *m, void *v) +{ +} + +static int saved_tgids_show(struct seq_file *m, void *v) +{ + int pid = (int *)v - tgid_map; + + seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid)); + return 0; +} + +static const struct seq_operations tracing_saved_tgids_seq_ops = { + .start = saved_tgids_start, + .stop = saved_tgids_stop, + .next = saved_tgids_next, + .show = saved_tgids_show, +}; + +static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(filp, &tracing_saved_tgids_seq_ops); +} + + +static const struct file_operations tracing_saved_tgids_fops = { + .open = tracing_saved_tgids_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) { unsigned int *ptr = v; @@ -7920,6 +7990,9 @@ static __init int tracer_init_tracefs(void) trace_create_file("saved_cmdlines_size", 0644, d_tracer, NULL, &tracing_saved_cmdlines_size_fops); + trace_create_file("saved_tgids", 0444, d_tracer, + NULL, &tracing_saved_tgids_fops); + trace_eval_init(); trace_create_eval_file(d_tracer); -- cgit v1.3-14-g43fede From c80081b9209713e0fe86d3def395a9fc66503c58 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 6 Jul 2017 14:29:04 +0200 Subject: genirq: Allow to pass the IRQF_TIMER flag with percpu irq request The irq timings infrastructure tracks when interrupts occur in order to statistically predict te next interrupt event. There is no point to track timer interrupts and try to predict them because the next expiration time is already known. This can be avoided via the IRQF_TIMER flag which is passed by timer drivers in request_irq(). It marks the interrupt as timer based which alloes to ignore these interrupts in the timings code. Per CPU interrupts which are requested via request_percpu_+irq() have no flag argument, so marking per cpu timer interrupts is not possible and they get tracked pointlessly. Add __request_percpu_irq() as a variant of request_percpu_irq() with a flags argument and make request_percpu_irq() an inline wrapper passing flags = 0. The flag parameter is restricted to IRQF_TIMER as all other IRQF_ flags make no sense for per cpu interrupts. The next step is to convert all existing users of request_percpu_irq() and then remove the wrapper and the underscores. [ tglx: Massaged changelog ] Signed-off-by: Daniel Lezcano Signed-off-by: Thomas Gleixner Cc: peterz@infradead.org Cc: nicolas.pitre@linaro.org Cc: vincent.guittot@linaro.org Cc: rafael@kernel.org Link: http://lkml.kernel.org/r/1499344144-3964-1-git-send-email-daniel.lezcano@linaro.org --- include/linux/interrupt.h | 11 ++++++++++- kernel/irq/manage.c | 15 ++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 37f8e354f564..5ac6e238555e 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -152,8 +152,17 @@ request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id); extern int __must_check +__request_percpu_irq(unsigned int irq, irq_handler_t handler, + unsigned long flags, const char *devname, + void __percpu *percpu_dev_id); + +static inline int __must_check request_percpu_irq(unsigned int irq, irq_handler_t handler, - const char *devname, void __percpu *percpu_dev_id); + const char *devname, void __percpu *percpu_dev_id) +{ + return __request_percpu_irq(irq, handler, 0, + devname, percpu_dev_id); +} extern const void *free_irq(unsigned int, void *); extern void free_percpu_irq(unsigned int, void __percpu *); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 91e1f2390752..5624b2dd6b58 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1950,9 +1950,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) } /** - * request_percpu_irq - allocate a percpu interrupt line + * __request_percpu_irq - allocate a percpu interrupt line * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. + * @flags: Interrupt type flags (IRQF_TIMER only) * @devname: An ascii name for the claiming device * @dev_id: A percpu cookie passed back to the handler function * @@ -1965,8 +1966,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) * the handler gets called with the interrupted CPU's instance of * that variable. */ -int request_percpu_irq(unsigned int irq, irq_handler_t handler, - const char *devname, void __percpu *dev_id) +int __request_percpu_irq(unsigned int irq, irq_handler_t handler, + unsigned long flags, const char *devname, + void __percpu *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -1980,12 +1982,15 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, !irq_settings_is_per_cpu_devid(desc)) return -EINVAL; + if (flags && flags != IRQF_TIMER) + return -EINVAL; + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); if (!action) return -ENOMEM; action->handler = handler; - action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND; + action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; action->name = devname; action->percpu_dev_id = dev_id; @@ -2004,7 +2009,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, return retval; } -EXPORT_SYMBOL_GPL(request_percpu_irq); +EXPORT_SYMBOL_GPL(__request_percpu_irq); /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. -- cgit v1.3-14-g43fede From c0d80ddab89916273cb97114889d3f337bc370ae Mon Sep 17 00:00:00 2001 From: Marcin Nowakowski Date: Thu, 6 Jul 2017 15:35:31 -0700 Subject: kernel/extable.c: mark core_kernel_text notrace core_kernel_text is used by MIPS in its function graph trace processing, so having this method traced leads to an infinite set of recursive calls such as: Call Trace: ftrace_return_to_handler+0x50/0x128 core_kernel_text+0x10/0x1b8 prepare_ftrace_return+0x6c/0x114 ftrace_graph_caller+0x20/0x44 return_to_handler+0x10/0x30 return_to_handler+0x0/0x30 return_to_handler+0x0/0x30 ftrace_ops_no_ops+0x114/0x1bc core_kernel_text+0x10/0x1b8 core_kernel_text+0x10/0x1b8 core_kernel_text+0x10/0x1b8 ftrace_ops_no_ops+0x114/0x1bc core_kernel_text+0x10/0x1b8 prepare_ftrace_return+0x6c/0x114 ftrace_graph_caller+0x20/0x44 (...) Mark the function notrace to avoid it being traced. Link: http://lkml.kernel.org/r/1498028607-6765-1-git-send-email-marcin.nowakowski@imgtec.com Signed-off-by: Marcin Nowakowski Reviewed-by: Masami Hiramatsu Cc: Peter Zijlstra Cc: Thomas Meyer Cc: Ingo Molnar Cc: Steven Rostedt Cc: Daniel Borkmann Cc: Paul Gortmaker Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/extable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index 0fbdd8582f08..223df4a328a4 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr) return 0; } -int core_kernel_text(unsigned long addr) +int notrace core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr < (unsigned long)_etext) -- cgit v1.3-14-g43fede From 61f6d09a931c3ab216f43e00505073088d387d05 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 Jul 2017 15:35:55 -0700 Subject: kernel/power/snapshot.c: use linux/set_memory.h This header always exists, so doesn't require an ifdef around its inclusion. When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm header, otherwise it provides empty versions of the set_memory_xx() routines. Link: http://lkml.kernel.org/r/1498717781-29151-2-git-send-email-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Acked-by: Kees Cook Acked-by: Laura Abbott Cc: Daniel Borkmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b7708e319941..222317721c5a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -30,15 +30,13 @@ #include #include #include +#include #include #include #include #include #include -#ifdef CONFIG_ARCH_HAS_SET_MEMORY -#include -#endif #include "power.h" -- cgit v1.3-14-g43fede From 563ec5cbc615698239c3a63511b939a7a7e38870 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 Jul 2017 15:35:58 -0700 Subject: kernel/module.c: use linux/set_memory.h This header always exists, so doesn't require an ifdef around its inclusion. When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm header, otherwise it provides empty versions of the set_memory_xx() routines. The usages of set_memory_xx() are still guarded by CONFIG_STRICT_MODULE_RWX. Link: http://lkml.kernel.org/r/1498717781-29151-3-git-send-email-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Acked-by: Kees Cook Acked-by: Laura Abbott Cc: Daniel Borkmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d7eb41d772c4..8f883d86cedc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -49,9 +49,7 @@ #include #include #include -#ifdef CONFIG_STRICT_MODULE_RWX -#include -#endif +#include #include #include #include -- cgit v1.3-14-g43fede From f1dd2cd13c4bbbc9a7c4617b3b034fa643de98fe Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:11 -0700 Subject: mm, memory_hotplug: do not associate hotadded memory to zones until online The current memory hotplug implementation relies on having all the struct pages associate with a zone/node during the physical hotplug phase (arch_add_memory->__add_pages->__add_section->__add_zone). In the vast majority of cases this means that they are added to ZONE_NORMAL. This has been so since 9d99aaa31f59 ("[PATCH] x86_64: Support memory hotadd without sparsemem") and it wasn't a big deal back then because movable onlining didn't exist yet. Much later memory hotplug wanted to (ab)use ZONE_MOVABLE for movable onlining 511c2aba8f07 ("mm, memory-hotplug: dynamic configure movable memory and portion memory") and then things got more complicated. Rather than reconsidering the zone association which was no longer needed (because the memory hotplug already depended on SPARSEMEM) a convoluted semantic of zone shifting has been developed. Only the currently last memblock or the one adjacent to the zone_movable can be onlined movable. This essentially means that the online type changes as the new memblocks are added. Let's simulate memory hot online manually $ echo 0x100000000 > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory32/valid_zones Normal Movable $ echo $((0x100000000+(128<<20))) > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable $ echo $((0x100000000+2*(128<<20))) > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal /sys/devices/system/memory/memory34/valid_zones:Normal Movable $ echo online_movable > /sys/devices/system/memory/memory34/state $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable Normal This is an awkward semantic because an udev event is sent as soon as the block is onlined and an udev handler might want to online it based on some policy (e.g. association with a node) but it will inherently race with new blocks showing up. This patch changes the physical online phase to not associate pages with any zone at all. All the pages are just marked reserved and wait for the onlining phase to be associated with the zone as per the online request. There are only two requirements - existing ZONE_NORMAL and ZONE_MOVABLE cannot overlap - ZONE_NORMAL precedes ZONE_MOVABLE in physical addresses the latter one is not an inherent requirement and can be changed in the future. It preserves the current behavior and made the code slightly simpler. This is subject to change in future. This means that the same physical online steps as above will lead to the following state: Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable Implementation: The current move_pfn_range is reimplemented to check the above requirements (allow_online_pfn_range) and then updates the respective zone (move_pfn_range_to_zone), the pgdat and links all the pages in the pfn range with the zone/node. __add_pages is updated to not require the zone and only initializes sections in the range. This allowed to simplify the arch_add_memory code (s390 could get rid of quite some of code). devm_memremap_pages is the only user of arch_add_memory which relies on the zone association because it only hooks into the memory hotplug only half way. It uses it to associate the new memory with ZONE_DEVICE but doesn't allow it to be {on,off}lined via sysfs. This means that this particular code path has to call move_pfn_range_to_zone explicitly. The original zone shifting code is kept in place and will be removed in the follow up patch for an easier review. Please note that this patch also changes the original behavior when offlining a memory block adjacent to another zone (Normal vs. Movable) used to allow to change its movable type. This will be handled later. [richard.weiyang@gmail.com: simplify zone_intersects()] Link: http://lkml.kernel.org/r/20170616092335.5177-1-richard.weiyang@gmail.com [richard.weiyang@gmail.com: remove duplicate call for set_page_links] Link: http://lkml.kernel.org/r/20170616092335.5177-2-richard.weiyang@gmail.com [akpm@linux-foundation.org: remove unused local `i'] Link: http://lkml.kernel.org/r/20170515085827.16474-12-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Wei Yang Tested-by: Dan Williams Tested-by: Reza Arbab Acked-by: Heiko Carstens # For s390 bits Acked-by: Vlastimil Babka Cc: Martin Schwidefsky Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 9 +- arch/powerpc/mm/mem.c | 10 +- arch/s390/mm/init.c | 30 +----- arch/sh/mm/init.c | 8 +- arch/x86/mm/init_32.c | 5 +- arch/x86/mm/init_64.c | 9 +- drivers/base/memory.c | 52 ++++++----- include/linux/memory_hotplug.h | 13 +-- include/linux/mmzone.h | 16 ++++ kernel/memremap.c | 4 + mm/memory_hotplug.c | 201 +++++++++++++++++++++++++---------------- mm/sparse.c | 3 +- 12 files changed, 185 insertions(+), 175 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 39e2aeb4669d..80db57d063d0 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -648,18 +648,11 @@ mem_init (void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - pg_data_t *pgdat; - struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - pgdat = NODE_DATA(nid); - - zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); - ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); - + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index e6b2e6618b6c..72c46eb53215 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -128,16 +128,12 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdata; - struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int rc; resize_hpt_for_hotplug(memblock_phys_mem_size()); - pgdata = NODE_DATA(nid); - start = (unsigned long)__va(start); rc = create_section_mapping(start, start + size); if (rc) { @@ -147,11 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) return -EFAULT; } - /* this should work for most non-highmem platforms */ - zone = pgdata->node_zones + - zone_for_memory(nid, start, size, 0, for_device); - - return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index a3d549966b6a..bfa918e3592b 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -168,41 +168,15 @@ unsigned long memory_block_size_bytes(void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - unsigned long zone_start_pfn, zone_end_pfn, nr_pages; unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); - pg_data_t *pgdat = NODE_DATA(nid); - struct zone *zone; - int rc, i; + int rc; rc = vmem_add_mapping(start, size); if (rc) return rc; - for (i = 0; i < MAX_NR_ZONES; i++) { - zone = pgdat->node_zones + i; - if (zone_idx(zone) != ZONE_MOVABLE) { - /* Add range within existing zone limits, if possible */ - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone->zone_start_pfn + - zone->spanned_pages; - } else { - /* Add remaining range to ZONE_MOVABLE */ - zone_start_pfn = start_pfn; - zone_end_pfn = start_pfn + size_pages; - } - if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn) - continue; - nr_pages = (start_pfn + size_pages > zone_end_pfn) ? - zone_end_pfn - start_pfn : size_pages; - rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); - if (rc) - break; - start_pfn += nr_pages; - size_pages -= nr_pages; - if (!size_pages) - break; - } + rc = __add_pages(nid, start_pfn, size_pages, !for_device); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index a9d57f75ae8c..3813a610a2bb 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -487,18 +487,12 @@ void free_initrd_mem(unsigned long start, unsigned long end) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - pg_data_t *pgdat; unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - pgdat = NODE_DATA(nid); - /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, - for_device), - start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 94594b889144..a424066d0552 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -825,13 +825,10 @@ void __init mem_init(void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdata = NODE_DATA(nid); - struct zone *zone = pgdata->node_zones + - zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9d64291459b6..06afa84ac0a0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -772,22 +772,15 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -/* - * Memory is added always to NORMAL zone. This means you will never get - * additional DMA/DMA32 memory. - */ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; init_memory_mapping(start, start + size); - ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 1e884d82af6f..b86fda30ce62 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -392,39 +392,43 @@ static ssize_t show_valid_zones(struct device *dev, struct device_attribute *attr, char *buf) { struct memory_block *mem = to_memory_block(dev); - unsigned long start_pfn, end_pfn; - unsigned long valid_start, valid_end, valid_pages; + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; - struct zone *zone; - int zone_shift = 0; + unsigned long valid_start_pfn, valid_end_pfn; + bool append = false; + int nid; - start_pfn = section_nr_to_pfn(mem->start_section_nr); - end_pfn = start_pfn + nr_pages; - - /* The block contains more than one zone can not be offlined. */ - if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) + /* + * The block contains more than one zone can not be offlined. + * This can happen e.g. for ZONE_DMA and ZONE_DMA32 + */ + if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn)) return sprintf(buf, "none\n"); - zone = page_zone(pfn_to_page(valid_start)); - valid_pages = valid_end - valid_start; - - /* MMOP_ONLINE_KEEP */ - sprintf(buf, "%s", zone->name); + start_pfn = valid_start_pfn; + nr_pages = valid_end_pfn - start_pfn; - /* MMOP_ONLINE_KERNEL */ - zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift); - if (zone_shift) { - strcat(buf, " "); - strcat(buf, (zone + zone_shift)->name); + /* + * Check the existing zone. Make sure that we do that only on the + * online nodes otherwise the page_zone is not reliable + */ + if (mem->state == MEM_ONLINE) { + strcat(buf, page_zone(pfn_to_page(start_pfn))->name); + goto out; } - /* MMOP_ONLINE_MOVABLE */ - zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift); - if (zone_shift) { - strcat(buf, " "); - strcat(buf, (zone + zone_shift)->name); + nid = pfn_to_nid(start_pfn); + if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) { + strcat(buf, NODE_DATA(nid)->node_zones[ZONE_NORMAL].name); + append = true; } + if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) { + if (append) + strcat(buf, " "); + strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name); + } +out: strcat(buf, "\n"); return strlen(buf); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index a61aede1b391..8a07a49fd8dc 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -123,8 +123,8 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); #endif /* CONFIG_MEMORY_HOTREMOVE */ -/* reasonably generic interface to expand the physical pages in a zone */ -extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, +/* reasonably generic interface to expand the physical pages */ +extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, bool want_memblock); #ifdef CONFIG_NUMA @@ -299,15 +299,16 @@ extern int add_memory_resource(int nid, struct resource *resource, bool online); extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, bool for_device); extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); +extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); -extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); +extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn); extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, unsigned long map_offset); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target, int *zone_shift); - +extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, + int online_type); #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2aaf7e08c5a8..abc1641011f2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -532,6 +532,22 @@ static inline bool zone_is_empty(struct zone *zone) return zone->spanned_pages == 0; } +/* + * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty + * intersection with the given zone + */ +static inline bool zone_intersects(struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + if (zone_is_empty(zone)) + return false; + if (start_pfn >= zone_end_pfn(zone) || + start_pfn + nr_pages <= zone->zone_start_pfn) + return false; + + return true; +} + /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the diff --git a/kernel/memremap.c b/kernel/memremap.c index 23a6483c3666..281eb478856a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -359,6 +359,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mem_hotplug_begin(); error = arch_add_memory(nid, align_start, align_size, true); + if (!error) + move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT); mem_hotplug_done(); if (error) goto err_add_memory; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b2ebe9ad7f6c..9438ffe24cb2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -433,25 +433,6 @@ out_fail: return -1; } -static struct zone * __meminit move_pfn_range(int zone_shift, - unsigned long start_pfn, unsigned long end_pfn) -{ - struct zone *zone = page_zone(pfn_to_page(start_pfn)); - int ret = 0; - - if (zone_shift < 0) - ret = move_pfn_range_left(zone + zone_shift, zone, - start_pfn, end_pfn); - else if (zone_shift) - ret = move_pfn_range_right(zone, zone + zone_shift, - start_pfn, end_pfn); - - if (ret) - return NULL; - - return zone + zone_shift; -} - static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { @@ -493,23 +474,35 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) return 0; } -static int __meminit __add_section(int nid, struct zone *zone, - unsigned long phys_start_pfn, bool want_memblock) +static int __meminit __add_section(int nid, unsigned long phys_start_pfn, + bool want_memblock) { int ret; + int i; if (pfn_valid(phys_start_pfn)) return -EEXIST; - ret = sparse_add_one_section(zone, phys_start_pfn); - + ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn); if (ret < 0) return ret; - ret = __add_zone(zone, phys_start_pfn); + /* + * Make all the pages reserved so that nobody will stumble over half + * initialized state. + * FIXME: We also have to associate it with a node because pfn_to_node + * relies on having page with the proper node. + */ + for (i = 0; i < PAGES_PER_SECTION; i++) { + unsigned long pfn = phys_start_pfn + i; + struct page *page; + if (!pfn_valid(pfn)) + continue; - if (ret < 0) - return ret; + page = pfn_to_page(pfn); + set_page_node(page, nid); + SetPageReserved(page); + } if (!want_memblock) return 0; @@ -523,7 +516,7 @@ static int __meminit __add_section(int nid, struct zone *zone, * call this function after deciding the zone to which to * add the new pages. */ -int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, +int __ref __add_pages(int nid, unsigned long phys_start_pfn, unsigned long nr_pages, bool want_memblock) { unsigned long i; @@ -531,8 +524,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, int start_sec, end_sec; struct vmem_altmap *altmap; - clear_zone_contiguous(zone); - /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); @@ -552,7 +543,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock); + err = __add_section(nid, section_nr_to_pfn(i), want_memblock); /* * EEXIST is finally dealt with by ioresource collision @@ -565,7 +556,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } vmemmap_populate_print_last(); out: - set_zone_contiguous(zone); return err; } EXPORT_SYMBOL_GPL(__add_pages); @@ -1034,39 +1024,109 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } -bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target, int *zone_shift) +bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) { - struct zone *zone = page_zone(pfn_to_page(pfn)); - enum zone_type idx = zone_idx(zone); - int i; + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + struct zone *normal_zone = &pgdat->node_zones[ZONE_NORMAL]; - *zone_shift = 0; + /* + * TODO there shouldn't be any inherent reason to have ZONE_NORMAL + * physically before ZONE_MOVABLE. All we need is they do not + * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE + * though so let's stick with it for simplicity for now. + * TODO make sure we do not overlap with ZONE_DEVICE + */ + if (online_type == MMOP_ONLINE_KERNEL) { + if (zone_is_empty(movable_zone)) + return true; + return movable_zone->zone_start_pfn >= pfn + nr_pages; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + return zone_end_pfn(normal_zone) <= pfn; + } - if (idx < target) { - /* pages must be at end of current zone */ - if (pfn + nr_pages != zone_end_pfn(zone)) - return false; + /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ + return online_type == MMOP_ONLINE_KEEP; +} - /* no zones in use between current zone and target */ - for (i = idx + 1; i < target; i++) - if (zone_is_initialized(zone - idx + i)) - return false; - } +static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = zone_end_pfn(zone); - if (target < idx) { - /* pages must be at beginning of current zone */ - if (pfn != zone->zone_start_pfn) - return false; + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; +} + +static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = pgdat_end_pfn(pgdat); - /* no zones in use between current zone and target */ - for (i = target + 1; i < idx; i++) - if (zone_is_initialized(zone - idx + i)) - return false; + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; +} + +void move_pfn_range_to_zone(struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nid = pgdat->node_id; + unsigned long flags; + + if (zone_is_empty(zone)) + init_currently_empty_zone(zone, start_pfn, nr_pages); + + clear_zone_contiguous(zone); + + /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ + pgdat_resize_lock(pgdat, &flags); + zone_span_writelock(zone); + resize_zone_range(zone, start_pfn, nr_pages); + zone_span_writeunlock(zone); + resize_pgdat_range(pgdat, start_pfn, nr_pages); + pgdat_resize_unlock(pgdat, &flags); + + /* + * TODO now we have a visible range of pages which are not associated + * with their zone properly. Not nice but set_pfnblock_flags_mask + * expects the zone spans the pfn range. All the pages in the range + * are reserved so nobody should be touching them so we should be safe + */ + memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG); + + set_zone_contiguous(zone); +} + +/* + * Associates the given pfn range with the given node and the zone appropriate + * for the given online type. + */ +static struct zone * __meminit move_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *zone = &pgdat->node_zones[ZONE_NORMAL]; + + if (online_type == MMOP_ONLINE_KEEP) { + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + /* + * MMOP_ONLINE_KEEP inherits the current zone which is + * ZONE_NORMAL by default but we might be within ZONE_MOVABLE + * already. + */ + if (zone_intersects(movable_zone, start_pfn, nr_pages)) + zone = movable_zone; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + zone = &pgdat->node_zones[ZONE_MOVABLE]; } - *zone_shift = target - idx; - return true; + move_pfn_range_to_zone(zone, start_pfn, nr_pages); + return zone; } /* Must be protected by mem_hotplug_begin() */ @@ -1079,38 +1139,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int nid; int ret; struct memory_notify arg; - int zone_shift = 0; - /* - * This doesn't need a lock to do pfn_to_page(). - * The section can't be removed here because of the - * memory_block->state_mutex. - */ - zone = page_zone(pfn_to_page(pfn)); - - if ((zone_idx(zone) > ZONE_NORMAL || - online_type == MMOP_ONLINE_MOVABLE) && - !can_online_high_movable(pfn_to_nid(pfn))) + nid = pfn_to_nid(pfn); + if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) return -EINVAL; - if (online_type == MMOP_ONLINE_KERNEL) { - if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) - return -EINVAL; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) - return -EINVAL; - } - - zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); - if (!zone) + if (online_type == MMOP_ONLINE_MOVABLE && !can_online_high_movable(nid)) return -EINVAL; + /* associate pfn range with the zone */ + zone = move_pfn_range(online_type, nid, pfn, nr_pages); + arg.start_pfn = pfn; arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = zone_to_nid(zone); - ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); if (ret) diff --git a/mm/sparse.c b/mm/sparse.c index 9d7fd666015e..7b4be3fd5cac 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -761,10 +761,9 @@ static void free_map_bootmem(struct page *memmap) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) +int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn) { unsigned long section_nr = pfn_to_section_nr(start_pfn); - struct pglist_data *pgdat = zone->zone_pgdat; struct mem_section *ms; struct page *memmap; unsigned long *usemap; -- cgit v1.3-14-g43fede From 3d79a728f9b2e6ddcce4e02c91c4de1076548a4c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:21 -0700 Subject: mm, memory_hotplug: replace for_device by want_memblock in arch_add_memory arch_add_memory gets for_device argument which then controls whether we want to create memblocks for created memory sections. Simplify the logic by telling whether we want memblocks directly rather than going through pointless negation. This also makes the api easier to understand because it is clear what we want rather than nothing telling for_device which can mean anything. This shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170515085827.16474-13-mhocko@kernel.org Signed-off-by: Michal Hocko Tested-by: Dan Williams Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 4 ++-- arch/powerpc/mm/mem.c | 4 ++-- arch/s390/mm/init.c | 4 ++-- arch/sh/mm/init.c | 4 ++-- arch/x86/mm/init_32.c | 4 ++-- arch/x86/mm/init_64.c | 4 ++-- include/linux/memory_hotplug.h | 2 +- kernel/memremap.c | 2 +- mm/memory_hotplug.c | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 80db57d063d0..a4e8d6bd9cfa 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -646,13 +646,13 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 72c46eb53215..de5a90e1ceaa 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -126,7 +126,7 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -143,7 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) return -EFAULT; } - return __add_pages(nid, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index bfa918e3592b..8111694ce55a 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -166,7 +166,7 @@ unsigned long memory_block_size_bytes(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); @@ -176,7 +176,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, !for_device); + rc = __add_pages(nid, start_pfn, size_pages, want_memblock); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 3813a610a2bb..bf726af5f1a5 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -485,14 +485,14 @@ void free_initrd_mem(unsigned long start, unsigned long end) #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index a424066d0552..8a64a6f2848d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -823,12 +823,12 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 06afa84ac0a0..136422d7d539 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -772,7 +772,7 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -780,7 +780,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) init_memory_mapping(start, start + size); - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4d65a2fcac15..780c806e17d3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -298,7 +298,7 @@ extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource, bool online); extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, bool for_device); -extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); +extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); diff --git a/kernel/memremap.c b/kernel/memremap.c index 281eb478856a..124bed776532 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -358,7 +358,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, goto err_pfn_remap; mem_hotplug_begin(); - error = arch_add_memory(nid, align_start, align_size, true); + error = arch_add_memory(nid, align_start, align_size, false); if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], align_start >> PAGE_SHIFT, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4263fa6f2ab4..9b04cf5ea813 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1448,7 +1448,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) } /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, false); + ret = arch_add_memory(nid, start, size, true); if (ret < 0) goto error; -- cgit v1.3-14-g43fede From 57ecbd3831ee3ad43914d5c9dddbff7ce30e3d42 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 6 Jul 2017 15:38:32 -0700 Subject: kernel/exit.c: don't include unused userfaultfd_k.h Commit dd0db88d8094 ("userfaultfd: non-cooperative: rollback userfaultfd_exit") removed userfaultfd callback from exit() which makes the include of unnecessary. Link: http://lkml.kernel.org/r/1494930907-3060-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index b0cc86a2d00b..2bbc23273e2f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include -- cgit v1.3-14-g43fede From 3d375d78593cd5daeead34ed3279c4ff63dd04f2 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 6 Jul 2017 15:39:11 -0700 Subject: mm: update callers to use HASH_ZERO flag Update dcache, inode, pid, mountpoint, and mount hash tables to use HASH_ZERO, and remove initialization after allocations. In case of places where HASH_EARLY was used such as in __pv_init_lock_hash the zeroed hash table was already assumed, because memblock zeroes the memory. CPU: SPARC M6, Memory: 7T Before fix: Dentry cache hash table entries: 1073741824 Inode-cache hash table entries: 536870912 Mount-cache hash table entries: 16777216 Mountpoint-cache hash table entries: 16777216 ftrace: allocating 20414 entries in 40 pages Total time: 11.798s After fix: Dentry cache hash table entries: 1073741824 Inode-cache hash table entries: 536870912 Mount-cache hash table entries: 16777216 Mountpoint-cache hash table entries: 16777216 ftrace: allocating 20414 entries in 40 pages Total time: 3.198s CPU: Intel Xeon E5-2630, Memory: 2.2T: Before fix: Dentry cache hash table entries: 536870912 Inode-cache hash table entries: 268435456 Mount-cache hash table entries: 8388608 Mountpoint-cache hash table entries: 8388608 CPU: Physical Processor ID: 0 Total time: 3.245s After fix: Dentry cache hash table entries: 536870912 Inode-cache hash table entries: 268435456 Mount-cache hash table entries: 8388608 Mountpoint-cache hash table entries: 8388608 CPU: Physical Processor ID: 0 Total time: 3.244s Link: http://lkml.kernel.org/r/1488432825-92126-4-git-send-email-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Babu Moger Cc: David Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 18 ++++-------------- fs/inode.c | 14 ++------------ fs/namespace.c | 10 ++-------- kernel/locking/qspinlock_paravirt.h | 3 ++- kernel/pid.c | 7 ++----- 5 files changed, 12 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/fs/dcache.c b/fs/dcache.c index a9f995f6859e..a140fe1dbb1a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3546,8 +3546,6 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -3559,24 +3557,19 @@ static void __init dcache_init_early(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &d_hash_shift, &d_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - unsigned int loop; - - /* + /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature - * of the dcache. + * of the dcache. */ dentry_cache = KMEM_CACHE(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT); @@ -3590,14 +3583,11 @@ static void __init dcache_init(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - 0, + HASH_ZERO, &d_hash_shift, &d_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } /* SLAB cache for __getname() consumers */ diff --git a/fs/inode.c b/fs/inode.c index ab3b9a795c0b..5cbc8e6e9390 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1915,8 +1915,6 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -1928,20 +1926,15 @@ void __init inode_init_early(void) sizeof(struct hlist_head), ihash_entries, 14, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - unsigned int loop; - /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), @@ -1959,14 +1952,11 @@ void __init inode_init(void) sizeof(struct hlist_head), ihash_entries, 14, - 0, + HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); } void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) diff --git a/fs/namespace.c b/fs/namespace.c index f70914a859a4..81f934b5d571 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3239,7 +3239,6 @@ static void __init init_mount_tree(void) void __init mnt_init(void) { - unsigned u; int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), @@ -3248,22 +3247,17 @@ void __init mnt_init(void) mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), mhash_entries, 19, - 0, + HASH_ZERO, &m_hash_shift, &m_hash_mask, 0, 0); mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", sizeof(struct hlist_head), mphash_entries, 19, - 0, + HASH_ZERO, &mp_hash_shift, &mp_hash_mask, 0, 0); if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); - for (u = 0; u <= m_hash_mask; u++) - INIT_HLIST_HEAD(&mount_hashtable[u]); - for (u = 0; u <= mp_hash_mask; u++) - INIT_HLIST_HEAD(&mountpoint_hashtable[u]); - kernfs_init(); err = sysfs_init(); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e6b2f7ad3e51..4ccfcaae5b89 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void) */ pv_lock_hash = alloc_large_system_hash("PV qspinlock", sizeof(struct pv_hash_entry), - pv_hash_size, 0, HASH_EARLY, + pv_hash_size, 0, + HASH_EARLY | HASH_ZERO, &pv_lock_hash_bits, NULL, pv_hash_size, pv_hash_size); } diff --git a/kernel/pid.c b/kernel/pid.c index fd1cde1e4576..731c4e528f4e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -575,16 +575,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - unsigned int i, pidhash_size; + unsigned int pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, - HASH_EARLY | HASH_SMALL, + HASH_EARLY | HASH_SMALL | HASH_ZERO, &pidhash_shift, NULL, 0, 4096); pidhash_size = 1U << pidhash_shift; - - for (i = 0; i < pidhash_size; i++) - INIT_HLIST_HEAD(&pid_hash[i]); } void __init pidmap_init(void) -- cgit v1.3-14-g43fede From 213980c0f23b6c4932fd5516da7e8443b2a615ea Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Jul 2017 15:40:06 -0700 Subject: mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Christoph Lameter Cc: David Rientjes Cc: Dimitri Sivanich Cc: Hugh Dickins Cc: Li Zefan Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 6 +-- include/uapi/linux/mempolicy.h | 8 ---- kernel/cgroup/cpuset.c | 4 +- mm/mempolicy.c | 102 ++++++++--------------------------------- 4 files changed, 21 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index ecb6cbeede5a..3a58b4be1b0c 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -142,8 +142,7 @@ bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); -extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, - enum mpol_rebind_step step); +extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern int huge_node(struct vm_area_struct *vma, @@ -260,8 +259,7 @@ static inline void numa_default_policy(void) } static inline void mpol_rebind_task(struct task_struct *tsk, - const nodemask_t *new, - enum mpol_rebind_step step) + const nodemask_t *new) { } diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 9cd8b21dddbe..2a4d89508fec 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -24,13 +24,6 @@ enum { MPOL_MAX, /* always last member of enum */ }; -enum mpol_rebind_step { - MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */ - MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */ - MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/ - MPOL_REBIND_NSTEP, -}; - /* Flags for set_mempolicy */ #define MPOL_F_STATIC_NODES (1 << 15) #define MPOL_F_RELATIVE_NODES (1 << 14) @@ -65,7 +58,6 @@ enum mpol_rebind_step { */ #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ -#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ae643412948a..5fd1bdbaa381 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1063,9 +1063,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, } nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); + mpol_rebind_task(tsk, newmems); tsk->mems_allowed = *newmems; if (need_loop) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c60807625fd5..047181452040 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -146,22 +146,7 @@ struct mempolicy *get_task_policy(struct task_struct *p) static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); - /* - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ - void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step); + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) @@ -304,19 +289,11 @@ void __mpol_put(struct mempolicy *p) kmem_cache_free(policy_cache, p); } -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step) +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } -/* - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ -static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step) +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; @@ -325,35 +302,19 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - /* - * if step == 1, we use ->w.cpuset_mems_allowed to cache the - * result - */ - if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { - nodes_remap(tmp, pol->v.nodes, - pol->w.cpuset_mems_allowed, *nodes); - pol->w.cpuset_mems_allowed = step ? tmp : *nodes; - } else if (step == MPOL_REBIND_STEP2) { - tmp = pol->w.cpuset_mems_allowed; - pol->w.cpuset_mems_allowed = *nodes; - } else - BUG(); + nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, + *nodes); + pol->w.cpuset_mems_allowed = tmp; } if (nodes_empty(tmp)) tmp = *nodes; - if (step == MPOL_REBIND_STEP1) - nodes_or(pol->v.nodes, pol->v.nodes, tmp); - else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) - pol->v.nodes = tmp; - else - BUG(); + pol->v.nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, - const nodemask_t *nodes, - enum mpol_rebind_step step) + const nodemask_t *nodes) { nodemask_t tmp; @@ -379,42 +340,19 @@ static void mpol_rebind_preferred(struct mempolicy *pol, /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes + * Per-vma policies are protected by mmap_sem. Allocations using per-task + * policies are protected by task->mems_allowed_seq to prevent a premature + * OOM/allocation failure due to parallel nodemask modification. */ -static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, - enum mpol_rebind_step step) +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && + if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; - if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) - return; - - if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) - BUG(); - - if (step == MPOL_REBIND_STEP1) - pol->flags |= MPOL_F_REBINDING; - else if (step == MPOL_REBIND_STEP2) - pol->flags &= ~MPOL_F_REBINDING; - else if (step >= MPOL_REBIND_NSTEP) - BUG(); - - mpol_ops[pol->mode].rebind(pol, newmask, step); + mpol_ops[pol->mode].rebind(pol, newmask); } /* @@ -424,10 +362,9 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, * Called with task's alloc_lock held. */ -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, - enum mpol_rebind_step step) +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { - mpol_rebind_policy(tsk->mempolicy, new, step); + mpol_rebind_policy(tsk->mempolicy, new); } /* @@ -442,7 +379,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) down_write(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) - mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); + mpol_rebind_policy(vma->vm_policy, new); up_write(&mm->mmap_sem); } @@ -2101,10 +2038,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); - if (new->flags & MPOL_F_REBINDING) - mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); - else - mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); + mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; -- cgit v1.3-14-g43fede From 5f155f27cb7f0670429e2b8bb954094fa4110df9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Jul 2017 15:40:09 -0700 Subject: mm, cpuset: always use seqlock when changing task's nodemask When updating task's mems_allowed and rebinding its mempolicy due to cpuset's mems being changed, we currently only take the seqlock for writing when either the task has a mempolicy, or the new mems has no intersection with the old mems. This should be enough to prevent a parallel allocation seeing no available nodes, but the optimization is IMHO unnecessary (cpuset updates should not be frequent), and we still potentially risk issues if the intersection of new and old nodes has limited amount of free/reclaimable memory. Let's just use the seqlock for all tasks. Link: http://lkml.kernel.org/r/20170517081140.30654-6-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Christoph Lameter Cc: David Rientjes Cc: Dimitri Sivanich Cc: Hugh Dickins Cc: Li Zefan Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cpuset.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5fd1bdbaa381..ca8376e5008c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1038,38 +1038,25 @@ static void cpuset_post_attach(void) * @tsk: the task to change * @newmems: new nodes that the task will be set * - * In order to avoid seeing no nodes if the old and new nodes are disjoint, - * we structure updates as setting all new allowed nodes, then clearing newly - * disallowed ones. + * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed + * and rebind an eventual tasks' mempolicy. If the task is allocating in + * parallel, it might temporarily see an empty intersection, which results in + * a seqlock check and retry before OOM or allocation failure. */ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { - bool need_loop; - task_lock(tsk); - /* - * Determine if a loop is necessary if another thread is doing - * read_mems_allowed_begin(). If at least one node remains unchanged and - * tsk does not have a mempolicy, then an empty nodemask will not be - * possible when mems_allowed is larger than a word. - */ - need_loop = task_has_mempolicy(tsk) || - !nodes_intersects(*newmems, tsk->mems_allowed); - if (need_loop) { - local_irq_disable(); - write_seqcount_begin(&tsk->mems_allowed_seq); - } + local_irq_disable(); + write_seqcount_begin(&tsk->mems_allowed_seq); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems); tsk->mems_allowed = *newmems; - if (need_loop) { - write_seqcount_end(&tsk->mems_allowed_seq); - local_irq_enable(); - } + write_seqcount_end(&tsk->mems_allowed_seq); + local_irq_enable(); task_unlock(tsk); } -- cgit v1.3-14-g43fede From ed52be7bfd45533b194b429f43361493d24599a7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 6 Jul 2017 15:40:49 -0700 Subject: mm: memcontrol: use generic mod_memcg_page_state for kmem pages The kmem-specific functions do the same thing. Switch and drop. Link: http://lkml.kernel.org/r/20170530181724.27197-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Josef Bacik Cc: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 17 ----------------- kernel/fork.c | 8 ++++---- mm/slab.h | 16 ++++++++-------- 3 files changed, 12 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fa506ae61d66..5a72d8377942 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -929,19 +929,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -/** - * memcg_kmem_update_page_stat - update kmem page state statistics - * @page: the page - * @idx: page state item to account - * @val: number of pages (positive or negative) - */ -static inline void memcg_kmem_update_page_stat(struct page *page, - enum memcg_stat_item idx, int val) -{ - if (memcg_kmem_enabled() && page->mem_cgroup) - this_cpu_add(page->mem_cgroup->stat->count[idx], val); -} - #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) @@ -964,10 +951,6 @@ static inline void memcg_put_cache_ids(void) { } -static inline void memcg_kmem_update_page_stat(struct page *page, - enum memcg_stat_item idx, int val) -{ -} #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index e53770d2bf95..aa01b810c0bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -326,8 +326,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } /* All stack pages belong to the same memcg. */ - memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } else { /* * All stack pages are in the same zone and belong to the @@ -338,8 +338,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, THREAD_SIZE / 1024 * account); - memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } } diff --git a/mm/slab.h b/mm/slab.h index 69f0579cb5aa..7b84e3839dfe 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -285,10 +285,10 @@ static __always_inline int memcg_charge_slab(struct page *page, if (ret) return ret; - memcg_kmem_update_page_stat(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - 1 << order); + mod_memcg_page_state(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + 1 << order); return 0; } @@ -298,10 +298,10 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, if (!memcg_kmem_enabled()) return; - memcg_kmem_update_page_stat(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - -(1 << order)); + mod_memcg_page_state(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + -(1 << order)); memcg_kmem_uncharge(page, order); } -- cgit v1.3-14-g43fede From f610c9d68b1a47f539b7764f4b5ce07d32fb9ae1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Jul 2017 08:57:57 +0200 Subject: genirq/debugfs: Remove redundant NULL pointer check debugfs_remove() can be called with a NULL pointer. Fixes: 087cdfb662ae5 ("genirq/debugfs: Add proper debugfs interface") Reported-by: Fengguang Wu Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 14fe862aa2e3..ed47688b8e79 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1667,8 +1667,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d) static void debugfs_remove_domain_dir(struct irq_domain *d) { - if (d->debugfs_file) - debugfs_remove(d->debugfs_file); + debugfs_remove(d->debugfs_file); } void __init irq_domain_debugfs_init(struct dentry *root) -- cgit v1.3-14-g43fede From c5c601c4295f89368f4a304cb3ae4aebdf80db22 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 7 Jul 2017 09:39:59 +0100 Subject: irqdomain: Allow ACPI device nodes to be used as irqdomain identifiers A number of irqchip implementations are (ab)using the irqdomain allocator by passing a fwnode that is neither a FWNODE_OF or a FWNODE_IRQCHIP. This is pretty bad, but it also feels pretty crap to force these drivers to allocate their own irqchip_fwid when they already have a proper fwnode. Instead, let's teach the irqdomain allocator about ACPI device nodes, and add some lovely name generation code... Tested on an arm64 D05 system. Reported-and-tested-by: John Garry Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Agustin Vega-Frias Cc: Ma Jun Cc: Hanjun Guo Link: http://lkml.kernel.org/r/20170707083959.10349-1-marc.zyngier@arm.com --- kernel/irq/irqdomain.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index ed47688b8e79..f1f251479aa6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1,5 +1,6 @@ #define pr_fmt(fmt) "irq: " fmt +#include #include #include #include @@ -155,6 +156,21 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain->name = fwid->name; break; } +#ifdef CONFIG_ACPI + } else if (is_acpi_device_node(fwnode)) { + struct acpi_buffer buf = { + .length = ACPI_ALLOCATE_BUFFER, + }; + acpi_handle handle; + + handle = acpi_device_handle(to_acpi_device_node(fwnode)); + if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) { + domain->name = buf.pointer; + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + } + + domain->fwnode = fwnode; +#endif } else if (of_node) { char *name; -- cgit v1.3-14-g43fede From eaf260ac04d9b4cf9f458d5c97555bfff2da526e Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 6 Jul 2017 16:00:21 -0700 Subject: tracing: Treat recording comm for idle task as a success Currently we stop recording comm for non-idle tasks when switching from/to idle task since we treat that as a record failure. Fix that by treat recording of comm for idle task as a success. Link: http://lkml.kernel.org/r/20170706230023.17942-1-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Reported-by: Michael Sartain Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f079a8ca1117..6722d86f2af5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1916,7 +1916,11 @@ static int trace_save_cmdline(struct task_struct *tsk) { unsigned pid, idx; - if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + if (unlikely(tsk->pid > PID_MAX_DEFAULT)) return 0; /* -- cgit v1.3-14-g43fede From bd45d34d25720a820021c8ea45de5cd607eace64 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 6 Jul 2017 16:00:22 -0700 Subject: tracing: Treat recording tgid for idle task as a success Currently we stop recording tgid for non-idle tasks when switching from/to idle task since we treat that as a record failure. Fix that by treat recording of tgid for idle task as a success. Link: http://lkml.kernel.org/r/20170706230023.17942-2-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Reported-by: Michael Sartain Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6722d86f2af5..aee11e3a394f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2006,7 +2006,11 @@ int trace_find_tgid(int pid) static int trace_save_tgid(struct task_struct *tsk) { - if (unlikely(!tgid_map || !tsk->pid || tsk->pid > PID_MAX_DEFAULT)) + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT)) return 0; tgid_map[tsk->pid] = tsk->tgid; -- cgit v1.3-14-g43fede From 29b1a8ad7df4528b862a79e3d5fb0936f4d199c7 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 6 Jul 2017 16:00:23 -0700 Subject: tracing: Attempt to record other information even if some fail In recent patches where we record comm and tgid at the same time, we skip continuing to record if any fail. Fix that by trying to record as many things as we can even if some couldn't be recorded. If any information isn't recorded, then we don't set trace_taskinfo_save as before. Link: http://lkml.kernel.org/r/20170706230023.17942-3-joelaf@google.com Cc: kernel-team@android.com Cc: Ingo Molnar Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aee11e3a394f..92af8fd1429b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2037,11 +2037,20 @@ static bool tracing_record_taskinfo_skip(int flags) */ void tracing_record_taskinfo(struct task_struct *task, int flags) { + bool done; + if (tracing_record_taskinfo_skip(flags)) return; - if ((flags & TRACE_RECORD_CMDLINE) && !trace_save_cmdline(task)) - return; - if ((flags & TRACE_RECORD_TGID) && !trace_save_tgid(task)) + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); + + /* If recording any information failed, retry again soon. */ + if (!done) return; __this_cpu_write(trace_taskinfo_save, false); @@ -2058,15 +2067,22 @@ void tracing_record_taskinfo(struct task_struct *task, int flags) void tracing_record_taskinfo_sched_switch(struct task_struct *prev, struct task_struct *next, int flags) { + bool done; + if (tracing_record_taskinfo_skip(flags)) return; - if ((flags & TRACE_RECORD_CMDLINE) && - (!trace_save_cmdline(prev) || !trace_save_cmdline(next))) - return; + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); + done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); - if ((flags & TRACE_RECORD_TGID) && - (!trace_save_tgid(prev) || !trace_save_tgid(next))) + /* If recording any information failed, retry again soon. */ + if (!done) return; __this_cpu_write(trace_taskinfo_save, false); -- cgit v1.3-14-g43fede From 5671360f29c68d9079914438f6a0109ef62f82a8 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Sat, 8 Jul 2017 04:56:58 +0900 Subject: locking/qspinlock: Explicitly include asm/prefetch.h In architectures that use qspinlock, like x86, prefetch is loaded indirectly via the asm/qspinlock.h include. On other architectures, like OpenRISC, which may want to use asm-generic/qspinlock.h the built will fail without the asm/prefetch.h include. Fix this by including directly. Signed-off-by: Stafford Horne Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170707195658.23840-1-shorne@gmail.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index b2caec7315af..fd24153e8a48 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include -- cgit v1.3-14-g43fede From 659b957f20c78fd470083c80af5e79eedfb39e5b Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 7 Jul 2017 22:37:24 +0530 Subject: kprobes: Rename [arch_]function_offset_within_entry() to [arch_]kprobe_on_func_entry() Rename function_offset_within_entry() to scope it to kprobe namespace by using kprobe_ prefix, and to also simplify it. Suggested-by: Ingo Molnar Suggested-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3aa6c7e2e4fb6e00f3c24fa306496a66edb558ea.1499443367.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/kprobes.c | 2 +- include/linux/kprobes.h | 4 ++-- kernel/kprobes.c | 8 ++++---- kernel/trace/trace_kprobe.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 01addfb0ed0a..586508e949f0 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -221,7 +221,7 @@ static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs kcb->kprobe_saved_msr = regs->msr; } -bool arch_function_offset_within_entry(unsigned long offset) +bool arch_kprobe_on_func_entry(unsigned long offset) { #ifdef PPC64_ELF_ABI_v2 #ifdef CONFIG_KPROBES_ON_FTRACE diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 541df0b5b815..bd2684700b74 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -267,8 +267,8 @@ extern int arch_init_kprobes(void); extern void show_registers(struct pt_regs *regs); extern void kprobes_inc_nmissed_count(struct kprobe *p); extern bool arch_within_kprobe_blacklist(unsigned long addr); -extern bool arch_function_offset_within_entry(unsigned long offset); -extern bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset); +extern bool arch_kprobe_on_func_entry(unsigned long offset); +extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset); extern bool within_kprobe_blacklist(unsigned long addr); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6756d750b31b..a519219169fd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1888,12 +1888,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe); -bool __weak arch_function_offset_within_entry(unsigned long offset) +bool __weak arch_kprobe_on_func_entry(unsigned long offset) { return !offset; } -bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) +bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) { kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset); @@ -1901,7 +1901,7 @@ bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsign return false; if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) || - !arch_function_offset_within_entry(offset)) + !arch_kprobe_on_func_entry(offset)) return false; return true; @@ -1914,7 +1914,7 @@ int register_kretprobe(struct kretprobe *rp) int i; void *addr; - if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset)) + if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset)) return -EINVAL; if (kretprobe_blacklist_size) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b53c8d369163..2c5221819be5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -720,7 +720,7 @@ static int create_trace_kprobe(int argc, char **argv) return ret; } if (offset && is_return && - !function_offset_within_entry(NULL, symbol, offset)) { + !kprobe_on_func_entry(NULL, symbol, offset)) { pr_info("Given offset is not valid for return probe.\n"); return -EINVAL; } -- cgit v1.3-14-g43fede From 0f73ff80b751b39ff539a550e65c5bd131ff0316 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 7 Jul 2017 22:37:25 +0530 Subject: kprobes: Simplify register_jprobes() Re-factor jprobe registration functions as the current version is getting too unwieldy. Move the actual jprobe registration to register_jprobe() and re-organize code accordingly. Suggested-by: Ingo Molnar Signed-off-by: Naveen N. Rao Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/089cae4bfe73767f765291ee0e6fb0c3d240e5f1.1499443367.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a519219169fd..db3cd3e60bdd 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1771,24 +1771,13 @@ unsigned long __weak arch_deref_entry_point(void *entry) int register_jprobes(struct jprobe **jps, int num) { - struct jprobe *jp; int ret = 0, i; if (num <= 0) return -EINVAL; + for (i = 0; i < num; i++) { - unsigned long addr, offset; - jp = jps[i]; - addr = arch_deref_entry_point(jp->entry); - - /* Verify probepoint is a function entry point */ - if (kallsyms_lookup_size_offset(addr, NULL, &offset) && - offset == 0) { - jp->kp.pre_handler = setjmp_pre_handler; - jp->kp.break_handler = longjmp_break_handler; - ret = register_kprobe(&jp->kp); - } else - ret = -EINVAL; + ret = register_jprobe(jps[i]); if (ret < 0) { if (i > 0) @@ -1796,13 +1785,26 @@ int register_jprobes(struct jprobe **jps, int num) break; } } + return ret; } EXPORT_SYMBOL_GPL(register_jprobes); int register_jprobe(struct jprobe *jp) { - return register_jprobes(&jp, 1); + unsigned long addr, offset; + struct kprobe *kp = &jp->kp; + + /* Verify probepoint is a function entry point */ + addr = arch_deref_entry_point(jp->entry); + + if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0) { + kp->pre_handler = setjmp_pre_handler; + kp->break_handler = longjmp_break_handler; + return register_kprobe(kp); + } + + return -EINVAL; } EXPORT_SYMBOL_GPL(register_jprobe); -- cgit v1.3-14-g43fede From dbf580623d5fee785218d1a47a2bcdf36d85c0e9 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 7 Jul 2017 22:37:26 +0530 Subject: kprobes: Ensure that jprobe probepoints are at function entry Similar to commit 90ec5e89e393c ("kretprobes: Ensure probe location is at function entry"), ensure that the jprobe probepoint is at function entry. Signed-off-by: Naveen N. Rao Cc: Ananth N Mavinakayanahalli Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a4525af6c5a42df385efa31251246cf7cca73598.1499443367.git.naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index db3cd3e60bdd..a1606a4224e1 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1795,10 +1795,14 @@ int register_jprobe(struct jprobe *jp) unsigned long addr, offset; struct kprobe *kp = &jp->kp; - /* Verify probepoint is a function entry point */ + /* + * Verify probepoint as well as the jprobe handler are + * valid function entry points. + */ addr = arch_deref_entry_point(jp->entry); - if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0) { + if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0 && + kprobe_on_func_entry(kp->addr, kp->symbol_name, kp->offset)) { kp->pre_handler = setjmp_pre_handler; kp->break_handler = longjmp_break_handler; return register_kprobe(kp); -- cgit v1.3-14-g43fede From 634a81609561f05266e1f625b6f2567c2e0b0419 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 8 Jul 2017 11:26:39 -0400 Subject: fix waitid(2) breakage We lose the distinction between "found a PID" and "nothing, but that's not an error" a bit too early in waitid(). Easily fixed, fortunately... Reported-by: Markus Trippelsdorf Fixes: 67d7ddded322 ("waitid(2): leave copyout of siginfo to syscall itself") Tested-by: Markus Trippelsdorf Signed-off-by: Al Viro --- kernel/exit.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 2bbc23273e2f..608c9775a37b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1590,9 +1590,6 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, wo.wo_rusage = ru; ret = do_wait(&wo); - if (ret > 0) - ret = 0; - put_pid(pid); return ret; } @@ -1603,6 +1600,11 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, struct rusage r; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); + int signo = 0; + if (err > 0) { + signo = SIGCHLD; + err = 0; + } if (!err) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) @@ -1612,7 +1614,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, return err; user_access_begin(); - unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault); + unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user((short)info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); @@ -1714,6 +1716,11 @@ COMPAT_SYSCALL_DEFINE5(waitid, struct rusage ru; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); + int signo = 0; + if (err > 0) { + signo = SIGCHLD; + err = 0; + } if (!err && uru) { /* kernel_waitid() overwrites everything in ru */ @@ -1729,7 +1736,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, return err; user_access_begin(); - unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault); + unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user((short)info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); -- cgit v1.3-14-g43fede From fca18a47cf3eb8425ec19c2dfc374f3d04f5219f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Sat, 8 Jul 2017 00:27:30 +0530 Subject: trace/kprobes: Sanitize derived event names When we derive event names, convert some expected symbols (such as ':' used to specify module:name and '.' present in some symbols) into underscores so that the event name is not rejected. Before this patch: # echo 'p kobject_example:foo_store' > kprobe_events trace_kprobe: Failed to allocate trace_probe.(-22) -sh: write error: Invalid argument After this patch: # echo 'p kobject_example:foo_store' > kprobe_events # cat kprobe_events p:kprobes/p_kobject_example_foo_store_0 kobject_example:foo_store Link: http://lkml.kernel.org/r/66c189e09e71361aba91dd4a5bd146a1b62a7a51.1499453040.git.naveen.n.rao@linux.vnet.ibm.com Acked-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c129fca6ec99..44fd819aa33d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -598,6 +598,14 @@ static struct notifier_block trace_kprobe_module_nb = { .priority = 1 /* Invoked after kprobe module callback */ }; +/* Convert certain expected symbols into '_' when generating event names */ +static inline void sanitize_event_name(char *name) +{ + while (*name++ != '\0') + if (*name == ':' || *name == '.') + *name = '_'; +} + static int create_trace_kprobe(int argc, char **argv) { /* @@ -740,6 +748,7 @@ static int create_trace_kprobe(int argc, char **argv) else snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", is_return ? 'r' : 'p', addr); + sanitize_event_name(buf); event = buf; } tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, -- cgit v1.3-14-g43fede From 1860033237d4be09c5d7382585f0c7229367a534 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 10 Jul 2017 15:48:02 -0700 Subject: mm: make PR_SET_THP_DISABLE immediately active PR_SET_THP_DISABLE has a rather subtle semantic. It doesn't affect any existing mapping because it only updated mm->def_flags which is a template for new mappings. The mappings created after prctl(PR_SET_THP_DISABLE) have VM_NOHUGEPAGE flag set. This can be quite surprising for all those applications which do not do prctl(); fork() & exec() and want to control their own THP behavior. Another usecase when the immediate semantic of the prctl might be useful is a combination of pre- and post-copy migration of containers with CRIU. In this case CRIU populates a part of a memory region with data that was saved during the pre-copy stage. Afterwards, the region is registered with userfaultfd and CRIU expects to get page faults for the parts of the region that were not yet populated. However, khugepaged collapses the pages and the expected page faults do not occur. In more general case, the prctl(PR_SET_THP_DISABLE) could be used as a temporary mechanism for enabling/disabling THP process wide. Implementation wise, a new MMF_DISABLE_THP flag is added. This flag is tested when decision whether to use huge pages is taken either during page fault of at the time of THP collapse. It should be noted, that the new implementation makes PR_SET_THP_DISABLE master override to any per-VMA setting, which was not the case previously. Fixes: a0715cc22601 ("mm, thp: add VM_INIT_DEF_MASK and PRCTL_THP_DISABLE") Link: http://lkml.kernel.org/r/1496415802-30944-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Michal Hocko Signed-off-by: Mike Rapoport Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: "Kirill A. Shutemov" Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 1 + include/linux/khugepaged.h | 3 ++- include/linux/sched/coredump.h | 5 ++++- kernel/sys.c | 6 +++--- mm/khugepaged.c | 3 ++- mm/shmem.c | 8 +++++--- 6 files changed, 17 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d3b3e8fcc717..40d7b7dd2653 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -92,6 +92,7 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); (1<vm_flags & VM_HUGEPAGE))) && \ !((__vma)->vm_flags & VM_NOHUGEPAGE) && \ + !test_bit(MMF_DISABLE_THP, &(__vma)->vm_mm->flags) && \ !is_vma_temporary_stack(__vma)) #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 5d9a400af509..f0d7335336cd 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -48,7 +48,8 @@ static inline int khugepaged_enter(struct vm_area_struct *vma, if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) if ((khugepaged_always() || (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && - !(vm_flags & VM_NOHUGEPAGE)) + !(vm_flags & VM_NOHUGEPAGE) && + !test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) if (__khugepaged_enter(vma->vm_mm)) return -ENOMEM; return 0; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 69eedcef8f03..98ae0d05aa32 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -68,7 +68,10 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ +#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) +#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ + MMF_DISABLE_THP_MASK) #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 47d901586b4e..73fc0af147d0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2360,7 +2360,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = !!(me->mm->def_flags & VM_NOHUGEPAGE); + error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); break; case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) @@ -2368,9 +2368,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (down_write_killable(&me->mm->mmap_sem)) return -EINTR; if (arg2) - me->mm->def_flags |= VM_NOHUGEPAGE; + set_bit(MMF_DISABLE_THP, &me->mm->flags); else - me->mm->def_flags &= ~VM_NOHUGEPAGE; + clear_bit(MMF_DISABLE_THP, &me->mm->flags); up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: diff --git a/mm/khugepaged.c b/mm/khugepaged.c index df4ebdb2b10a..c01f177a1120 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -816,7 +816,8 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) static bool hugepage_vma_check(struct vm_area_struct *vma) { if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) + (vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; if (shmem_file(vma->vm_file)) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) diff --git a/mm/shmem.c b/mm/shmem.c index 9418f5a9bc46..b0aa6075d164 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1977,10 +1977,12 @@ static int shmem_fault(struct vm_fault *vmf) } sgp = SGP_CACHE; - if (vma->vm_flags & VM_HUGEPAGE) - sgp = SGP_HUGE; - else if (vma->vm_flags & VM_NOHUGEPAGE) + + if ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) sgp = SGP_NOHUGE; + else if (vma->vm_flags & VM_HUGEPAGE) + sgp = SGP_HUGE; error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, gfp, vma, vmf, &ret); -- cgit v1.3-14-g43fede From 9dcdcea11491f6eee65bd1b352293ca01e4b7997 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Mon, 10 Jul 2017 15:51:14 -0700 Subject: kernel/ksysfs.c: constify attribute_group structures. attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 1120 544 16 1680 690 kernel/ksysfs.o File size After adding 'const': text data bss dec hex filename 1160 480 16 1656 678 kernel/ksysfs.o Link: http://lkml.kernel.org/r/aa224b3cc923fdbb3edd0c41b2c639c85408c9e8.1498737347.git.arvind.yadav.cs@gmail.com Signed-off-by: Arvind Yadav Acked-by: Kees Cook Cc: Russell King Cc: Dave Young Cc: Hari Bathini Cc: Petr Tesarik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ksysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 23cd70651238..df1a9aa602a0 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -234,7 +234,7 @@ static struct attribute * kernel_attrs[] = { NULL }; -static struct attribute_group kernel_attr_group = { +static const struct attribute_group kernel_attr_group = { .attrs = kernel_attrs, }; -- cgit v1.3-14-g43fede From b7b2562f7252878e18de60c24f320052076f9de8 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 10 Jul 2017 15:51:17 -0700 Subject: kernel/groups.c: use sort library function setgroups is not exactly a hot path, so we might as well use the library function instead of open-coding the sorting. Saves ~150 bytes. Link: http://lkml.kernel.org/r/1497301378-22739-1-git-send-email-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/groups.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/groups.c b/kernel/groups.c index d09727692a2a..434f6665f187 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -76,32 +77,18 @@ static int groups_from_user(struct group_info *group_info, return 0; } -/* a simple Shell sort */ +static int gid_cmp(const void *_a, const void *_b) +{ + kgid_t a = *(kgid_t *)_a; + kgid_t b = *(kgid_t *)_b; + + return gid_gt(a, b) - gid_lt(a, b); +} + static void groups_sort(struct group_info *group_info) { - int base, max, stride; - int gidsetsize = group_info->ngroups; - - for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) - ; /* nothing */ - stride /= 3; - - while (stride) { - max = gidsetsize - stride; - for (base = 0; base < max; base++) { - int left = base; - int right = left + stride; - kgid_t tmp = group_info->gid[right]; - - while (left >= 0 && gid_gt(group_info->gid[left], tmp)) { - group_info->gid[right] = group_info->gid[left]; - right = left; - left -= stride; - } - group_info->gid[right] = tmp; - } - stride /= 3; - } + sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), + gid_cmp, NULL); } /* a simple bsearch */ -- cgit v1.3-14-g43fede From 63b23e2cbc8e80de3e40184ecb2c3bfb705776fa Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 10 Jul 2017 15:51:20 -0700 Subject: kernel/kallsyms.c: replace all_var with IS_ENABLED(CONFIG_KALLSYMS_ALL) 'all_var' looks like a variable, but is actually a macro. Use IS_ENABLED(CONFIG_KALLSYMS_ALL) for clarification. Link: http://lkml.kernel.org/r/1497577591-3434-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6a3b249a2ae1..127e7cfafa55 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -28,12 +28,6 @@ #include -#ifdef CONFIG_KALLSYMS_ALL -#define all_var 1 -#else -#define all_var 0 -#endif - /* * These will be re-linked against their real values * during the second link stage. @@ -82,7 +76,7 @@ static inline int is_kernel(unsigned long addr) static int is_ksym_addr(unsigned long addr) { - if (all_var) + if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) return is_kernel(addr); return is_kernel_text(addr) || is_kernel_inittext(addr); @@ -280,7 +274,7 @@ static unsigned long get_symbol_pos(unsigned long addr, if (!symbol_end) { if (is_kernel_inittext(addr)) symbol_end = (unsigned long)_einittext; - else if (all_var) + else if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) symbol_end = (unsigned long)_end; else symbol_end = (unsigned long)_etext; -- cgit v1.3-14-g43fede From a94c33dd1f677d16c4f1a162b4b3e9eba1b07c24 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Mon, 10 Jul 2017 15:51:58 -0700 Subject: lib/extable.c: use bsearch() library function in search_extable() [thomas@m3y3r.de: v3: fix arch specific implementations] Link: http://lkml.kernel.org/r/1497890858.12931.7.camel@m3y3r.de Signed-off-by: Thomas Meyer Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/kernel/module.c | 3 ++- arch/mips/kernel/traps.c | 3 ++- arch/sh/mm/extable_64.c | 34 ++++++++++++++++++---------------- arch/sparc/mm/extable.c | 28 ++++++++++++++-------------- include/linux/extable.h | 5 +++-- kernel/extable.c | 3 ++- kernel/module.c | 2 +- lib/extable.c | 41 +++++++++++++++++++++-------------------- 8 files changed, 63 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 94627a3a6a0d..50c020c47e54 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -317,7 +317,8 @@ const struct exception_table_entry *search_module_dbetables(unsigned long addr) spin_lock_irqsave(&dbe_lock, flags); list_for_each_entry(dbe, &dbe_list, dbe_list) { - e = search_extable(dbe->dbe_start, dbe->dbe_end - 1, addr); + e = search_extable(dbe->dbe_start, + dbe->dbe_end - dbe->dbe_start, addr); if (e) break; } diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 38dfa27730ff..b68b4d0726d3 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -429,7 +429,8 @@ static const struct exception_table_entry *search_dbe_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___dbe_table, __stop___dbe_table - 1, addr); + e = search_extable(__start___dbe_table, + __stop___dbe_table - __start___dbe_table, addr); if (!e) e = search_module_dbetables(addr); return e; diff --git a/arch/sh/mm/extable_64.c b/arch/sh/mm/extable_64.c index b90cdfad2c78..7a3b4d33d2e7 100644 --- a/arch/sh/mm/extable_64.c +++ b/arch/sh/mm/extable_64.c @@ -10,6 +10,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. */ +#include #include #include #include @@ -40,10 +41,23 @@ static const struct exception_table_entry *check_exception_ranges(unsigned long return NULL; } +static int cmp_ex_search(const void *key, const void *elt) +{ + const struct exception_table_entry *_elt = elt; + unsigned long _key = *(unsigned long *)key; + + /* avoid overflow */ + if (_key > _elt->insn) + return 1; + if (_key < _elt->insn) + return -1; + return 0; +} + /* Simple binary search */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { const struct exception_table_entry *mid; @@ -52,20 +66,8 @@ search_extable(const struct exception_table_entry *first, if (mid) return mid; - while (first <= last) { - long diff; - - mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) - return mid; - else if (diff < 0) - first = mid+1; - else - last = mid-1; - } - - return NULL; + return bsearch(&value, base, num, + sizeof(struct exception_table_entry), cmp_ex_search); } int fixup_exception(struct pt_regs *regs) diff --git a/arch/sparc/mm/extable.c b/arch/sparc/mm/extable.c index db214e9931d9..2422511dc8c5 100644 --- a/arch/sparc/mm/extable.c +++ b/arch/sparc/mm/extable.c @@ -13,11 +13,11 @@ void sort_extable(struct exception_table_entry *start, /* Caller knows they are in a range if ret->fixup == 0 */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *start, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { - const struct exception_table_entry *walk; + int i; /* Single insn entries are encoded as: * word 1: insn address @@ -37,30 +37,30 @@ search_extable(const struct exception_table_entry *start, */ /* 1. Try to find an exact match. */ - for (walk = start; walk <= last; walk++) { - if (walk->fixup == 0) { + for (i = 0; i < num; i++) { + if (base[i].fixup == 0) { /* A range entry, skip both parts. */ - walk++; + i++; continue; } /* A deleted entry; see trim_init_extable */ - if (walk->fixup == -1) + if (base[i].fixup == -1) continue; - if (walk->insn == value) - return walk; + if (base[i].insn == value) + return &base[i]; } /* 2. Try to find a range match. */ - for (walk = start; walk <= (last - 1); walk++) { - if (walk->fixup) + for (i = 0; i < (num - 1); i++) { + if (base[i].fixup) continue; - if (walk[0].insn <= value && walk[1].insn > value) - return walk; + if (base[i].insn <= value && base[i + 1].insn > value) + return &base[i]; - walk++; + i++; } return NULL; diff --git a/include/linux/extable.h b/include/linux/extable.h index 7effea4b257d..28addad0dda7 100644 --- a/include/linux/extable.h +++ b/include/linux/extable.h @@ -2,13 +2,14 @@ #define _LINUX_EXTABLE_H #include /* for NULL */ +#include struct module; struct exception_table_entry; const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value); void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish); diff --git a/kernel/extable.c b/kernel/extable.c index 223df4a328a4..38c2412401a1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___ex_table, __stop___ex_table-1, addr); + e = search_extable(__start___ex_table, + __stop___ex_table - __start___ex_table, addr); if (!e) e = search_module_extables(addr); return e; diff --git a/kernel/module.c b/kernel/module.c index b3dbdde82e80..b0f92a365140 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4196,7 +4196,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) goto out; e = search_extable(mod->extable, - mod->extable + mod->num_exentries - 1, + mod->num_exentries, addr); out: preempt_enable(); diff --git a/lib/extable.c b/lib/extable.c index 62968daa66a9..f54996fdd0b8 100644 --- a/lib/extable.c +++ b/lib/extable.c @@ -9,6 +9,7 @@ * 2 of the License, or (at your option) any later version. */ +#include #include #include #include @@ -51,7 +52,7 @@ static void swap_ex(void *a, void *b, int size) * This is used both for the kernel exception table and for * the exception tables of modules that get loaded. */ -static int cmp_ex(const void *a, const void *b) +static int cmp_ex_sort(const void *a, const void *b) { const struct exception_table_entry *x = a, *y = b; @@ -67,7 +68,7 @@ void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish) { sort(start, finish - start, sizeof(struct exception_table_entry), - cmp_ex, swap_ex); + cmp_ex_sort, swap_ex); } #ifdef CONFIG_MODULES @@ -93,6 +94,20 @@ void trim_init_extable(struct module *m) #endif /* !ARCH_HAS_SORT_EXTABLE */ #ifndef ARCH_HAS_SEARCH_EXTABLE + +static int cmp_ex_search(const void *key, const void *elt) +{ + const struct exception_table_entry *_elt = elt; + unsigned long _key = *(unsigned long *)key; + + /* avoid overflow */ + if (_key > ex_to_insn(_elt)) + return 1; + if (_key < ex_to_insn(_elt)) + return -1; + return 0; +} + /* * Search one exception table for an entry corresponding to the * given instruction address, and return the address of the entry, @@ -101,25 +116,11 @@ void trim_init_extable(struct module *m) * already sorted. */ const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, +search_extable(const struct exception_table_entry *base, + const size_t num, unsigned long value) { - while (first <= last) { - const struct exception_table_entry *mid; - - mid = ((last - first) >> 1) + first; - /* - * careful, the distance between value and insn - * can be larger than MAX_LONG: - */ - if (ex_to_insn(mid) < value) - first = mid + 1; - else if (ex_to_insn(mid) > value) - last = mid - 1; - else - return mid; - } - return NULL; + return bsearch(&value, base, num, + sizeof(struct exception_table_entry), cmp_ex_search); } #endif -- cgit v1.3-14-g43fede From 4ea77014af0d6205b05503d1c7aac6eace11d473 Mon Sep 17 00:00:00 2001 From: zhongjiang Date: Mon, 10 Jul 2017 15:52:57 -0700 Subject: kernel/signal.c: avoid undefined behaviour in kill_something_info When running kill(72057458746458112, 0) in userspace I hit the following issue. UBSAN: Undefined behaviour in kernel/signal.c:1462:11 negation of -2147483648 cannot be represented in type 'int': CPU: 226 PID: 9849 Comm: test Tainted: G B ---- ------- 3.10.0-327.53.58.70.x86_64_ubsan+ #116 Hardware name: Huawei Technologies Co., Ltd. RH8100 V3/BC61PBIA, BIOS BLHSV028 11/11/2014 Call Trace: dump_stack+0x19/0x1b ubsan_epilogue+0xd/0x50 __ubsan_handle_negate_overflow+0x109/0x14e SYSC_kill+0x43e/0x4d0 SyS_kill+0xe/0x10 system_call_fastpath+0x16/0x1b Add code to avoid the UBSAN detection. [akpm@linux-foundation.org: tweak comment] Link: http://lkml.kernel.org/r/1496670008-59084-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhongjiang Cc: Oleg Nesterov Cc: Michal Hocko Cc: Vlastimil Babka Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 48a59eefd8ad..caed9133ae52 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1402,6 +1402,10 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) return ret; } + /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ + if (pid == INT_MIN) + return -ESRCH; + read_lock(&tasklist_lock); if (pid != -1) { ret = __kill_pgrp_info(sig, info, -- cgit v1.3-14-g43fede From dd83c161fbcc5d8be637ab159c0de015cbff5ba4 Mon Sep 17 00:00:00 2001 From: zhongjiang Date: Mon, 10 Jul 2017 15:53:01 -0700 Subject: kernel/exit.c: avoid undefined behaviour when calling wait4() wait4(-2147483648, 0x20, 0, 0xdd0000) triggers: UBSAN: Undefined behaviour in kernel/exit.c:1651:9 The related calltrace is as follows: negation of -2147483648 cannot be represented in type 'int': CPU: 9 PID: 16482 Comm: zj Tainted: G B ---- ------- 3.10.0-327.53.58.71.x86_64+ #66 Hardware name: Huawei Technologies Co., Ltd. Tecal RH2285 /BC11BTSA , BIOS CTSAV036 04/27/2011 Call Trace: dump_stack+0x19/0x1b ubsan_epilogue+0xd/0x50 __ubsan_handle_negate_overflow+0x109/0x14e SyS_wait4+0x1cb/0x1e0 system_call_fastpath+0x16/0x1b Exclude the overflow to avoid the UBSAN warning. Link: http://lkml.kernel.org/r/1497264618-20212-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhongjiang Cc: Oleg Nesterov Cc: David Rientjes Cc: Aneesh Kumar K.V Cc: Kirill A. Shutemov Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 608c9775a37b..c5548faa9f37 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1639,6 +1639,10 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options, __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; + /* -INT_MIN is not defined */ + if (upid == INT_MIN) + return -ESRCH; + if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { -- cgit v1.3-14-g43fede From 6a8a75f3235724c5941a33e287b2f98966ad14c5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Jul 2017 10:56:54 +0200 Subject: Revert "perf/core: Drop kernel samples even though :u is specified" This reverts commit cc1582c231ea041fbc68861dfaf957eaf902b829. This commit introduced a regression that broke rr-project, which uses sampling events to receive a signal on overflow (but does not care about the contents of the sample). These signals are critical to the correct operation of rr. There's been some back and forth about how to fix it - but to not keep applications in limbo queue up a revert. Reported-by: Kyle Huey Acked-by: Kyle Huey Acked-by: Peter Zijlstra Cc: Jin Yao Cc: Vince Weaver Cc: Linus Torvalds Cc: Will Deacon Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Stephane Eranian Cc: Namhyung Kim Cc: Jiri Olsa Cc: Link: http://lkml.kernel.org/r/20170628105600.GC5981@leverpostej Signed-off-by: Ingo Molnar --- kernel/events/core.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4d2c32f98482..9747e422ab20 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7308,21 +7308,6 @@ int perf_event_account_interrupt(struct perf_event *event) return __perf_event_account_interrupt(event, 1); } -static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) -{ - /* - * Due to interrupt latency (AKA "skid"), we may enter the - * kernel before taking an overflow, even if the PMU is only - * counting user events. - * To avoid leaking information to userspace, we must always - * reject kernel samples when exclude_kernel is set. - */ - if (event->attr.exclude_kernel && !user_mode(regs)) - return false; - - return true; -} - /* * Generic event overflow handling, sampling. */ @@ -7343,12 +7328,6 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); - /* - * For security, drop the skid kernel samples if necessary. - */ - if (!sample_is_allowed(event, regs)) - return ret; - /* * XXX event_limit might not quite work as expected on inherited * events -- cgit v1.3-14-g43fede From dea1d0f5f1284e3defee4b8484d9fc230686cd42 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 11 Jul 2017 22:06:24 +0200 Subject: smp/hotplug: Replace BUG_ON and react useful The move of the unpark functions to the control thread moved the BUG_ON() there as well. While it made some sense in the idle thread of the upcoming CPU, it's bogus to crash the control thread on the already online CPU, especially as the function has a return value and the callsite is prepared to handle an error return. Replace it with a WARN_ON_ONCE() and return a proper error code. Fixes: 9cd4f1a4e7a8 ("smp/hotplug: Move unparking of percpu threads to the control CPU") Rightfully-ranted-at-by: Linux Torvalds Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index ab860453841d..eee033134262 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -279,7 +279,8 @@ static int bringup_wait_for_ap(unsigned int cpu) /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ wait_for_completion(&st->done); - BUG_ON(!cpu_online(cpu)); + if (WARN_ON_ONCE((!cpu_online(cpu)))) + return -ECANCELED; /* Unpark the stopper thread and the hotplug thread of the target cpu */ stop_machine_unpark(cpu); -- cgit v1.3-14-g43fede From b11fb73743fc406204e0749ead18560aeda8b136 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 11 Jul 2017 15:43:24 -0400 Subject: tracing: Fixup trace file header alignment The addition of TGID to the tracing header added a check to see if TGID shoudl be displayed or not, and updated the header accordingly. Unfortunately, it broke the default header. Also add constant strings to use for spacing. This does remove the visibility of the header a bit, but cuts it down from the extended lines much greater than 80 characters. Before this change: # tracer: function # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU#|||| TIMESTAMP FUNCTION # | | | |||| | | swapper/0-1 [000] .... 0.277830: migration_init <-do_one_initcall swapper/0-1 [002] d... 13.861967: Unknown type 1201 swapper/0-1 [002] d..1 13.861970: Unknown type 1202 After this change: # tracer: function # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | swapper/0-1 [000] .... 0.278245: migration_init <-do_one_initcall swapper/0-1 [003] d... 13.861189: Unknown type 1201 swapper/0-1 [003] d..1 13.861192: Unknown type 1202 Cc: Joel Fernandes Fixes: 441dae8f2f29 ("tracing: Add support for display of tgid in trace output") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 92af8fd1429b..dabd810a10cd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3358,14 +3358,23 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - - seq_printf(m, "# %s _-----=> irqs-off\n", tgid ? " " : ""); - seq_printf(m, "# %s / _----=> need-resched\n", tgid ? " " : ""); - seq_printf(m, "# %s| / _---=> hardirq/softirq\n", tgid ? " " : ""); - seq_printf(m, "# %s|| / _--=> preempt-depth\n", tgid ? " " : ""); - seq_printf(m, "# %s||| / delay\n", tgid ? " " : ""); - seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); - seq_printf(m, "# | | | %s|||| | |\n", tgid ? " | " : ""); + const char tgid_space[] = " "; + const char space[] = " "; + + seq_printf(m, "# %s _-----=> irqs-off\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s / _----=> need-resched\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s| / _---=> hardirq/softirq\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s|| / _--=> preempt-depth\n", + tgid ? tgid_space : space); + seq_printf(m, "# %s||| / delay\n", + tgid ? tgid_space : space); + seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", + tgid ? " TGID " : space); + seq_printf(m, "# | | | %s|||| | |\n", + tgid ? " | " : space); } void -- cgit v1.3-14-g43fede From bbd1d27d863d5c0acee65ecd0c2e34035e1df5ea Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 11 Jul 2017 19:21:04 -0400 Subject: tracing: Do note expose stack_trace_filter without DYNAMIC_FTRACE The "stack_trace_filter" file only makes sense if DYNAMIC_FTRACE is configured in. If it is not, then the user can not filter any functions. Not only that, the open function causes warnings when DYNAMIC_FTRACE is not set. Link: http://lkml.kernel.org/r/20170710110521.600806-1-arnd@arndb.de Reported-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_stack.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b4a751e8f9d6..a4df67cbc711 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -406,6 +406,8 @@ static const struct file_operations stack_trace_fops = { .release = seq_release, }; +#ifdef CONFIG_DYNAMIC_FTRACE + static int stack_trace_filter_open(struct inode *inode, struct file *file) { @@ -423,6 +425,8 @@ static const struct file_operations stack_trace_filter_fops = { .release = ftrace_regex_release, }; +#endif /* CONFIG_DYNAMIC_FTRACE */ + int stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -477,8 +481,10 @@ static __init int stack_trace_init(void) trace_create_file("stack_trace", 0444, d_tracer, NULL, &stack_trace_fops); +#ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("stack_trace_filter", 0444, d_tracer, &trace_ops, &stack_trace_filter_fops); +#endif if (stack_trace_filter_buf[0]) ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); -- cgit v1.3-14-g43fede From 69449bbd65687e8e5fb968a5a0c46089f6af6001 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 10 Jul 2017 10:44:03 +0200 Subject: ftrace: Hide cached module code for !CONFIG_MODULES When modules are disabled, we get a harmless build warning: kernel/trace/ftrace.c:4051:13: error: 'process_cached_mods' defined but not used [-Werror=unused-function] This adds the same #ifdef around the new code that exists around its caller. Link: http://lkml.kernel.org/r/20170710084413.1820568-1-arnd@arndb.de Fixes: d7fbf8df7ca0 ("ftrace: Implement cached modules tracing on module load") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2953d558bbee..4706f0ed193e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3978,6 +3978,7 @@ static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable); +#ifdef CONFIG_MODULES static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, char *mod, bool enable) { @@ -4068,6 +4069,7 @@ static void process_cached_mods(const char *mod_name) kfree(mod); } +#endif /* * We register the module command as a template to show others how -- cgit v1.3-14-g43fede From 19d39a3810e7032f311ef83effdac40339b9d022 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 11 Jul 2017 23:41:52 +0200 Subject: genirq: Keep chip buslock across irq_request/release_resources() Moving the irq_request/release_resources() callbacks out of the spinlocked, irq disabled and bus locked region, unearthed an interesting abuse of the irq_bus_lock/irq_bus_sync_unlock() callbacks. The OMAP GPIO driver does merily power management inside of them. The irq_request_resources() callback of this GPIO irqchip calls a function which reads a GPIO register. That read aborts now because the clock of the GPIO block is not magically enabled via the irq_bus_lock() callback. Move the callbacks under the bus lock again to prevent this. In the free_irq() path this requires to drop the bus_lock before calling synchronize_irq() and reaquiring it before calling the irq_release_resources() callback. The bus lock can't be held because: 1) The data which has been changed between bus_lock/un_lock is cached in the irq chip driver private data and needs to go out to the irq chip via the slow bus (usually SPI or I2C) before calling synchronize_irq(). That's the reason why this bus_lock/unlock magic exists in the first place, as you cannot do SPI/I2C transactions while holding desc->lock with interrupts disabled. 2) synchronize_irq() will actually deadlock, if there is a handler on flight. These chips use threaded handlers for obvious reasons, as they allow to do SPI/I2C communication. When the threaded handler returns then bus_lock needs to be taken in irq_finalize_oneshot() as we need to talk to the actual irq chip once more. After that the threaded handler is marked done, which makes synchronize_irq() return. So if we hold bus_lock accross the synchronize_irq() call, the handler cannot mark itself done because it blocks on the bus lock. That in turn makes synchronize_irq() wait forever on the threaded handler to complete.... Add the missing unlock of desc->request_mutex in the error path of __free_irq() and add a bunch of comments to explain the locking and protection rules. Fixes: 46e48e257360 ("genirq: Move irq resource handling out of spinlocked region") Reported-and-tested-by: Sebastian Reichel Reported-and-tested-by: Tony Lindgren Reported-by: Pavel Machek Signed-off-by: Thomas Gleixner Not-longer-ranted-at-by: Linus Torvalds Cc: Linus Walleij Cc: Grygorii Strashko Cc: Marc Zyngier --- kernel/irq/manage.c | 63 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5624b2dd6b58..1d1a5b945ab4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1090,6 +1090,16 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. + * + * Locking rules: + * + * desc->request_mutex Provides serialization against a concurrent free_irq() + * chip_bus_lock Provides serialization for slow bus operations + * desc->lock Provides serialization against hard interrupts + * + * chip_bus_lock and desc->lock are sufficient for all other management and + * interrupt related functions. desc->request_mutex solely serializes + * request/free_irq(). */ static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) @@ -1167,20 +1177,35 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) new->flags &= ~IRQF_ONESHOT; + /* + * Protects against a concurrent __free_irq() call which might wait + * for synchronize_irq() to complete without holding the optional + * chip bus lock and desc->lock. + */ mutex_lock(&desc->request_mutex); + + /* + * Acquire bus lock as the irq_request_resources() callback below + * might rely on the serialization or the magic power management + * functions which are abusing the irq_bus_lock() callback, + */ + chip_bus_lock(desc); + + /* First installed action requests resources. */ if (!desc->action) { ret = irq_request_resources(desc); if (ret) { pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", new->name, irq, desc->irq_data.chip->name); - goto out_mutex; + goto out_bus_unlock; } } - chip_bus_lock(desc); - /* * The following block of code has to be executed atomically + * protected against a concurrent interrupt and any of the other + * management calls which are not serialized via + * desc->request_mutex or the optional bus lock. */ raw_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; @@ -1286,10 +1311,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); - if (ret) { - irq_release_resources(desc); + if (ret) goto out_unlock; - } } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ @@ -1385,12 +1408,10 @@ mismatch: out_unlock: raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); - if (!desc->action) irq_release_resources(desc); - -out_mutex: +out_bus_unlock: + chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); out_thread: @@ -1472,6 +1493,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) WARN(1, "Trying to free already-free IRQ %d\n", irq); raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); + mutex_unlock(&desc->request_mutex); return NULL; } @@ -1498,6 +1520,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif raw_spin_unlock_irqrestore(&desc->lock, flags); + /* + * Drop bus_lock here so the changes which were done in the chip + * callbacks above are synced out to the irq chips which hang + * behind a slow bus (I2C, SPI) before calling synchronize_irq(). + * + * Aside of that the bus_lock can also be taken from the threaded + * handler in irq_finalize_oneshot() which results in a deadlock + * because synchronize_irq() would wait forever for the thread to + * complete, which is blocked on the bus lock. + * + * The still held desc->request_mutex() protects against a + * concurrent request_irq() of this irq so the release of resources + * and timing data is properly serialized. + */ chip_bus_sync_unlock(desc); unregister_handler_proc(irq, action); @@ -1530,8 +1566,15 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) } } + /* Last action releases resources */ if (!desc->action) { + /* + * Reaquire bus lock as irq_release_resources() might + * require it to deallocate resources over the slow bus. + */ + chip_bus_lock(desc); irq_release_resources(desc); + chip_bus_sync_unlock(desc); irq_remove_timings(desc); } -- cgit v1.3-14-g43fede From ab2f7cf141aa6734c4ca7525132d8cc236efee77 Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Thu, 6 Jul 2017 10:53:20 -0700 Subject: cpufreq: schedutil: Fix sugov_start() versus sugov_update_shared() race With a shared policy in place, when one of the CPUs in the policy is hotplugged out and then brought back online, sugov_stop() and sugov_start() are called in order. sugov_stop() removes utilization hooks for each CPU in the policy and does nothing else in the for_each_cpu() loop. sugov_start() on the other hand iterates through the CPUs in the policy and re-initializes the per-cpu structure _and_ adds the utilization hook. This implies that the scheduler is allowed to invoke a CPU's utilization update hook when the rest of the per-cpu structures have yet to be re-inited. Apart from some strange values in tracepoints this doesn't cause a problem, but if we do end up accessing a pointer from the per-cpu sugov_cpu structure somewhere in the sugov_update_shared() path, we will likely see crashes since the memset for another CPU in the policy is free to race with sugov_update_shared from the CPU that is ready to go. So let's fix this now to first init all per-cpu structures, and then add the per-cpu utilization update hooks all at once. Signed-off-by: Vikram Mulukutla Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 076a2e31951c..29a397067ffa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -610,6 +610,11 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->sg_policy = sg_policy; sg_cpu->flags = SCHED_CPUFREQ_RT; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + } + + for_each_cpu(cpu, policy->cpus) { + struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, policy_is_shared(policy) ? sugov_update_shared : -- cgit v1.3-14-g43fede From 44925dfff05fd1a897992d278b15a6b6b55e79a7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Jul 2017 10:33:40 +0300 Subject: ftrace: Remove an unneeded NULL check "func" can't be NULL and it doesn't make sense to check because we've already derefenced it. Link: http://lkml.kernel.org/r/20170712073340.4enzeojeoupuds5a@mwanda Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4706f0ed193e..5fb5b40b3ae8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3950,7 +3950,7 @@ static int cache_mod(struct trace_array *tr, continue; /* no func matches all */ - if (!func || strcmp(func, "*") == 0 || + if (strcmp(func, "*") == 0 || (ftrace_mod->func && strcmp(ftrace_mod->func, func) == 0)) { ret = 0; -- cgit v1.3-14-g43fede From 2e028c4fe12907f226b8221815f16c2486ad3aa7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Jul 2017 10:35:57 +0300 Subject: ftrace: Fix uninitialized variable in match_records() My static checker complains that if "func" is NULL then "clear_filter" is uninitialized. This seems like it could be true, although it's possible something subtle is happening that I haven't seen. kernel/trace/ftrace.c:3844 match_records() error: uninitialized symbol 'clear_filter'. Link: http://lkml.kernel.org/r/20170712073556.h6tkpjcdzjaozozs@mwanda Cc: stable@vger.kernel.org Fixes: f0a3b154bd7 ("ftrace: Clarify code for mod command") Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5fb5b40b3ae8..53f6b6401cf0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3816,7 +3816,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) int exclude_mod = 0; int found = 0; int ret; - int clear_filter; + int clear_filter = 0; if (func) { func_g.type = filter_parse_regex(func, len, &func_g.search, -- cgit v1.3-14-g43fede From 58c7ffc0747a3a9145629d4966291f0586703767 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 12 Jul 2017 04:59:45 +0100 Subject: fix a braino in compat_sys_getrlimit() Reported-and-tested-by: Meelis Roos Fixes: commit d9e968cb9f84 "getrlimit()/setrlimit(): move compat to native" Signed-off-by: Al Viro Acked-by: David S. Miller Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 73fc0af147d0..2855ee73acd0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1362,7 +1362,7 @@ COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, ret = do_prlimit(current, resource, NULL, &r); if (!ret) { - struct rlimit r32; + struct compat_rlimit r32; if (r.rlim_cur > COMPAT_RLIM_INFINITY) r32.rlim_cur = COMPAT_RLIM_INFINITY; else -- cgit v1.3-14-g43fede From 112166f88cf83dd11486cf1818672d42b540865b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 12 Jul 2017 14:33:11 -0700 Subject: kernel/fork.c: virtually mapped stacks: do not disable interrupts The reason to disable interrupts seems to be to avoid switching to a different processor while handling per cpu data using individual loads and stores. If we use per cpu RMV primitives we will not have to disable interrupts. Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1705171055130.5898@east.gentwo.org Signed-off-by: Christoph Lameter Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0f69a3e5281e..d2b9d7c31eaf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -205,19 +205,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) void *stack; int i; - local_irq_disable(); for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s = this_cpu_read(cached_stacks[i]); + struct vm_struct *s; + + s = this_cpu_xchg(cached_stacks[i], NULL); if (!s) continue; - this_cpu_write(cached_stacks[i], NULL); tsk->stack_vm_area = s; - local_irq_enable(); return s->addr; } - local_irq_enable(); stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, VMALLOC_START, VMALLOC_END, @@ -245,19 +243,15 @@ static inline void free_thread_stack(struct task_struct *tsk) { #ifdef CONFIG_VMAP_STACK if (task_stack_vm_area(tsk)) { - unsigned long flags; int i; - local_irq_save(flags); for (i = 0; i < NR_CACHED_STACKS; i++) { - if (this_cpu_read(cached_stacks[i])) + if (this_cpu_cmpxchg(cached_stacks[i], + NULL, tsk->stack_vm_area) != NULL) continue; - this_cpu_write(cached_stacks[i], tsk->stack_vm_area); - local_irq_restore(flags); return; } - local_irq_restore(flags); vfree_atomic(tsk->stack); return; -- cgit v1.3-14-g43fede From 203e9e41219b4e7357104e525e91ac609fba2c6c Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 12 Jul 2017 14:33:14 -0700 Subject: kexec: move vmcoreinfo out of the kernel's .bss section As Eric said, "what we need to do is move the variable vmcoreinfo_note out of the kernel's .bss section. And modify the code to regenerate and keep this information in something like the control page. Definitely something like this needs a page all to itself, and ideally far away from any other kernel data structures. I clearly was not watching closely the data someone decided to keep this silly thing in the kernel's .bss section." This patch allocates extra pages for these vmcoreinfo_XXX variables, one advantage is that it enhances some safety of vmcoreinfo, because vmcoreinfo now is kept far away from other kernel data structures. Link: http://lkml.kernel.org/r/1493281021-20737-1-git-send-email-xlpang@redhat.com Signed-off-by: Xunlei Pang Tested-by: Michael Holzheu Reviewed-by: Juergen Gross Suggested-by: Eric Biederman Cc: Benjamin Herrenschmidt Cc: Dave Young Cc: Hari Bathini Cc: Mahesh Salgaonkar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/machine_kexec.c | 5 ----- arch/s390/kernel/machine_kexec.c | 1 + arch/s390/kernel/setup.c | 6 ------ arch/x86/kernel/crash.c | 2 +- arch/x86/xen/mmu_pv.c | 4 ++-- include/linux/crash_core.h | 4 ++-- kernel/crash_core.c | 26 ++++++++++++++++++++++---- kernel/ksysfs.c | 2 +- 8 files changed, 29 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c index 599507bcec91..c14815dca747 100644 --- a/arch/ia64/kernel/machine_kexec.c +++ b/arch/ia64/kernel/machine_kexec.c @@ -163,8 +163,3 @@ void arch_crash_save_vmcoreinfo(void) #endif } -phys_addr_t paddr_vmcoreinfo_note(void) -{ - return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note); -} - diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 49a6bd45957b..3d0b14afa232 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -246,6 +246,7 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_SYMBOL(lowcore_ptr); VMCOREINFO_SYMBOL(high_memory); VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); + mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); } void machine_shutdown(void) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 3ae756c0db3d..3d1d808ea8a9 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -496,11 +496,6 @@ static void __init setup_memory_end(void) pr_notice("The maximum memory size is %luMB\n", memory_end >> 20); } -static void __init setup_vmcoreinfo(void) -{ - mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); -} - #ifdef CONFIG_CRASH_DUMP /* @@ -939,7 +934,6 @@ void __init setup_arch(char **cmdline_p) #endif setup_resources(); - setup_vmcoreinfo(); setup_lowcore(); smp_fill_possible_mask(); cpu_detect_mhz_feature(); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 22217ece26c8..44404e2307bb 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -457,7 +457,7 @@ static int prepare_elf64_headers(struct crash_elf_data *ced, bufp += sizeof(Elf64_Phdr); phdr->p_type = PT_NOTE; phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); - phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); + phdr->p_filesz = phdr->p_memsz = VMCOREINFO_NOTE_SIZE; (ehdr->e_phnum)++; #ifdef CONFIG_X86_64 diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 1d7a7213a310..cab28cf2cffb 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2693,8 +2693,8 @@ EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); phys_addr_t paddr_vmcoreinfo_note(void) { if (xen_pv_domain()) - return virt_to_machine(&vmcoreinfo_note).maddr; + return virt_to_machine(vmcoreinfo_note).maddr; else - return __pa_symbol(&vmcoreinfo_note); + return __pa(vmcoreinfo_note); } #endif /* CONFIG_KEXEC_CORE */ diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 4090a42578a8..87506a02e914 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -19,7 +19,7 @@ CRASH_CORE_NOTE_NAME_BYTES + \ CRASH_CORE_NOTE_DESC_BYTES) -#define VMCOREINFO_BYTES (4096) +#define VMCOREINFO_BYTES PAGE_SIZE #define VMCOREINFO_NOTE_NAME "VMCOREINFO" #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) #define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ @@ -56,7 +56,7 @@ phys_addr_t paddr_vmcoreinfo_note(void); #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +extern u32 *vmcoreinfo_note; extern size_t vmcoreinfo_size; extern size_t vmcoreinfo_max_size; diff --git a/kernel/crash_core.c b/kernel/crash_core.c index fcbd568f1e95..2837d6164db8 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -14,10 +14,10 @@ #include /* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +static unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); +size_t vmcoreinfo_max_size = VMCOREINFO_BYTES; +u32 *vmcoreinfo_note; /* * parsing the "crashkernel" commandline @@ -326,6 +326,9 @@ static void update_vmcoreinfo_note(void) void crash_save_vmcoreinfo(void) { + if (!vmcoreinfo_note) + return; + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); update_vmcoreinfo_note(); } @@ -356,11 +359,26 @@ void __weak arch_crash_save_vmcoreinfo(void) phys_addr_t __weak paddr_vmcoreinfo_note(void) { - return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); + return __pa(vmcoreinfo_note); } static int __init crash_save_vmcoreinfo_init(void) { + vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); + if (!vmcoreinfo_data) { + pr_warn("Memory allocation for vmcoreinfo_data failed\n"); + return -ENOMEM; + } + + vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, + GFP_KERNEL | __GFP_ZERO); + if (!vmcoreinfo_note) { + free_page((unsigned long)vmcoreinfo_data); + vmcoreinfo_data = NULL; + pr_warn("Memory allocation for vmcoreinfo_note failed\n"); + return -ENOMEM; + } + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); VMCOREINFO_PAGESIZE(PAGE_SIZE); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index df1a9aa602a0..46ba853656f6 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, { phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); return sprintf(buf, "%pa %x\n", &vmcore_base, - (unsigned int)sizeof(vmcoreinfo_note)); + (unsigned int)VMCOREINFO_NOTE_SIZE); } KERNEL_ATTR_RO(vmcoreinfo); -- cgit v1.3-14-g43fede From 5203f4995d9a87952a83c2ce7866adbbe8f97bb5 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 12 Jul 2017 14:33:17 -0700 Subject: powerpc/fadump: use the correct VMCOREINFO_NOTE_SIZE for phdr vmcoreinfo_max_size stands for the vmcoreinfo_data, the correct one we should use is vmcoreinfo_note whose total size is VMCOREINFO_NOTE_SIZE. Like explained in commit 77019967f06b ("kdump: fix exported size of vmcoreinfo note"), it should not affect the actual function, but we better fix it, also this change should be safe and backward compatible. After this, we can get rid of variable vmcoreinfo_max_size, let's use the corresponding macros directly, fewer variables means more safety for vmcoreinfo operation. [xlpang@redhat.com: fix build warning] Link: http://lkml.kernel.org/r/1494830606-27736-1-git-send-email-xlpang@redhat.com Link: http://lkml.kernel.org/r/1493281021-20737-2-git-send-email-xlpang@redhat.com Signed-off-by: Xunlei Pang Reviewed-by: Mahesh Salgaonkar Reviewed-by: Dave Young Cc: Hari Bathini Cc: Benjamin Herrenschmidt Cc: Eric Biederman Cc: Juergen Gross Cc: Michael Holzheu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/fadump.c | 3 +-- include/linux/crash_core.h | 1 - kernel/crash_core.c | 3 +-- 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 3079518f2245..dc0c49cfd90a 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -999,8 +999,7 @@ static int fadump_create_elfcore_headers(char *bufp) phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note()); phdr->p_offset = phdr->p_paddr; - phdr->p_memsz = vmcoreinfo_max_size; - phdr->p_filesz = vmcoreinfo_max_size; + phdr->p_memsz = phdr->p_filesz = VMCOREINFO_NOTE_SIZE; /* Increment number of program headers. */ (elf->e_phnum)++; diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 87506a02e914..e5df1b3cf072 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -58,7 +58,6 @@ phys_addr_t paddr_vmcoreinfo_note(void); extern u32 *vmcoreinfo_note; extern size_t vmcoreinfo_size; -extern size_t vmcoreinfo_max_size; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 2837d6164db8..315adbf9cb68 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -16,7 +16,6 @@ /* vmcoreinfo stuff */ static unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = VMCOREINFO_BYTES; u32 *vmcoreinfo_note; /* @@ -343,7 +342,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) r = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); - r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); + r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); -- cgit v1.3-14-g43fede From 1229384f5b856d83698c38f9dedfd836e26711cb Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 12 Jul 2017 14:33:21 -0700 Subject: kdump: protect vmcoreinfo data under the crash memory Currently vmcoreinfo data is updated at boot time subsys_initcall(), it has the risk of being modified by some wrong code during system is running. As a result, vmcore dumped may contain the wrong vmcoreinfo. Later on, when using "crash", "makedumpfile", etc utility to parse this vmcore, we probably will get "Segmentation fault" or other unexpected errors. E.g. 1) wrong code overwrites vmcoreinfo_data; 2) further crashes the system; 3) trigger kdump, then we obviously will fail to recognize the crash context correctly due to the corrupted vmcoreinfo. Now except for vmcoreinfo, all the crash data is well protected(including the cpu note which is fully updated in the crash path, thus its correctness is guaranteed). Given that vmcoreinfo data is a large chunk prepared for kdump, we better protect it as well. To solve this, we relocate and copy vmcoreinfo_data to the crash memory when kdump is loading via kexec syscalls. Because the whole crash memory will be protected by existing arch_kexec_protect_crashkres() mechanism, we naturally protect vmcoreinfo_data from write(even read) access under kernel direct mapping after kdump is loaded. Since kdump is usually loaded at the very early stage after boot, we can trust the correctness of the vmcoreinfo data copied. On the other hand, we still need to operate the vmcoreinfo safe copy when crash happens to generate vmcoreinfo_note again, we rely on vmap() to map out a new kernel virtual address and update to use this new one instead in the following crash_save_vmcoreinfo(). BTW, we do not touch vmcoreinfo_note, because it will be fully updated using the protected vmcoreinfo_data after crash which is surely correct just like the cpu crash note. Link: http://lkml.kernel.org/r/1493281021-20737-3-git-send-email-xlpang@redhat.com Signed-off-by: Xunlei Pang Tested-by: Michael Holzheu Cc: Benjamin Herrenschmidt Cc: Dave Young Cc: Eric Biederman Cc: Hari Bathini Cc: Juergen Gross Cc: Mahesh Salgaonkar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crash_core.h | 2 +- include/linux/kexec.h | 2 ++ kernel/crash_core.c | 17 ++++++++++++++++- kernel/kexec.c | 8 ++++++++ kernel/kexec_core.c | 39 +++++++++++++++++++++++++++++++++++++++ kernel/kexec_file.c | 8 ++++++++ 6 files changed, 74 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index e5df1b3cf072..2df2118fbe13 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -28,6 +28,7 @@ typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; +void crash_update_vmcoreinfo_safecopy(void *ptr); void crash_save_vmcoreinfo(void); void arch_crash_save_vmcoreinfo(void); __printf(1, 2) @@ -57,7 +58,6 @@ phys_addr_t paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("CONFIG_%s=y\n", #name) extern u32 *vmcoreinfo_note; -extern size_t vmcoreinfo_size; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 65888418fb69..dd056fab9e35 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -172,6 +172,7 @@ struct kimage { unsigned long start; struct page *control_code_page; struct page *swap_page; + void *vmcoreinfo_data_copy; /* locates in the crash memory */ unsigned long nr_segments; struct kexec_segment segment[KEXEC_SEGMENT_MAX]; @@ -241,6 +242,7 @@ extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); int kexec_crash_loaded(void); void crash_save_cpu(struct pt_regs *regs, int cpu); +extern int kimage_crash_copy_vmcoreinfo(struct kimage *image); extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 315adbf9cb68..6db80fc0810b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -15,9 +15,12 @@ /* vmcoreinfo stuff */ static unsigned char *vmcoreinfo_data; -size_t vmcoreinfo_size; +static size_t vmcoreinfo_size; u32 *vmcoreinfo_note; +/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ +static unsigned char *vmcoreinfo_data_safecopy; + /* * parsing the "crashkernel" commandline * @@ -323,11 +326,23 @@ static void update_vmcoreinfo_note(void) final_note(buf); } +void crash_update_vmcoreinfo_safecopy(void *ptr) +{ + if (ptr) + memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); + + vmcoreinfo_data_safecopy = ptr; +} + void crash_save_vmcoreinfo(void) { if (!vmcoreinfo_note) return; + /* Use the safe copy to generate vmcoreinfo note if have */ + if (vmcoreinfo_data_safecopy) + vmcoreinfo_data = vmcoreinfo_data_safecopy; + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); update_vmcoreinfo_note(); } diff --git a/kernel/kexec.c b/kernel/kexec.c index 980936a90ee6..e62ec4dc6620 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -144,6 +144,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (ret) goto out; + /* + * Some architecture(like S390) may touch the crash memory before + * machine_kexec_prepare(), we must copy vmcoreinfo data after it. + */ + ret = kimage_crash_copy_vmcoreinfo(image); + if (ret) + goto out; + for (i = 0; i < nr_segments; i++) { ret = kimage_load_segment(image, &image->segment[i]); if (ret) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 154ffb489b93..1ae7c41c33c1 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -482,6 +482,40 @@ struct page *kimage_alloc_control_pages(struct kimage *image, return pages; } +int kimage_crash_copy_vmcoreinfo(struct kimage *image) +{ + struct page *vmcoreinfo_page; + void *safecopy; + + if (image->type != KEXEC_TYPE_CRASH) + return 0; + + /* + * For kdump, allocate one vmcoreinfo safe copy from the + * crash memory. as we have arch_kexec_protect_crashkres() + * after kexec syscall, we naturally protect it from write + * (even read) access under kernel direct mapping. But on + * the other hand, we still need to operate it when crash + * happens to generate vmcoreinfo note, hereby we rely on + * vmap for this purpose. + */ + vmcoreinfo_page = kimage_alloc_control_pages(image, 0); + if (!vmcoreinfo_page) { + pr_warn("Could not allocate vmcoreinfo buffer\n"); + return -ENOMEM; + } + safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); + if (!safecopy) { + pr_warn("Could not vmap vmcoreinfo buffer\n"); + return -ENOMEM; + } + + image->vmcoreinfo_data_copy = safecopy; + crash_update_vmcoreinfo_safecopy(safecopy); + + return 0; +} + static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) @@ -569,6 +603,11 @@ void kimage_free(struct kimage *image) if (!image) return; + if (image->vmcoreinfo_data_copy) { + crash_update_vmcoreinfo_safecopy(NULL); + vunmap(image->vmcoreinfo_data_copy); + } + kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 766e7e4d3ad9..c8f7f77e9fa9 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -298,6 +298,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + /* + * Some architecture(like S390) may touch the crash memory before + * machine_kexec_prepare(), we must copy vmcoreinfo data after it. + */ + ret = kimage_crash_copy_vmcoreinfo(image); + if (ret) + goto out; + ret = kexec_calculate_store_digests(image); if (ret) goto out; -- cgit v1.3-14-g43fede From a19ac3374995382a994653ff372b98ea7cbad548 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:30 -0700 Subject: sysctl: kdoc'ify sysctl_writes_strict Document the different sysctl_writes_strict modes in code. Link: http://lkml.kernel.org/r/20170519033554.18592-3-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4dfba1a76cc3..02725178694a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -174,11 +174,32 @@ extern int no_unaligned_warning; #ifdef CONFIG_PROC_SYSCTL -#define SYSCTL_WRITES_LEGACY -1 -#define SYSCTL_WRITES_WARN 0 -#define SYSCTL_WRITES_STRICT 1 +/** + * enum sysctl_writes_mode - supported sysctl write modes + * + * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value + * to be written, and multiple writes on the same sysctl file descriptor + * will rewrite the sysctl value, regardless of file position. No warning + * is issued when the initial position is not 0. + * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is + * not 0. + * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at + * file position 0 and the value must be fully contained in the buffer + * sent to the write syscall. If dealing with strings respect the file + * position, but restrict this to the max length of the buffer, anything + * passed the max lenght will be ignored. Multiple writes will append + * to the buffer. + * + * These write modes control how current file position affects the behavior of + * updating sysctl values through the proc interface on each write. + */ +enum sysctl_writes_mode { + SYSCTL_WRITES_LEGACY = -1, + SYSCTL_WRITES_WARN = 0, + SYSCTL_WRITES_STRICT = 1, +}; -static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; +static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.3-14-g43fede From d383d48470819e86fe30eb72f0e9494e1ee0e2af Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:33 -0700 Subject: sysctl: fold sysctl_writes_strict checks into helper The mode sysctl_writes_strict positional checks keep being copy and pasted as we add new proc handlers. Just add a helper to avoid code duplication. Link: http://lkml.kernel.org/r/20170519033554.18592-4-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Suggested-by: Kees Cook Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 56 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 02725178694a..6f3bb1f099fa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1970,6 +1970,32 @@ static void warn_sysctl_write(struct ctl_table *table) current->comm, table->procname); } +/** + * proc_first_pos_non_zero_ignore - check if firs position is allowed + * @ppos: file position + * @table: the sysctl table + * + * Returns true if the first position is non-zero and the sysctl_writes_strict + * mode indicates this is not allowed for numeric input types. String proc + * hadlers can ignore the return value. + */ +static bool proc_first_pos_non_zero_ignore(loff_t *ppos, + struct ctl_table *table) +{ + if (!*ppos) + return false; + + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + return true; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + return false; + default: + return false; + } +} + /** * proc_dostring - read a string sysctl * @table: the sysctl table @@ -1990,8 +2016,8 @@ static void warn_sysctl_write(struct ctl_table *table) int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) - warn_sysctl_write(table); + if (write) + proc_first_pos_non_zero_ignore(ppos, table); return _proc_do_string((char *)(table->data), table->maxlen, write, (char __user *)buffer, lenp, ppos); @@ -2193,17 +2219,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, conv = do_proc_dointvec_conv; if (write) { - if (*ppos) { - switch (sysctl_writes_strict) { - case SYSCTL_WRITES_STRICT: - goto out; - case SYSCTL_WRITES_WARN: - warn_sysctl_write(table); - break; - default: - break; - } - } + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; @@ -2468,17 +2485,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int left = *lenp; if (write) { - if (*ppos) { - switch (sysctl_writes_strict) { - case SYSCTL_WRITES_STRICT: - goto out; - case SYSCTL_WRITES_WARN: - warn_sysctl_write(table); - break; - default: - break; - } - } + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; -- cgit v1.3-14-g43fede From 4f2fec00afa60aa8e5d1b7f2a8e0526900f55623 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:36 -0700 Subject: sysctl: simplify unsigned int support Commit e7d316a02f68 ("sysctl: handle error writing UINT_MAX to u32 fields") added proc_douintvec() to start help adding support for unsigned int, this however was only half the work needed. Two fixes have come in since then for the following issues: o Printing the values shows a negative value, this happens since do_proc_dointvec() and this uses proc_put_long() This was fixed by commit 5380e5644afbba9 ("sysctl: don't print negative flag for proc_douintvec"). o We can easily wrap around the int values: UINT_MAX is 4294967295, if we echo in 4294967295 + 1 we end up with 0, using 4294967295 + 2 we end up with 1. o We echo negative values in and they are accepted This was fixed by commit 425fffd886ba ("sysctl: report EINVAL if value is larger than UINT_MAX for proc_douintvec"). It still also failed to be added to sysctl_check_table()... instead of adding it with the current implementation just provide a proper and simplified unsigned int support without any array unsigned int support with no negative support at all. Historically sysctl proc helpers have supported arrays, due to the complexity this adds though we've taken a step back to evaluate array users to determine if its worth upkeeping for unsigned int. An evaluation using Coccinelle has been done to perform a grammatical search to ask ourselves: o How many sysctl proc_dointvec() (int) users exist which likely should be moved over to proc_douintvec() (unsigned int) ? Answer: about 8 - Of these how many are array users ? Answer: Probably only 1 o How many sysctl array users exist ? Answer: about 12 This last question gives us an idea just how popular arrays: they are not. Array support should probably just be kept for strings. The identified uint ports are: drivers/infiniband/core/ucma.c - max_backlog drivers/infiniband/core/iwcm.c - default_backlog net/core/sysctl_net_core.c - rps_sock_flow_sysctl() net/netfilter/nf_conntrack_timestamp.c - nf_conntrack_timestamp -- bool net/netfilter/nf_conntrack_acct.c nf_conntrack_acct -- bool net/netfilter/nf_conntrack_ecache.c - nf_conntrack_events -- bool net/netfilter/nf_conntrack_helper.c - nf_conntrack_helper -- bool net/phonet/sysctl.c proc_local_port_range() The only possible array users is proc_local_port_range() but it does not seem worth it to add array support just for this given the range support works just as well. Unsigned int support should be desirable more for when you *need* more than INT_MAX or using int min/max support then does not suffice for your ranges. If you forget and by mistake happen to register an unsigned int proc entry with an array, the driver will fail and you will get something as follows: sysctl table check failed: debug/test_sysctl//uint_0002 array now allowed CPU: 2 PID: 1342 Comm: modprobe Tainted: G W E Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Call Trace: dump_stack+0x63/0x81 __register_sysctl_table+0x350/0x650 ? kmem_cache_alloc_trace+0x107/0x240 __register_sysctl_paths+0x1b3/0x1e0 ? 0xffffffffc005f000 register_sysctl_table+0x1f/0x30 test_sysctl_init+0x10/0x1000 [test_sysctl] do_one_initcall+0x52/0x1a0 ? kmem_cache_alloc_trace+0x107/0x240 do_init_module+0x5f/0x200 load_module+0x1867/0x1bd0 ? __symbol_put+0x60/0x60 SYSC_finit_module+0xdf/0x110 SyS_finit_module+0xe/0x10 entry_SYSCALL_64_fastpath+0x1e/0xad RIP: 0033:0x7f042b22d119 Fixes: e7d316a02f68 ("sysctl: handle error writing UINT_MAX to u32 fields") Link: http://lkml.kernel.org/r/20170519033554.18592-5-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Suggested-by: Alexey Dobriyan Cc: Subash Abhinov Kasiviswanathan Cc: Liping Zhang Cc: Alexey Dobriyan Cc: Heinrich Schuchardt Cc: Kees Cook Cc: "David S. Miller" Cc: Ingo Molnar Cc: Al Viro Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 14 +++++ kernel/sysctl.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 160 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 32c9c5630507..ee6feba8b6c0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1061,6 +1061,18 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) return -EINVAL; } +static int sysctl_check_table_array(const char *path, struct ctl_table *table) +{ + int err = 0; + + if (table->proc_handler == proc_douintvec) { + if (table->maxlen != sizeof(unsigned int)) + err |= sysctl_err(path, table, "array now allowed"); + } + + return err; +} + static int sysctl_check_table(const char *path, struct ctl_table *table) { int err = 0; @@ -1081,6 +1093,8 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) err |= sysctl_err(path, table, "No data"); if (!table->maxlen) err |= sysctl_err(path, table, "No maxlen"); + else + err |= sysctl_check_table_array(path, table); } if (!table->proc_handler) err |= sysctl_err(path, table, "No proc_handler"); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6f3bb1f099fa..d12078fc215f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2175,19 +2175,18 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, return 0; } -static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, - int *valp, - int write, void *data) +static int do_proc_douintvec_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) { if (write) { - if (*negp) + if (*lvalp > UINT_MAX) return -EINVAL; if (*lvalp > UINT_MAX) return -EINVAL; *valp = *lvalp; } else { unsigned int val = *valp; - *negp = false; *lvalp = (unsigned long)val; } return 0; @@ -2287,6 +2286,146 @@ static int do_proc_dointvec(struct ctl_table *table, int write, buffer, lenp, ppos, conv, data); } +static int do_proc_douintvec_w(unsigned int *tbl_data, + struct ctl_table *table, + void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + bool neg; + char *kbuf = NULL, *p; + + left = *lenp; + + if (proc_first_pos_non_zero_ignore(ppos, table)) + goto bail_early; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return -EINVAL; + + left -= proc_skip_spaces(&p); + if (!left) { + err = -EINVAL; + goto out_free; + } + + err = proc_get_long(&p, &left, &lval, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err || neg) { + err = -EINVAL; + goto out_free; + } + + if (conv(&lval, tbl_data, 1, data)) { + err = -EINVAL; + goto out_free; + } + + if (!err && left) + left -= proc_skip_spaces(&p); + +out_free: + kfree(kbuf); + if (err) + return -EINVAL; + + return 0; + + /* This is in keeping with old __do_proc_dointvec() */ +bail_early: + *ppos += *lenp; + return err; +} + +static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned long lval; + int err = 0; + size_t left; + + left = *lenp; + + if (conv(&lval, tbl_data, 0, data)) { + err = -EINVAL; + goto out; + } + + err = proc_put_long(&buffer, &left, lval, false); + if (err || !left) + goto out; + + err = proc_put_char(&buffer, &left, '\n'); + +out: + *lenp -= left; + *ppos += *lenp; + + return err; +} + +static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, + int write, void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + unsigned int *i, vleft; + + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (unsigned int *) tbl_data; + vleft = table->maxlen / sizeof(*i); + + /* + * Arrays are not supported, keep this simple. *Do not* add + * support for them. + */ + if (vleft != 1) { + *lenp = 0; + return -EINVAL; + } + + if (!conv) + conv = do_proc_douintvec_conv; + + if (write) + return do_proc_douintvec_w(i, table, buffer, lenp, ppos, + conv, data); + return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); +} + +static int do_proc_douintvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) +{ + return __do_proc_douintvec(table->data, table, write, + buffer, lenp, ppos, conv, data); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -2322,8 +2461,8 @@ int proc_dointvec(struct ctl_table *table, int write, int proc_douintvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, buffer, lenp, ppos, - do_proc_douintvec_conv, NULL); + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_conv, NULL); } /* -- cgit v1.3-14-g43fede From 61d9b56a89208d8cccd0b4cfec7e6959717e16e3 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 12 Jul 2017 14:33:40 -0700 Subject: sysctl: add unsigned int range support To keep parity with regular int interfaces provide the an unsigned int proc_douintvec_minmax() which allows you to specify a range of allowed valid numbers. Adding proc_douintvec_minmax_sysadmin() is easy but we can wait for an actual user for that. Link: http://lkml.kernel.org/r/20170519033554.18592-6-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Acked-by: Kees Cook Cc: Subash Abhinov Kasiviswanathan Cc: Heinrich Schuchardt Cc: Kees Cook Cc: "David S. Miller" Cc: Ingo Molnar Cc: Al Viro Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 4 ++- include/linux/sysctl.h | 3 +++ kernel/sysctl.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ee6feba8b6c0..8f9d564d0969 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1065,7 +1065,8 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) { int err = 0; - if (table->proc_handler == proc_douintvec) { + if ((table->proc_handler == proc_douintvec) || + (table->proc_handler == proc_douintvec_minmax)) { if (table->maxlen != sizeof(unsigned int)) err |= sysctl_err(path, table, "array now allowed"); } @@ -1083,6 +1084,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) if ((table->proc_handler == proc_dostring) || (table->proc_handler == proc_dointvec) || (table->proc_handler == proc_douintvec) || + (table->proc_handler == proc_douintvec_minmax) || (table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || (table->proc_handler == proc_dointvec_userhz_jiffies) || diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 80d07816def0..225001d437ae 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -47,6 +47,9 @@ extern int proc_douintvec(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_minmax(struct ctl_table *, int, void __user *, size_t *, loff_t *); +extern int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); extern int proc_dointvec_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d12078fc215f..df9f2a367882 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2567,6 +2567,65 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +struct do_proc_douintvec_minmax_conv_param { + unsigned int *min; + unsigned int *max; +}; + +static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + struct do_proc_douintvec_minmax_conv_param *param = data; + + if (write) { + unsigned int val = *lvalp; + + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -ERANGE; + + if (*lvalp > UINT_MAX) + return -EINVAL; + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +/** + * proc_douintvec_minmax - read a vector of unsigned ints with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer + * values from/to the user buffer, treated as an ASCII string. Negative + * strings are not allowed. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). There is a final sanity + * check for UINT_MAX to avoid having to support wrap around uses from + * userspace. + * + * Returns 0 on success. + */ +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_douintvec_minmax_conv_param param = { + .min = (unsigned int *) table->extra1, + .max = (unsigned int *) table->extra2, + }; + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_minmax_conv, ¶m); +} + static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP @@ -3066,6 +3125,12 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_douintvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3108,6 +3173,7 @@ EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); +EXPORT_SYMBOL_GPL(proc_douintvec_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); -- cgit v1.3-14-g43fede From 9380fa60b10ebd6ee7c3fcdb2cf162f4d7cf9fc5 Mon Sep 17 00:00:00 2001 From: Mateusz Jurczyk Date: Wed, 12 Jul 2017 14:34:01 -0700 Subject: kernel/sysctl_binary.c: check name array length in deprecated_sysctl_warning() Prevent use of uninitialized memory (originating from the stack frame of do_sysctl()) by verifying that the name array is filled with sufficient input data before comparing its specific entries with integer constants. Through timing measurement or analyzing the kernel debug logs, a user-mode program could potentially infer the results of comparisons against the uninitialized memory, and acquire some (very limited) information about the state of the kernel stack. The change also eliminates possible future warnings by tools such as KMSAN and other code checkers / instrumentations. Link: http://lkml.kernel.org/r/20170524122139.21333-1-mjurczyk@google.com Signed-off-by: Mateusz Jurczyk Acked-by: Kees Cook Cc: "David S. Miller" Cc: Matthew Whitehead Cc: "Eric W. Biederman" Cc: Tetsuo Handa Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_binary.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 939a158eab11..02e1859f2ca8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1346,7 +1346,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen) * CTL_KERN/KERN_VERSION is used by older glibc and cannot * ever go away. */ - if (name[0] == CTL_KERN && name[1] == KERN_VERSION) + if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION) return; if (printk_ratelimit()) { -- cgit v1.3-14-g43fede From 0791e3644e5ef21646fe565b9061788d05ec71d4 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 12 Jul 2017 14:34:28 -0700 Subject: kcmp: add KCMP_EPOLL_TFD mode to compare epoll target files With current epoll architecture target files are addressed with file_struct and file descriptor number, where the last is not unique. Moreover files can be transferred from another process via unix socket, added into queue and closed then so we won't find this descriptor in the task fdinfo list. Thus to checkpoint and restore such processes CRIU needs to find out where exactly the target file is present to add it into epoll queue. For this sake one can use kcmp call where some particular target file from the queue is compared with arbitrary file passed as an argument. Because epoll target files can have same file descriptor number but different file_struct a caller should explicitly specify the offset within. To test if some particular file is matching entry inside epoll one have to - fill kcmp_epoll_slot structure with epoll file descriptor, target file number and target file offset (in case if only one target is present then it should be 0) - call kcmp as kcmp(pid1, pid2, KCMP_EPOLL_TFD, fd, &kcmp_epoll_slot) - the kernel fetch file pointer matching file descriptor @fd of pid1 - lookups for file struct in epoll queue of pid2 and returns traditional 0,1,2 result for sorting purpose Link: http://lkml.kernel.org/r/20170424154423.511592110@gmail.com Signed-off-by: Cyrill Gorcunov Acked-by: Andrey Vagin Cc: Al Viro Cc: Pavel Emelyanov Cc: Michael Kerrisk Cc: Jason Baron Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 42 ++++++++++++++++++++++++++++++++++ include/linux/eventpoll.h | 3 +++ include/uapi/linux/kcmp.h | 10 +++++++++ kernel/kcmp.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 112 insertions(+) (limited to 'kernel') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 322904c3ebdf..e7e9901c3790 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1077,6 +1077,48 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) return epir; } +static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) +{ + struct rb_node *rbp; + struct epitem *epi; + + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + if (epi->ffd.fd == tfd) { + if (toff == 0) + return epi; + else + toff--; + } + cond_resched(); + } + + return NULL; +} + +struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, + unsigned long toff) +{ + struct file *file_raw; + struct eventpoll *ep; + struct epitem *epi; + + if (!is_file_epoll(file)) + return ERR_PTR(-EINVAL); + + ep = file->private_data; + + mutex_lock(&ep->mtx); + epi = ep_find_tfd(ep, tfd, toff); + if (epi) + file_raw = epi->ffd.file; + else + file_raw = ERR_PTR(-ENOENT); + mutex_unlock(&ep->mtx); + + return file_raw; +} + /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 6daf6d4971f6..d8625d214ea7 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -14,6 +14,7 @@ #define _LINUX_EVENTPOLL_H #include +#include /* Forward declarations to avoid compiler errors */ @@ -22,6 +23,8 @@ struct file; #ifdef CONFIG_EPOLL +struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); + /* Used to initialize the epoll bits inside the "struct file" */ static inline void eventpoll_init_file(struct file *file) { diff --git a/include/uapi/linux/kcmp.h b/include/uapi/linux/kcmp.h index 84df14b37360..481e103da78e 100644 --- a/include/uapi/linux/kcmp.h +++ b/include/uapi/linux/kcmp.h @@ -1,6 +1,8 @@ #ifndef _UAPI_LINUX_KCMP_H #define _UAPI_LINUX_KCMP_H +#include + /* Comparison type */ enum kcmp_type { KCMP_FILE, @@ -10,8 +12,16 @@ enum kcmp_type { KCMP_SIGHAND, KCMP_IO, KCMP_SYSVSEM, + KCMP_EPOLL_TFD, KCMP_TYPES, }; +/* Slot for KCMP_EPOLL_TFD */ +struct kcmp_epoll_slot { + __u32 efd; /* epoll file descriptor */ + __u32 tfd; /* target file number */ + __u32 toff; /* target offset within same numbered sequence */ +}; + #endif /* _UAPI_LINUX_KCMP_H */ diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 3a47fa998fe0..ea34ed8bb952 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -11,6 +11,10 @@ #include #include #include +#include +#include +#include +#include #include @@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2) return err; } +#ifdef CONFIG_EPOLL +static int kcmp_epoll_target(struct task_struct *task1, + struct task_struct *task2, + unsigned long idx1, + struct kcmp_epoll_slot __user *uslot) +{ + struct file *filp, *filp_epoll, *filp_tgt; + struct kcmp_epoll_slot slot; + struct files_struct *files; + + if (copy_from_user(&slot, uslot, sizeof(slot))) + return -EFAULT; + + filp = get_file_raw_ptr(task1, idx1); + if (!filp) + return -EBADF; + + files = get_files_struct(task2); + if (!files) + return -EBADF; + + spin_lock(&files->file_lock); + filp_epoll = fcheck_files(files, slot.efd); + if (filp_epoll) + get_file(filp_epoll); + else + filp_tgt = ERR_PTR(-EBADF); + spin_unlock(&files->file_lock); + put_files_struct(files); + + if (filp_epoll) { + filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); + fput(filp_epoll); + } else + + if (IS_ERR(filp_tgt)) + return PTR_ERR(filp_tgt); + + return kcmp_ptr(filp, filp_tgt, KCMP_FILE); +} +#else +static int kcmp_epoll_target(struct task_struct *task1, + struct task_struct *task2, + unsigned long idx1, + struct kcmp_epoll_slot __user *uslot) +{ + return -EOPNOTSUPP; +} +#endif + SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, unsigned long, idx1, unsigned long, idx2) { @@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, ret = -EOPNOTSUPP; #endif break; + case KCMP_EPOLL_TFD: + ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2); + break; default: ret = -EINVAL; break; -- cgit v1.3-14-g43fede From e41d58185f1444368873d4d7422f7664a68be61d Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 12 Jul 2017 14:34:35 -0700 Subject: fault-inject: support systematic fault injection Add /proc/self/task//fail-nth file that allows failing 0-th, 1-st, 2-nd and so on calls systematically. Excerpt from the added documentation: "Write to this file of integer N makes N-th call in the current task fail (N is 0-based). Read from this file returns a single char 'Y' or 'N' that says if the fault setup with a previous write to this file was injected or not, and disables the fault if it wasn't yet injected. Note that this file enables all types of faults (slab, futex, etc). This setting takes precedence over all other generic settings like probability, interval, times, etc. But per-capability settings (e.g. fail_futex/ignore-private) take precedence over it. This feature is intended for systematic testing of faults in a single system call. See an example below" Why add a new setting: 1. Existing settings are global rather than per-task. So parallel testing is not possible. 2. attr->interval is close but it depends on attr->count which is non reset to 0, so interval does not work as expected. 3. Trying to model this with existing settings requires manipulations of all of probability, interval, times, space, task-filter and unexposed count and per-task make-it-fail files. 4. Existing settings are per-failure-type, and the set of failure types is potentially expanding. 5. make-it-fail can't be changed by unprivileged user and aggressive stress testing better be done from an unprivileged user. Similarly, this would require opening the debugfs files to the unprivileged user, as he would need to reopen at least times file (not possible to pre-open before dropping privs). The proposed interface solves all of the above (see the example). We want to integrate this into syzkaller fuzzer. A prototype has found 10 bugs in kernel in first day of usage: https://groups.google.com/forum/#!searchin/syzkaller/%22FAULT_INJECTION%22%7Csort:relevance I've made the current interface work with all types of our sandboxes. For setuid the secret sauce was prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) to make /proc entries non-root owned. So I am fine with the current version of the code. [akpm@linux-foundation.org: fix build] Link: http://lkml.kernel.org/r/20170328130128.101773-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Akinobu Mita Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/fault-injection/fault-injection.txt | 78 +++++++++++++++++++++++ fs/proc/base.c | 52 +++++++++++++++ include/linux/sched.h | 1 + kernel/fork.c | 4 ++ lib/fault-inject.c | 7 ++ 5 files changed, 142 insertions(+) (limited to 'kernel') diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt index 415484f3d59a..192d8cbcc5f9 100644 --- a/Documentation/fault-injection/fault-injection.txt +++ b/Documentation/fault-injection/fault-injection.txt @@ -134,6 +134,22 @@ use the boot option: fail_futex= mmc_core.fail_request=,,, +o proc entries + +- /proc/self/task//fail-nth: + + Write to this file of integer N makes N-th call in the current task fail + (N is 0-based). Read from this file returns a single char 'Y' or 'N' + that says if the fault setup with a previous write to this file was + injected or not, and disables the fault if it wasn't yet injected. + Note that this file enables all types of faults (slab, futex, etc). + This setting takes precedence over all other generic debugfs settings + like probability, interval, times, etc. But per-capability settings + (e.g. fail_futex/ignore-private) take precedence over it. + + This feature is intended for systematic testing of faults in a single + system call. See an example below. + How to add new fault injection capability ----------------------------------------- @@ -278,3 +294,65 @@ allocation failure. # env FAILCMD_TYPE=fail_page_alloc \ ./tools/testing/fault-injection/failcmd.sh --times=100 \ -- make -C tools/testing/selftests/ run_tests + +Systematic faults using fail-nth +--------------------------------- + +The following code systematically faults 0-th, 1-st, 2-nd and so on +capabilities in the socketpair() system call. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int main() +{ + int i, err, res, fail_nth, fds[2]; + char buf[128]; + + system("echo N > /sys/kernel/debug/failslab/ignore-gfp-wait"); + sprintf(buf, "/proc/self/task/%ld/fail-nth", syscall(SYS_gettid)); + fail_nth = open(buf, O_RDWR); + for (i = 0;; i++) { + sprintf(buf, "%d", i); + write(fail_nth, buf, strlen(buf)); + res = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds); + err = errno; + read(fail_nth, buf, 1); + if (res == 0) { + close(fds[0]); + close(fds[1]); + } + printf("%d-th fault %c: res=%d/%d\n", i, buf[0], res, err); + if (buf[0] != 'Y') + break; + } + return 0; +} + +An example output: + +0-th fault Y: res=-1/23 +1-th fault Y: res=-1/23 +2-th fault Y: res=-1/23 +3-th fault Y: res=-1/12 +4-th fault Y: res=-1/12 +5-th fault Y: res=-1/23 +6-th fault Y: res=-1/23 +7-th fault Y: res=-1/23 +8-th fault Y: res=-1/12 +9-th fault Y: res=-1/12 +10-th fault Y: res=-1/12 +11-th fault Y: res=-1/12 +12-th fault Y: res=-1/12 +13-th fault Y: res=-1/12 +14-th fault Y: res=-1/12 +15-th fault Y: res=-1/12 +16-th fault N: res=0/12 diff --git a/fs/proc/base.c b/fs/proc/base.c index f1e1927ccd48..88b773f318cd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1355,6 +1355,53 @@ static const struct file_operations proc_fault_inject_operations = { .write = proc_fault_inject_write, .llseek = generic_file_llseek, }; + +static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + int err, n; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + put_task_struct(task); + if (task != current) + return -EPERM; + err = kstrtoint_from_user(buf, count, 10, &n); + if (err) + return err; + if (n < 0 || n == INT_MAX) + return -EINVAL; + current->fail_nth = n + 1; + return count; +} + +static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + int err; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + put_task_struct(task); + if (task != current) + return -EPERM; + if (count < 1) + return -EINVAL; + err = put_user((char)(current->fail_nth ? 'N' : 'Y'), buf); + if (err) + return err; + current->fail_nth = 0; + return 1; +} + +static const struct file_operations proc_fail_nth_operations = { + .read = proc_fail_nth_read, + .write = proc_fail_nth_write, +}; #endif @@ -3311,6 +3358,11 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), + /* + * Operations on the file check that the task is current, + * so we create it with 0666 to support testing under unprivileged user. + */ + REG("fail-nth", 0666, proc_fail_nth_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), diff --git a/include/linux/sched.h b/include/linux/sched.h index 20814b7d7d70..3822d749fc9e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -974,6 +974,7 @@ struct task_struct { #ifdef CONFIG_FAULT_INJECTION int make_it_fail; + int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call diff --git a/kernel/fork.c b/kernel/fork.c index d2b9d7c31eaf..ade237a96308 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -573,6 +573,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) kcov_task_init(tsk); +#ifdef CONFIG_FAULT_INJECTION + tsk->fail_nth = 0; +#endif + return tsk; free_stack: diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 4ff157159a0d..09ac73c177fd 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -107,6 +107,12 @@ static inline bool fail_stacktrace(struct fault_attr *attr) bool should_fail(struct fault_attr *attr, ssize_t size) { + if (in_task() && current->fail_nth) { + if (--current->fail_nth == 0) + goto fail; + return false; + } + /* No need to check any other properties if the probability is 0 */ if (attr->probability == 0) return false; @@ -134,6 +140,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size) if (!fail_stacktrace(attr)) return false; +fail: fail_dump(attr); if (atomic_read(&attr->times) != -1) -- cgit v1.3-14-g43fede From f2e0cff85ed111a3cf24d894c3fa11697dfae628 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Jul 2017 14:35:43 -0700 Subject: kernel/watchdog: introduce arch_touch_nmi_watchdog() For architectures that define HAVE_NMI_WATCHDOG, instead of having them provide the complete touch_nmi_watchdog() function, just have them provide arch_touch_nmi_watchdog(). This gives the generic code more flexibility in implementing this function, and arch implementations don't miss out on touching the softlockup watchdog or other generic details. Link: http://lkml.kernel.org/r/20170616065715.18390-3-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Reviewed-by: Babu Moger Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/blackfin/include/asm/nmi.h | 2 ++ arch/blackfin/kernel/nmi.c | 2 +- arch/mn10300/include/asm/nmi.h | 2 ++ arch/mn10300/kernel/mn10300-watchdog-low.S | 8 ++++---- arch/mn10300/kernel/mn10300-watchdog.c | 2 +- arch/sparc/include/asm/nmi.h | 1 + arch/sparc/kernel/nmi.c | 6 ++---- include/linux/nmi.h | 27 ++++++++++++++++----------- kernel/watchdog_hld.c | 5 ++--- 9 files changed, 31 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/arch/blackfin/include/asm/nmi.h b/arch/blackfin/include/asm/nmi.h index b9caac4fcfd8..107d23705f46 100644 --- a/arch/blackfin/include/asm/nmi.h +++ b/arch/blackfin/include/asm/nmi.h @@ -9,4 +9,6 @@ #include +extern void arch_touch_nmi_watchdog(void); + #endif diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c index 633c37083e87..1e714329fe8a 100644 --- a/arch/blackfin/kernel/nmi.c +++ b/arch/blackfin/kernel/nmi.c @@ -190,7 +190,7 @@ static int __init init_nmi_wdt(void) } device_initcall(init_nmi_wdt); -void touch_nmi_watchdog(void) +void arch_touch_nmi_watchdog(void) { atomic_set(&nmi_touched[smp_processor_id()], 1); } diff --git a/arch/mn10300/include/asm/nmi.h b/arch/mn10300/include/asm/nmi.h index f3671cbbc117..b05627597b1b 100644 --- a/arch/mn10300/include/asm/nmi.h +++ b/arch/mn10300/include/asm/nmi.h @@ -11,4 +11,6 @@ #ifndef _ASM_NMI_H #define _ASM_NMI_H +extern void arch_touch_nmi_watchdog(void); + #endif /* _ASM_NMI_H */ diff --git a/arch/mn10300/kernel/mn10300-watchdog-low.S b/arch/mn10300/kernel/mn10300-watchdog-low.S index f2f5c9cfaabd..34f8773de7d0 100644 --- a/arch/mn10300/kernel/mn10300-watchdog-low.S +++ b/arch/mn10300/kernel/mn10300-watchdog-low.S @@ -50,9 +50,9 @@ watchdog_handler: # we can't inline it) # ############################################################################### - .globl touch_nmi_watchdog - .type touch_nmi_watchdog,@function -touch_nmi_watchdog: + .globl arch_touch_nmi_watchdog + .type arch_touch_nmi_watchdog,@function +arch_touch_nmi_watchdog: clr d0 clr d1 mov watchdog_alert_counter, a0 @@ -63,4 +63,4 @@ touch_nmi_watchdog: lne ret [],0 - .size touch_nmi_watchdog,.-touch_nmi_watchdog + .size arch_touch_nmi_watchdog,.-arch_touch_nmi_watchdog diff --git a/arch/mn10300/kernel/mn10300-watchdog.c b/arch/mn10300/kernel/mn10300-watchdog.c index a2d8e6938d67..0d5641beadf5 100644 --- a/arch/mn10300/kernel/mn10300-watchdog.c +++ b/arch/mn10300/kernel/mn10300-watchdog.c @@ -31,7 +31,7 @@ static unsigned int watchdog; static unsigned int watchdog_hz = 1; unsigned int watchdog_alert_counter[NR_CPUS]; -EXPORT_SYMBOL(touch_nmi_watchdog); +EXPORT_SYMBOL(arch_touch_nmi_watchdog); /* * the best way to detect whether a CPU has a 'hard lockup' problem diff --git a/arch/sparc/include/asm/nmi.h b/arch/sparc/include/asm/nmi.h index 26ad2b2607c6..284eac3ffaf2 100644 --- a/arch/sparc/include/asm/nmi.h +++ b/arch/sparc/include/asm/nmi.h @@ -7,6 +7,7 @@ void nmi_adjust_hz(unsigned int new_hz); extern atomic_t nmi_active; +void arch_touch_nmi_watchdog(void); void start_nmi_watchdog(void *unused); void stop_nmi_watchdog(void *unused); diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index 95e73c63c99d..048ad783ea3f 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -51,7 +51,7 @@ static DEFINE_PER_CPU(unsigned int, last_irq_sum); static DEFINE_PER_CPU(long, alert_counter); static DEFINE_PER_CPU(int, nmi_touch); -void touch_nmi_watchdog(void) +void arch_touch_nmi_watchdog(void) { if (atomic_read(&nmi_active)) { int cpu; @@ -61,10 +61,8 @@ void touch_nmi_watchdog(void) per_cpu(nmi_touch, cpu) = 1; } } - - touch_softlockup_watchdog(); } -EXPORT_SYMBOL(touch_nmi_watchdog); +EXPORT_SYMBOL(arch_touch_nmi_watchdog); static void die_nmi(const char *str, struct pt_regs *regs, int do_panic) { diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 5e2e57536d98..bd387ef8bccd 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -6,6 +6,9 @@ #include #include +#if defined(CONFIG_HAVE_NMI_WATCHDOG) +#include +#endif #ifdef CONFIG_LOCKUP_DETECTOR extern void touch_softlockup_watchdog_sched(void); @@ -58,6 +61,18 @@ static inline void reset_hung_task_detector(void) #define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) #define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) +#if defined(CONFIG_HARDLOCKUP_DETECTOR) +extern void hardlockup_detector_disable(void); +#else +static inline void hardlockup_detector_disable(void) {} +#endif + +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +extern void arch_touch_nmi_watchdog(void); +#else +static inline void arch_touch_nmi_watchdog(void) {} +#endif + /** * touch_nmi_watchdog - restart NMI watchdog timeout. * @@ -65,21 +80,11 @@ static inline void reset_hung_task_detector(void) * may be used to reset the timeout - for code which intentionally * disables interrupts for a long time. This call is stateless. */ -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) -#include -extern void touch_nmi_watchdog(void); -#else static inline void touch_nmi_watchdog(void) { + arch_touch_nmi_watchdog(); touch_softlockup_watchdog(); } -#endif - -#if defined(CONFIG_HARDLOCKUP_DETECTOR) -extern void hardlockup_detector_disable(void); -#else -static inline void hardlockup_detector_disable(void) {} -#endif /* * Create trigger_all_cpu_backtrace() out of the arch-provided diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 54a427d1f344..90d688df6ce1 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -56,7 +56,7 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -void touch_nmi_watchdog(void) +void arch_touch_nmi_watchdog(void) { /* * Using __raw here because some code paths have @@ -66,9 +66,8 @@ void touch_nmi_watchdog(void) * going off. */ raw_cpu_write(watchdog_nmi_touch, true); - touch_softlockup_watchdog(); } -EXPORT_SYMBOL(touch_nmi_watchdog); +EXPORT_SYMBOL(arch_touch_nmi_watchdog); static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, -- cgit v1.3-14-g43fede From 05a4a95279311c3a4633b4277a5d21cfd616c6c7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Jul 2017 14:35:46 -0700 Subject: kernel/watchdog: split up config options Split SOFTLOCKUP_DETECTOR from LOCKUP_DETECTOR, and split HARDLOCKUP_DETECTOR_PERF from HARDLOCKUP_DETECTOR. LOCKUP_DETECTOR implies the general boot, sysctl, and programming interfaces for the lockup detectors. An architecture that wants to use a hard lockup detector must define HAVE_HARDLOCKUP_DETECTOR_PERF or HAVE_HARDLOCKUP_DETECTOR_ARCH. Alternatively an arch can define HAVE_NMI_WATCHDOG, which provides the minimum arch_touch_nmi_watchdog, and it otherwise does its own thing and does not implement the LOCKUP_DETECTOR interfaces. sparc is unusual in that it has started to implement some of the interfaces, but not fully yet. It should probably be converted to a full HAVE_HARDLOCKUP_DETECTOR_ARCH. [npiggin@gmail.com: fix] Link: http://lkml.kernel.org/r/20170617223522.66c0ad88@roar.ozlabs.ibm.com Link: http://lkml.kernel.org/r/20170616065715.18390-4-npiggin@gmail.com Signed-off-by: Nicholas Piggin Reviewed-by: Don Zickus Reviewed-by: Babu Moger Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 25 ++++- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/setup_64.c | 2 +- arch/x86/Kconfig | 1 + arch/x86/kernel/apic/hw_nmi.c | 2 +- include/linux/nmi.h | 29 +++-- kernel/Makefile | 2 +- kernel/sysctl.c | 31 +++--- kernel/watchdog.c | 243 +++++++++++++++++++++++++++-------------- kernel/watchdog_hld.c | 32 ------ lib/Kconfig.debug | 45 +++++--- 11 files changed, 251 insertions(+), 162 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index cae0958a2298..fb9bd7d36b05 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -198,9 +198,6 @@ config HAVE_KPROBES_ON_FTRACE config HAVE_NMI bool -config HAVE_NMI_WATCHDOG - depends on HAVE_NMI - bool # # An arch should select this if it provides all these things: # @@ -288,6 +285,28 @@ config HAVE_PERF_EVENTS_NMI subsystem. Also has support for calculating CPU cycle events to determine how many clock cycles in a given period. +config HAVE_HARDLOCKUP_DETECTOR_PERF + bool + depends on HAVE_PERF_EVENTS_NMI + help + The arch chooses to use the generic perf-NMI-based hardlockup + detector. Must define HAVE_PERF_EVENTS_NMI. + +config HAVE_NMI_WATCHDOG + depends on HAVE_NMI + bool + help + The arch provides a low level NMI watchdog. It provides + asm/nmi.h, and defines its own arch_touch_nmi_watchdog(). + +config HAVE_HARDLOCKUP_DETECTOR_ARCH + bool + select HAVE_NMI_WATCHDOG + help + The arch chooses to provide its own hardlockup detector, which is + a superset of the HAVE_NMI_WATCHDOG. It also conforms to config + interfaces and parameters provided by hardlockup detector subsystem. + config HAVE_PERF_REGS bool help diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7177a3f4f418..63ed758e1d20 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -197,6 +197,7 @@ config PPC select HAVE_OPTPROBES if PPC64 select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 + select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 4640f6d64f8b..074a075a9cdb 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -752,7 +752,7 @@ struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { return ppc_proc_freq * watchdog_thresh; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 94a18681353d..3d2b8ce54e00 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -162,6 +162,7 @@ config X86 select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI + select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c73c9fb281e1..d6f387780849 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -19,7 +19,7 @@ #include #include -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { return (u64)(cpu_khz) * 1000 * watchdog_thresh; diff --git a/include/linux/nmi.h b/include/linux/nmi.h index bd387ef8bccd..8aa01fd859fb 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -11,13 +11,21 @@ #endif #ifdef CONFIG_LOCKUP_DETECTOR +void lockup_detector_init(void); +#else +static inline void lockup_detector_init(void) +{ +} +#endif + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR extern void touch_softlockup_watchdog_sched(void); extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); extern unsigned int softlockup_panic; -extern unsigned int hardlockup_panic; -void lockup_detector_init(void); +extern int soft_watchdog_enabled; +extern atomic_t watchdog_park_in_progress; #else static inline void touch_softlockup_watchdog_sched(void) { @@ -31,9 +39,6 @@ static inline void touch_softlockup_watchdog_sync(void) static inline void touch_all_softlockup_watchdogs(void) { } -static inline void lockup_detector_init(void) -{ -} #endif #ifdef CONFIG_DETECT_HUNG_TASK @@ -63,15 +68,18 @@ static inline void reset_hung_task_detector(void) #if defined(CONFIG_HARDLOCKUP_DETECTOR) extern void hardlockup_detector_disable(void); +extern unsigned int hardlockup_panic; #else static inline void hardlockup_detector_disable(void) {} #endif -#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) extern void arch_touch_nmi_watchdog(void); #else +#if !defined(CONFIG_HAVE_NMI_WATCHDOG) static inline void arch_touch_nmi_watchdog(void) {} #endif +#endif /** * touch_nmi_watchdog - restart NMI watchdog timeout. @@ -141,15 +149,18 @@ static inline bool trigger_single_cpu_backtrace(int cpu) } #endif -#ifdef CONFIG_LOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh); +#endif + +#ifdef CONFIG_LOCKUP_DETECTOR extern int nmi_watchdog_enabled; -extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; extern unsigned long watchdog_enabled; +extern struct cpumask watchdog_cpumask; extern unsigned long *watchdog_cpumask_bits; -extern atomic_t watchdog_park_in_progress; +extern int __read_mostly watchdog_suspended; #ifdef CONFIG_SMP extern int sysctl_softlockup_all_cpu_backtrace; extern int sysctl_hardlockup_all_cpu_backtrace; diff --git a/kernel/Makefile b/kernel/Makefile index 72aa080f91f0..4cb8e8b23c6e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -82,7 +82,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o -obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index df9f2a367882..6648fbbb8157 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -900,6 +900,14 @@ static struct ctl_table kern_table[] = { .extra2 = &zero, #endif }, + { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", .data = &soft_watchdog_enabled, @@ -909,13 +917,6 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, - { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, { .procname = "softlockup_panic", .data = &softlockup_panic, @@ -925,27 +926,29 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_SMP { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, +#endif /* CONFIG_SMP */ #endif -#ifdef CONFIG_SMP +#ifdef CONFIG_HARDLOCKUP_DETECTOR { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, + .procname = "hardlockup_panic", + .data = &hardlockup_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP { .procname = "hardlockup_all_cpu_backtrace", .data = &sysctl_hardlockup_all_cpu_backtrace, @@ -957,6 +960,8 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_SMP */ #endif +#endif + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 03e0b69bb5bf..1fba9c3d66dc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,15 +29,58 @@ #include #include +/* Watchdog configuration */ static DEFINE_MUTEX(watchdog_proc_mutex); -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +int __read_mostly nmi_watchdog_enabled; + +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | + NMI_WATCHDOG_ENABLED; #else unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif -int __read_mostly nmi_watchdog_enabled; + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +unsigned int __read_mostly hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void hardlockup_detector_disable(void) +{ + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +} + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; + else if (!strncmp(str, "0", 1)) + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); + +#endif + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR int __read_mostly soft_watchdog_enabled; +#endif + int __read_mostly watchdog_user_enabled; int __read_mostly watchdog_thresh = 10; @@ -45,15 +88,9 @@ int __read_mostly watchdog_thresh = 10; int __read_mostly sysctl_softlockup_all_cpu_backtrace; int __read_mostly sysctl_hardlockup_all_cpu_backtrace; #endif -static struct cpumask watchdog_cpumask __read_mostly; +struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); -/* Helper for online, unparked cpus. */ -#define for_each_watchdog_cpu(cpu) \ - for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) - -atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); - /* * The 'watchdog_running' variable is set to 1 when the watchdog threads * are registered/started and is set to 0 when the watchdog threads are @@ -72,7 +109,27 @@ static int __read_mostly watchdog_running; * of 'watchdog_running' cannot change while the watchdog is deactivated * temporarily (see related code in 'proc' handlers). */ -static int __read_mostly watchdog_suspended; +int __read_mostly watchdog_suspended; + +/* + * These functions can be overridden if an architecture implements its + * own hardlockup detector. + */ +int __weak watchdog_nmi_enable(unsigned int cpu) +{ + return 0; +} +void __weak watchdog_nmi_disable(unsigned int cpu) +{ +} + +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + +/* Helper for online, unparked cpus. */ +#define for_each_watchdog_cpu(cpu) \ + for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) + +atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); static u64 __read_mostly sample_period; @@ -120,6 +177,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#ifdef CONFIG_HARDLOCKUP_DETECTOR static int __init hardlockup_all_cpu_backtrace_setup(char *str) { sysctl_hardlockup_all_cpu_backtrace = @@ -128,6 +186,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str) } __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); #endif +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- @@ -213,18 +272,6 @@ void touch_softlockup_watchdog_sync(void) __this_cpu_write(watchdog_touch_ts, 0); } -/* watchdog detector functions */ -bool is_hardlockup(void) -{ - unsigned long hrint = __this_cpu_read(hrtimer_interrupts); - - if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) - return true; - - __this_cpu_write(hrtimer_interrupts_saved, hrint); - return false; -} - static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); @@ -237,21 +284,21 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -static void watchdog_interrupt_count(void) +/* watchdog detector functions */ +bool is_hardlockup(void) { - __this_cpu_inc(hrtimer_interrupts); -} + unsigned long hrint = __this_cpu_read(hrtimer_interrupts); -/* - * These two functions are mostly architecture specific - * defining them as weak here. - */ -int __weak watchdog_nmi_enable(unsigned int cpu) -{ - return 0; + if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) + return true; + + __this_cpu_write(hrtimer_interrupts_saved, hrint); + return false; } -void __weak watchdog_nmi_disable(unsigned int cpu) + +static void watchdog_interrupt_count(void) { + __this_cpu_inc(hrtimer_interrupts); } static int watchdog_enable_all_cpus(void); @@ -502,57 +549,6 @@ static void watchdog_unpark_threads(void) kthread_unpark(per_cpu(softlockup_watchdog, cpu)); } -/* - * Suspend the hard and soft lockup detector by parking the watchdog threads. - */ -int lockup_detector_suspend(void) -{ - int ret = 0; - - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); - /* - * Multiple suspend requests can be active in parallel (counted by - * the 'watchdog_suspended' variable). If the watchdog threads are - * running, the first caller takes care that they will be parked. - * The state of 'watchdog_running' cannot change while a suspend - * request is active (see related code in 'proc' handlers). - */ - if (watchdog_running && !watchdog_suspended) - ret = watchdog_park_threads(); - - if (ret == 0) - watchdog_suspended++; - else { - watchdog_disable_all_cpus(); - pr_err("Failed to suspend lockup detectors, disabled\n"); - watchdog_enabled = 0; - } - - mutex_unlock(&watchdog_proc_mutex); - - return ret; -} - -/* - * Resume the hard and soft lockup detector by unparking the watchdog threads. - */ -void lockup_detector_resume(void) -{ - mutex_lock(&watchdog_proc_mutex); - - watchdog_suspended--; - /* - * The watchdog threads are unparked if they were previously running - * and if there is no more active suspend request. - */ - if (watchdog_running && !watchdog_suspended) - watchdog_unpark_threads(); - - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); -} - static int update_watchdog_all_cpus(void) { int ret; @@ -604,6 +600,81 @@ static void watchdog_disable_all_cpus(void) } } +#else /* SOFTLOCKUP */ +static int watchdog_park_threads(void) +{ + return 0; +} + +static void watchdog_unpark_threads(void) +{ +} + +static int watchdog_enable_all_cpus(void) +{ + return 0; +} + +static void watchdog_disable_all_cpus(void) +{ +} + +static void set_sample_period(void) +{ +} +#endif /* SOFTLOCKUP */ + +/* + * Suspend the hard and soft lockup detector by parking the watchdog threads. + */ +int lockup_detector_suspend(void) +{ + int ret = 0; + + get_online_cpus(); + mutex_lock(&watchdog_proc_mutex); + /* + * Multiple suspend requests can be active in parallel (counted by + * the 'watchdog_suspended' variable). If the watchdog threads are + * running, the first caller takes care that they will be parked. + * The state of 'watchdog_running' cannot change while a suspend + * request is active (see related code in 'proc' handlers). + */ + if (watchdog_running && !watchdog_suspended) + ret = watchdog_park_threads(); + + if (ret == 0) + watchdog_suspended++; + else { + watchdog_disable_all_cpus(); + pr_err("Failed to suspend lockup detectors, disabled\n"); + watchdog_enabled = 0; + } + + mutex_unlock(&watchdog_proc_mutex); + + return ret; +} + +/* + * Resume the hard and soft lockup detector by unparking the watchdog threads. + */ +void lockup_detector_resume(void) +{ + mutex_lock(&watchdog_proc_mutex); + + watchdog_suspended--; + /* + * The watchdog threads are unparked if they were previously running + * and if there is no more active suspend request. + */ + if (watchdog_running && !watchdog_suspended) + watchdog_unpark_threads(); + + mutex_unlock(&watchdog_proc_mutex); + put_online_cpus(); +} + #ifdef CONFIG_SYSCTL /* @@ -810,9 +881,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, * a temporary cpumask, so we are likely not in a * position to do much else to make things better. */ +#ifdef CONFIG_SOFTLOCKUP_DETECTOR if (smpboot_update_cpumask_percpu_thread( &watchdog_threads, &watchdog_cpumask) != 0) pr_err("cpumask update failed\n"); +#endif } } out: diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 90d688df6ce1..295a0d84934c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -22,39 +22,7 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -/* boot commands */ -/* - * Should we panic when a soft-lockup or hard-lockup occurs: - */ -unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static unsigned long hardlockup_allcpu_dumped; -/* - * We may not want to enable hard lockup detection by default in all cases, - * for example when running the kernel as a guest on a hypervisor. In these - * cases this function can be called to disable hard lockup detection. This - * function should only be executed once by the boot processor before the - * kernel command line parameters are parsed, because otherwise it is not - * possible to override this in hardlockup_panic_setup(). - */ -void hardlockup_detector_disable(void) -{ - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; -} - -static int __init hardlockup_panic_setup(char *str) -{ - if (!strncmp(str, "panic", 5)) - hardlockup_panic = 1; - else if (!strncmp(str, "nopanic", 7)) - hardlockup_panic = 0; - else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; - else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; - return 1; -} -__setup("nmi_watchdog=", hardlockup_panic_setup); void arch_touch_nmi_watchdog(void) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f28f4252e54a..b0d01c6d4e03 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -778,34 +778,45 @@ config DEBUG_SHIRQ menu "Debug Lockups and Hangs" config LOCKUP_DETECTOR - bool "Detect Hard and Soft Lockups" + bool + +config SOFTLOCKUP_DETECTOR + bool "Detect Soft Lockups" depends on DEBUG_KERNEL && !S390 + select LOCKUP_DETECTOR help Say Y here to enable the kernel to act as a watchdog to detect - hard and soft lockups. + soft lockups. Softlockups are bugs that cause the kernel to loop in kernel mode for more than 20 seconds, without giving other tasks a chance to run. The current stack trace is displayed upon detection and the system will stay locked up. +config HARDLOCKUP_DETECTOR_PERF + bool + select SOFTLOCKUP_DETECTOR + +# +# arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard +# lockup detector rather than the perf based detector. +# +config HARDLOCKUP_DETECTOR + bool "Detect Hard Lockups" + depends on DEBUG_KERNEL && !S390 + depends on HAVE_HARDLOCKUP_DETECTOR_PERF || HAVE_HARDLOCKUP_DETECTOR_ARCH + select LOCKUP_DETECTOR + select HARDLOCKUP_DETECTOR_PERF if HAVE_HARDLOCKUP_DETECTOR_PERF + select HARDLOCKUP_DETECTOR_ARCH if HAVE_HARDLOCKUP_DETECTOR_ARCH + help + Say Y here to enable the kernel to act as a watchdog to detect + hard lockups. + Hardlockups are bugs that cause the CPU to loop in kernel mode for more than 10 seconds, without letting other interrupts have a chance to run. The current stack trace is displayed upon detection and the system will stay locked up. - The overhead should be minimal. A periodic hrtimer runs to - generate interrupts and kick the watchdog task every 4 seconds. - An NMI is generated every 10 seconds or so to check for hardlockups. - - The frequency of hrtimer and NMI events and the soft and hard lockup - thresholds can be controlled through the sysctl watchdog_thresh. - -config HARDLOCKUP_DETECTOR - def_bool y - depends on LOCKUP_DETECTOR && !HAVE_NMI_WATCHDOG - depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI - config BOOTPARAM_HARDLOCKUP_PANIC bool "Panic (Reboot) On Hard Lockups" depends on HARDLOCKUP_DETECTOR @@ -826,7 +837,7 @@ config BOOTPARAM_HARDLOCKUP_PANIC_VALUE config BOOTPARAM_SOFTLOCKUP_PANIC bool "Panic (Reboot) On Soft Lockups" - depends on LOCKUP_DETECTOR + depends on SOFTLOCKUP_DETECTOR help Say Y here to enable the kernel to panic on "soft lockups", which are bugs that cause the kernel to loop in kernel @@ -843,7 +854,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE int - depends on LOCKUP_DETECTOR + depends on SOFTLOCKUP_DETECTOR range 0 1 default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC default 1 if BOOTPARAM_SOFTLOCKUP_PANIC @@ -851,7 +862,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE config DETECT_HUNG_TASK bool "Detect Hung Tasks" depends on DEBUG_KERNEL - default LOCKUP_DETECTOR + default SOFTLOCKUP_DETECTOR help Say Y here to enable the kernel to detect "hung tasks", which are bugs that cause the task to be stuck in -- cgit v1.3-14-g43fede From a10a842ff81a7e3810817b3b04e4c432b6191e21 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 12 Jul 2017 14:35:49 -0700 Subject: kernel/watchdog: provide watchdog_nmi_reconfigure() for arch watchdogs After reconfiguring watchdog sysctls etc., architecture specific watchdogs may not get all their parameters updated. watchdog_nmi_reconfigure() can be implemented to pull the new values in and set the arch NMI watchdog. [npiggin@gmail.com: add code comments] Link: http://lkml.kernel.org/r/20170617125933.774d3858@roar.ozlabs.ibm.com [arnd@arndb.de: hide unused function] Link: http://lkml.kernel.org/r/20170620204854.966601-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20170616065715.18390-5-npiggin@gmail.com Signed-off-by: Nicholas Piggin Signed-off-by: Arnd Bergmann Reviewed-by: Don Zickus Tested-by: Babu Moger [sparc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1fba9c3d66dc..cabe3e9fb620 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -114,6 +114,10 @@ int __read_mostly watchdog_suspended; /* * These functions can be overridden if an architecture implements its * own hardlockup detector. + * + * watchdog_nmi_enable/disable can be implemented to start and stop when + * softlockup watchdog threads start and stop. The arch must select the + * SOFTLOCKUP_DETECTOR Kconfig. */ int __weak watchdog_nmi_enable(unsigned int cpu) { @@ -123,6 +127,22 @@ void __weak watchdog_nmi_disable(unsigned int cpu) { } +/* + * watchdog_nmi_reconfigure can be implemented to be notified after any + * watchdog configuration change. The arch hardlockup watchdog should + * respond to the following variables: + * - nmi_watchdog_enabled + * - watchdog_thresh + * - watchdog_cpumask + * - sysctl_hardlockup_all_cpu_backtrace + * - hardlockup_panic + * - watchdog_suspended + */ +void __weak watchdog_nmi_reconfigure(void) +{ +} + + #ifdef CONFIG_SOFTLOCKUP_DETECTOR /* Helper for online, unparked cpus. */ @@ -600,6 +620,14 @@ static void watchdog_disable_all_cpus(void) } } +#ifdef CONFIG_SYSCTL +static int watchdog_update_cpus(void) +{ + return smpboot_update_cpumask_percpu_thread( + &watchdog_threads, &watchdog_cpumask); +} +#endif + #else /* SOFTLOCKUP */ static int watchdog_park_threads(void) { @@ -619,6 +647,13 @@ static void watchdog_disable_all_cpus(void) { } +#ifdef CONFIG_SYSCTL +static int watchdog_update_cpus(void) +{ + return 0; +} +#endif + static void set_sample_period(void) { } @@ -651,6 +686,8 @@ int lockup_detector_suspend(void) watchdog_enabled = 0; } + watchdog_nmi_reconfigure(); + mutex_unlock(&watchdog_proc_mutex); return ret; @@ -671,6 +708,8 @@ void lockup_detector_resume(void) if (watchdog_running && !watchdog_suspended) watchdog_unpark_threads(); + watchdog_nmi_reconfigure(); + mutex_unlock(&watchdog_proc_mutex); put_online_cpus(); } @@ -696,6 +735,8 @@ static int proc_watchdog_update(void) else watchdog_disable_all_cpus(); + watchdog_nmi_reconfigure(); + return err; } @@ -881,12 +922,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, * a temporary cpumask, so we are likely not in a * position to do much else to make things better. */ -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - if (smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask) != 0) + if (watchdog_update_cpus() != 0) pr_err("cpumask update failed\n"); -#endif } + + watchdog_nmi_reconfigure(); } out: mutex_unlock(&watchdog_proc_mutex); -- cgit v1.3-14-g43fede From e2ae8ab4b571e2e4094a28acb60649bc2732c67f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jul 2017 14:35:58 -0700 Subject: kexec_file: adjust declaration of kexec_purgatory Defining kexec_purgatory as a zero-length char array upsets compile time size checking. Since this is built on a per-arch basis, define it as an unsized char array (like is done for other similar things, e.g. linker sections). This silences the warning generated by the future CONFIG_FORTIFY_SOURCE, which did not like the memcmp() of a "0 byte" array. This drops the __weak and uses an extern instead, since both users define kexec_purgatory. Link: http://lkml.kernel.org/r/1497903987-21002-4-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Acked-by: "Eric W. Biederman" Cc: Daniel Micay Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 7 ------- kernel/kexec_internal.h | 2 ++ 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index c8f7f77e9fa9..9f48f4412297 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -26,13 +26,6 @@ #include #include "kexec_internal.h" -/* - * Declare these symbols weak so that if architecture provides a purgatory, - * these will be overridden. - */ -char __weak kexec_purgatory[0]; -size_t __weak kexec_purgatory_size = 0; - static int kexec_calculate_store_digests(struct kimage *image); /* Architectures can provide this probe function */ diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 799a8a452187..50dfcb039a41 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -17,6 +17,8 @@ extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE #include void kimage_file_post_load_cleanup(struct kimage *image); +extern char kexec_purgatory[]; +extern size_t kexec_purgatory_size; #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } #endif /* CONFIG_KEXEC_FILE */ -- cgit v1.3-14-g43fede From 7cd815bce828220deffd1654265f0ef891567774 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 12 Jul 2017 14:36:20 -0700 Subject: fork,random: use get_random_canary() to set tsk->stack_canary Use the ascii-armor canary to prevent unterminated C string overflows from being able to successfully overwrite the canary, even if they somehow obtain the canary value. Inspired by execshield ascii-armor and Daniel Micay's linux-hardened tree. Link: http://lkml.kernel.org/r/20170524155751.424-3-riel@redhat.com Signed-off-by: Rik van Riel Acked-by: Kees Cook Cc: Daniel Micay Cc: "Theodore Ts'o" Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Ingo Molnar Cc: Catalin Marinas Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ade237a96308..17921b0390b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -554,7 +554,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_long(); + tsk->stack_canary = get_random_canary(); #endif /* -- cgit v1.3-14-g43fede From 69f0d429c413fe96db2c187475cebcc6e3a8c7f5 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 13 Jul 2017 14:18:24 +0800 Subject: locking/rtmutex: Remove unnecessary priority adjustment We don't need to adjust priority before adding a new pi_waiter, the priority only needs to be updated after pi_waiter change or task priority change. Steven Rostedt pointed out: "Interesting, I did some git mining and this was added with the original entry of the rtmutex.c (23f78d4a03c5). Looking at even that version, I don't see the purpose of adjusting the task prio here. It is done before anything changes in the task." Signed-off-by: Alex Shi Reviewed-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mathieu Poirier Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1499926704-28841-1-git-send-email-alex.shi@linaro.org [ Enhance the changelog. ] Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 78069895032a..649dc9d3951a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -963,7 +963,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, return -EDEADLK; raw_spin_lock(&task->pi_lock); - rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; waiter->prio = task->prio; -- cgit v1.3-14-g43fede From 0e4097c3354e2f5a5ad8affd9dc7f7f7d00bb6b9 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sun, 9 Jul 2017 00:40:28 -0700 Subject: sched/cputime: Don't use smp_processor_id() in preemptible context Recent kernels trigger this warning: BUG: using smp_processor_id() in preemptible [00000000] code: 99-trinity/181 caller is debug_smp_processor_id+0x17/0x19 CPU: 0 PID: 181 Comm: 99-trinity Not tainted 4.12.0-01059-g2a42eb9 #1 Call Trace: dump_stack+0x82/0xb8 check_preemption_disabled() debug_smp_processor_id() vtime_delta() task_cputime() thread_group_cputime() thread_group_cputime_adjusted() wait_consider_task() do_wait() SYSC_wait4() do_syscall_64() entry_SYSCALL64_slow_path() As Frederic pointed out: | Although those sched_clock_cpu() things seem to only matter when the | sched_clock() is unstable. And that stability is a condition for nohz_full | to work anyway. So probably sched_clock() alone would be enough. This patch fixes it by replacing sched_clock_cpu() with sched_clock() to avoid calling smp_processor_id() in a preemptible context. Reported-by: Xiaolong Ye Signed-off-by: Wanpeng Li Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Luiz Capitulino Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1499586028-7402-1-git-send-email-wanpeng.li@hotmail.com [ Prettified the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 6e3ea4ac1bda..14d2dbf97c53 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -683,7 +683,7 @@ static u64 vtime_delta(struct vtime *vtime) { unsigned long long clock; - clock = sched_clock_cpu(smp_processor_id()); + clock = sched_clock(); if (clock < vtime->starttime) return 0; @@ -814,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = sched_clock_cpu(smp_processor_id()); + vtime->starttime = sched_clock(); write_seqcount_end(&vtime->seqcount); } @@ -826,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&vtime->seqcount); vtime->state = VTIME_SYS; - vtime->starttime = sched_clock_cpu(cpu); + vtime->starttime = sched_clock(); write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } -- cgit v1.3-14-g43fede From 193be41e33168a3a06eb9d356d9e39c69de161d2 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 12 Jul 2017 19:24:29 -0700 Subject: sched/deadline: Fix confusing comments about selection of top pi-waiter This comment in the code is incomplete, and I believe it begs a definition of dl_boosted to make sense of the condition that follows. Rewrite the comment and also rearrange the condition that follows to reflect the first condition "we have a top pi-waiter which is a SCHED_DEADLINE task" in that order. Also fix a typo that follows. Signed-off-by: Joel Fernandes Reviewed-by: Daniel Bristot de Oliveira Acked-by: Juri Lelli Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170713022429.10307-1-joelaf@google.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a84299f44b5d..755bd3f1a1a9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1392,17 +1392,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) struct sched_dl_entity *pi_se = &p->dl; /* - * Use the scheduling parameters of the top pi-waiter - * task if we have one and its (absolute) deadline is - * smaller than our one... OTW we keep our runtime and - * deadline. + * Use the scheduling parameters of the top pi-waiter task if: + * - we have a top pi-waiter which is a SCHED_DEADLINE task AND + * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is + * smaller than our deadline OR we are a !SCHED_DEADLINE task getting + * boosted due to a SCHED_DEADLINE pi-waiter). + * Otherwise we keep our runtime and deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { + if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { pi_se = &pi_task->dl; } else if (!dl_prio(p->normal_prio)) { /* * Special case in which we have a !SCHED_DEADLINE task - * that is going to be deboosted, but exceedes its + * that is going to be deboosted, but exceeds its * runtime while doing so. No point in replenishing * it, as it's going to return back to its original * scheduling class after this. -- cgit v1.3-14-g43fede From 5f92a7b0fcd627fbd06ceb1cee3bbe5d08d13356 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 14 Jul 2017 14:49:46 -0700 Subject: kernel/watchdog.c: use better pr_fmt prefix After commit 73ce0511c436 ("kernel/watchdog.c: move hardlockup detector to separate file"), 'NMI watchdog' is inappropriate in kernel/watchdog.c, using 'watchdog' only. Link: http://lkml.kernel.org/r/1499928642-48983-1-git-send-email-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Babu Moger Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index cabe3e9fb620..06d3389bca0d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -9,7 +9,7 @@ * to those contributors as well. */ -#define pr_fmt(fmt) "NMI watchdog: " fmt +#define pr_fmt(fmt) "watchdog: " fmt #include #include -- cgit v1.3-14-g43fede From 6d7964a722afc8e4f880b947f174009063028c99 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 14 Jul 2017 14:50:11 -0700 Subject: kmod: throttle kmod thread limit If we reach the limit of modprobe_limit threads running the next request_module() call will fail. The original reason for adding a kill was to do away with possible issues with in old circumstances which would create a recursive series of request_module() calls. We can do better than just be super aggressive and reject calls once we've reached the limit by simply making pending callers wait until the threshold has been reduced, and then throttling them in, one by one. This throttling enables requests over the kmod concurrent limit to be processed once a pending request completes. Only the first item queued up to wait is woken up. The assumption here is once a task is woken it will have no other option to also kick the queue to check if there are more pending tasks -- regardless of whether or not it was successful. By throttling and processing only max kmod concurrent tasks we ensure we avoid unexpected fatal request_module() calls, and we keep memory consumption on module loading to a minimum. With x86_64 qemu, with 4 cores, 4 GiB of RAM it takes the following run time to run both tests: time ./kmod.sh -t 0008 real 0m16.366s user 0m0.883s sys 0m8.916s time ./kmod.sh -t 0009 real 0m50.803s user 0m0.791s sys 0m9.852s Link: http://lkml.kernel.org/r/20170628223155.26472-4-mcgrof@kernel.org Signed-off-by: Luis R. Rodriguez Reviewed-by: Petr Mladek Cc: Jessica Yu Cc: Shuah Khan Cc: Rusty Russell Cc: Michal Marek Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 16 +++++++--------- tools/testing/selftests/kmod/kmod.sh | 24 ++---------------------- 2 files changed, 9 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index ff68198fe83b..6d016c5d97c8 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -68,6 +68,7 @@ static DECLARE_RWSEM(umhelper_sem); */ #define MAX_KMOD_CONCURRENT 50 static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); +static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); /* modprobe_path is set via /proc/sys. @@ -140,7 +141,6 @@ int __request_module(bool wait, const char *fmt, ...) va_list args; char module_name[MODULE_NAME_LEN]; int ret; - static int kmod_loop_msg; /* * We don't allow synchronous module loading from async. Module @@ -164,14 +164,11 @@ int __request_module(bool wait, const char *fmt, ...) return ret; if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) { - /* We may be blaming an innocent here, but unlikely */ - if (kmod_loop_msg < 5) { - printk(KERN_ERR - "request_module: runaway loop modprobe %s\n", - module_name); - kmod_loop_msg++; - } - return -ENOMEM; + pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", + atomic_read(&kmod_concurrent_max), + MAX_KMOD_CONCURRENT, module_name); + wait_event_interruptible(kmod_wq, + atomic_dec_if_positive(&kmod_concurrent_max) >= 0); } trace_module_request(module_name, wait, _RET_IP_); @@ -179,6 +176,7 @@ int __request_module(bool wait, const char *fmt, ...) ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); atomic_inc(&kmod_concurrent_max); + wake_up(&kmod_wq); return ret; } diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh index 10196a62ed09..8cecae9a8bca 100644 --- a/tools/testing/selftests/kmod/kmod.sh +++ b/tools/testing/selftests/kmod/kmod.sh @@ -59,28 +59,8 @@ ALL_TESTS="$ALL_TESTS 0004:1:1" ALL_TESTS="$ALL_TESTS 0005:10:1" ALL_TESTS="$ALL_TESTS 0006:10:1" ALL_TESTS="$ALL_TESTS 0007:5:1" - -# Disabled tests: -# -# 0008 x 150 - multithreaded - push kmod_concurrent over max_modprobes for request_module()" -# Current best-effort failure interpretation: -# Enough module requests get loaded in place fast enough to reach over the -# max_modprobes limit and trigger a failure -- before we're even able to -# start processing pending requests. -ALL_TESTS="$ALL_TESTS 0008:150:0" - -# 0009 x 150 - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type()" -# Current best-effort failure interpretation: -# -# get_fs_type() requests modules using aliases as such the optimization in -# place today to look for already loaded modules will not take effect and -# we end up requesting a new module to load, this bumps the kmod_concurrent, -# and in certain circumstances can lead to pushing the kmod_concurrent over -# the max_modprobe limit. -# -# This test fails much easier than test 0008 since the alias optimizations -# are not in place. -ALL_TESTS="$ALL_TESTS 0009:150:0" +ALL_TESTS="$ALL_TESTS 0008:150:1" +ALL_TESTS="$ALL_TESTS 0009:150:1" test_modprobe() { -- cgit v1.3-14-g43fede From a696712c3dd54eb58d2c5a807b4aaa27782d80d6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 17 Jul 2017 19:47:02 +0200 Subject: genirq/PM: Properly pretend disabled state when force resuming interrupts Interrupts with the IRQF_FORCE_RESUME flag set have also the IRQF_NO_SUSPEND flag set. They are not disabled in the suspend path, but must be forcefully resumed. That's used by XEN to keep IPIs enabled beyond the suspension of device irqs. Force resume works by pretending that the interrupt was disabled and then calling __irq_enable(). Incrementing the disabled depth counter was enough to do that, but with the recent changes which use state flags to avoid unnecessary hardware access, this is not longer sufficient. If the state flags are not set, then the hardware callbacks are not invoked and the interrupt line stays disabled in "hardware". Set the disabled and masked state when pretending that an interrupt got disabled by suspend. Fixes: bf22ff45bed6 ("genirq: Avoid unnecessary low level irq function calls") Suggested-by: Thomas Gleixner Signed-off-by: Juergen Gross Signed-off-by: Thomas Gleixner Cc: xen-devel@lists.xenproject.org Cc: boris.ostrovsky@oracle.com Link: http://lkml.kernel.org/r/20170717174703.4603-2-jgross@suse.com --- kernel/irq/chip.c | 10 ---------- kernel/irq/internals.h | 10 ++++++++++ kernel/irq/pm.c | 2 ++ 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d171bc57e1e0..a3cc37c0c85e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -170,21 +170,11 @@ static void irq_state_clr_disabled(struct irq_desc *desc) irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); } -static void irq_state_set_disabled(struct irq_desc *desc) -{ - irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); -} - static void irq_state_clr_masked(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); } -static void irq_state_set_masked(struct irq_desc *desc) -{ - irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); -} - static void irq_state_clr_started(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index dbfba9933ed2..a2c48058354c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -227,6 +227,16 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) return __irqd_to_state(d) & mask; } +static inline void irq_state_set_disabled(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); +} + +static inline void irq_state_set_masked(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); +} + #undef __irqd_to_state static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cea1de0161f1..6bd9b58429cc 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -149,6 +149,8 @@ static void resume_irq(struct irq_desc *desc) /* Pretend that it got disabled ! */ desc->depth++; + irq_state_set_disabled(desc); + irq_state_set_masked(desc); resume: desc->istate &= ~IRQS_SUSPENDED; __enable_irq(desc); -- cgit v1.3-14-g43fede From 848618857d2535176037bdc085f8d012d907071f Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Wed, 12 Jul 2017 19:14:16 -0700 Subject: tracing/ring_buffer: Try harder to allocate ftrace can fail to allocate per-CPU ring buffer on systems with a large number of CPUs coupled while large amounts of cache happening in the page cache. Currently the ring buffer allocation doesn't retry in the VM implementation even if direct-reclaim made some progress but still wasn't able to find a free page. On retrying I see that the allocations almost always succeed. The retry doesn't happen because __GFP_NORETRY is used in the tracer to prevent the case where we might OOM, however if we drop __GFP_NORETRY, we risk destabilizing the system if OOM killer is triggered. To prevent this situation, use the __GFP_RETRY_MAYFAIL flag introduced recently [1]. Tested the following still succeeds without destabilizing a system with 1GB memory. echo 300000 > /sys/kernel/debug/tracing/buffer_size_kb [1] https://marc.info/?l=linux-mm&m=149820805124906&w=2 Link: http://lkml.kernel.org/r/20170713021416.8897-1-joelaf@google.com Cc: Tim Murray Cc: Ingo Molnar Cc: Andrew Morton Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4ae268e687fe..529cc50d7243 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1136,12 +1136,12 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) for (i = 0; i < nr_pages; i++) { struct page *page; /* - * __GFP_NORETRY flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is - * not destabilized. + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL | __GFP_NORETRY, + GFP_KERNEL | __GFP_RETRY_MAYFAIL, cpu_to_node(cpu)); if (!bpage) goto free_pages; @@ -1149,7 +1149,7 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) list_add(&bpage->list, pages); page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); + GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0); if (!page) goto free_pages; bpage->page = page_address(page); -- cgit v1.3-14-g43fede From b0659ae5e30074ede1dc08f2c6d64f0c11d64e0f Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Tue, 18 Jul 2017 14:37:24 +0800 Subject: audit: fix memleak in auditd_send_unicast_skb. Found this issue by kmemleak report, auditd_send_unicast_skb did not free skb if rcu_dereference(auditd_conn) returns null. unreferenced object 0xffff88082568ce00 (size 256): comm "auditd", pid 1119, jiffies 4294708499 backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc_node+0xcc/0x210 [] __alloc_skb+0x5d/0x290 [] audit_make_reply+0x54/0xd0 [] audit_receive_msg+0x967/0xd70 ---------------- (gdb) list *audit_receive_msg+0x967 0xffffffff8113dff7 is in audit_receive_msg (kernel/audit.c:1133). 1132 skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr)); --------------- [] audit_receive+0x52/0xa0 [] netlink_unicast+0x181/0x240 [] netlink_sendmsg+0x2c2/0x3b0 [] sock_sendmsg+0x38/0x50 [] SYSC_sendto+0x102/0x190 [] SyS_sendto+0xe/0x10 [] entry_SYSCALL_64_fastpath+0x1a/0xa5 [] 0xffffffffffffffff Signed-off-by: Shu Wang Signed-off-by: Paul Moore --- kernel/audit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 7cad70214b81..07def5e49cc9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -641,6 +641,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) ac = rcu_dereference(auditd_conn); if (!ac) { rcu_read_unlock(); + kfree_skb(skb); rc = -ECONNREFUSED; goto err; } -- cgit v1.3-14-g43fede From 3bda69c1c3993a2bddbae01397d12bfef6054011 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 18 Jul 2017 14:08:34 +0300 Subject: perf/core: Fix scheduling regression of pinned groups Vince Weaver reported: > I was tracking down some regressions in my perf_event_test testsuite. > Some of the tests broke in the 4.11-rc1 timeframe. > > I've bisected one of them, this report is about > tests/overflow/simul_oneshot_group_overflow > This test creates an event group containing two sampling events, set > to overflow to a signal handler (which disables and then refreshes the > event). > > On a good kernel you get the following: > Event perf::instructions with period 1000000 > Event perf::instructions with period 2000000 > fd 3 overflows: 946 (perf::instructions/1000000) > fd 4 overflows: 473 (perf::instructions/2000000) > Ending counts: > Count 0: 946379875 > Count 1: 946365218 > > With the broken kernels you get: > Event perf::instructions with period 1000000 > Event perf::instructions with period 2000000 > fd 3 overflows: 938 (perf::instructions/1000000) > fd 4 overflows: 318 (perf::instructions/2000000) > Ending counts: > Count 0: 946373080 > Count 1: 653373058 The root cause of the bug is that the following commit: 487f05e18a ("perf/core: Optimize event rescheduling on active contexts") erronously assumed that event's 'pinned' setting determines whether the event belongs to a pinned group or not, but in fact, it's the group leader's pinned state that matters. This was discovered by Vince in the test case described above, where two instruction counters are grouped, the group leader is pinned, but the other event is not; in the regressed case the counters were off by 33% (the difference between events' periods), but should be the same within the error margin. Fix the problem by looking at the group leader's pinning. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 487f05e18a ("perf/core: Optimize event rescheduling on active contexts") Link: http://lkml.kernel.org/r/87lgnmvw7h.fsf@ashishki-desk.ger.corp.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9747e422ab20..c9cdbd396770 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1452,6 +1452,13 @@ static enum event_type_t get_event_type(struct perf_event *event) lockdep_assert_held(&ctx->lock); + /* + * It's 'group type', really, because if our group leader is + * pinned, so are we. + */ + if (event->group_leader != event) + event = event->group_leader; + event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; if (!ctx->task) event_type |= EVENT_CPU; -- cgit v1.3-14-g43fede From db9108e054700c96322b0f0028546aa4e643cf0b Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Thu, 20 Jul 2017 18:36:09 +0800 Subject: tracing: Fix kmemleak in instance_rmdir Hit the kmemleak when executing instance_rmdir, it forgot releasing mem of tracing_cpumask. With this fix, the warn does not appear any more. unreferenced object 0xffff93a8dfaa7c18 (size 8): comm "mkdir", pid 1436, jiffies 4294763622 (age 9134.308s) hex dump (first 8 bytes): ff ff ff ff ff ff ff ff ........ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] __kmalloc_node+0xf1/0x280 [] alloc_cpumask_var_node+0x23/0x30 [] alloc_cpumask_var+0xe/0x10 [] instance_mkdir+0x90/0x240 [] tracefs_syscall_mkdir+0x40/0x70 [] vfs_mkdir+0x109/0x1b0 [] SyS_mkdir+0xd0/0x100 [] do_syscall_64+0x67/0x150 [] return_from_SYSCALL_64+0x0/0x6a [] 0xffffffffffffffff Link: http://lkml.kernel.org/r/1500546969-12594-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: ccfe9e42e451 ("tracing: Make tracing_cpumask available for all instances") Signed-off-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2d0ffcc49dba..42b9355033d4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7774,6 +7774,7 @@ static int instance_rmdir(const char *name) } kfree(tr->topts); + free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); -- cgit v1.3-14-g43fede From f86f418059b94aa01f9342611a272ca60c583e89 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Wed, 7 Jun 2017 16:12:51 +0800 Subject: trace: fix the errors caused by incompatible type of RCU variables The variables which are processed by RCU functions should be annotated as RCU, otherwise sparse will report the errors like below: "error: incompatible types in comparison expression (different address spaces)" Link: http://lkml.kernel.org/r/1496823171-7758-1-git-send-email-zhang.chunyan@linaro.org Signed-off-by: Chunyan Zhang [ Updated to not be 100% 80 column strict ] Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 +++--- include/linux/trace_events.h | 2 +- kernel/trace/ftrace.c | 41 +++++++++++++++++++++++++++-------------- kernel/trace/trace.h | 6 +++--- 4 files changed, 34 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 5857390ac35a..6383115e9d2c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -145,8 +145,8 @@ enum { #ifdef CONFIG_DYNAMIC_FTRACE /* The hash used to know what functions callbacks trace */ struct ftrace_ops_hash { - struct ftrace_hash *notrace_hash; - struct ftrace_hash *filter_hash; + struct ftrace_hash __rcu *notrace_hash; + struct ftrace_hash __rcu *filter_hash; struct mutex regex_lock; }; @@ -168,7 +168,7 @@ static inline void ftrace_free_init_mem(void) { } */ struct ftrace_ops { ftrace_func_t func; - struct ftrace_ops *next; + struct ftrace_ops __rcu *next; unsigned long flags; void *private; ftrace_func_t saved_func; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index f73cedfa2e0b..536c80ff7ad9 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -338,7 +338,7 @@ enum { struct trace_event_file { struct list_head list; struct trace_event_call *event_call; - struct event_filter *filter; + struct event_filter __rcu *filter; struct dentry *dir; struct trace_array *tr; struct trace_subsystem_dir *system; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 53f6b6401cf0..02004ae91860 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -113,7 +113,7 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; @@ -169,8 +169,11 @@ int ftrace_nr_registered_ops(void) mutex_lock(&ftrace_lock); - for (ops = ftrace_ops_list; - ops != &ftrace_list_end; ops = ops->next) + for (ops = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); + ops != &ftrace_list_end; + ops = rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock))) cnt++; mutex_unlock(&ftrace_lock); @@ -275,10 +278,11 @@ static void update_ftrace_function(void) * If there's only one ftrace_ops registered, the ftrace_ops_list * will point to the ops we want. */ - set_function_trace_op = ftrace_ops_list; + set_function_trace_op = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); /* If there's no ftrace_ops registered, just call the stub function */ - if (ftrace_ops_list == &ftrace_list_end) { + if (set_function_trace_op == &ftrace_list_end) { func = ftrace_stub; /* @@ -286,7 +290,8 @@ static void update_ftrace_function(void) * recursion safe and not dynamic and the arch supports passing ops, * then have the mcount trampoline call the function directly. */ - } else if (ftrace_ops_list->next == &ftrace_list_end) { + } else if (rcu_dereference_protected(ftrace_ops_list->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { func = ftrace_ops_get_list_func(ftrace_ops_list); } else { @@ -348,9 +353,11 @@ int using_ftrace_ops_list_func(void) return ftrace_trace_function == ftrace_ops_list_func; } -static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static void add_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { - ops->next = *list; + rcu_assign_pointer(ops->next, *list); + /* * We are entering ops into the list but another * CPU might be walking that list. We need to make sure @@ -360,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) rcu_assign_pointer(*list, ops); } -static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +static int remove_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) { struct ftrace_ops **p; @@ -368,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) * If we are removing the last function, then simply point * to the ftrace_stub. */ - if (*list == ops && ops->next == &ftrace_list_end) { + if (rcu_dereference_protected(*list, + lockdep_is_held(&ftrace_lock)) == ops && + rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { *list = &ftrace_list_end; return 0; } @@ -1569,8 +1580,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); - hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); + rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash); + rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash); if (hash_contains_ip(ip, &hash)) ret = 1; @@ -2840,7 +2851,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * If there's no more ops registered with ftrace, run a * sanity check to make sure all rec flags are cleared. */ - if (ftrace_ops_list == &ftrace_list_end) { + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -6453,7 +6465,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ftrace_enabled) { /* we are starting ftrace again */ - if (ftrace_ops_list != &ftrace_list_end) + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) != &ftrace_list_end) update_ftrace_function(); ftrace_startup_sysctl(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6ade1c55cc3a..490ba229931d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1210,9 +1210,9 @@ struct ftrace_event_field { struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ - struct filter_pred *preds; - struct filter_pred *root; - char *filter_string; + struct filter_pred __rcu *preds; + struct filter_pred __rcu *root; + char *filter_string; }; struct event_subsystem { -- cgit v1.3-14-g43fede From 4cabc5b186b5427b9ee5a7495172542af105f02b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 21 Jul 2017 00:00:21 +0200 Subject: bpf: fix mixed signed/unsigned derived min/max value bounds Edward reported that there's an issue in min/max value bounds tracking when signed and unsigned compares both provide hints on limits when having unknown variables. E.g. a program such as the following should have been rejected: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff8a94cda93400 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+7 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = -1 10: (2d) if r1 > r2 goto pc+3 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 11: (65) if r1 s> 0x1 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0,max_value=1 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 12: (0f) r0 += r1 13: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=0,max_value=1 R1=inv,min_value=0,max_value=1 R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 14: (b7) r0 = 0 15: (95) exit What happens is that in the first part ... 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = -1 10: (2d) if r1 > r2 goto pc+3 ... r1 carries an unsigned value, and is compared as unsigned against a register carrying an immediate. Verifier deduces in reg_set_min_max() that since the compare is unsigned and operation is greater than (>), that in the fall-through/false case, r1's minimum bound must be 0 and maximum bound must be r2. Latter is larger than the bound and thus max value is reset back to being 'invalid' aka BPF_REGISTER_MAX_RANGE. Thus, r1 state is now 'R1=inv,min_value=0'. The subsequent test ... 11: (65) if r1 s> 0x1 goto pc+2 ... is a signed compare of r1 with immediate value 1. Here, verifier deduces in reg_set_min_max() that since the compare is signed this time and operation is greater than (>), that in the fall-through/false case, we can deduce that r1's maximum bound must be 1, meaning with prior test, we result in r1 having the following state: R1=inv,min_value=0,max_value=1. Given that the actual value this holds is -8, the bounds are wrongly deduced. When this is being added to r0 which holds the map_value(_adj) type, then subsequent store access in above case will go through check_mem_access() which invokes check_map_access_adj(), that will then probe whether the map memory is in bounds based on the min_value and max_value as well as access size since the actual unknown value is min_value <= x <= max_value; commit fce366a9dd0d ("bpf, verifier: fix alu ops against map_value{, _adj} register types") provides some more explanation on the semantics. It's worth to note in this context that in the current code, min_value and max_value tracking are used for two things, i) dynamic map value access via check_map_access_adj() and since commit 06c1c049721a ("bpf: allow helpers access to variable memory") ii) also enforced at check_helper_mem_access() when passing a memory address (pointer to packet, map value, stack) and length pair to a helper and the length in this case is an unknown value defining an access range through min_value/max_value in that case. The min_value/max_value tracking is /not/ used in the direct packet access case to track ranges. However, the issue also affects case ii), for example, the following crafted program based on the same principle must be rejected as well: 0: (b7) r2 = 0 1: (bf) r3 = r10 2: (07) r3 += -512 3: (7a) *(u64 *)(r10 -16) = -8 4: (79) r4 = *(u64 *)(r10 -16) 5: (b7) r6 = -1 6: (2d) if r4 > r6 goto pc+5 R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512 R4=inv,min_value=0 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 7: (65) if r4 s> 0x1 goto pc+4 R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512 R4=inv,min_value=0,max_value=1 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp 8: (07) r4 += 1 9: (b7) r5 = 0 10: (6a) *(u16 *)(r10 -512) = 0 11: (85) call bpf_skb_load_bytes#26 12: (b7) r0 = 0 13: (95) exit Meaning, while we initialize the max_value stack slot that the verifier thinks we access in the [1,2] range, in reality we pass -7 as length which is interpreted as u32 in the helper. Thus, this issue is relevant also for the case of helper ranges. Resetting both bounds in check_reg_overflow() in case only one of them exceeds limits is also not enough as similar test can be created that uses values which are within range, thus also here learned min value in r1 is incorrect when mixed with later signed test to create a range: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff880ad081fa00 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+7 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = 2 10: (3d) if r2 >= r1 goto pc+3 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 11: (65) if r1 s> 0x4 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 12: (0f) r0 += r1 13: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=3,max_value=4 R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 14: (b7) r0 = 0 15: (95) exit This leaves us with two options for fixing this: i) to invalidate all prior learned information once we switch signed context, ii) to track min/max signed and unsigned boundaries separately as done in [0]. (Given latter introduces major changes throughout the whole verifier, it's rather net-next material, thus this patch follows option i), meaning we can derive bounds either from only signed tests or only unsigned tests.) There is still the case of adjust_reg_min_max_vals(), where we adjust bounds on ALU operations, meaning programs like the following where boundaries on the reg get mixed in context later on when bounds are merged on the dst reg must get rejected, too: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (18) r1 = 0xffff89b2bf87ce00 5: (85) call bpf_map_lookup_elem#1 6: (15) if r0 == 0x0 goto pc+6 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp 7: (7a) *(u64 *)(r10 -16) = -8 8: (79) r1 = *(u64 *)(r10 -16) 9: (b7) r2 = 2 10: (3d) if r2 >= r1 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp 11: (b7) r7 = 1 12: (65) if r7 s> 0x0 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,max_value=0 R10=fp 13: (b7) r0 = 0 14: (95) exit from 12 to 15: R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,min_value=1 R10=fp 15: (0f) r7 += r1 16: (65) if r7 s> 0x4 goto pc+2 R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp 17: (0f) r0 += r7 18: (72) *(u8 *)(r0 +0) = 0 R0=map_value_adj(ks=8,vs=8,id=0),min_value=4,max_value=4 R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp 19: (b7) r0 = 0 20: (95) exit Meaning, in adjust_reg_min_max_vals() we must also reset range values on the dst when src/dst registers have mixed signed/ unsigned derived min/max value bounds with one unbounded value as otherwise they can be added together deducing false boundaries. Once both boundaries are established from either ALU ops or compare operations w/o mixing signed/unsigned insns, then they can safely be added to other regs also having both boundaries established. Adding regs with one unbounded side to a map value where the bounded side has been learned w/o mixing ops is possible, but the resulting map value won't recover from that, meaning such op is considered invalid on the time of actual access. Invalid bounds are set on the dst reg in case i) src reg, or ii) in case dst reg already had them. The only way to recover would be to perform i) ALU ops but only 'add' is allowed on map value types or ii) comparisons, but these are disallowed on pointers in case they span a range. This is fine as only BPF_JEQ and BPF_JNE may be performed on PTR_TO_MAP_VALUE_OR_NULL registers which potentially turn them into PTR_TO_MAP_VALUE type depending on the branch, so only here min/max value cannot be invalidated for them. In terms of state pruning, value_from_signed is considered as well in states_equal() when dealing with adjusted map values. With regards to breaking existing programs, there is a small risk, but use-cases are rather quite narrow where this could occur and mixing compares probably unlikely. Joint work with Josef and Edward. [0] https://lists.iovisor.org/pipermail/iovisor-dev/2017-June/000822.html Fixes: 484611357c19 ("bpf: allow access into map value arrays") Reported-by: Edward Cree Signed-off-by: Daniel Borkmann Signed-off-by: Edward Cree Signed-off-by: Josef Bacik Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 108 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 95 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 621076f56251..8e5d31f6faef 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -43,6 +43,7 @@ struct bpf_reg_state { u32 min_align; u32 aux_off; u32 aux_off_align; + bool value_from_signed; }; enum bpf_stack_slot_type { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a86723c5b64..af9e84a4944e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -504,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) { regs[regno].min_value = BPF_REGISTER_MIN_RANGE; regs[regno].max_value = BPF_REGISTER_MAX_RANGE; + regs[regno].value_from_signed = false; regs[regno].min_align = 0; } @@ -777,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return -EACCES; } -static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +static bool __is_pointer_value(bool allow_ptr_leaks, + const struct bpf_reg_state *reg) { - if (env->allow_ptr_leaks) + if (allow_ptr_leaks) return false; - switch (env->cur_state.regs[regno].type) { + switch (reg->type) { case UNKNOWN_VALUE: case CONST_IMM: return false; @@ -791,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) } } +static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +{ + return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); +} + static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, int off, int size, bool strict) { @@ -1832,10 +1839,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_align = dst_reg->min_align; /* We don't know anything about what was done to this register, mark it - * as unknown. + * as unknown. Also, if both derived bounds came from signed/unsigned + * mixed compares and one side is unbounded, we cannot really do anything + * with them as boundaries cannot be trusted. Thus, arithmetic of two + * regs of such kind will get invalidated bounds on the dst side. */ - if (min_val == BPF_REGISTER_MIN_RANGE && - max_val == BPF_REGISTER_MAX_RANGE) { + if ((min_val == BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (BPF_SRC(insn->code) == BPF_X && + ((min_val != BPF_REGISTER_MIN_RANGE && + max_val == BPF_REGISTER_MAX_RANGE) || + (min_val == BPF_REGISTER_MIN_RANGE && + max_val != BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value != BPF_REGISTER_MIN_RANGE && + dst_reg->max_value == BPF_REGISTER_MAX_RANGE) || + (dst_reg->min_value == BPF_REGISTER_MIN_RANGE && + dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) && + regs[insn->dst_reg].value_from_signed != + regs[insn->src_reg].value_from_signed)) { reset_reg_range_values(regs, insn->dst_reg); return; } @@ -2023,6 +2044,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[insn->dst_reg].max_value = insn->imm; regs[insn->dst_reg].min_value = insn->imm; regs[insn->dst_reg].min_align = calc_align(insn->imm); + regs[insn->dst_reg].value_from_signed = false; } } else if (opcode > BPF_END) { @@ -2198,40 +2220,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum val is val, * otherwise we know the min val is val+1. */ false_reg->max_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val + 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - false_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + false_reg->min_value = 0; + } /* If this is false then we know the maximum value is val - 1, * otherwise we know the mimimum value is val. */ false_reg->max_value = val - 1; + false_reg->value_from_signed = value_from_signed; true_reg->min_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -2239,6 +2284,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg @@ -2248,41 +2299,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, u8 opcode) { + bool value_from_signed = true; + bool is_range = true; + switch (opcode) { case BPF_JEQ: /* If this is false then we know nothing Jon Snow, but if it is * true then we know for sure. */ true_reg->max_value = true_reg->min_value = val; + is_range = false; break; case BPF_JNE: /* If this is true we know nothing Jon Snow, but if it is false * we know the value for sure; */ false_reg->max_value = false_reg->min_value = val; + is_range = false; break; case BPF_JGT: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGT: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGT) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* * If this is false, then the val is <= the register, if it is * true the register <= to the val. */ false_reg->min_value = val; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val - 1; + true_reg->value_from_signed = value_from_signed; break; case BPF_JGE: - /* Unsigned comparison, the minimum value is 0. */ - true_reg->min_value = 0; + value_from_signed = false; /* fallthrough */ case BPF_JSGE: + if (true_reg->value_from_signed != value_from_signed) + reset_reg_range_values(true_reg, 0); + if (false_reg->value_from_signed != value_from_signed) + reset_reg_range_values(false_reg, 0); + if (opcode == BPF_JGE) { + /* Unsigned comparison, the minimum value is 0. */ + true_reg->min_value = 0; + } /* If this is false then constant < register, if it is true then * the register < constant. */ false_reg->min_value = val + 1; + false_reg->value_from_signed = value_from_signed; true_reg->max_value = val; + true_reg->value_from_signed = value_from_signed; break; default: break; @@ -2290,6 +2364,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, check_reg_overflow(false_reg); check_reg_overflow(true_reg); + if (is_range) { + if (__is_pointer_value(false, false_reg)) + reset_reg_range_values(false_reg, 0); + if (__is_pointer_value(false, true_reg)) + reset_reg_range_values(true_reg, 0); + } } static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, -- cgit v1.3-14-g43fede From 2aeb1883547626d82c597cce2c99f0b9c62e2425 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 20 Jul 2017 16:14:55 +0200 Subject: perf/core: Fix locking for children siblings group read We're missing ctx lock when iterating children siblings within the perf_read path for group reading. Following race and crash can happen: User space doing read syscall on event group leader: T1: perf_read lock event->ctx->mutex perf_read_group lock leader->child_mutex __perf_read_group_add(child) list_for_each_entry(sub, &leader->sibling_list, group_entry) ----> sub might be invalid at this point, because it could get removed via perf_event_exit_task_context in T2 Child exiting and cleaning up its events: T2: perf_event_exit_task_context lock ctx->mutex list_for_each_entry_safe(child_event, next, &child_ctx->event_list,... perf_event_exit_event(child) lock ctx->lock perf_group_detach(child) unlock ctx->lock ----> child is removed from sibling_list without any sync with T1 path above ... free_event(child) Before the child is removed from the leader's child_list, (and thus is omitted from perf_read_group processing), we need to ensure that perf_read_group touches child's siblings under its ctx->lock. Peter further notes: | One additional note; this bug got exposed by commit: | | ba5213ae6b88 ("perf/core: Correct event creation with PERF_FORMAT_GROUP") | | which made it possible to actually trigger this code-path. Tested-by: Andi Kleen Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: ba5213ae6b88 ("perf/core: Correct event creation with PERF_FORMAT_GROUP") Link: http://lkml.kernel.org/r/20170720141455.2106-1-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c9cdbd396770..c17c0881fd36 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4372,7 +4372,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value); static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { + struct perf_event_context *ctx = leader->ctx; struct perf_event *sub; + unsigned long flags; int n = 1; /* skip @nr */ int ret; @@ -4402,12 +4404,15 @@ static int __perf_read_group_add(struct perf_event *leader, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); + raw_spin_lock_irqsave(&ctx->lock, flags); + list_for_each_entry(sub, &leader->sibling_list, group_entry) { values[n++] += perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); } + raw_spin_unlock_irqrestore(&ctx->lock, flags); return 0; } -- cgit v1.3-14-g43fede