From b421b22b00b0011f6a2ce3561176c4e79e640c49 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:14:13 +0200 Subject: x86/tsc, sched/clock, clocksource: Use clocksource watchdog to provide stable sync points Currently we keep sched_clock_tick() active for stable TSC in order to keep the per-CPU state semi up-to-date. The (obvious) problem is that by the time we detect TSC is borked, our per-CPU state is also borked. So hook into the clocksource watchdog and call a method after we've found it to still be stable. There's the obvious race where the TSC goes wonky between finding it stable and us running the callback, but closing that is too much work and not really worth it, since we're already detecting TSC wobbles after the fact, so we cannot, per definition, fully avoid funny clock values. And since the watchdog runs less often than the tick, this is also an optimization. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/time/clocksource.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/time') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 93621ae718d3..03918a19cf2d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigned long data) continue; } + if (cs == curr_clocksource && cs->tick_stable) + cs->tick_stable(cs); + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { -- cgit v1.2.3-59-g8ed1b From ac1e843f0900bea92fcb47f6205e1f9ffb0d469c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Apr 2017 12:26:23 +0200 Subject: sched/clock: Remove unused argument to sched_clock_idle_wakeup_event() The argument to sched_clock_idle_wakeup_event() has not been used in a long time. Remove it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 2 +- include/linux/sched/clock.h | 4 ++-- kernel/sched/clock.c | 4 ++-- kernel/time/tick-sched.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/time') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c1b16b328abe..a3b544264360 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -182,7 +182,7 @@ static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long ts c2n->data[1] = data; done: - sched_clock_idle_wakeup_event(0); + sched_clock_idle_wakeup_event(); local_irq_restore(flags); } diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 978cbb0af5f3..9c36f0722966 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -39,7 +39,7 @@ static inline void sched_clock_idle_sleep_event(void) { } -static inline void sched_clock_idle_wakeup_event(u64 delta_ns) +static inline void sched_clock_idle_wakeup_event(void) { } @@ -66,7 +66,7 @@ extern u64 __sched_clock_offset; extern void sched_clock_tick(void); extern void sched_clock_tick_stable(void); extern void sched_clock_idle_sleep_event(void); -extern void sched_clock_idle_wakeup_event(u64 delta_ns); +extern void sched_clock_idle_wakeup_event(void); /* * As outlined in clock.c, provides a fast, high resolution, nanosecond diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index f861637f7fdc..750a92c9db7e 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -410,9 +410,9 @@ void sched_clock_idle_sleep_event(void) EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); /* - * We just idled delta nanoseconds (called with irqs disabled): + * We just idled; resync with ktime. (called with irqs disabled): */ -void sched_clock_idle_wakeup_event(u64 delta_ns) +void sched_clock_idle_wakeup_event(void) { if (timekeeping_suspended) return; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 64c97fc130c4..9c2dc64e31d8 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -554,7 +554,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) update_ts_time_stats(smp_processor_id(), ts, now, NULL); ts->idle_active = 0; - sched_clock_idle_wakeup_event(0); + sched_clock_idle_wakeup_event(); } static ktime_t tick_nohz_start_idle(struct tick_sched *ts) -- cgit v1.2.3-59-g8ed1b From 3c85d6db5e5f05ae6c3d7f5a0ceceb43746a5ca7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 19 Jun 2017 04:12:00 +0200 Subject: sched/loadavg: Generalize "_idle" naming to "_nohz" The loadavg naming code still assumes that nohz == idle whereas its code is actually handling well both nohz idle and nohz full. So lets fix the naming according to what the code actually does, to unconfuse the reader. Signed-off-by: Frederic Weisbecker Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1497838322-10913-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- Documentation/trace/ftrace.txt | 2 +- include/linux/sched/nohz.h | 8 +++---- kernel/sched/loadavg.c | 51 +++++++++++++++++++++--------------------- kernel/time/tick-sched.c | 4 ++-- 4 files changed, 33 insertions(+), 32 deletions(-) (limited to 'kernel/time') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 94a987bd2bc5..fff8ff6d4893 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -1609,7 +1609,7 @@ Doing the same with chrt -r 5 and function-trace set. -0 3dN.2 14us : sched_avg_update <-__cpu_load_update -0 3dN.2 14us : _raw_spin_unlock <-cpu_load_update_nohz -0 3dN.2 14us : sub_preempt_count <-_raw_spin_unlock - -0 3dN.1 15us : calc_load_exit_idle <-tick_nohz_idle_exit + -0 3dN.1 15us : calc_load_nohz_stop <-tick_nohz_idle_exit -0 3dN.1 15us : touch_softlockup_watchdog <-tick_nohz_idle_exit -0 3dN.1 15us : hrtimer_cancel <-tick_nohz_idle_exit -0 3dN.1 15us : hrtimer_try_to_cancel <-hrtimer_cancel diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index 4995b717500b..7d3f75db23e5 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -23,11 +23,11 @@ static inline void set_cpu_sd_state_idle(void) { } #endif #ifdef CONFIG_NO_HZ_COMMON -void calc_load_enter_idle(void); -void calc_load_exit_idle(void); +void calc_load_nohz_start(void); +void calc_load_nohz_stop(void); #else -static inline void calc_load_enter_idle(void) { } -static inline void calc_load_exit_idle(void) { } +static inline void calc_load_nohz_start(void) { } +static inline void calc_load_nohz_stop(void) { } #endif /* CONFIG_NO_HZ_COMMON */ #if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index f15fb2bdbc0d..f14716a3522f 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -117,7 +117,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * load-average relies on per-cpu sampling from the tick, it is affected by * NO_HZ. * - * The basic idea is to fold the nr_active delta into a global idle-delta upon + * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon * entering NO_HZ state such that we can include this as an 'extra' cpu delta * when we read the global state. * @@ -126,7 +126,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * - When we go NO_HZ idle during the window, we can negate our sample * contribution, causing under-accounting. * - * We avoid this by keeping two idle-delta counters and flipping them + * We avoid this by keeping two NO_HZ-delta counters and flipping them * when the window starts, thus separating old and new NO_HZ load. * * The only trick is the slight shift in index flip for read vs write. @@ -137,22 +137,22 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) * r:0 0 1 1 0 0 1 1 0 * w:0 1 1 0 0 1 1 0 0 * - * This ensures we'll fold the old idle contribution in this window while + * This ensures we'll fold the old NO_HZ contribution in this window while * accumlating the new one. * - * - When we wake up from NO_HZ idle during the window, we push up our + * - When we wake up from NO_HZ during the window, we push up our * contribution, since we effectively move our sample point to a known * busy state. * * This is solved by pushing the window forward, and thus skipping the - * sample, for this cpu (effectively using the idle-delta for this cpu which + * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which * was in effect at the time the window opened). This also solves the issue - * of having to deal with a cpu having been in NOHZ idle for multiple - * LOAD_FREQ intervals. + * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ + * intervals. * * When making the ILB scale, we should try to pull this in as well. */ -static atomic_long_t calc_load_idle[2]; +static atomic_long_t calc_load_nohz[2]; static int calc_load_idx; static inline int calc_load_write_idx(void) @@ -167,7 +167,7 @@ static inline int calc_load_write_idx(void) /* * If the folding window started, make sure we start writing in the - * next idle-delta. + * next NO_HZ-delta. */ if (!time_before(jiffies, READ_ONCE(calc_load_update))) idx++; @@ -180,24 +180,24 @@ static inline int calc_load_read_idx(void) return calc_load_idx & 1; } -void calc_load_enter_idle(void) +void calc_load_nohz_start(void) { struct rq *this_rq = this_rq(); long delta; /* - * We're going into NOHZ mode, if there's any pending delta, fold it - * into the pending idle delta. + * We're going into NO_HZ mode, if there's any pending delta, fold it + * into the pending NO_HZ delta. */ delta = calc_load_fold_active(this_rq, 0); if (delta) { int idx = calc_load_write_idx(); - atomic_long_add(delta, &calc_load_idle[idx]); + atomic_long_add(delta, &calc_load_nohz[idx]); } } -void calc_load_exit_idle(void) +void calc_load_nohz_stop(void) { struct rq *this_rq = this_rq(); @@ -217,13 +217,13 @@ void calc_load_exit_idle(void) this_rq->calc_load_update += LOAD_FREQ; } -static long calc_load_fold_idle(void) +static long calc_load_nohz_fold(void) { int idx = calc_load_read_idx(); long delta = 0; - if (atomic_long_read(&calc_load_idle[idx])) - delta = atomic_long_xchg(&calc_load_idle[idx], 0); + if (atomic_long_read(&calc_load_nohz[idx])) + delta = atomic_long_xchg(&calc_load_nohz[idx], 0); return delta; } @@ -299,9 +299,9 @@ calc_load_n(unsigned long load, unsigned long exp, /* * NO_HZ can leave us missing all per-cpu ticks calling - * calc_load_account_active(), but since an idle CPU folds its delta into - * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold - * in the pending idle delta if our idle period crossed a load cycle boundary. + * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into + * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold + * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. * * Once we've updated the global active value, we need to apply the exponential * weights adjusted to the number of cycles missed. @@ -330,7 +330,7 @@ static void calc_global_nohz(void) } /* - * Flip the idle index... + * Flip the NO_HZ index... * * Make sure we first write the new time then flip the index, so that * calc_load_write_idx() will see the new time when it reads the new @@ -341,7 +341,7 @@ static void calc_global_nohz(void) } #else /* !CONFIG_NO_HZ_COMMON */ -static inline long calc_load_fold_idle(void) { return 0; } +static inline long calc_load_nohz_fold(void) { return 0; } static inline void calc_global_nohz(void) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -362,9 +362,9 @@ void calc_global_load(unsigned long ticks) return; /* - * Fold the 'old' idle-delta to include all NO_HZ cpus. + * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. */ - delta = calc_load_fold_idle(); + delta = calc_load_nohz_fold(); if (delta) atomic_long_add(delta, &calc_load_tasks); @@ -378,7 +378,8 @@ void calc_global_load(unsigned long ticks) WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ); /* - * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. + * In case we went to NO_HZ for multiple LOAD_FREQ intervals + * catch up in bulk. */ calc_global_nohz(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9c2dc64e31d8..b1b58a07e042 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -783,7 +783,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, */ if (!ts->tick_stopped) { nohz_balance_enter_idle(cpu); - calc_load_enter_idle(); + calc_load_nohz_start(); cpu_load_update_nohz_start(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); @@ -823,7 +823,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) */ timer_clear_idle(); - calc_load_exit_idle(); + calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick -- cgit v1.2.3-59-g8ed1b From a0db971e4eb69fc84eb3d7ef94f718b483550b4a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 19 Jun 2017 04:12:01 +0200 Subject: nohz: Move idle balancer registration to the idle path The idle load balancing registration path assumes that we only stop the tick when the CPU is idle, ignoring the nohz full case. As a result, a nohz full CPU that is running a task may be chosen to perform idle load balancing. Lets make sure that only CPUs in dynticks idle mode can be picked as idle load balancers. Signed-off-by: Frederic Weisbecker Acked-by: Rik van Riel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1497838322-10913-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel/time') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b1b58a07e042..db023e9cbb25 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -782,7 +782,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - nohz_balance_enter_idle(cpu); calc_load_nohz_start(); cpu_load_update_nohz_start(); @@ -923,8 +922,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ts->idle_expires = expires; } - if (!was_stopped && ts->tick_stopped) + if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; + nohz_balance_enter_idle(cpu); + } } } -- cgit v1.2.3-59-g8ed1b