From d689fe222a858c767cb8594faf280048e532b53f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Nov 2013 21:01:57 +0100 Subject: NOHZ: Check for nohz active instead of nohz enabled RCU and the fine grained idle time accounting functions check tick_nohz_enabled. But that variable is merily telling that NOHZ has been enabled in the config and not been disabled on the command line. But it does not tell anything about nohz being active. That's what all this should check for. Matthew reported, that the idle accounting on his old P1 machine showed bogus values, when he enabled NOHZ in the config and did not disable it on the kernel command line. The reason is that his machine uses (refined) jiffies as a clocksource which explains why the "fine" grained accounting went into lala land, because it depends on when the system goes and leaves idle relative to the jiffies increment. Provide a tick_nohz_active indicator and let RCU and the accounting code use this instead of tick_nohz_enable. Reported-and-tested-by: Matthew Whitehead Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt Reviewed-by: Paul E. McKenney Cc: john.stultz@linaro.org Cc: mwhitehe@redhat.com Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1311132052240.30673@ionos.tec.linutronix.de --- kernel/rcu/tree_plugin.h | 4 ++-- kernel/time/tick-sched.c | 21 +++++++++------------ 2 files changed, 11 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6abb03dff5c0..08a765232432 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1632,7 +1632,7 @@ module_param(rcu_idle_gp_delay, int, 0644); static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); -extern int tick_nohz_enabled; +extern int tick_nohz_active; /* * Try to advance callbacks for all flavors of RCU on the current CPU, but @@ -1729,7 +1729,7 @@ static void rcu_prepare_for_idle(int cpu) int tne; /* Handle nohz enablement switches conservatively. */ - tne = ACCESS_ONCE(tick_nohz_enabled); + tne = ACCESS_ONCE(tick_nohz_active); if (tne != rdtp->tick_nohz_enabled_snap) { if (rcu_cpu_has_callbacks(cpu, NULL)) invoke_rcu_core(); /* force nohz to see update. */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3612fc77f834..a12df5abde0b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -361,8 +361,8 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -int tick_nohz_enabled __read_mostly = 1; - +static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -465,7 +465,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, idle; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; now = ktime_get(); @@ -506,7 +506,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, iowait; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return -1; now = ktime_get(); @@ -799,11 +799,6 @@ void tick_nohz_idle_enter(void) local_irq_disable(); ts = &__get_cpu_var(tick_cpu_sched); - /* - * set ts->inidle unconditionally. even if the system did not - * switch to nohz mode the cpu frequency governers rely on the - * update of the idle time accounting in tick_nohz_start_idle(). - */ ts->inidle = 1; __tick_nohz_idle_enter(ts); @@ -973,7 +968,7 @@ static void tick_nohz_switch_to_nohz(void) struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t next; - if (!tick_nohz_enabled) + if (!tick_nohz_active) return; local_irq_disable(); @@ -981,7 +976,7 @@ static void tick_nohz_switch_to_nohz(void) local_irq_enable(); return; } - + tick_nohz_active = 1; ts->nohz_mode = NOHZ_MODE_LOWRES; /* @@ -1139,8 +1134,10 @@ void tick_setup_sched_timer(void) } #ifdef CONFIG_NO_HZ_COMMON - if (tick_nohz_enabled) + if (tick_nohz_enabled) { ts->nohz_mode = NOHZ_MODE_HIGHRES; + tick_nohz_active = 1; + } #endif } #endif /* HIGH_RES_TIMERS */ -- cgit v1.3-8-gc7d7 From da554eba2e68c8ec051977db5ee1f42d384a01ed Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 15 Nov 2013 14:15:31 -0800 Subject: timer: Convert kmalloc_node(...GFP_ZERO...) to kzalloc_node(...) Use the helper function instead of __GFP_ZERO. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/timer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 6582b82fa966..accfd241b9e5 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1518,9 +1518,8 @@ static int init_timers_cpu(int cpu) /* * The APs use this path later in boot */ - base = kmalloc_node(sizeof(*base), - GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); + base = kzalloc_node(sizeof(*base), GFP_KERNEL, + cpu_to_node(cpu)); if (!base) return -ENOMEM; -- cgit v1.3-8-gc7d7 From 050ded1bbaea3331745cf2782315f5bc2582d083 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 15 Nov 2013 14:15:33 -0800 Subject: tick: Document tick_do_timer_cpu Taken straight from a tglx email ;) Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 64522ecdfe0e..162b03ab0ad2 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,6 +33,21 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; + +/* + * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR + * which is responsible for calling do_timer(), i.e. the timekeeping stuff. This + * variable has two functions: + * + * 1) Prevent a thundering herd issue of a gazillion of CPUs trying to grab the + * timekeeping lock all at once. Only the CPU which is assigned to do the + * update is handling it. + * + * 2) Hand off the duty in the NOHZ idle case by setting the value to + * TICK_DO_TIMER_NONE, i.e. a non existing CPU. So the next cpu which looks + * at it will take over and keep the time keeping alive. The handover + * procedure also covers cpu hotplug. + */ int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; /* -- cgit v1.3-8-gc7d7 From d5b5f391d434c5cc8bcb1ab2d759738797b85f52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Nov 2013 16:23:04 +0100 Subject: ftrace, perf: Avoid infinite event generation loop Vince's perf-trinity fuzzer found yet another 'interesting' problem. When we sample the irq_work_exit tracepoint with period==1 (or PERF_SAMPLE_PERIOD) and we add an fasync SIGNAL handler we create an infinite event generation loop: ,-> | irq_work_exit() -> | trace_irq_work_exit() -> | ... | __perf_event_overflow() -> (due to fasync) | irq_work_queue() -> (irq_work_list must be empty) '--------- arch_irq_work_raise() Similar things can happen due to regular poll() wakeups if we exceed the ring-buffer wakeup watermark, or have an event_limit. To avoid this, dis-allow sampling this particular tracepoint. In order to achieve this, create a special perf_perm function pointer for each event and call this (when set) on trying to create a tracepoint perf event. [ roasted: use expr... to allow for ',' in your expression ] Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Cc: Dave Jones Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20131114152304.GC5364@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/trace/irq_vectors.h | 11 +++++++++++ include/linux/ftrace_event.h | 16 ++++++++++++++++ include/linux/tracepoint.h | 4 ++++ include/trace/ftrace.h | 7 +++++++ kernel/trace/trace_event_perf.c | 6 ++++++ 5 files changed, 44 insertions(+) (limited to 'kernel') diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 2874df24e7a4..4cab890007a7 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -71,6 +71,17 @@ DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi); */ DEFINE_IRQ_VECTOR_EVENT(irq_work); +/* + * We must dis-allow sampling irq_work_exit() because perf event sampling + * itself can cause irq_work, which would lead to an infinite loop; + * + * 1) irq_work_exit happens + * 2) generates perf sample + * 3) generates irq_work + * 4) goto 1 + */ +TRACE_EVENT_PERF_PERM(irq_work_exit, is_sampling_event(p_event) ? -EPERM : 0); + /* * call_function - called when entering/exiting a call function interrupt * vector handler diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 9abbe630c456..8c9b7a1c4138 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -248,6 +248,9 @@ struct ftrace_event_call { #ifdef CONFIG_PERF_EVENTS int perf_refcount; struct hlist_head __percpu *perf_events; + + int (*perf_perm)(struct ftrace_event_call *, + struct perf_event *); #endif }; @@ -317,6 +320,19 @@ struct ftrace_event_file { } \ early_initcall(trace_init_flags_##name); +#define __TRACE_EVENT_PERF_PERM(name, expr...) \ + static int perf_perm_##name(struct ftrace_event_call *tp_event, \ + struct perf_event *p_event) \ + { \ + return ({ expr; }); \ + } \ + static int __init trace_init_perf_perm_##name(void) \ + { \ + event_##name.perf_perm = &perf_perm_##name; \ + return 0; \ + } \ + early_initcall(trace_init_perf_perm_##name); + #define PERF_MAX_TRACE_SIZE 2048 #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index ebeab360d851..f16dc0a40049 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -267,6 +267,8 @@ static inline void tracepoint_synchronize_unregister(void) #define TRACE_EVENT_FLAGS(event, flag) +#define TRACE_EVENT_PERF_PERM(event, expr...) + #endif /* DECLARE_TRACE */ #ifndef TRACE_EVENT @@ -399,4 +401,6 @@ static inline void tracepoint_synchronize_unregister(void) #define TRACE_EVENT_FLAGS(event, flag) +#define TRACE_EVENT_PERF_PERM(event, expr...) + #endif /* ifdef TRACE_EVENT (see note above) */ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 52594b20179e..6b852f60f8ae 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -90,6 +90,10 @@ #define TRACE_EVENT_FLAGS(name, value) \ __TRACE_EVENT_FLAGS(name, value) +#undef TRACE_EVENT_PERF_PERM +#define TRACE_EVENT_PERF_PERM(name, expr...) \ + __TRACE_EVENT_PERF_PERM(name, expr) + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) @@ -140,6 +144,9 @@ #undef TRACE_EVENT_FLAGS #define TRACE_EVENT_FLAGS(event, flag) +#undef TRACE_EVENT_PERF_PERM +#define TRACE_EVENT_PERF_PERM(event, expr...) + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) /* diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 78e27e3b52ac..630889f68b1d 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -24,6 +24,12 @@ static int total_ref_count; static int perf_trace_event_perm(struct ftrace_event_call *tp_event, struct perf_event *p_event) { + if (tp_event->perf_perm) { + int ret = tp_event->perf_perm(tp_event, p_event); + if (ret) + return ret; + } + /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event) && perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) -- cgit v1.3-8-gc7d7 From 06db0b21712f878b808480ef31097637013bbf0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Sep 2013 13:14:47 +0200 Subject: perf: Remove fragile swevent hlist optimization Currently we only allocate a single cpu hashtable for per-cpu swevents; do away with this optimization for it is fragile in the face of things like perf_pmu_migrate_context(). The easiest thing is to make sure all CPUs are consistent wrt state. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130913111447.GN31370@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d724e7757cd1..72348dc192c1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5680,11 +5680,6 @@ static void swevent_hlist_put(struct perf_event *event) { int cpu; - if (event->cpu != -1) { - swevent_hlist_put_cpu(event, event->cpu); - return; - } - for_each_possible_cpu(cpu) swevent_hlist_put_cpu(event, cpu); } @@ -5718,9 +5713,6 @@ static int swevent_hlist_get(struct perf_event *event) int err; int cpu, failed_cpu; - if (event->cpu != -1) - return swevent_hlist_get_cpu(event, event->cpu); - get_online_cpus(); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(event, cpu); -- cgit v1.3-8-gc7d7 From 0022cedd4a7d8a87841351e2b018bb6794cf2e67 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 15 Nov 2013 12:39:45 -0500 Subject: perf/trace: Properly use u64 to hold event_id The 64-bit attr.config value for perf trace events was being copied into an "int" before doing a comparison, meaning the top 32 bits were being truncated. As far as I can tell this didn't cause any errors, but it did mean it was possible to create valid aliases for all the tracepoint ids which I don't think was intended. (For example, 0xffffffff00000018 and 0x18 both enable the same tracepoint). Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1311151236100.11932@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 630889f68b1d..e854f420e033 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -179,7 +179,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, int perf_trace_init(struct perf_event *p_event) { struct ftrace_event_call *tp_event; - int event_id = p_event->attr.config; + u64 event_id = p_event->attr.config; int ret = -EINVAL; mutex_lock(&event_mutex); -- cgit v1.3-8-gc7d7 From 9abf24d465180f5f2eb26a43545348262f16b771 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 12 Nov 2013 22:11:26 +0530 Subject: sched: Check sched_domain before computing group power After commit 863bffc80898 ("sched/fair: Fix group power_orig computation"), we can dereference rq->sd before it is set. Fix this by falling back to power_of() in this case and add a comment explaining things. Signed-off-by: Srikar Dronamraju [ Added comment and tweaked patch. ] Signed-off-by: Peter Zijlstra Cc: mikey@neuling.org Link: http://lkml.kernel.org/r/20131113151718.GN21461@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e8b652ebe027..fd773ade1a31 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5379,10 +5379,31 @@ void update_group_power(struct sched_domain *sd, int cpu) */ for_each_cpu(cpu, sched_group_cpus(sdg)) { - struct sched_group *sg = cpu_rq(cpu)->sd->groups; + struct sched_group_power *sgp; + struct rq *rq = cpu_rq(cpu); - power_orig += sg->sgp->power_orig; - power += sg->sgp->power; + /* + * build_sched_domains() -> init_sched_groups_power() + * gets here before we've attached the domains to the + * runqueues. + * + * Use power_of(), which is set irrespective of domains + * in update_cpu_power(). + * + * This avoids power/power_orig from being 0 and + * causing divide-by-zero issues on boot. + * + * Runtime updates will correct power_orig. + */ + if (unlikely(!rq->sd)) { + power_orig += power_of(cpu); + power += power_of(cpu); + continue; + } + + sgp = rq->sd->groups->sgp; + power_orig += sgp->power_orig; + power += sgp->power; } } else { /* -- cgit v1.3-8-gc7d7 From 42eb088ed246a5a817bb45a8b32fe234cf1c0f8b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Nov 2013 16:41:49 +0100 Subject: sched: Avoid NULL dereference on sd_busy Commit 37dc6b50cee9 ("sched: Remove unnecessary iteration over sched domains to update nr_busy_cpus") forgot to clear 'sd_busy' under some conditions leading to a possible NULL deref in set_cpu_sd_state_idle(). Reported-by: Anton Blanchard Cc: Preeti U Murthy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131118113701.GF3866@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1808606ee5f..a1591ca7eb5a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4910,8 +4910,9 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent); + sd = sd->parent; /* sd_busy */ } + rcu_assign_pointer(per_cpu(sd_busy, cpu), sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; -- cgit v1.3-8-gc7d7 From 0515973ffb16c2852a1bb1df2ca1456556faaaa5 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sun, 17 Nov 2013 12:12:36 +0900 Subject: sched: Fix a trivial typo in comments Fix a trivial typo in rq_attach_root(). Signed-off-by: Shigeru Yoshida Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131117.121236.1990617639803941055.shigeru.yoshida@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a1591ca7eb5a..718730dd0480 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4762,7 +4762,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) cpumask_clear_cpu(rq->cpu, old_rd->span); /* - * If we dont want to free the old_rt yet then + * If we dont want to free the old_rd yet then * set old_rd to NULL to skip the freeing later * in this function: */ -- cgit v1.3-8-gc7d7 From 4be77398ac9d948773116b6be4a3c91b3d6ea18c Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 22 Nov 2013 11:44:51 -0800 Subject: time: Fix 1ns/tick drift w/ GENERIC_TIME_VSYSCALL_OLD Since commit 1e75fa8be9f (time: Condense timekeeper.xtime into xtime_sec - merged in v3.6), there has been an problem with the error accounting in the timekeeping code, such that when truncating to nanoseconds, we round up to the next nsec, but the balancing adjustment to the ntp_error value was dropped. This causes 1ns per tick drift forward of the clock. In 3.7, this logic was isolated to only GENERIC_TIME_VSYSCALL_OLD architectures (s390, ia64, powerpc). The fix is simply to balance the accounting and to subtract the added nanosecond from ntp_error. This allows the internal long-term clock steering to keep the clock accurate. While this fix removes the regression added in 1e75fa8be9f, the ideal solution is to move away from GENERIC_TIME_VSYSCALL_OLD and use the new VSYSCALL method, which avoids entirely the nanosecond granular rounding, and the resulting short-term clock adjustment oscillation needed to keep long term accurate time. [ jstultz: Many thanks to Martin for his efforts identifying this subtle bug, and providing the fix. ] Originally-from: Martin Schwidefsky Cc: Tony Luck Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Andy Lutomirski Cc: Paul Turner Cc: Steven Rostedt Cc: Richard Cochran Cc: Prarit Bhargava Cc: Fenghua Yu Cc: Thomas Gleixner Cc: stable #v3.6+ Link: http://lkml.kernel.org/r/1385149491-20307-1-git-send-email-john.stultz@linaro.org Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3abf53418b67..87b4f00284c9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1347,7 +1347,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) tk->xtime_nsec -= remainder; tk->xtime_nsec += 1ULL << tk->shift; tk->ntp_error += remainder << tk->ntp_error_shift; - + tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; } #else #define old_vsyscall_fixup(tk) -- cgit v1.3-8-gc7d7 From ac01810c9d2814238f08a227062e66a35a0e1ea2 Mon Sep 17 00:00:00 2001 From: Laxman Dewangan Date: Mon, 25 Nov 2013 19:39:47 +0530 Subject: irq: Enable all irqs unconditionally in irq_resume When the system enters suspend, it disables all interrupts in suspend_device_irqs(), including the interrupts marked EARLY_RESUME. On the resume side things are different. The EARLY_RESUME interrupts are reenabled in sys_core_ops->resume and the non EARLY_RESUME interrupts are reenabled in the normal system resume path. When suspend_noirq() failed or suspend is aborted for any other reason, we might omit the resume side call to sys_core_ops->resume() and therefor the interrupts marked EARLY_RESUME are not reenabled and stay disabled forever. To solve this, enable all irqs unconditionally in irq_resume() regardless whether interrupts marked EARLY_RESUMEhave been already enabled or not. This might try to reenable already enabled interrupts in the non failure case, but the only affected platform is XEN and it has been confirmed that it does not cause any side effects. [ tglx: Massaged changelog. ] Signed-off-by: Laxman Dewangan Acked-by-and-tested-by: Konrad Rzeszutek Wilk Acked-by: Heiko Stuebner Reviewed-by: Pavel Machek Cc: Cc: Cc: Cc: Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1385388587-16442-1-git-send-email-ldewangan@nvidia.com Signed-off-by: Thomas Gleixner --- kernel/irq/pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cb228bf21760..abcd6ca86cb7 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -50,7 +50,7 @@ static void resume_irqs(bool want_early) bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; - if (is_early != want_early) + if (!is_early && want_early) continue; raw_spin_lock_irqsave(&desc->lock, flags); -- cgit v1.3-8-gc7d7 From 32e475d76a3e40879cd9ee4f69b19615062280d7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Nov 2013 12:41:44 +0100 Subject: sched: Expose preempt_schedule_irq() Tony reported that aa0d53260596 ("ia64: Use preempt_schedule_irq") broke PREEMPT=n builds on ia64. Ok, wrapped my brain around it. I tripped over the magic asm foo which has a single need_resched check and schedule point for both sys call return and interrupt return. So you need the schedule_preempt_irq() for kernel preemption from interrupt return while on a normal syscall preemption a schedule would be sufficient. But using schedule_preempt_irq() is not harmful here in any way. It just sets the preempt_active bit also in cases where it would not be required. Even on preempt=n kernels adding the preempt_active bit is completely harmless. So instead of having an extra function, moving the existing one out of the ifdef PREEMPT looks like the sanest thing to do. It would also allow getting rid of various other sti/schedule/cli asm magic in other archs. Reported-and-Tested-by: Tony Luck Fixes: aa0d53260596 ("ia64: Use preempt_schedule_irq") Signed-off-by: Thomas Gleixner [slightly edited Changelog] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1311211230030.30673@ionos.tec.linutronix.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 718730dd0480..e85cda20ab2b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2660,6 +2660,7 @@ asmlinkage void __sched notrace preempt_schedule(void) } while (need_resched()); } EXPORT_SYMBOL(preempt_schedule); +#endif /* CONFIG_PREEMPT */ /* * this is the entry point to schedule() from kernel preemption @@ -2693,8 +2694,6 @@ asmlinkage void __sched preempt_schedule_irq(void) exception_exit(prev_state); } -#endif /* CONFIG_PREEMPT */ - int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { -- cgit v1.3-8-gc7d7 From 0e576acbc1d9600cf2d9b4a141a2554639959d50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Nov 2013 12:18:13 +0100 Subject: nohz: Fix another inconsistency between CONFIG_NO_HZ=n and nohz=off If CONFIG_NO_HZ=n tick_nohz_get_sleep_length() returns NSEC_PER_SEC/HZ. If CONFIG_NO_HZ=y and the nohz functionality is disabled via the command line option "nohz=off" or not enabled due to missing hardware support, then tick_nohz_get_sleep_length() returns 0. That happens because ts->sleep_length is never set in that case. Set it to NSEC_PER_SEC/HZ when the NOHZ mode is inactive. Reported-by: Michal Hocko Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a12df5abde0b..ea20f7d1ac2c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -711,8 +711,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) { + ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ }; return false; + } if (need_resched()) return false; -- cgit v1.3-8-gc7d7 From 3ccb01239201af06a07482ec686b14cd148102a5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 Dec 2013 12:41:20 -0500 Subject: tracing: Only run synchronize_sched() at instance deletion time It has been reported that boot up with FTRACE_SELFTEST enabled can take a very long time. There can be stalls of over a minute. This was tracked down to the synchronize_sched() called when a system call event is disabled. As the self tests enable and disable thousands of events, this makes the synchronize_sched() get called thousands of times. The synchornize_sched() was added with d562aff93bfb53 "tracing: Add support for SOFT_DISABLE to syscall events" which caused this regression (added in 3.13-rc1). The synchronize_sched() is to protect against the events being accessed when a tracer instance is being deleted. When an instance is being deleted all the events associated to it are unregistered. The synchronize_sched() makes sure that no more users are running when it finishes. Instead of calling synchronize_sched() for all syscall events, we only need to call it once, after the events are unregistered and before the instance is deleted. The event_mutex is held during this action to prevent new users from enabling events. Link: http://lkml.kernel.org/r/20131203124120.427b9661@gandalf.local.home Reported-by: Petr Mladek Acked-by: Tom Zanussi Acked-by: Petr Mladek Tested-by: Petr Mladek Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 3 +++ kernel/trace/trace_syscalls.c | 10 ---------- 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f919a2e21bf3..a11800ae96de 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2314,6 +2314,9 @@ int event_trace_del_tracer(struct trace_array *tr) /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + /* Access to events are within rcu_read_lock_sched() */ + synchronize_sched(); + down_write(&trace_event_sem); __trace_remove_event_dirs(tr); debugfs_remove_recursive(tr->event_dir); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index e4b6d11bdf78..ea90eb5f6f17 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -431,11 +431,6 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); - /* - * Callers expect the event to be completely disabled on - * return, so wait for current handlers to finish. - */ - synchronize_sched(); } static int reg_event_syscall_exit(struct ftrace_event_file *file, @@ -474,11 +469,6 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); - /* - * Callers expect the event to be completely disabled on - * return, so wait for current handlers to finish. - */ - synchronize_sched(); } static int __init init_syscall_trace(struct ftrace_event_call *call) -- cgit v1.3-8-gc7d7