From be96b316deff35e119760982c43af74e606fa143 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 28 Oct 2017 09:49:37 -0700 Subject: perf/cgroup: Fix perf cgroup hierarchy support The following commit: 864c2357ca89 ("perf/core: Do not set cpuctx->cgrp for unscheduled cgroups") made list_update_cgroup_event() skip setting cpuctx->cgrp if no cgroup event targets %current's cgroup. This breaks perf_event's hierarchical support because events which target one of the ancestors get ignored. Fix it by using cgroup_is_descendant() test instead of equality. Signed-off-by: Tejun Heo Acked-by: Thomas Gleixner Cc: Arnaldo Carvalho de Melo Cc: David Carrillo-Cisneros Cc: Linus Torvalds Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: stable@vger.kernel.org # v4.9+ Fixes: 864c2357ca89 ("perf/core: Do not set cpuctx->cgrp for unscheduled cgroups") Link: http://lkml.kernel.org/r/20171028164237.GA972780@devbig577.frc2.facebook.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9d93db81fa36..10cdb9c26b5d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -901,9 +901,11 @@ list_update_cgroup_event(struct perf_event *event, cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ if (add) { + struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); + list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); - if (perf_cgroup_from_task(current, ctx) == event->cgrp) - cpuctx->cgrp = event->cgrp; + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) + cpuctx->cgrp = cgrp; } else { list_del(cpuctx_entry); cpuctx->cgrp = NULL; -- cgit v1.2.3-59-g8ed1b From cef572ad9bd7f85035ba8272e5352040e8be0152 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Sat, 28 Oct 2017 11:07:28 +0800 Subject: workqueue: Fix NULL pointer dereference When queue_work() is used in irq (not in task context), there is a potential case that trigger NULL pointer dereference. ---------------------------------------------------------------- worker_thread() |-spin_lock_irq() |-process_one_work() |-worker->current_pwq = pwq |-spin_unlock_irq() |-worker->current_func(work) |-spin_lock_irq() |-worker->current_pwq = NULL |-spin_unlock_irq() //interrupt here |-irq_handler |-__queue_work() //assuming that the wq is draining |-is_chained_work(wq) |-current_wq_worker() //Here, 'current' is the interrupted worker! |-current->current_pwq is NULL here! |-schedule() ---------------------------------------------------------------- Avoid it by checking for task context in current_wq_worker(), and if not in task context, we shouldn't use the 'current' to check the condition. Reported-by: Xiaofei Tan Signed-off-by: Li Bin Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo Fixes: 8d03ecfe4718 ("workqueue: reimplement is_chained_work() using current_wq_worker()") Cc: stable@vger.kernel.org # v3.9+ --- kernel/workqueue_internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 8635417c587b..29fa81f0f51a 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -9,6 +9,7 @@ #include #include +#include struct worker_pool; @@ -59,7 +60,7 @@ struct worker { */ static inline struct worker *current_wq_worker(void) { - if (current->flags & PF_WQ_WORKER) + if (in_task() && (current->flags & PF_WQ_WORKER)) return kthread_data(current); return NULL; } -- cgit v1.2.3-59-g8ed1b From 153fbd1226fb30b8630802aa5047b8af5ef53c9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 Oct 2017 11:18:53 +0100 Subject: futex: Fix more put_pi_state() vs. exit_pi_state_list() races Dmitry (through syzbot) reported being able to trigger the WARN in get_pi_state() and a use-after-free on: raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); Both are due to this race: exit_pi_state_list() put_pi_state() lock(&curr->pi_lock) while() { pi_state = list_first_entry(head); hb = hash_futex(&pi_state->key); unlock(&curr->pi_lock); dec_and_test(&pi_state->refcount); lock(&hb->lock) lock(&pi_state->pi_mutex.wait_lock) // uaf if pi_state free'd lock(&curr->pi_lock); .... unlock(&curr->pi_lock); get_pi_state(); // WARN; refcount==0 The problem is we take the reference count too late, and don't allow it being 0. Fix it by using inc_not_zero() and simply retrying the loop when we fail to get a refcount. In that case put_pi_state() should remove the entry from the list. Reported-by: Dmitry Vyukov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Gratian Crisan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: dvhart@infradead.org Cc: syzbot Cc: syzkaller-bugs@googlegroups.com Cc: Fixes: c74aef2d06a9 ("futex: Fix pi_state->owner serialization") Link: http://lkml.kernel.org/r/20171031101853.xpfh72y643kdfhjs@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/futex.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 0518a0bfc746..ca5bb9cba5cf 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -903,11 +903,27 @@ void exit_pi_state_list(struct task_struct *curr) */ raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { - next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; hb = hash_futex(&key); + + /* + * We can race against put_pi_state() removing itself from the + * list (a waiter going away). put_pi_state() will first + * decrement the reference count and then modify the list, so + * its possible to see the list entry but fail this reference + * acquire. + * + * In that case; drop the locks to let put_pi_state() make + * progress and retry the loop. + */ + if (!atomic_inc_not_zero(&pi_state->refcount)) { + raw_spin_unlock_irq(&curr->pi_lock); + cpu_relax(); + raw_spin_lock_irq(&curr->pi_lock); + continue; + } raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); @@ -918,8 +934,10 @@ void exit_pi_state_list(struct task_struct *curr) * task still owns the PI-state: */ if (head->next != next) { + /* retain curr->pi_lock for the loop invariant */ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); + put_pi_state(pi_state); continue; } @@ -927,9 +945,8 @@ void exit_pi_state_list(struct task_struct *curr) WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; - raw_spin_unlock(&curr->pi_lock); - get_pi_state(pi_state); + raw_spin_unlock(&curr->pi_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); -- cgit v1.2.3-59-g8ed1b From 9c388a5ed1960b2ebbebd3dbe7553092b0c15ec1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 31 Oct 2017 22:32:00 +0100 Subject: watchdog/harclockup/perf: Revert a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event destroy") Guenter reported a crash in the watchdog/perf code, which is caused by cleanup() and enable() running concurrently. The reason for this is: The watchdog functions are serialized via the watchdog_mutex and cpu hotplug locking, but the enable of the perf based watchdog happens in context of the unpark callback of the smpboot thread. But that unpark function is not synchronous inside the locking. The unparking of the thread just wakes it up and leaves so there is no guarantee when the thread is executing. If it starts running _before_ the cleanup happened then it will create a event and overwrite the dead event pointer. The new event is then cleaned up because the event is marked dead. lock(watchdog_mutex); lockup_detector_reconfigure(); cpus_read_lock(); stop(); park() update(); start(); unpark() cpus_read_unlock(); thread runs() overwrite dead event ptr cleanup(); free new event, which is active inside perf.... unlock(watchdog_mutex); The park side is safe as that actually waits for the thread to reach parked state. Commit a33d44843d45 removed the protection against this kind of scenario under the stupid assumption that the hotplug serialization and the watchdog_mutex cover everything. Bring it back. Reverts: a33d44843d45 ("watchdog/hardlockup/perf: Simplify deferred event destroy") Reported-and-tested-by: Guenter Roeck Signed-off-by: Thomas Feels-stupid Gleixner Cc: Peter Zijlstra Cc: Don Zickus Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1710312145190.1942@nanos --- kernel/watchdog_hld.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 71a62ceacdc8..a7f137c1933a 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -21,6 +21,7 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; @@ -203,6 +204,8 @@ void hardlockup_detector_perf_disable(void) if (event) { perf_event_disable(event); + this_cpu_write(watchdog_ev, NULL); + this_cpu_write(dead_event, event); cpumask_set_cpu(smp_processor_id(), &dead_events_mask); watchdog_cpus--; } @@ -218,7 +221,7 @@ void hardlockup_detector_perf_cleanup(void) int cpu; for_each_cpu(cpu, &dead_events_mask) { - struct perf_event *event = per_cpu(watchdog_ev, cpu); + struct perf_event *event = per_cpu(dead_event, cpu); /* * Required because for_each_cpu() reports unconditionally @@ -226,7 +229,7 @@ void hardlockup_detector_perf_cleanup(void) */ if (event) perf_event_release_kernel(event); - per_cpu(watchdog_ev, cpu) = NULL; + per_cpu(dead_event, cpu) = NULL; } cpumask_clear(&dead_events_mask); } -- cgit v1.2.3-59-g8ed1b From 42f930da7f00c0ab23df4c7aed36137f35988980 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Wed, 1 Nov 2017 14:11:27 -0400 Subject: watchdog/hardlockup/perf: Use atomics to track in-use cpu counter Guenter reported: There is still a problem. When running echo 6 > /proc/sys/kernel/watchdog_thresh echo 5 > /proc/sys/kernel/watchdog_thresh repeatedly, the message NMI watchdog: Enabled. Permanently consumes one hw-PMU counter. stops after a while (after ~10-30 iterations, with fluctuations). Maybe watchdog_cpus needs to be atomic ? That's correct as this again is affected by the asynchronous nature of the smpboot thread unpark mechanism. CPU 0 CPU1 CPU2 write(watchdog_thresh, 6) stop() park() update() start() unpark() thread->unpark() cnt++; write(watchdog_thresh, 5) thread->unpark() stop() park() thread->park() cnt--; cnt++; update() start() unpark() That's not a functional problem, it just affects the informational message. Convert watchdog_cpus to atomic_t to prevent the problem Reported-and-tested-by: Guenter Roeck Signed-off-by: Don Zickus Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Link: https://lkml.kernel.org/r/20171101181126.j727fqjmdthjz4xk@redhat.com --- kernel/watchdog_hld.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index a7f137c1933a..a84b205fac9a 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "NMI watchdog: " fmt #include +#include #include #include @@ -25,7 +26,7 @@ static DEFINE_PER_CPU(struct perf_event *, dead_event); static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; -static unsigned int watchdog_cpus; +static atomic_t watchdog_cpus = ATOMIC_INIT(0); void arch_touch_nmi_watchdog(void) { @@ -189,7 +190,8 @@ void hardlockup_detector_perf_enable(void) if (hardlockup_detector_event_create()) return; - if (!watchdog_cpus++) + /* use original value for check */ + if (!atomic_fetch_inc(&watchdog_cpus)) pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); perf_event_enable(this_cpu_read(watchdog_ev)); @@ -207,7 +209,7 @@ void hardlockup_detector_perf_disable(void) this_cpu_write(watchdog_ev, NULL); this_cpu_write(dead_event, event); cpumask_set_cpu(smp_processor_id(), &dead_events_mask); - watchdog_cpus--; + atomic_dec(&watchdog_cpus); } } -- cgit v1.2.3-59-g8ed1b From d62d813c0d714a2d0aaf3d796a7a51ae60bf5470 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Fri, 3 Nov 2017 13:36:42 +0000 Subject: cpufreq: schedutil: Examine the correct CPU when we update util After commit 674e75411fc2 (sched: cpufreq: Allow remote cpufreq callbacks) we stopped to always read the utilization for the CPU we are running the governor on, and instead we read it for the CPU which we've been told has updated utilization. This is stored in sugov_cpu->cpu. The value is set in sugov_register() but we clear it in sugov_start() which leads to always looking at the utilization of CPU0 instead of the correct one. Fix this by consolidating the initialization code into sugov_start(). Fixes: 674e75411fc2 (sched: cpufreq: Allow remote cpufreq callbacks) Signed-off-by: Chris Redpath Reviewed-by: Patrick Bellasi Reviewed-by: Brendan Jackman Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 9209d83ecdcf..ba0da243fdd8 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -649,6 +649,7 @@ static int sugov_start(struct cpufreq_policy *policy) struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); memset(sg_cpu, 0, sizeof(*sg_cpu)); + sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; sg_cpu->flags = SCHED_CPUFREQ_RT; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; @@ -714,11 +715,6 @@ struct cpufreq_governor *cpufreq_default_governor(void) static int __init sugov_register(void) { - int cpu; - - for_each_possible_cpu(cpu) - per_cpu(sugov_cpu, cpu).cpu = cpu; - return cpufreq_register_governor(&schedutil_gov); } fs_initcall(sugov_register); -- cgit v1.2.3-59-g8ed1b