From d9c0ffcabd6aae7ff1e34e8078354c13bb9f1183 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 28 Jun 2018 18:29:41 +0200 Subject: sched/nohz: Skip remote tick on idle task entirely Some people have reported that the warning in sched_tick_remote() occasionally triggers, especially in favour of some RCU-Torture pressure: WARNING: CPU: 11 PID: 906 at kernel/sched/core.c:3138 sched_tick_remote+0xb6/0xc0 Modules linked in: CPU: 11 PID: 906 Comm: kworker/u32:3 Not tainted 4.18.0-rc2+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 Workqueue: events_unbound sched_tick_remote RIP: 0010:sched_tick_remote+0xb6/0xc0 Code: e8 0f 06 b8 00 c6 03 00 fb eb 9d 8b 43 04 85 c0 75 8d 48 8b 83 e0 0a 00 00 48 85 c0 75 81 eb 88 48 89 df e8 bc fe ff ff eb aa <0f> 0b eb +c5 66 0f 1f 44 00 00 bf 17 00 00 00 e8 b6 2e fe ff 0f b6 Call Trace: process_one_work+0x1df/0x3b0 worker_thread+0x44/0x3d0 kthread+0xf3/0x130 ? set_worker_desc+0xb0/0xb0 ? kthread_create_worker_on_cpu+0x70/0x70 ret_from_fork+0x35/0x40 This happens when the remote tick applies on an idle task. Usually the idle_cpu() check avoids that, but it is performed before we lock the runqueue and it is therefore racy. It was intended to be that way in order to prevent from useless runqueue locks since idle task tick callback is a no-op. Now if the racy check slips out of our hands and we end up remotely ticking an idle task, the empty task_tick_idle() is harmless. Still it won't pass the WARN_ON_ONCE() test that ensures rq_clock_task() is not too far from curr->se.exec_start because update_curr_idle() doesn't update the exec_start value like other scheduler policies. Hence the reported false positive. So let's have another check, while the rq is locked, to make sure we don't remote tick on an idle task. The lockless idle_cpu() still applies to avoid unecessary rq lock contention. Reported-by: Jacek Tomaka Reported-by: Paul E. McKenney Reported-by: Anna-Maria Gleixner Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1530203381-31234-1-git-send-email-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78d8facba456..22fce36426c0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3113,7 +3113,9 @@ static void sched_tick_remote(struct work_struct *work) struct tick_work *twork = container_of(dwork, struct tick_work, work); int cpu = twork->cpu; struct rq *rq = cpu_rq(cpu); + struct task_struct *curr; struct rq_flags rf; + u64 delta; /* * Handle the tick only if it appears the remote CPU is running in full @@ -3122,24 +3124,28 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { - struct task_struct *curr; - u64 delta; + if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) + goto out_requeue; - rq_lock_irq(rq, &rf); - update_rq_clock(rq); - curr = rq->curr; - delta = rq_clock_task(rq) - curr->se.exec_start; + rq_lock_irq(rq, &rf); + curr = rq->curr; + if (is_idle_task(curr)) + goto out_unlock; - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); - curr->sched_class->task_tick(rq, curr, 0); - rq_unlock_irq(rq, &rf); - } + update_rq_clock(rq); + delta = rq_clock_task(rq) - curr->se.exec_start; + + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + curr->sched_class->task_tick(rq, curr, 0); + +out_unlock: + rq_unlock_irq(rq, &rf); +out_requeue: /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough -- cgit v1.2.3-59-g8ed1b From 296b2ffe7fa9ed756c41415c6b1512bc4ad687b1 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Jun 2018 15:53:22 +0200 Subject: sched/rt: Fix call to cpufreq_update_util() With commit: 8f111bc357aa ("cpufreq/schedutil: Rewrite CPUFREQ_RT support") the schedutil governor uses rq->rt.rt_nr_running to detect whether an RT task is currently running on the CPU and to set frequency to max if necessary. cpufreq_update_util() is called in enqueue/dequeue_top_rt_rq() but rq->rt.rt_nr_running has not been updated yet when dequeue_top_rt_rq() is called so schedutil still considers that an RT task is running when the last task is dequeued. The update of rq->rt.rt_nr_running happens later in dequeue_rt_stack(). In fact, we can take advantage of the sequence that the dequeue then re-enqueue rt entities when a rt task is enqueued or dequeued; As a result enqueue_top_rt_rq() is always called when a task is enqueued or dequeued and also when groups are throttled or unthrottled. The only place that not use enqueue_top_rt_rq() is when root rt_rq is throttled. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: efault@gmx.de Cc: juri.lelli@redhat.com Cc: patrick.bellasi@arm.com Cc: viresh.kumar@linaro.org Fixes: 8f111bc357aa ('cpufreq/schedutil: Rewrite CPUFREQ_RT support') Link: http://lkml.kernel.org/r/1530021202-21695-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/rt.c | 16 ++++++++++------ kernel/sched/sched.h | 5 +++++ 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3cde46483f0a..c907fde01eaa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -192,7 +192,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - if (rq->rt.rt_nr_running) + if (rt_rq_is_runnable(&rq->rt)) return sg_cpu->max; /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 47556b0c9a95..572567078b60 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -508,8 +508,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; - if (!rt_se) + if (!rt_se) { dequeue_top_rt_rq(rt_rq); + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); + } else if (on_rt_rq(rt_se)) dequeue_rt_entity(rt_se, 0); } @@ -1001,8 +1004,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) sub_nr_running(rq, rt_rq->rt_nr_running); rt_rq->rt_queued = 0; - /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq, 0); } static void @@ -1014,11 +1015,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) if (rt_rq->rt_queued) return; - if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) + + if (rt_rq_throttled(rt_rq)) return; - add_nr_running(rq, rt_rq->rt_nr_running); - rt_rq->rt_queued = 1; + if (rt_rq->rt_nr_running) { + add_nr_running(rq, rt_rq->rt_nr_running); + rt_rq->rt_queued = 1; + } /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq, 0); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6601baf2361c..27ddec334601 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -609,6 +609,11 @@ struct rt_rq { #endif }; +static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq) +{ + return rt_rq->rt_queued && rt_rq->rt_nr_running; +} + /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ -- cgit v1.2.3-59-g8ed1b From 512ac999d2755d2b7109e996a76b6fb8b888631d Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 20 Jun 2018 18:18:33 +0800 Subject: sched/fair: Fix bandwidth timer clock drift condition I noticed that cgroup task groups constantly get throttled even if they have low CPU usage, this causes some jitters on the response time to some of our business containers when enabling CPU quotas. It's very simple to reproduce: mkdir /sys/fs/cgroup/cpu/test cd /sys/fs/cgroup/cpu/test echo 100000 > cpu.cfs_quota_us echo $$ > tasks then repeat: cat cpu.stat | grep nr_throttled # nr_throttled will increase steadily After some analysis, we found that cfs_rq::runtime_remaining will be cleared by expire_cfs_rq_runtime() due to two equal but stale "cfs_{b|q}->runtime_expires" after period timer is re-armed. The current condition to judge clock drift in expire_cfs_rq_runtime() is wrong, the two runtime_expires are actually the same when clock drift happens, so this condtion can never hit. The orginal design was correctly done by this commit: a9cf55b28610 ("sched: Expire invalid runtime") ... but was changed to be the current implementation due to its locking bug. This patch introduces another way, it adds a new field in both structures cfs_rq and cfs_bandwidth to record the expiration update sequence, and uses them to figure out if clock drift happens (true if they are equal). Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period") Link: http://lkml.kernel.org/r/20180620101834.24455-1-xlpang@linux.alibaba.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 ++++++++------ kernel/sched/sched.h | 6 ++++-- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1866e64792a7..791707c56886 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4590,6 +4590,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) now = sched_clock_cpu(smp_processor_id()); cfs_b->runtime = cfs_b->quota; cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); + cfs_b->expires_seq++; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4612,6 +4613,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); u64 amount = 0, min_amount, expires; + int expires_seq; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -4628,6 +4630,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } + expires_seq = cfs_b->expires_seq; expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); @@ -4637,8 +4640,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) * spread between our sched_clock and the one on which runtime was * issued. */ - if ((s64)(expires - cfs_rq->runtime_expires) > 0) + if (cfs_rq->expires_seq != expires_seq) { + cfs_rq->expires_seq = expires_seq; cfs_rq->runtime_expires = expires; + } return cfs_rq->runtime_remaining > 0; } @@ -4664,12 +4669,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) * has not truly expired. * * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. It is valid to compare - * cfs_b->runtime_expires without any locks since we only care about - * exact equality, so a partial write will still work. + * whether the global deadline(cfs_b->expires_seq) has advanced. */ - - if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { + if (cfs_rq->expires_seq == cfs_b->expires_seq) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; } else { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 27ddec334601..c7742dcc136c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -334,9 +334,10 @@ struct cfs_bandwidth { u64 runtime; s64 hierarchical_quota; u64 runtime_expires; + int expires_seq; - int idle; - int period_active; + short idle; + short period_active; struct hrtimer period_timer; struct hrtimer slack_timer; struct list_head throttled_cfs_rq; @@ -551,6 +552,7 @@ struct cfs_rq { #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; + int expires_seq; u64 runtime_expires; s64 runtime_remaining; -- cgit v1.2.3-59-g8ed1b From f1d1be8aee6c461652aea8f58bedebaa73d7f4d3 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 20 Jun 2018 18:18:34 +0800 Subject: sched/fair: Advance global expiration when period timer is restarted When period gets restarted after some idle time, start_cfs_bandwidth() doesn't update the expiration information, expire_cfs_rq_runtime() will see cfs_rq->runtime_expires smaller than rq clock and go to the clock drift logic, wasting needless CPU cycles on the scheduler hot path. Update the global expiration in start_cfs_bandwidth() to avoid frequent expire_cfs_rq_runtime() calls once a new period begins. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180620101834.24455-2-xlpang@linux.alibaba.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 791707c56886..840b92ee6f89 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5204,13 +5204,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + u64 overrun; + lockdep_assert_held(&cfs_b->lock); - if (!cfs_b->period_active) { - cfs_b->period_active = 1; - hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); - hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); - } + if (cfs_b->period_active) + return; + + cfs_b->period_active = 1; + overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period); + cfs_b->expires_seq++; + hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); } static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -- cgit v1.2.3-59-g8ed1b From 3482d98bbc730758b63a5d1cf41d05ea17481412 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 14 Jun 2018 12:33:00 +0200 Subject: sched/util_est: Fix util_est_dequeue() for throttled cfs_rq When a cfs_rq is throttled, parent cfs_rq->nr_running is decreased and everything happens at cfs_rq level. Currently util_est stays unchanged in such case and it keeps accounting the utilization of throttled tasks. This can somewhat make sense as we don't dequeue tasks but only throttled cfs_rq. If a task of another group is enqueued/dequeued and root cfs_rq becomes idle during the dequeue, util_est will be cleared whereas it was accounting util_est of throttled tasks before. So the behavior of util_est is not always the same regarding throttled tasks and depends of side activity. Furthermore, util_est will not be updated when the cfs_rq is unthrottled as everything happens at cfs_rq level. Main results is that util_est will stay null whereas we now have running tasks. We have to wait for the next dequeue/enqueue of the previously throttled tasks to get an up to date util_est. Remove the assumption that cfs_rq's estimated utilization of a CPU is 0 if there is no running task so the util_est of a task remains until the latter is dequeued even if its cfs_rq has been throttled. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Patrick Bellasi Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 7f65ea42eb00 ("sched/fair: Add util_est on top of PELT") Link: http://lkml.kernel.org/r/1528972380-16268-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 840b92ee6f89..2f0a0be4d344 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3982,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) if (!sched_feat(UTIL_EST)) return; - /* - * Update root cfs_rq's estimated utilization - * - * If *p is the last task then the root cfs_rq's estimated utilization - * of a CPU is 0 by definition. - */ - ue.enqueued = 0; - if (cfs_rq->nr_running) { - ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, - (_task_util_est(p) | UTIL_AVG_UNCHANGED)); - } + /* Update root cfs_rq's estimated utilization */ + ue.enqueued = cfs_rq->avg.util_est.enqueued; + ue.enqueued -= min_t(unsigned int, ue.enqueued, + (_task_util_est(p) | UTIL_AVG_UNCHANGED)); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); /* -- cgit v1.2.3-59-g8ed1b From 1cef1150ef40ec52f507436a14230cbc2623299c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Jun 2018 11:45:49 +0200 Subject: kthread, sched/core: Fix kthread_parkme() (again...) Gaurav reports that commit: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue") isn't working for him. Because of the following race: > controller Thread CPUHP Thread > takedown_cpu > kthread_park > kthread_parkme > Set KTHREAD_SHOULD_PARK > smpboot_thread_fn > set Task interruptible > > > wake_up_process > if (!(p->state & state)) > goto out; > > Kthread_parkme > SET TASK_PARKED > schedule > raw_spin_lock(&rq->lock) > ttwu_remote > waiting for __task_rq_lock > context_switch > > finish_lock_switch > > > > Case TASK_PARKED > kthread_park_complete > > > SET Running Furthermore, Oleg noticed that the whole scheduler TASK_PARKED handling is buggered because the TASK_DEAD thing is done with preemption disabled, the current code can still complete early on preemption :/ So basically revert that earlier fix and go with a variant of the alternative mentioned in the commit. Promote TASK_PARKED to special state to avoid the store-store issue on task->state leading to the WARN in kthread_unpark() -> __kthread_bind(). But in addition, add wait_task_inactive() to kthread_park() to ensure the task really is PARKED when we return from kthread_park(). This avoids the whole kthread still gets migrated nonsense -- although it would be really good to get this done differently. Reported-by: Gaurav Kohli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue") Signed-off-by: Ingo Molnar --- include/linux/kthread.h | 1 - include/linux/sched.h | 2 +- kernel/kthread.c | 30 ++++++++++++++++++++++++------ kernel/sched/core.c | 31 +++++++++++-------------------- 4 files changed, 36 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 2803264c512f..c1961761311d 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -62,7 +62,6 @@ void *kthread_probe_data(struct task_struct *k); int kthread_park(struct task_struct *k); void kthread_unpark(struct task_struct *k); void kthread_parkme(void); -void kthread_park_complete(struct task_struct *k); int kthreadd(void *unused); extern struct task_struct *kthreadd_task; diff --git a/include/linux/sched.h b/include/linux/sched.h index 9256118bd40c..43731fe51c97 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -118,7 +118,7 @@ struct task_group; * the comment with set_special_state(). */ #define is_special_task_state(state) \ - ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_DEAD)) + ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) #define __set_current_state(state_value) \ do { \ diff --git a/kernel/kthread.c b/kernel/kthread.c index 481951bf091d..750cb8082694 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task) static void __kthread_parkme(struct kthread *self) { for (;;) { - set_current_state(TASK_PARKED); + /* + * TASK_PARKED is a special state; we must serialize against + * possible pending wakeups to avoid store-store collisions on + * task->state. + * + * Such a collision might possibly result in the task state + * changin from TASK_PARKED and us failing the + * wait_task_inactive() in kthread_park(). + */ + set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; + + complete_all(&self->parked); schedule(); } __set_current_state(TASK_RUNNING); @@ -191,11 +202,6 @@ void kthread_parkme(void) } EXPORT_SYMBOL_GPL(kthread_parkme); -void kthread_park_complete(struct task_struct *k) -{ - complete_all(&to_kthread(k)->parked); -} - static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -461,6 +467,9 @@ void kthread_unpark(struct task_struct *k) reinit_completion(&kthread->parked); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. + */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -487,7 +496,16 @@ int kthread_park(struct task_struct *k) set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); + /* + * Wait for __kthread_parkme() to complete(), this means we + * _will_ have TASK_PARKED and are about to call schedule(). + */ wait_for_completion(&kthread->parked); + /* + * Now wait for that schedule() to complete and the task to + * get scheduled out. + */ + WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 22fce36426c0..fe365c9a08e9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,7 +7,6 @@ */ #include "sched.h" -#include #include #include @@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev) membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); } - if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { - switch (prev_state) { - case TASK_DEAD: - if (prev->sched_class->task_dead) - prev->sched_class->task_dead(prev); + if (unlikely(prev_state == TASK_DEAD)) { + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - - /* Task is done with its stack. */ - put_task_stack(prev); + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); - put_task_struct(prev); - break; + /* Task is done with its stack. */ + put_task_stack(prev); - case TASK_PARKED: - kthread_park_complete(prev); - break; - } + put_task_struct(prev); } tick_nohz_task_switch(); -- cgit v1.2.3-59-g8ed1b From ed2b82c03dc187018307c7c6bf9299705f3db383 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Fri, 29 Jun 2018 14:48:20 +0200 Subject: bpf: hash map: decrement counter on error Decrement the number of elements in the map in case the allocation of a new node fails. Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements") Signed-off-by: Mauricio Vasquez B Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/hashtab.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3ca2198a6d22..513d9dfcf4ee 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -747,13 +747,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, * old element will be freed immediately. * Otherwise return an error */ - atomic_dec(&htab->count); - return ERR_PTR(-E2BIG); + l_new = ERR_PTR(-E2BIG); + goto dec_count; } l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, htab->map.numa_node); - if (!l_new) - return ERR_PTR(-ENOMEM); + if (!l_new) { + l_new = ERR_PTR(-ENOMEM); + goto dec_count; + } } memcpy(l_new->key, key, key_size); @@ -766,7 +768,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, GFP_ATOMIC | __GFP_NOWARN); if (!pptr) { kfree(l_new); - return ERR_PTR(-ENOMEM); + l_new = ERR_PTR(-ENOMEM); + goto dec_count; } } @@ -780,6 +783,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new->hash = hash; return l_new; +dec_count: + atomic_dec(&htab->count); + return l_new; } static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, -- cgit v1.2.3-59-g8ed1b From cf4d418e653afc84c9c873236033e06be5d58f1c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 28 Mar 2018 16:09:10 +0200 Subject: tracing: Avoid string overflow 'err' is used as a NUL-terminated string, but using strncpy() with the length equal to the buffer size may result in lack of the termination: kernel/trace/trace_events_hist.c: In function 'hist_err_event': kernel/trace/trace_events_hist.c:396:3: error: 'strncpy' specified bound 256 equals destination size [-Werror=stringop-truncation] strncpy(err, var, MAX_FILTER_STR_VAL); This changes it to use the safer strscpy() instead. Link: http://lkml.kernel.org/r/20180328140920.2842153-1-arnd@arndb.de Cc: stable@vger.kernel.org Fixes: f404da6e1d46 ("tracing: Add 'last error' error facility for hist triggers") Acked-by: Tom Zanussi Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 046c716a6536..aae18af94c94 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -393,7 +393,7 @@ static void hist_err_event(char *str, char *system, char *event, char *var) else if (system) snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); else - strncpy(err, var, MAX_FILTER_STR_VAL); + strscpy(err, var, MAX_FILTER_STR_VAL); hist_err(str, err); } -- cgit v1.2.3-59-g8ed1b From f90658725ba7ebb031054866aff4cda0d099a3b1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 2 Jul 2018 11:41:38 -0400 Subject: tracing: Make create_filter() code match the comments The comment in create_filter() states that the passed in filter pointer (filterp) will either be NULL or contain an error message stating why the filter failed. But it also expects the filter pointer to point to NULL when passed in. If it is not, the function create_filter_start() will warn and return an error message without updating the filter pointer. This is not what the comment states. As we always expect the pointer to point to NULL, if it is not, trigger a WARN_ON(), set it to NULL, and then continue the path as the rest will work as the comment states. Also update the comment to state it must point to NULL. Reported-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0dceb77d1d42..893a206bcba4 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1701,6 +1701,7 @@ static void create_filter_finish(struct filter_parse_error *pe) * @filter_str: filter string * @set_str: remember @filter_str and enable detailed error in filter * @filterp: out param for created filter (always updated on return) + * Must be a pointer that references a NULL pointer. * * Creates a filter for @call with @filter_str. If @set_str is %true, * @filter_str is copied and recorded in the new filter. @@ -1718,6 +1719,10 @@ static int create_filter(struct trace_event_call *call, struct filter_parse_error *pe = NULL; int err; + /* filterp must point to NULL */ + if (WARN_ON(*filterp)) + *filterp = NULL; + err = create_filter_start(filter_string, set_str, &pe, filterp); if (err) return err; -- cgit v1.2.3-59-g8ed1b From f26808ba7227a921e0e8549c7d3c52332b920085 Mon Sep 17 00:00:00 2001 From: yuan linyu Date: Sun, 8 Apr 2018 19:36:31 +0800 Subject: tracing: Optimize trace_buffer_iter() logic Simplify and optimize the logic in trace_buffer_iter() to use a conditional operation instead of an if conditional. Link: http://lkml.kernel.org/r/20180408113631.3947-1-cugyly@163.com Signed-off-by: yuan linyu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 630c5a24b2b2..f8f86231ad90 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -583,9 +583,7 @@ static __always_inline void trace_clear_recursion(int bit) static inline struct ring_buffer_iter * trace_buffer_iter(struct trace_iterator *iter, int cpu) { - if (iter->buffer_iter && iter->buffer_iter[cpu]) - return iter->buffer_iter[cpu]; - return NULL; + return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL; } int tracer_init(struct tracer *t, struct trace_array *tr); -- cgit v1.2.3-59-g8ed1b From 26b68dd2f48fe7699a89f0cfbb9f4a650dc1c837 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Thu, 8 Mar 2018 21:58:43 +0100 Subject: tracing: Use __printf markup to silence compiler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Silence warnings (triggered at W=1) by adding relevant __printf attributes. CC kernel/trace/trace.o kernel/trace/trace.c: In function ‘__trace_array_vprintk’: kernel/trace/trace.c:2979:2: warning: function might be possible candidate for ‘gnu_printf’ format attribute [-Wsuggest-attribute=format] len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); ^~~ AR kernel/trace/built-in.o Link: http://lkml.kernel.org/r/20180308205843.27447-1-malat@debian.org Signed-off-by: Mathieu Malaterre Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a0079b4c7a49..f054bd6a1c66 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2953,6 +2953,7 @@ out_nobuffer: } EXPORT_SYMBOL_GPL(trace_vbprintk); +__printf(3, 0) static int __trace_array_vprintk(struct ring_buffer *buffer, unsigned long ip, const char *fmt, va_list args) @@ -3007,12 +3008,14 @@ out_nobuffer: return len; } +__printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); } +__printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { @@ -3028,6 +3031,7 @@ int trace_array_printk(struct trace_array *tr, return ret; } +__printf(3, 4) int trace_array_printk_buf(struct ring_buffer *buffer, unsigned long ip, const char *fmt, ...) { @@ -3043,6 +3047,7 @@ int trace_array_printk_buf(struct ring_buffer *buffer, return ret; } +__printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(&global_trace, ip, fmt, args); -- cgit v1.2.3-59-g8ed1b From 5ccba64a560fa6ca06008d4001f5d46ebeb34b41 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 2 Feb 2018 10:14:49 +0800 Subject: ftrace: Nuke clear_ftrace_function clear_ftrace_function is not used outside of ftrace.c and is not help to use a function, so nuke it per Steve's suggestion. Link: http://lkml.kernel.org/r/1517537689-34947-1-git-send-email-xieyisheng1@huawei.com Suggested-by: Steven Rostedt Signed-off-by: Yisheng Xie Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 2 -- kernel/trace/ftrace.c | 13 +------------ 2 files changed, 1 insertion(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8154f4920fcb..ebb77674be90 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -223,7 +223,6 @@ extern enum ftrace_tracing_type_t ftrace_tracing_type; */ int register_ftrace_function(struct ftrace_ops *ops); int unregister_ftrace_function(struct ftrace_ops *ops); -void clear_ftrace_function(void); extern void ftrace_stub(unsigned long a0, unsigned long a1, struct ftrace_ops *op, struct pt_regs *regs); @@ -239,7 +238,6 @@ static inline int ftrace_nr_registered_ops(void) { return 0; } -static inline void clear_ftrace_function(void) { } static inline void ftrace_kill(void) { } static inline void ftrace_free_init_mem(void) { } static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index efed9c1cfb7e..caf9cbf35816 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -192,17 +192,6 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, regs); } -/** - * clear_ftrace_function - reset the ftrace function - * - * This NULLs the ftrace function and in essence stops - * tracing. There may be lag - */ -void clear_ftrace_function(void) -{ - ftrace_trace_function = ftrace_stub; -} - static void ftrace_sync(struct work_struct *work) { /* @@ -6689,7 +6678,7 @@ void ftrace_kill(void) { ftrace_disabled = 1; ftrace_enabled = 0; - clear_ftrace_function(); + ftrace_trace_function = ftrace_stub; } /** -- cgit v1.2.3-59-g8ed1b From 1fe4293f4b8de75824935f8d8e9a99c7fc6873da Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Wed, 31 Jan 2018 23:48:49 +0800 Subject: tracing: Fix missing return symbol in function_graph output The function_graph tracer does not show the interrupt return marker for the leaf entry. On leaf entries, we see an unbalanced interrupt marker (the interrupt was entered, but nevern left). Before: 1) | SyS_write() { 1) | __fdget_pos() { 1) 0.061 us | __fget_light(); 1) 0.289 us | } 1) | vfs_write() { 1) 0.049 us | rw_verify_area(); 1) + 15.424 us | __vfs_write(); 1) ==========> | 1) 6.003 us | smp_apic_timer_interrupt(); 1) 0.055 us | __fsnotify_parent(); 1) 0.073 us | fsnotify(); 1) + 23.665 us | } 1) + 24.501 us | } After: 0) | SyS_write() { 0) | __fdget_pos() { 0) 0.052 us | __fget_light(); 0) 0.328 us | } 0) | vfs_write() { 0) 0.057 us | rw_verify_area(); 0) | __vfs_write() { 0) ==========> | 0) 8.548 us | smp_apic_timer_interrupt(); 0) <========== | 0) + 36.507 us | } /* __vfs_write */ 0) 0.049 us | __fsnotify_parent(); 0) 0.066 us | fsnotify(); 0) + 50.064 us | } 0) + 50.952 us | } Link: http://lkml.kernel.org/r/1517413729-20411-1-git-send-email-changbin.du@intel.com Cc: stable@vger.kernel.org Fixes: f8b755ac8e0cc ("tracing/function-graph-tracer: Output arrows signal on hardirq call/return") Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions_graph.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 23c0b0cb5fb9..169b3c44ee97 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -831,6 +831,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; + int cpu = iter->cpu; int i; graph_ret = &ret_entry->ret; @@ -839,7 +840,6 @@ print_graph_entry_leaf(struct trace_iterator *iter, if (data) { struct fgraph_cpu_data *cpu_data; - int cpu = iter->cpu; cpu_data = per_cpu_ptr(data->cpu_data, cpu); @@ -869,6 +869,9 @@ print_graph_entry_leaf(struct trace_iterator *iter, trace_seq_printf(s, "%ps();\n", (void *)call->func); + print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, + cpu, iter->ent->pid, flags); + return trace_handle_return(s); } -- cgit v1.2.3-59-g8ed1b From 547b3aa451ae2739585547db9fbdee11a43ff999 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:05:56 -0700 Subject: bpf: sockmap, error path can not release psock in multi-map case The current code, in the error path of sock_hash_ctx_update_elem, checks if the sock has a psock in the user data and if so decrements the reference count of the psock. However, if the error happens early in the error path we may have never incremented the psock reference count and if the psock exists because the sock is in another map then we may inadvertently decrement the reference count. Fix this by making the error path only call smap_release_sock if the error happens after the increment. Reported-by: syzbot+d464d2c20c717ef5a6a8@syzkaller.appspotmail.com Fixes: 81110384441a ("bpf: sockmap, add hash map support") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index cf7b6a6dbd1f..3847a7ce7dae 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1896,7 +1896,7 @@ static int __sock_map_ctx_update_elem(struct bpf_map *map, e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); if (!e) { err = -ENOMEM; - goto out_progs; + goto out_free; } } @@ -2342,7 +2342,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, if (err) goto err; - /* bpf_map_update_elem() can be called in_irq() */ + /* psock is valid here because otherwise above *ctx_update_elem would + * have thrown an error. It is safe to skip error check. + */ + psock = smap_psock_sk(sock); raw_spin_lock_bh(&b->lock); l_old = lookup_elem_raw(head, hash, key, key_size); if (l_old && map_flags == BPF_NOEXIST) { @@ -2360,12 +2363,6 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto bucket_err; } - psock = smap_psock_sk(sock); - if (unlikely(!psock)) { - err = -EINVAL; - goto bucket_err; - } - rcu_assign_pointer(e->hash_link, l_new); rcu_assign_pointer(e->htab, container_of(map, struct bpf_htab, map)); @@ -2388,12 +2385,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, raw_spin_unlock_bh(&b->lock); return 0; bucket_err: + smap_release_sock(psock, sock); raw_spin_unlock_bh(&b->lock); err: kfree(e); - psock = smap_psock_sk(sock); - if (psock) - smap_release_sock(psock, sock); return err; } -- cgit v1.2.3-59-g8ed1b From 1d1ef005dbc6de673c62cbd2562290ada3090463 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:06:01 -0700 Subject: bpf: sockmap, hash table is RCU so readers do not need locks This removes locking from readers of RCU hash table. Its not necessary. Fixes: 81110384441a ("bpf: sockmap, add hash map support") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 3847a7ce7dae..00fb2e328d1b 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2467,10 +2467,8 @@ struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - raw_spin_lock_bh(&b->lock); l = lookup_elem_raw(head, hash, key, key_size); sk = l ? l->sk : NULL; - raw_spin_unlock_bh(&b->lock); return sk; } -- cgit v1.2.3-59-g8ed1b From 99ba2b5aba24e022683a7db63204f9e306fe7ab9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:50:04 -0700 Subject: bpf: sockhash, disallow bpf_tcp_close and update in parallel After latest lock updates there is no longer anything preventing a close and recvmsg call running in parallel. Additionally, we can race update with close if we close a socket and simultaneously update if via the BPF userspace API (note the cgroup ops are already run with sock_lock held). To resolve this take sock_lock in close and update paths. Reported-by: syzbot+b680e42077a0d7c9a0c4@syzkaller.appspotmail.com Fixes: e9db4ef6bf4c ("bpf: sockhash fix omitted bucket lock in sock_close") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 15 +++++++++++++++ kernel/bpf/syscall.c | 4 +++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 00fb2e328d1b..9c67e96fe336 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -312,10 +312,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout) struct smap_psock *psock; struct sock *osk; + lock_sock(sk); rcu_read_lock(); psock = smap_psock_sk(sk); if (unlikely(!psock)) { rcu_read_unlock(); + release_sock(sk); return sk->sk_prot->close(sk, timeout); } @@ -371,6 +373,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) e = psock_map_pop(sk, psock); } rcu_read_unlock(); + release_sock(sk); close_fun(sk, timeout); } @@ -2069,7 +2072,13 @@ static int sock_map_update_elem(struct bpf_map *map, return -EOPNOTSUPP; } + lock_sock(skops.sk); + preempt_disable(); + rcu_read_lock(); err = sock_map_ctx_update_elem(&skops, map, key, flags); + rcu_read_unlock(); + preempt_enable(); + release_sock(skops.sk); fput(socket->file); return err; } @@ -2410,7 +2419,13 @@ static int sock_hash_update_elem(struct bpf_map *map, return -EINVAL; } + lock_sock(skops.sk); + preempt_disable(); + rcu_read_lock(); err = sock_hash_ctx_update_elem(&skops, map, key, flags); + rcu_read_unlock(); + preempt_enable(); + release_sock(skops.sk); fput(socket->file); return err; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d10ecd78105f..a31a1ba0f8ea 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -735,7 +735,9 @@ static int map_update_elem(union bpf_attr *attr) if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_update_elem(map, key, value, attr->flags); goto out; - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH || + map->map_type == BPF_MAP_TYPE_SOCKMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } -- cgit v1.2.3-59-g8ed1b From 7ebc14d507b4b55105da8d1a1eda323381529cc7 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:50:10 -0700 Subject: bpf: sockmap, consume_skb in close path Currently, when a sock is closed and the bpf_tcp_close() callback is used we remove memory but do not free the skb. Call consume_skb() if the skb is attached to the buffer. Reported-by: syzbot+d464d2c20c717ef5a6a8@syzkaller.appspotmail.com Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/sockmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 9c67e96fe336..dfc8a8a07c1f 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -571,7 +571,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) while (sg[i].length) { free += sg[i].length; sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); + if (!md->skb) + put_page(sg_page(&sg[i])); sg[i].length = 0; sg[i].page_link = 0; sg[i].offset = 0; @@ -580,6 +581,8 @@ static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) if (i == MAX_SKB_FRAGS) i = 0; } + if (md->skb) + consume_skb(md->skb); return free; } -- cgit v1.2.3-59-g8ed1b From 0ea488ff8d23c93da383fcf424825c298b13b1fb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:50:15 -0700 Subject: bpf: sockmap, convert bpf_compute_data_pointers to bpf_*_sk_skb In commit 'bpf: bpf_compute_data uses incorrect cb structure' (8108a7751512) we added the routine bpf_compute_data_end_sk_skb() to compute the correct data_end values, but this has since been lost. In kernel v4.14 this was correct and the above patch was applied in it entirety. Then when v4.14 was merged into v4.15-rc1 net-next tree we lost the piece that renamed bpf_compute_data_pointers to the new function bpf_compute_data_end_sk_skb. This was done here, e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") When it conflicted with the following rename patch, 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers") Finally, after a refactor I thought even the function bpf_compute_data_end_sk_skb() was no longer needed and it was erroneously removed. However, we never reverted the sk_skb_convert_ctx_access() usage of tcp_skb_cb which had been committed and survived the merge conflict. Here we fix this by adding back the helper and *_data_end_sk_skb() usage. Using the bpf_skc_data_end mapping is not correct because it expects a qdisc_skb_cb object but at the sock layer this is not the case. Even though it happens to work here because we don't overwrite any data in-use at the socket layer and the cb structure is cleared later this has potential to create some subtle issues. But, even more concretely the filter.c access check uses tcp_skb_cb. And by some act of chance though, struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; /* 0 28 */ /* XXX 4 bytes hole, try to pack */ void * data_meta; /* 32 8 */ void * data_end; /* 40 8 */ /* size: 48, cachelines: 1, members: 3 */ /* sum members: 44, holes: 1, sum holes: 4 */ /* last cacheline: 48 bytes */ }; and then tcp_skb_cb, struct tcp_skb_cb { [...] struct { __u32 flags; /* 24 4 */ struct sock * sk_redir; /* 32 8 */ void * data_end; /* 40 8 */ } bpf; /* 24 */ }; So when we use offset_of() to track down the byte offset we get 40 in either case and everything continues to work. Fix this mess and use correct structures its unclear how long this might actually work for until someone moves the structs around. Reported-by: Martin KaFai Lau Fixes: e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") Fixes: 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 4 +++ kernel/bpf/sockmap.c | 4 +-- net/core/filter.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 97 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/net/tcp.h b/include/net/tcp.h index 800582b5dd54..af3ec72d5d41 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -828,6 +828,10 @@ struct tcp_skb_cb { #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) +static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); +} #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index dfc8a8a07c1f..98fb7938beea 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1236,7 +1236,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) */ TCP_SKB_CB(skb)->bpf.sk_redir = NULL; skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); preempt_enable(); @@ -1491,7 +1491,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); diff --git a/net/core/filter.c b/net/core/filter.c index 3095f1ba7015..470268024a40 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1762,6 +1762,37 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = { .arg2_type = ARG_ANYTHING, }; +static inline int sk_skb_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + int err = __bpf_try_make_writable(skb, write_len); + + bpf_compute_data_end_sk_skb(skb); + return err; +} + +BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto sk_skb_pull_data_proto = { + .func = sk_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { @@ -2864,8 +2895,8 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) return __skb_trim_rcsum(skb, new_len); } -BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, - u64, flags) +static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, + u64 flags) { u32 max_len = __bpf_skb_max_len(skb); u32 min_len = __bpf_skb_min_len(skb); @@ -2901,6 +2932,13 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, if (!ret && skb_is_gso(skb)) skb_gso_reset(skb); } + return ret; +} + +BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) +{ + int ret = __bpf_skb_change_tail(skb, new_len, flags); bpf_compute_data_pointers(skb); return ret; @@ -2915,8 +2953,26 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, +BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) +{ + int ret = __bpf_skb_change_tail(skb, new_len, flags); + + bpf_compute_data_end_sk_skb(skb); + return ret; +} + +static const struct bpf_func_proto sk_skb_change_tail_proto = { + .func = sk_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, + u64 flags) { u32 max_len = __bpf_skb_max_len(skb); u32 new_len = skb->len + head_room; @@ -2942,8 +2998,16 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, skb_reset_mac_header(skb); } + return ret; +} + +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + int ret = __bpf_skb_change_head(skb, head_room, flags); + bpf_compute_data_pointers(skb); - return 0; + return ret; } static const struct bpf_func_proto bpf_skb_change_head_proto = { @@ -2955,6 +3019,23 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + int ret = __bpf_skb_change_head(skb, head_room, flags); + + bpf_compute_data_end_sk_skb(skb); + return ret; +} + +static const struct bpf_func_proto sk_skb_change_head_proto = { + .func = sk_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : @@ -4618,9 +4699,12 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_store_bytes || func == bpf_skb_change_proto || func == bpf_skb_change_head || + func == sk_skb_change_head || func == bpf_skb_change_tail || + func == sk_skb_change_tail || func == bpf_skb_adjust_room || func == bpf_skb_pull_data || + func == sk_skb_pull_data || func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || @@ -4872,11 +4956,11 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: - return &bpf_skb_pull_data_proto; + return &sk_skb_pull_data_proto; case BPF_FUNC_skb_change_tail: - return &bpf_skb_change_tail_proto; + return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: - return &bpf_skb_change_head_proto; + return &sk_skb_change_head_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: -- cgit v1.2.3-59-g8ed1b From d8d7218ad842e18fc6976b87c08ed749e8d56313 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 6 Jul 2018 11:49:00 +0900 Subject: xdp: XDP_REDIRECT should check IFF_UP and MTU Otherwise we end up with attempting to send packets from down devices or to send oversized packets, which may cause unexpected driver/device behaviour. Generic XDP has already done this check, so reuse the logic in native XDP. Fixes: 814abfabef3c ("xdp: add bpf_redirect helper function") Signed-off-by: Toshiaki Makita Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 6 +++--- kernel/bpf/devmap.c | 7 ++++++- net/core/filter.c | 9 +++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 300baad62c88..c73dd7396886 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -765,8 +765,8 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); -static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, - struct net_device *fwd) +static inline int xdp_ok_fwd_dev(const struct net_device *fwd, + unsigned int pktlen) { unsigned int len; @@ -774,7 +774,7 @@ static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, return -ENETDOWN; len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; - if (skb->len > len) + if (pktlen > len) return -EMSGSIZE; return 0; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 642c97f6d1b8..d361fc1e3bf3 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -334,10 +334,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; struct xdp_frame *xdpf; + int err; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -350,7 +355,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, { int err; - err = __xdp_generic_ok_fwd_dev(skb, dst->dev); + err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; skb->dev = dst->dev; diff --git a/net/core/filter.c b/net/core/filter.c index 470268024a40..5fa66a33927f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3128,12 +3128,16 @@ static int __bpf_tx_xdp(struct net_device *dev, u32 index) { struct xdp_frame *xdpf; - int sent; + int err, sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; } + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -3367,7 +3371,8 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, goto err; } - if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) goto err; skb->dev = fwd; -- cgit v1.2.3-59-g8ed1b From 5b5ccbc2b041f98f26b984e013d303b7f9e6fb8e Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 9 Jul 2018 16:45:35 +0100 Subject: Revert "tick: Prefer a lower rating device only if it's CPU local device" This reverts commit 1332a90558013ae4242e3dd7934bdcdeafb06c0d. The original issue was not because of incorrect checking of cpumask for both new and old tick device. It was incorrectly analysed was due to the misunderstanding of the comment and misinterpretation of the return value from tick_check_preferred. The main issue is with the clockevent driver that sets the cpumask to cpu_all_mask instead of cpu_possible_mask. Signed-off-by: Sudeep Holla Signed-off-by: Thomas Gleixner Tested-by: Kevin Hilman Tested-by: Martin Blumenstingl Cc: linux-arm-kernel@lists.infradead.org Cc: Marc Zyngier Link: https://lkml.kernel.org/r/1531151136-18297-1-git-send-email-sudeep.holla@arm.com --- kernel/time/tick-common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b7005dd21ec1..14de3727b18e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -277,8 +277,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev, */ return !curdev || newdev->rating > curdev->rating || - (!cpumask_equal(curdev->cpumask, newdev->cpumask) && - !tick_check_percpu(curdev, newdev, smp_processor_id())); + !cpumask_equal(curdev->cpumask, newdev->cpumask); } /* -- cgit v1.2.3-59-g8ed1b From e96d71359e9bbea846a2111e4469a03a055dfa6f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 9 Jul 2018 15:51:50 -0400 Subject: rseq: Use __u64 for rseq_cs fields, validate user inputs Change the rseq ABI so rseq_cs start_ip, post_commit_offset and abort_ip fields are seen as 64-bit fields by both 32-bit and 64-bit kernels rather that ignoring the 32 upper bits on 32-bit kernels. This ensures we have a consistent behavior for a 32-bit binary executed on 32-bit kernels and in compat mode on 64-bit kernels. Validating the value of abort_ip field to be below TASK_SIZE ensures the kernel don't return to an invalid address when returning to userspace after an abort. I don't fully trust each architecture code to consistently deal with invalid return addresses. Validating the value of the start_ip and post_commit_offset fields prevents overflow on arithmetic performed on those values, used to check whether abort_ip is within the rseq critical section. If validation fails, the process is killed with a segmentation fault. When the signature encountered before abort_ip does not match the expected signature, return -EINVAL rather than -EPERM to be consistent with other input validation return codes from rseq_get_rseq_cs(). Signed-off-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Cc: linux-api@vger.kernel.org Cc: Peter Zijlstra Cc: "Paul E . McKenney" Cc: Boqun Feng Cc: Andy Lutomirski Cc: Dave Watson Cc: Paul Turner Cc: Andrew Morton Cc: Russell King Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: Chris Lameter Cc: Ben Maurer Cc: Steven Rostedt Cc: Josh Triplett Cc: Linus Torvalds Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Kerrisk Cc: Joel Fernandes Cc: "Paul E. McKenney" Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20180709195155.7654-2-mathieu.desnoyers@efficios.com --- include/uapi/linux/rseq.h | 6 +++--- kernel/rseq.c | 14 ++++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index d620fa43756c..519ad6e176d1 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -52,10 +52,10 @@ struct rseq_cs { __u32 version; /* enum rseq_cs_flags */ __u32 flags; - LINUX_FIELD_u32_u64(start_ip); + __u64 start_ip; /* Offset from start_ip. */ - LINUX_FIELD_u32_u64(post_commit_offset); - LINUX_FIELD_u32_u64(abort_ip); + __u64 post_commit_offset; + __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); /* diff --git a/kernel/rseq.c b/kernel/rseq.c index 22b6acf1ad63..16b38c5342f9 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -130,14 +130,20 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) urseq_cs = (struct rseq_cs __user *)ptr; if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) return -EFAULT; - if (rseq_cs->version > 0) - return -EINVAL; + if (rseq_cs->start_ip >= TASK_SIZE || + rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || + rseq_cs->abort_ip >= TASK_SIZE || + rseq_cs->version > 0) + return -EINVAL; + /* Check for overflow. */ + if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) + return -EINVAL; /* Ensure that abort_ip is not in the critical section. */ if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) return -EINVAL; - usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32)); + usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); ret = get_user(sig, usig); if (ret) return ret; @@ -146,7 +152,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) printk_ratelimited(KERN_WARNING "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", sig, current->rseq_sig, current->pid, usig); - return -EPERM; + return -EINVAL; } return 0; } -- cgit v1.2.3-59-g8ed1b From 8f28177014925f968baf45fc833c25848faf8c1c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 9 Jul 2018 15:51:51 -0400 Subject: rseq: Use get_user/put_user rather than __get_user/__put_user __get_user()/__put_user() is used to read values for address ranges that were already checked with access_ok() on rseq registration. It has been recognized that __get_user/__put_user are optimizing the wrong thing. Replace them by get_user/put_user across rseq instead. If those end up showing up in benchmarks, the proper approach would be to use user_access_begin() / unsafe_{get,put}_user() / user_access_end() anyway. Signed-off-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Cc: linux-api@vger.kernel.org Cc: Peter Zijlstra Cc: "Paul E . McKenney" Cc: Boqun Feng Cc: Andy Lutomirski Cc: Dave Watson Cc: Paul Turner Cc: Andrew Morton Cc: Russell King Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: Chris Lameter Cc: Ben Maurer Cc: Steven Rostedt Cc: Josh Triplett Cc: Linus Torvalds Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Kerrisk Cc: Joel Fernandes Cc: linux-arm-kernel@lists.infradead.org Link: https://lkml.kernel.org/r/20180709195155.7654-3-mathieu.desnoyers@efficios.com --- kernel/rseq.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rseq.c b/kernel/rseq.c index 16b38c5342f9..2c8463acb50d 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -85,9 +85,9 @@ static int rseq_update_cpu_id(struct task_struct *t) { u32 cpu_id = raw_smp_processor_id(); - if (__put_user(cpu_id, &t->rseq->cpu_id_start)) + if (put_user(cpu_id, &t->rseq->cpu_id_start)) return -EFAULT; - if (__put_user(cpu_id, &t->rseq->cpu_id)) + if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; trace_rseq_update(t); return 0; @@ -100,14 +100,14 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) /* * Reset cpu_id_start to its initial state (0). */ - if (__put_user(cpu_id_start, &t->rseq->cpu_id_start)) + if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) return -EFAULT; /* * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming * in after unregistration can figure out that rseq needs to be * registered again. */ - if (__put_user(cpu_id, &t->rseq->cpu_id)) + if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; return 0; } @@ -120,7 +120,7 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) u32 sig; int ret; - ret = __get_user(ptr, &t->rseq->rseq_cs); + ret = get_user(ptr, &t->rseq->rseq_cs); if (ret) return ret; if (!ptr) { @@ -163,7 +163,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) int ret; /* Get thread flags. */ - ret = __get_user(flags, &t->rseq->flags); + ret = get_user(flags, &t->rseq->flags); if (ret) return ret; @@ -203,7 +203,7 @@ static int clear_rseq_cs(struct task_struct *t) * * Set rseq_cs to NULL with single-copy atomicity. */ - return __put_user(0UL, &t->rseq->rseq_cs); + return put_user(0UL, &t->rseq->rseq_cs); } /* -- cgit v1.2.3-59-g8ed1b From 0fb9a1abc8c97f858997e962694eb36b4517144e Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 9 Jul 2018 15:51:52 -0400 Subject: rseq: uapi: Update uapi comments Update rseq uapi header comments to reflect that user-space need to do thread-local loads/stores from/to the struct rseq fields. As a consequence of this added requirement, the kernel does not need to perform loads/stores with single-copy atomicity. Update the comment associated to the "flags" fields to describe more accurately that it's only useful to facilitate single-stepping through rseq critical sections with debuggers. Signed-off-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Cc: linux-api@vger.kernel.org Cc: Peter Zijlstra Cc: "Paul E . McKenney" Cc: Boqun Feng Cc: Andy Lutomirski Cc: Dave Watson Cc: Paul Turner Cc: Andrew Morton Cc: Russell King Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: Chris Lameter Cc: Ben Maurer Cc: Steven Rostedt Cc: Josh Triplett Cc: Linus Torvalds Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Kerrisk Cc: Joel Fernandes Cc: "Paul E. McKenney" Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20180709195155.7654-4-mathieu.desnoyers@efficios.com --- include/uapi/linux/rseq.h | 69 ++++++++++++++++++++++++----------------------- kernel/rseq.c | 2 +- 2 files changed, 37 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 519ad6e176d1..bf4188c13bec 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -67,28 +67,30 @@ struct rseq_cs { struct rseq { /* * Restartable sequences cpu_id_start field. Updated by the - * kernel, and read by user-space with single-copy atomicity - * semantics. Aligned on 32-bit. Always contains a value in the - * range of possible CPUs, although the value may not be the - * actual current CPU (e.g. if rseq is not initialized). This - * CPU number value should always be compared against the value - * of the cpu_id field before performing a rseq commit or - * returning a value read from a data structure indexed using - * the cpu_id_start value. + * kernel. Read by user-space with single-copy atomicity + * semantics. This field should only be read by the thread which + * registered this data structure. Aligned on 32-bit. Always + * contains a value in the range of possible CPUs, although the + * value may not be the actual current CPU (e.g. if rseq is not + * initialized). This CPU number value should always be compared + * against the value of the cpu_id field before performing a rseq + * commit or returning a value read from a data structure indexed + * using the cpu_id_start value. */ __u32 cpu_id_start; /* - * Restartable sequences cpu_id field. Updated by the kernel, - * and read by user-space with single-copy atomicity semantics. - * Aligned on 32-bit. Values RSEQ_CPU_ID_UNINITIALIZED and - * RSEQ_CPU_ID_REGISTRATION_FAILED have a special semantic: the - * former means "rseq uninitialized", and latter means "rseq - * initialization failed". This value is meant to be read within - * rseq critical sections and compared with the cpu_id_start - * value previously read, before performing the commit instruction, - * or read and compared with the cpu_id_start value before returning - * a value loaded from a data structure indexed using the - * cpu_id_start value. + * Restartable sequences cpu_id field. Updated by the kernel. + * Read by user-space with single-copy atomicity semantics. This + * field should only be read by the thread which registered this + * data structure. Aligned on 32-bit. Values + * RSEQ_CPU_ID_UNINITIALIZED and RSEQ_CPU_ID_REGISTRATION_FAILED + * have a special semantic: the former means "rseq uninitialized", + * and latter means "rseq initialization failed". This value is + * meant to be read within rseq critical sections and compared + * with the cpu_id_start value previously read, before performing + * the commit instruction, or read and compared with the + * cpu_id_start value before returning a value loaded from a data + * structure indexed using the cpu_id_start value. */ __u32 cpu_id; /* @@ -105,27 +107,28 @@ struct rseq { * targeted by the rseq_cs. Also needs to be set to NULL by user-space * before reclaiming memory that contains the targeted struct rseq_cs. * - * Read and set by the kernel with single-copy atomicity semantics. - * Set by user-space with single-copy atomicity semantics. Aligned - * on 64-bit. + * Read and set by the kernel. Set by user-space with single-copy + * atomicity semantics. This field should only be updated by the + * thread which registered this data structure. Aligned on 64-bit. */ LINUX_FIELD_u32_u64(rseq_cs); /* - * - RSEQ_DISABLE flag: + * Restartable sequences flags field. + * + * This field should only be updated by the thread which + * registered this data structure. Read by the kernel. + * Mainly used for single-stepping through rseq critical sections + * with debuggers. * - * Fallback fast-track flag for single-stepping. - * Set by user-space if lack of progress is detected. - * Cleared by user-space after rseq finish. - * Read by the kernel. * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT - * Inhibit instruction sequence block restart and event - * counter increment on preemption for this thread. + * Inhibit instruction sequence block restart on preemption + * for this thread. * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL - * Inhibit instruction sequence block restart and event - * counter increment on signal delivery for this thread. + * Inhibit instruction sequence block restart on signal + * delivery for this thread. * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE - * Inhibit instruction sequence block restart and event - * counter increment on migration for this thread. + * Inhibit instruction sequence block restart on migration for + * this thread. */ __u32 flags; } __attribute__((aligned(4 * sizeof(__u64)))); diff --git a/kernel/rseq.c b/kernel/rseq.c index 2c8463acb50d..2a7748675be7 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -201,7 +201,7 @@ static int clear_rseq_cs(struct task_struct *t) * of code outside of the rseq assembly block. This performs * a lazy clear of the rseq_cs field. * - * Set rseq_cs to NULL with single-copy atomicity. + * Set rseq_cs to NULL. */ return put_user(0UL, &t->rseq->rseq_cs); } -- cgit v1.2.3-59-g8ed1b From ec9c82e03a744e5698bd95eab872855861a821fa Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 9 Jul 2018 15:51:53 -0400 Subject: rseq: uapi: Declare rseq_cs field as union, update includes Declaring the rseq_cs field as a union between __u64 and two __u32 allows both 32-bit and 64-bit kernels to read the full __u64, and therefore validate that a 32-bit user-space cleared the upper 32 bits, thus ensuring a consistent behavior between native 32-bit kernels and 32-bit compat tasks on 64-bit kernels. Check that the rseq_cs value read is < TASK_SIZE. The asm/byteorder.h header needs to be included by rseq.h, now that it is not using linux/types_32_64.h anymore. Considering that only __32 and __u64 types are declared in linux/rseq.h, the linux/types.h header should always be included for both kernel and user-space code: including stdint.h is just for u64 and u32, which are not used in this header at all. Use copy_from_user()/clear_user() to interact with a 64-bit field, because arm32 does not implement 64-bit __get_user, and ppc32 does not 64-bit get_user. Considering that the rseq_cs pointer does not need to be loaded/stored with single-copy atomicity from the kernel anymore, we can simply use copy_from_user()/clear_user(). Signed-off-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Cc: linux-api@vger.kernel.org Cc: Peter Zijlstra Cc: "Paul E . McKenney" Cc: Boqun Feng Cc: Andy Lutomirski Cc: Dave Watson Cc: Paul Turner Cc: Andrew Morton Cc: Russell King Cc: "H . Peter Anvin" Cc: Andi Kleen Cc: Chris Lameter Cc: Ben Maurer Cc: Steven Rostedt Cc: Josh Triplett Cc: Linus Torvalds Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Kerrisk Cc: Joel Fernandes Cc: "Paul E. McKenney" Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20180709195155.7654-5-mathieu.desnoyers@efficios.com --- include/uapi/linux/rseq.h | 27 +++++++++++++++++++-------- kernel/rseq.c | 15 +++++++++------ tools/testing/selftests/rseq/rseq.h | 11 ++++++++++- 3 files changed, 38 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index bf4188c13bec..9a402fdb60e9 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -10,13 +10,8 @@ * Copyright (c) 2015-2018 Mathieu Desnoyers */ -#ifdef __KERNEL__ -# include -#else -# include -#endif - -#include +#include +#include enum rseq_cpu_id_state { RSEQ_CPU_ID_UNINITIALIZED = -1, @@ -111,7 +106,23 @@ struct rseq { * atomicity semantics. This field should only be updated by the * thread which registered this data structure. Aligned on 64-bit. */ - LINUX_FIELD_u32_u64(rseq_cs); + union { + __u64 ptr64; +#ifdef __LP64__ + __u64 ptr; +#else + struct { +#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN)) || defined(__BIG_ENDIAN) + __u32 padding; /* Initialized to zero. */ + __u32 ptr32; +#else /* LITTLE */ + __u32 ptr32; + __u32 padding; /* Initialized to zero. */ +#endif /* ENDIAN */ + } ptr; +#endif + } rseq_cs; + /* * Restartable sequences flags field. * diff --git a/kernel/rseq.c b/kernel/rseq.c index 2a7748675be7..c6242d8594dc 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -115,19 +115,20 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; - unsigned long ptr; + u64 ptr; u32 __user *usig; u32 sig; int ret; - ret = get_user(ptr, &t->rseq->rseq_cs); - if (ret) - return ret; + if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr))) + return -EFAULT; if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } - urseq_cs = (struct rseq_cs __user *)ptr; + if (ptr >= TASK_SIZE) + return -EINVAL; + urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) return -EFAULT; @@ -203,7 +204,9 @@ static int clear_rseq_cs(struct task_struct *t) * * Set rseq_cs to NULL. */ - return put_user(0UL, &t->rseq->rseq_cs); + if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64))) + return -EFAULT; + return 0; } /* diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h index a4684112676c..f2073cfa4448 100644 --- a/tools/testing/selftests/rseq/rseq.h +++ b/tools/testing/selftests/rseq/rseq.h @@ -133,6 +133,15 @@ static inline uint32_t rseq_current_cpu(void) return cpu; } +static inline void rseq_clear_rseq_cs(void) +{ +#ifdef __LP64__ + __rseq_abi.rseq_cs.ptr = 0; +#else + __rseq_abi.rseq_cs.ptr.ptr32 = 0; +#endif +} + /* * rseq_prepare_unload() should be invoked by each thread using rseq_finish*() * at least once between their last rseq_finish*() and library unload of the @@ -143,7 +152,7 @@ static inline uint32_t rseq_current_cpu(void) */ static inline void rseq_prepare_unload(void) { - __rseq_abi.rseq_cs = 0; + rseq_clear_rseq_cs(); } #endif /* RSEQ_H_ */ -- cgit v1.2.3-59-g8ed1b From 0fc8c3581dd42bc8f530314ca86db2d861485731 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 9 Jul 2018 16:19:06 +0200 Subject: tracing/kprobe: Release kprobe print_fmt properly We don't release tk->tp.call.print_fmt when destroying local uprobe. Also there's missing print_fmt kfree in create_local_trace_kprobe error path. Link: http://lkml.kernel.org/r/20180709141906.2390-1-jolsa@kernel.org Cc: stable@vger.kernel.org Fixes: e12f03d7031a ("perf/core: Implement the 'perf_kprobe' PMU") Acked-by: Song Liu Acked-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index daa81571b22a..21f718472942 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1480,8 +1480,10 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, } ret = __register_trace_kprobe(tk); - if (ret < 0) + if (ret < 0) { + kfree(tk->tp.call.print_fmt); goto error; + } return &tk->tp.call; error: @@ -1501,6 +1503,8 @@ void destroy_local_trace_kprobe(struct trace_event_call *event_call) } __unregister_trace_kprobe(tk); + + kfree(tk->tp.call.print_fmt); free_trace_kprobe(tk); } #endif /* CONFIG_PERF_EVENTS */ -- cgit v1.2.3-59-g8ed1b From b65f370d0671c4980ffe866c41e327b88893245c Mon Sep 17 00:00:00 2001 From: Okash Khawaja Date: Tue, 10 Jul 2018 14:33:07 -0700 Subject: bpf: btf: Fix bitfield extraction for big endian When extracting bitfield from a number, btf_int_bits_seq_show() builds a mask and accesses least significant byte of the number in a way specific to little-endian. This patch fixes that by checking endianness of the machine and then shifting left and right the unneeded bits. Thanks to Martin Lau for the help in navigating potential pitfalls when dealing with endianess and for the final solution. Fixes: b00b8daec828 ("bpf: btf: Add pretty print capability for data with BTF type info") Signed-off-by: Okash Khawaja Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/btf.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2d49d18b793a..e016ac3afa24 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -991,16 +991,13 @@ static void btf_int_bits_seq_show(const struct btf *btf, void *data, u8 bits_offset, struct seq_file *m) { + u16 left_shift_bits, right_shift_bits; u32 int_data = btf_type_int(t); u16 nr_bits = BTF_INT_BITS(int_data); u16 total_bits_offset; u16 nr_copy_bytes; u16 nr_copy_bits; - u8 nr_upper_bits; - union { - u64 u64_num; - u8 u8_nums[8]; - } print_num; + u64 print_num; total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); @@ -1008,21 +1005,20 @@ static void btf_int_bits_seq_show(const struct btf *btf, nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); - print_num.u64_num = 0; - memcpy(&print_num.u64_num, data, nr_copy_bytes); + print_num = 0; + memcpy(&print_num, data, nr_copy_bytes); - /* Ditch the higher order bits */ - nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits); - if (nr_upper_bits) { - /* We need to mask out some bits of the upper byte. */ - u8 mask = (1 << nr_upper_bits) - 1; +#ifdef __BIG_ENDIAN_BITFIELD + left_shift_bits = bits_offset; +#else + left_shift_bits = BITS_PER_U64 - nr_copy_bits; +#endif + right_shift_bits = BITS_PER_U64 - nr_bits; - print_num.u8_nums[nr_copy_bytes - 1] &= mask; - } - - print_num.u64_num >>= bits_offset; + print_num <<= left_shift_bits; + print_num >>= right_shift_bits; - seq_printf(m, "0x%llx", print_num.u64_num); + seq_printf(m, "0x%llx", print_num); } static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, -- cgit v1.2.3-59-g8ed1b From c7a897843224a92209f306c984975b704969b89d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 12 Jul 2018 21:44:28 +0200 Subject: bpf: don't leave partial mangled prog in jit_subprogs error path syzkaller managed to trigger the following bug through fault injection: [...] [ 141.043668] verifier bug. No program starts at insn 3 [ 141.044648] WARNING: CPU: 3 PID: 4072 at kernel/bpf/verifier.c:1613 get_callee_stack_depth kernel/bpf/verifier.c:1612 [inline] [ 141.044648] WARNING: CPU: 3 PID: 4072 at kernel/bpf/verifier.c:1613 fixup_call_args kernel/bpf/verifier.c:5587 [inline] [ 141.044648] WARNING: CPU: 3 PID: 4072 at kernel/bpf/verifier.c:1613 bpf_check+0x525e/0x5e60 kernel/bpf/verifier.c:5952 [ 141.047355] CPU: 3 PID: 4072 Comm: a.out Not tainted 4.18.0-rc4+ #51 [ 141.048446] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),BIOS 1.10.2-1 04/01/2014 [ 141.049877] Call Trace: [ 141.050324] __dump_stack lib/dump_stack.c:77 [inline] [ 141.050324] dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113 [ 141.050950] ? dump_stack_print_info.cold.2+0x52/0x52 lib/dump_stack.c:60 [ 141.051837] panic+0x238/0x4e7 kernel/panic.c:184 [ 141.052386] ? add_taint.cold.5+0x16/0x16 kernel/panic.c:385 [ 141.053101] ? __warn.cold.8+0x148/0x1ba kernel/panic.c:537 [ 141.053814] ? __warn.cold.8+0x117/0x1ba kernel/panic.c:530 [ 141.054506] ? get_callee_stack_depth kernel/bpf/verifier.c:1612 [inline] [ 141.054506] ? fixup_call_args kernel/bpf/verifier.c:5587 [inline] [ 141.054506] ? bpf_check+0x525e/0x5e60 kernel/bpf/verifier.c:5952 [ 141.055163] __warn.cold.8+0x163/0x1ba kernel/panic.c:538 [ 141.055820] ? get_callee_stack_depth kernel/bpf/verifier.c:1612 [inline] [ 141.055820] ? fixup_call_args kernel/bpf/verifier.c:5587 [inline] [ 141.055820] ? bpf_check+0x525e/0x5e60 kernel/bpf/verifier.c:5952 [...] What happens in jit_subprogs() is that kcalloc() for the subprog func buffer is failing with NULL where we then bail out. Latter is a plain return -ENOMEM, and this is definitely not okay since earlier in the loop we are walking all subprogs and temporarily rewrite insn->off to remember the subprog id as well as insn->imm to temporarily point the call to __bpf_call_base + 1 for the initial JIT pass. Thus, bailing out in such state and handing this over to the interpreter is troublesome since later/subsequent e.g. find_subprog() lookups are based on wrong insn->imm. Therefore, once we hit this point, we need to jump to out_free path where we undo all changes from earlier loop, so that interpreter can work on unmodified insn->{off,imm}. Another point is that should find_subprog() fail in jit_subprogs() due to a verifier bug, then we also should not simply defer the program to the interpreter since also here we did partial modifications. Instead we should just bail out entirely and return an error to the user who is trying to load the program. Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") Reported-by: syzbot+7d427828b2ea6e592804@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e2bf834f13a..63aaac52a265 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5430,6 +5430,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; + /* Upon error here we cannot fall back to interpreter but + * need a hard reject of the program. Thus -EFAULT is + * propagated in any case. + */ subprog = find_subprog(env, i + insn->imm + 1); if (subprog < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", @@ -5450,7 +5454,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); if (!func) - return -ENOMEM; + goto out_undo_insn; for (i = 0; i < env->subprog_cnt; i++) { subprog_start = subprog_end; @@ -5515,7 +5519,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) tmp = bpf_int_jit_compile(func[i]); if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); - err = -EFAULT; + err = -ENOTSUPP; goto out_free; } cond_resched(); @@ -5552,6 +5556,7 @@ out_free: if (func[i]) bpf_jit_free(func[i]); kfree(func); +out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { @@ -5578,6 +5583,8 @@ static int fixup_call_args(struct bpf_verifier_env *env) err = jit_subprogs(env); if (err == 0) return 0; + if (err == -EFAULT) + return err; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON for (i = 0; i < prog->len; i++, insn++) { -- cgit v1.2.3-59-g8ed1b From f8494fa3dd10b52eab47a9666a8bc34719a129aa Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 25 Jun 2018 17:08:22 -0700 Subject: tracing: Reorder display of TGID to be after PID Currently ftrace displays data in trace output like so: _-----=> irqs-off / _----=> need-resched | / _---=> hardirq/softirq || / _--=> preempt-depth ||| / delay TASK-PID CPU TGID |||| TIMESTAMP FUNCTION | | | | |||| | | bash-1091 [000] ( 1091) d..2 28.313544: sched_switch: However Android's trace visualization tools expect a slightly different format due to an out-of-tree patch patch that was been carried for a decade, notice that the TGID and CPU fields are reversed: _-----=> irqs-off / _----=> need-resched | / _---=> hardirq/softirq || / _--=> preempt-depth ||| / delay TASK-PID TGID CPU |||| TIMESTAMP FUNCTION | | | | |||| | | bash-1091 ( 1091) [002] d..2 64.965177: sched_switch: From kernel v4.13 onwards, during which TGID was introduced, tracing with systrace on all Android kernels will break (most Android kernels have been on 4.9 with Android patches, so this issues hasn't been seen yet). From v4.13 onwards things will break. The chrome browser's tracing tools also embed the systrace viewer which uses the legacy TGID format and updates to that are known to be difficult to make. Considering this, I suggest we make this change to the upstream kernel and backport it to all Android kernels. I believe this feature is merged recently enough into the upstream kernel that it shouldn't be a problem. Also logically, IMO it makes more sense to group the TGID with the TASK-PID and the CPU after these. Link: http://lkml.kernel.org/r/20180626000822.113931-1-joel@joelfernandes.org Cc: jreck@google.com Cc: tkjos@google.com Cc: stable@vger.kernel.org Fixes: 441dae8f2f29 ("tracing: Add support for display of tgid in trace output") Signed-off-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace_output.c | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f054bd6a1c66..87cf25171fb8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3365,8 +3365,8 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m, print_event_info(buf, m); - seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); - seq_printf(m, "# | | | %s | |\n", tgid ? " | " : ""); + seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); + seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); } static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m, @@ -3386,9 +3386,9 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file tgid ? tgid_space : space); seq_printf(m, "# %s||| / delay\n", tgid ? tgid_space : space); - seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", + seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n", tgid ? " TGID " : space); - seq_printf(m, "# | | | %s|||| | |\n", + seq_printf(m, "# | | %s | |||| | |\n", tgid ? " | " : space); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 90db994ac900..1c8e30fda46a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -594,8 +594,7 @@ int trace_print_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); - trace_seq_printf(s, "%16s-%-5d [%03d] ", - comm, entry->pid, iter->cpu); + trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { unsigned int tgid = trace_find_tgid(entry->pid); @@ -606,6 +605,8 @@ int trace_print_context(struct trace_iterator *iter) trace_seq_printf(s, "(%5d) ", tgid); } + trace_seq_printf(s, "[%03d] ", iter->cpu); + if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); -- cgit v1.2.3-59-g8ed1b From 3c53776e29f81719efcf8f7a6e30cdf753bee94d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Jan 2018 11:51:04 -0800 Subject: Mark HI and TASKLET softirq synchronous Way back in 4.9, we committed 4cd13c21b207 ("softirq: Let ksoftirqd do its job"), and ever since we've had small nagging issues with it. For example, we've had: 1ff688209e2e ("watchdog: core: make sure the watchdog_worker is not deferred") 8d5755b3f77b ("watchdog: softdog: fire watchdog even if softirqs do not get to run") 217f69743681 ("net: busy-poll: allow preemption in sk_busy_loop()") all of which worked around some of the effects of that commit. The DVB people have also complained that the commit causes excessive USB URB latencies, which seems to be due to the USB code using tasklets to schedule USB traffic. This seems to be an issue mainly when already living on the edge, but waiting for ksoftirqd to handle it really does seem to cause excessive latencies. Now Hanna Hawa reports that this issue isn't just limited to USB URB and DVB, but also causes timeout problems for the Marvell SoC team: "I'm facing kernel panic issue while running raid 5 on sata disks connected to Macchiatobin (Marvell community board with Armada-8040 SoC with 4 ARMv8 cores of CA72) Raid 5 built with Marvell DMA engine and async_tx mechanism (ASYNC_TX_DMA [=y]); the DMA driver (mv_xor_v2) uses a tasklet to clean the done descriptors from the queue" The latency problem causes a panic: mv_xor_v2 f0400000.xor: dma_sync_wait: timeout! Kernel panic - not syncing: async_tx_quiesce: DMA error waiting for transaction We've discussed simply just reverting the original commit entirely, and also much more involved solutions (with per-softirq threads etc). This patch is intentionally stupid and fairly limited, because the issue still remains, and the other solutions either got sidetracked or had other issues. We should probably also consider the timer softirqs to be synchronous and not be delayed to ksoftirqd (since they were the issue with the earlier watchdog problems), but that should be done as a separate patch. This does only the tasklet cases. Reported-and-tested-by: Hanna Hawa Reported-and-tested-by: Josef Griebichler Reported-by: Mauro Carvalho Chehab Cc: Alan Stern Cc: Greg Kroah-Hartman Cc: Eric Dumazet Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/softirq.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 900dcfee542c..75ffc1d1a2e0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -79,12 +79,16 @@ static void wakeup_softirqd(void) /* * If ksoftirqd is scheduled, we do not want to process pending softirqs - * right now. Let ksoftirqd handle this at its own rate, to get fairness. + * right now. Let ksoftirqd handle this at its own rate, to get fairness, + * unless we're doing some of the synchronous softirqs. */ -static bool ksoftirqd_running(void) +#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ)) +static bool ksoftirqd_running(unsigned long pending) { struct task_struct *tsk = __this_cpu_read(ksoftirqd); + if (pending & SOFTIRQ_NOW_MASK) + return false; return tsk && (tsk->state == TASK_RUNNING); } @@ -328,7 +332,7 @@ asmlinkage __visible void do_softirq(void) pending = local_softirq_pending(); - if (pending && !ksoftirqd_running()) + if (pending && !ksoftirqd_running(pending)) do_softirq_own_stack(); local_irq_restore(flags); @@ -355,7 +359,7 @@ void irq_enter(void) static inline void invoke_softirq(void) { - if (ksoftirqd_running()) + if (ksoftirqd_running(local_softirq_pending())) return; if (!force_irqthreads) { -- cgit v1.2.3-59-g8ed1b