From 84ec3a0787086fcd25f284f59b3aa01fd6fc0a5d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Jun 2019 09:52:38 -0700 Subject: time/tick-broadcast: Fix tick_broadcast_offline() lockdep complaint time/tick-broadcast: Fix tick_broadcast_offline() lockdep complaint The TASKS03 and TREE04 rcutorture scenarios produce the following lockdep complaint: WARNING: inconsistent lock state 5.2.0-rc1+ #513 Not tainted -------------------------------- inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. migration/1/14 [HC0[0]:SC0[0]:HE1:SE1] takes: (____ptrval____) (tick_broadcast_lock){?...}, at: tick_broadcast_offline+0xf/0x70 {IN-HARDIRQ-W} state was registered at: lock_acquire+0xb0/0x1c0 _raw_spin_lock_irqsave+0x3c/0x50 tick_broadcast_switch_to_oneshot+0xd/0x40 tick_switch_to_oneshot+0x4f/0xd0 hrtimer_run_queues+0xf3/0x130 run_local_timers+0x1c/0x50 update_process_times+0x1c/0x50 tick_periodic+0x26/0xc0 tick_handle_periodic+0x1a/0x60 smp_apic_timer_interrupt+0x80/0x2a0 apic_timer_interrupt+0xf/0x20 _raw_spin_unlock_irqrestore+0x4e/0x60 rcu_nocb_gp_kthread+0x15d/0x590 kthread+0xf3/0x130 ret_from_fork+0x3a/0x50 irq event stamp: 171 hardirqs last enabled at (171): [] trace_hardirqs_on_thunk+0x1a/0x1c hardirqs last disabled at (170): [] trace_hardirqs_off_thunk+0x1a/0x1c softirqs last enabled at (0): [] copy_process.part.56+0x650/0x1cb0 softirqs last disabled at (0): [<0000000000000000>] 0x0 [...] To reproduce, run the following rcutorture test: $ tools/testing/selftests/rcutorture/bin/kvm.sh --duration 5 --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y" --configs "TASKS03 TREE04" It turns out that tick_broadcast_offline() was an innocent bystander. After all, interrupts are supposed to be disabled throughout take_cpu_down(), and therefore should have been disabled upon entry to tick_offline_cpu() and thus to tick_broadcast_offline(). This suggests that one of the CPU-hotplug notifiers was incorrectly enabling interrupts, and leaving them enabled on return. Some debugging code showed that the culprit was sched_cpu_dying(). It had irqs enabled after return from sched_tick_stop(). Which in turn had irqs enabled after return from cancel_delayed_work_sync(). Which is a wrapper around __cancel_work_timer(). Which can sleep in the case where something else is concurrently trying to cancel the same delayed work, and as Thomas Gleixner pointed out on IRC, sleeping is a decidedly bad idea when you are invoked from take_cpu_down(), regardless of the state you leave interrupts in upon return. Code inspection located no reason why the delayed work absolutely needed to be canceled from sched_tick_stop(): The work is not bound to the outgoing CPU by design, given that the whole point is to collect statistics without disturbing the outgoing CPU. This commit therefore simply drops the cancel_delayed_work_sync() from sched_tick_stop(). Instead, a new ->state field is added to the tick_work structure so that the delayed-work handler function sched_tick_remote() can avoid reposting itself. A cpu_is_offline() check is also added to sched_tick_remote() to avoid mucking with the state of an offlined CPU (though it does appear safe to do so). The sched_tick_start() and sched_tick_stop() functions also update ->state, and sched_tick_start() also schedules the delayed work if ->state indicates that it is not already in flight. Signed-off-by: Paul E. McKenney [ paulmck: Apply Peter Zijlstra and Frederic Weisbecker atomics feedback. ] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190625165238.GJ26519@linux.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 57 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 8 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..0b22e55cebe8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3486,8 +3486,36 @@ void scheduler_tick(void) struct tick_work { int cpu; + atomic_t state; struct delayed_work work; }; +/* Values for ->state, see diagram below. */ +#define TICK_SCHED_REMOTE_OFFLINE 0 +#define TICK_SCHED_REMOTE_OFFLINING 1 +#define TICK_SCHED_REMOTE_RUNNING 2 + +/* + * State diagram for ->state: + * + * + * TICK_SCHED_REMOTE_OFFLINE + * | ^ + * | | + * | | sched_tick_remote() + * | | + * | | + * +--TICK_SCHED_REMOTE_OFFLINING + * | ^ + * | | + * sched_tick_start() | | sched_tick_stop() + * | | + * V | + * TICK_SCHED_REMOTE_RUNNING + * + * + * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() + * and sched_tick_start() are happy to leave the state in RUNNING. + */ static struct tick_work __percpu *tick_work_cpu; @@ -3500,6 +3528,7 @@ static void sched_tick_remote(struct work_struct *work) struct task_struct *curr; struct rq_flags rf; u64 delta; + int os; /* * Handle the tick only if it appears the remote CPU is running in full @@ -3513,7 +3542,7 @@ static void sched_tick_remote(struct work_struct *work) rq_lock_irq(rq, &rf); curr = rq->curr; - if (is_idle_task(curr)) + if (is_idle_task(curr) || cpu_is_offline(cpu)) goto out_unlock; update_rq_clock(rq); @@ -3533,13 +3562,18 @@ out_requeue: /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough - * to keep scheduler internal stats reasonably up to date. + * to keep scheduler internal stats reasonably up to date. But + * first update state to reflect hotplug activity if required. */ - queue_delayed_work(system_unbound_wq, dwork, HZ); + os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); + WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); + if (os == TICK_SCHED_REMOTE_RUNNING) + queue_delayed_work(system_unbound_wq, dwork, HZ); } static void sched_tick_start(int cpu) { + int os; struct tick_work *twork; if (housekeeping_cpu(cpu, HK_FLAG_TICK)) @@ -3548,15 +3582,20 @@ static void sched_tick_start(int cpu) WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); - twork->cpu = cpu; - INIT_DELAYED_WORK(&twork->work, sched_tick_remote); - queue_delayed_work(system_unbound_wq, &twork->work, HZ); + os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); + WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); + if (os == TICK_SCHED_REMOTE_OFFLINE) { + twork->cpu = cpu; + INIT_DELAYED_WORK(&twork->work, sched_tick_remote); + queue_delayed_work(system_unbound_wq, &twork->work, HZ); + } } #ifdef CONFIG_HOTPLUG_CPU static void sched_tick_stop(int cpu) { struct tick_work *twork; + int os; if (housekeeping_cpu(cpu, HK_FLAG_TICK)) return; @@ -3564,7 +3603,10 @@ static void sched_tick_stop(int cpu) WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); - cancel_delayed_work_sync(&twork->work); + /* There cannot be competing actions, but don't rely on stop-machine. */ + os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); + WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); + /* Don't cancel, as this would mess up the state machine. */ } #endif /* CONFIG_HOTPLUG_CPU */ @@ -3572,7 +3614,6 @@ int __init sched_tick_offload_init(void) { tick_work_cpu = alloc_percpu(struct tick_work); BUG_ON(!tick_work_cpu); - return 0; } -- cgit v1.2.3-59-g8ed1b From 4b211f2b129dd1f6a6956bbc76e2f232c1ec3ad8 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 19 Jul 2019 15:59:54 +0200 Subject: sched/core: Streamle calls to task_rq_unlock() Calls to task_rq_unlock() are done several times in the __sched_setscheduler() function. This is fine when only the rq lock needs to be handled but not so much when other locks come into play. This patch streamlines the release of the rq lock so that only one location need to be modified when dealing with more than one lock. No change of functionality is introduced by this patch. Tested-by: Dietmar Eggemann Signed-off-by: Mathieu Poirier Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-3-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0b22e55cebe8..1af3d2dc6b29 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4712,8 +4712,8 @@ recheck: * Changing the policy of the stop threads its a very bad idea: */ if (p == rq->stop) { - task_rq_unlock(rq, p, &rf); - return -EINVAL; + retval = -EINVAL; + goto unlock; } /* @@ -4731,8 +4731,8 @@ recheck: goto change; p->sched_reset_on_fork = reset_on_fork; - task_rq_unlock(rq, p, &rf); - return 0; + retval = 0; + goto unlock; } change: @@ -4745,8 +4745,8 @@ change: if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - task_rq_unlock(rq, p, &rf); - return -EPERM; + retval = -EPERM; + goto unlock; } #endif #ifdef CONFIG_SMP @@ -4761,8 +4761,8 @@ change: */ if (!cpumask_subset(span, p->cpus_ptr) || rq->rd->dl_bw.bw == 0) { - task_rq_unlock(rq, p, &rf); - return -EPERM; + retval = -EPERM; + goto unlock; } } #endif @@ -4781,8 +4781,8 @@ change: * is available. */ if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { - task_rq_unlock(rq, p, &rf); - return -EBUSY; + retval = -EBUSY; + goto unlock; } p->sched_reset_on_fork = reset_on_fork; @@ -4840,6 +4840,10 @@ change: preempt_enable(); return 0; + +unlock: + task_rq_unlock(rq, p, &rf); + return retval; } static int _sched_setscheduler(struct task_struct *p, int policy, -- cgit v1.2.3-59-g8ed1b From 710da3c8ea7dfbd327920afd3831d8c82c42789d Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Jul 2019 16:00:00 +0200 Subject: sched/core: Prevent race condition between cpuset and __sched_setscheduler() No synchronisation mechanism exists between the cpuset subsystem and calls to function __sched_setscheduler(). As such, it is possible that new root domains are created on the cpuset side while a deadline acceptance test is carried out in __sched_setscheduler(), leading to a potential oversell of CPU bandwidth. Grab cpuset_rwsem read lock from core scheduler, so to prevent situations such as the one described above from happening. The only exception is normalize_rt_tasks() which needs to work under tasklist_lock and can't therefore grab cpuset_rwsem. We are fine with this, as this function is only called by sysrq and, if that gets triggered, DEADLINE guarantees are already gone out of the window anyway. Tested-by: Dietmar Eggemann Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-9-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- include/linux/cpuset.h | 5 +++++ kernel/cgroup/cpuset.c | 11 +++++++++++ kernel/sched/core.c | 20 +++++++++++++++++--- 3 files changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 7f1478c26a33..04c20de66afc 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -55,6 +55,8 @@ extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); extern void cpuset_wait_for_hotplug(void); +extern void cpuset_read_lock(void); +extern void cpuset_read_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -176,6 +178,9 @@ static inline void cpuset_update_active_cpus(void) static inline void cpuset_wait_for_hotplug(void) { } +static inline void cpuset_read_lock(void) { } +static inline void cpuset_read_unlock(void) { } + static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5c5014caa23c..c52bc91f882b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -334,6 +334,17 @@ static struct cpuset top_cpuset = { */ DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); + +void cpuset_read_lock(void) +{ + percpu_down_read(&cpuset_rwsem); +} + +void cpuset_read_unlock(void) +{ + percpu_up_read(&cpuset_rwsem); +} + static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1af3d2dc6b29..1bceb22dac18 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4698,6 +4698,9 @@ recheck: return retval; } + if (pi) + cpuset_read_lock(); + /* * Make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -4772,6 +4775,8 @@ change: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; task_rq_unlock(rq, p, &rf); + if (pi) + cpuset_read_unlock(); goto recheck; } @@ -4832,8 +4837,10 @@ change: preempt_disable(); task_rq_unlock(rq, p, &rf); - if (pi) + if (pi) { + cpuset_read_unlock(); rt_mutex_adjust_pi(p); + } /* Run balance callbacks after we've adjusted the PI chain: */ balance_callback(rq); @@ -4843,6 +4850,8 @@ change: unlock: task_rq_unlock(rq, p, &rf); + if (pi) + cpuset_read_unlock(); return retval; } @@ -4927,10 +4936,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setscheduler(p, policy, &lparam); + if (likely(p)) + get_task_struct(p); rcu_read_unlock(); + if (likely(p)) { + retval = sched_setscheduler(p, policy, &lparam); + put_task_struct(p); + } + return retval; } -- cgit v1.2.3-59-g8ed1b From a07db5c0865799ebed1f88be0df50c581fb65029 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Jul 2019 08:34:55 +0200 Subject: sched/core: Fix CPU controller for !RT_GROUP_SCHED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On !CONFIG_RT_GROUP_SCHED configurations it is currently not possible to move RT tasks between cgroups to which CPU controller has been attached; but it is oddly possible to first move tasks around and then make them RT (setschedule to FIFO/RR). E.g.: # mkdir /sys/fs/cgroup/cpu,cpuacct/group1 # chrt -fp 10 $$ # echo $$ > /sys/fs/cgroup/cpu,cpuacct/group1/tasks bash: echo: write error: Invalid argument # chrt -op 0 $$ # echo $$ > /sys/fs/cgroup/cpu,cpuacct/group1/tasks # chrt -fp 10 $$ # cat /sys/fs/cgroup/cpu,cpuacct/group1/tasks 2345 2598 # chrt -p 2345 pid 2345's current scheduling policy: SCHED_FIFO pid 2345's current scheduling priority: 10 Also, as Michal noted, it is currently not possible to enable CPU controller on unified hierarchy with !CONFIG_RT_GROUP_SCHED (if there are any kernel RT threads in root cgroup, they can't be migrated to the newly created CPU controller's root in cgroup_update_dfl_csses()). Existing code comes with a comment saying the "we don't support RT-tasks being in separate groups". Such comment is however stale and belongs to pre-RT_GROUP_SCHED times. Also, it doesn't make much sense for !RT_GROUP_ SCHED configurations, since checks related to RT bandwidth are not performed at all in these cases. Make moving RT tasks between CPU controller groups viable by removing special case check for RT (and DEADLINE) tasks. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutný Reviewed-by: Daniel Bristot de Oliveira Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: rostedt@goodmis.org Link: https://lkml.kernel.org/r/20190719063455.27328-1-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1bceb22dac18..042c736b2b73 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6995,10 +6995,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; -#else - /* We don't support RT-tasks being in separate groups */ - if (task->sched_class != &fair_sched_class) - return -EINVAL; #endif /* * Serialize against wake_up_new_task() such that if its -- cgit v1.2.3-59-g8ed1b From a1dc0446d64966dc0ae756aebdc449f335742c13 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Fri, 19 Jul 2019 21:23:19 -0400 Subject: sched/core: Silence a warning in sched_init() Compiling a kernel with both FAIR_GROUP_SCHED=n and RT_GROUP_SCHED=n will generate a compiler warning: kernel/sched/core.c: In function 'sched_init': kernel/sched/core.c:5906:32: warning: variable 'ptr' set but not used It is unnecessary to have both "alloc_size" and "ptr", so just combine them. Signed-off-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: valentin.schneider@arm.com Link: https://lkml.kernel.org/r/20190720012319.884-1-cai@lca.pw Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 042c736b2b73..46f3ca9e392a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6430,19 +6430,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); void __init sched_init(void) { - unsigned long alloc_size = 0, ptr; + unsigned long ptr = 0; int i; wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); + ptr += 2 * nr_cpu_ids * sizeof(void **); #endif #ifdef CONFIG_RT_GROUP_SCHED - alloc_size += 2 * nr_cpu_ids * sizeof(void **); + ptr += 2 * nr_cpu_ids * sizeof(void **); #endif - if (alloc_size) { - ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); + if (ptr) { + ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.se = (struct sched_entity **)ptr; -- cgit v1.2.3-59-g8ed1b From c1a280b68d4e6b6db4a65aa7865c22d8789ddf09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:37 +0200 Subject: sched/preempt: Use CONFIG_PREEMPTION where appropriate CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the preemption code, scheduler and init task over to use CONFIG_PREEMPTION. That's the first step towards RT in that area. The more complex changes are coming separately. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.117528401@linutronix.de Signed-off-by: Ingo Molnar --- include/asm-generic/preempt.h | 4 ++-- include/linux/preempt.h | 6 +++--- include/linux/sched.h | 6 +++--- init/init_task.c | 2 +- init/main.c | 2 +- kernel/sched/core.c | 14 +++++++------- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 4 ++-- 8 files changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index c3046c920063..d683f5e6d791 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -78,11 +78,11 @@ static __always_inline bool should_resched(int preempt_offset) tif_need_resched()); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION extern asmlinkage void preempt_schedule(void); #define __preempt_schedule() preempt_schedule() extern asmlinkage void preempt_schedule_notrace(void); #define __preempt_schedule_notrace() preempt_schedule_notrace() -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ #endif /* __ASM_PREEMPT_H */ diff --git a/include/linux/preempt.h b/include/linux/preempt.h index dd92b1a93919..bbb68dba37cc 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -182,7 +182,7 @@ do { \ #define preemptible() (preempt_count() == 0 && !irqs_disabled()) -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION #define preempt_enable() \ do { \ barrier(); \ @@ -203,7 +203,7 @@ do { \ __preempt_schedule(); \ } while (0) -#else /* !CONFIG_PREEMPT */ +#else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ barrier(); \ @@ -217,7 +217,7 @@ do { \ } while (0) #define preempt_check_resched() do { } while (0) -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ #define preempt_disable_notrace() \ do { \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9f51932bd543..6947516a2d3e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1767,7 +1767,7 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION extern int _cond_resched(void); #else static inline int _cond_resched(void) { return 0; } @@ -1796,12 +1796,12 @@ static inline void cond_resched_rcu(void) /* * Does a critical section need to be broken due to another - * task waiting?: (technically does not depend on CONFIG_PREEMPT, + * task waiting?: (technically does not depend on CONFIG_PREEMPTION, * but a general need for low latency) */ static inline int spin_needbreak(spinlock_t *lock) { -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION return spin_is_contended(lock); #else return 0; diff --git a/init/init_task.c b/init/init_task.c index 7ab773b9b3cd..bfe06c53b14e 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -174,7 +174,7 @@ struct task_struct init_task #ifdef CONFIG_FUNCTION_GRAPH_TRACER .ret_stack = NULL, #endif -#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT) +#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPTION) .trace_recursion = 0, #endif #ifdef CONFIG_LIVEPATCH diff --git a/init/main.c b/init/main.c index 96f8d5af52d6..653693da8da6 100644 --- a/init/main.c +++ b/init/main.c @@ -433,7 +433,7 @@ noinline void __ref rest_init(void) /* * Enable might_sleep() and smp_processor_id() checks. - * They cannot be enabled earlier because with CONFIG_PREEMPT=y + * They cannot be enabled earlier because with CONFIG_PREEMPTION=y * kernel_thread() would trigger might_sleep() splats. With * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled * already, but it's stuck on the kthreadd_done completion. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..604a5e137efe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3581,7 +3581,7 @@ static inline void sched_tick_start(int cpu) { } static inline void sched_tick_stop(int cpu) { } #endif -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ +#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) /* * If the value passed in is equal to the current preempt count @@ -3782,7 +3782,7 @@ again: * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets * called on the nearest possible occasion: * - * - If the kernel is preemptible (CONFIG_PREEMPT=y): + * - If the kernel is preemptible (CONFIG_PREEMPTION=y): * * - in syscall or exception context, at the next outmost * preempt_enable(). (this might be as soon as the wake_up()'s @@ -3791,7 +3791,7 @@ again: * - in IRQ context, return from interrupt-handler to * preemptible context * - * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) + * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) * then at the next: * * - cond_resched() call @@ -4033,7 +4033,7 @@ static void __sched notrace preempt_schedule_common(void) } while (need_resched()); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * this is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt @@ -4105,7 +4105,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ /* * this is the entry point to schedule() from kernel preemption @@ -5416,7 +5416,7 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION int __sched _cond_resched(void) { if (should_resched(0)) { @@ -5433,7 +5433,7 @@ EXPORT_SYMBOL(_cond_resched); * __cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc9cfeaac8bd..aff9d76d8d65 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7430,7 +7430,7 @@ static int detach_tasks(struct lb_env *env) detached++; env->imbalance -= load; -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is detached to minimize diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 802b1f3405f2..f2ce6ba1c5d5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1943,7 +1943,7 @@ unsigned long arch_scale_freq_capacity(int cpu) #endif #ifdef CONFIG_SMP -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -1995,7 +1995,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) return ret; } -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. -- cgit v1.2.3-59-g8ed1b From 139d025cda1da5484e7287b35c019fe1dcf9b650 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 29 Jul 2019 16:05:15 +0200 Subject: sched: Clean up active_mm reference counting The current active_mm reference counting is confusing and sub-optimal. Rewrite the code to explicitly consider the 4 separate cases: user -> user When switching between two user tasks, all we need to consider is switch_mm(). user -> kernel When switching from a user task to a kernel task (which doesn't have an associated mm) we retain the last mm in our active_mm. Increment a reference count on active_mm. kernel -> kernel When switching between kernel threads, all we need to do is pass along the active_mm reference. kernel -> user When switching between a kernel and user task, we must switch from the last active_mm to the next mm, hoping of course that these are the same. Decrement a reference on the active_mm. The code keeps a different order, because as you'll note, both 'to user' cases require switch_mm(). And where the old code would increment/decrement for the 'kernel -> kernel' case, the new code observes this is a neutral operation and avoids touching the reference count. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Reviewed-by: Mathieu Desnoyers Cc: luto@kernel.org --- kernel/sched/core.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 46f3ca9e392a..b4a44bc84749 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3214,12 +3214,8 @@ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next, struct rq_flags *rf) { - struct mm_struct *mm, *oldmm; - prepare_task_switch(rq, prev, next); - mm = next->mm; - oldmm = prev->active_mm; /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into @@ -3228,22 +3224,37 @@ context_switch(struct rq *rq, struct task_struct *prev, arch_start_context_switch(prev); /* - * If mm is non-NULL, we pass through switch_mm(). If mm is - * NULL, we will pass through mmdrop() in finish_task_switch(). - * Both of these contain the full memory barrier required by - * membarrier after storing to rq->curr, before returning to - * user-space. + * kernel -> kernel lazy + transfer active + * user -> kernel lazy + mmgrab() active + * + * kernel -> user switch + mmdrop() active + * user -> user switch */ - if (!mm) { - next->active_mm = oldmm; - mmgrab(oldmm); - enter_lazy_tlb(oldmm, next); - } else - switch_mm_irqs_off(oldmm, mm, next); - - if (!prev->mm) { - prev->active_mm = NULL; - rq->prev_mm = oldmm; + if (!next->mm) { // to kernel + enter_lazy_tlb(prev->active_mm, next); + + next->active_mm = prev->active_mm; + if (prev->mm) // from user + mmgrab(prev->active_mm); + else + prev->active_mm = NULL; + } else { // to user + /* + * sys_membarrier() requires an smp_mb() between setting + * rq->curr and returning to userspace. + * + * The below provides this either through switch_mm(), or in + * case 'prev->active_mm == next->mm' through + * finish_task_switch()'s mmdrop(). + */ + + switch_mm_irqs_off(prev->active_mm, next->mm, next); + + if (!prev->mm) { // from kernel + /* will mmdrop() in finish_task_switch(). */ + rq->prev_mm = prev->active_mm; + prev->active_mm = NULL; + } } rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); -- cgit v1.2.3-59-g8ed1b From 5feeb7837a448f659e0aaa19fb446b1d9a4b323a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:38 +0000 Subject: sched: Fix kerneldoc comment for ia64_set_curr_task Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/fde3a65ea3091ec6b84dac3c19639f85f452c5d1.1559129225.git.vpillai@digitalocean.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b4a44bc84749..9a821ff68502 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6772,7 +6772,7 @@ struct task_struct *curr_task(int cpu) #ifdef CONFIG_IA64 /** - * set_curr_task - set the current task for a given CPU. + * ia64_set_curr_task - set the current task for a given CPU. * @cpu: the processor in question. * @p: the task pointer to set. * -- cgit v1.2.3-59-g8ed1b From 10e7071b2f491b0fb981717ea0a585c441906ede Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Aug 2019 15:13:17 +0200 Subject: sched: Rework CPU hotplug task selection The CPU hotplug task selection is the only place where we used put_prev_task() on a task that is not current. While looking at that, it occured to me that we can simplify all that by by using a custom pick loop. Since we don't need to put current, we can do away with the fake task too. Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan --- kernel/sched/core.c | 32 ++++++++++++++------------------ kernel/sched/sched.h | 1 + 2 files changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9a821ff68502..364b6d7da2be 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6082,21 +6082,22 @@ static void calc_load_migrate(struct rq *rq) atomic_long_add(delta, &calc_load_tasks); } -static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) +static struct task_struct *__pick_migrate_task(struct rq *rq) { -} + const struct sched_class *class; + struct task_struct *next; -static const struct sched_class fake_sched_class = { - .put_prev_task = put_prev_task_fake, -}; + for_each_class(class) { + next = class->pick_next_task(rq, NULL, NULL); + if (next) { + next->sched_class->put_prev_task(rq, next); + return next; + } + } -static struct task_struct fake_task = { - /* - * Avoid pull_{rt,dl}_task() - */ - .prio = MAX_PRIO + 1, - .sched_class = &fake_sched_class, -}; + /* The idle class should always have a runnable task */ + BUG(); +} /* * Migrate all tasks from the rq, sleeping tasks will be migrated by @@ -6139,12 +6140,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) if (rq->nr_running == 1) break; - /* - * pick_next_task() assumes pinned rq->lock: - */ - next = pick_next_task(rq, &fake_task, rf); - BUG_ON(!next); - put_prev_task(rq, next); + next = __pick_migrate_task(rq); /* * Rules for changing task_struct::cpus_mask are holding diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ea48aa5daeee..b3449d0dd7f0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1751,6 +1751,7 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { + WARN_ON_ONCE(rq->curr != prev); prev->sched_class->put_prev_task(rq, prev); } -- cgit v1.2.3-59-g8ed1b From 03b7fad167efca3b7abbbb39733933f9df56e79c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:41 +0000 Subject: sched: Add task_struct pointer to sched_class::set_curr_task In preparation of further separating pick_next_task() and set_curr_task() we have to pass the actual task into it, while there, rename the thing to better pair with put_prev_task(). Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/a96d1bcdd716db4a4c5da2fece647a1456c0ed78.1559129225.git.vpillai@digitalocean.com --- kernel/sched/core.c | 12 ++++++------ kernel/sched/deadline.c | 7 +------ kernel/sched/fair.c | 17 ++++++++++++++--- kernel/sched/idle.c | 27 +++++++++++++++------------ kernel/sched/rt.c | 7 +------ kernel/sched/sched.h | 7 ++++--- kernel/sched/stop_task.c | 17 +++++++---------- 7 files changed, 48 insertions(+), 46 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 364b6d7da2be..0c4220789092 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1494,7 +1494,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); } /* @@ -4325,7 +4325,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) if (queued) enqueue_task(rq, p, queue_flag); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -4392,7 +4392,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_curr(rq); } if (running) - set_curr_task(rq, p); + set_next_task(rq, p); out_unlock: task_rq_unlock(rq, p, &rf); } @@ -4840,7 +4840,7 @@ change: enqueue_task(rq, p, queue_flags); } if (running) - set_curr_task(rq, p); + set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); @@ -6042,7 +6042,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -6919,7 +6919,7 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, queue_flags); if (running) - set_curr_task(rq, tsk); + set_next_task(rq, tsk); task_rq_unlock(rq, tsk, &rf); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2dc2784b196c..6eae79350303 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1844,11 +1844,6 @@ static void task_fork_dl(struct task_struct *p) */ } -static void set_curr_task_dl(struct rq *rq) -{ - set_next_task_dl(rq, rq->curr); -} - #ifdef CONFIG_SMP /* Only try algorithms three times */ @@ -2466,6 +2461,7 @@ const struct sched_class dl_sched_class = { .pick_next_task = pick_next_task_dl, .put_prev_task = put_prev_task_dl, + .set_next_task = set_next_task_dl, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, @@ -2476,7 +2472,6 @@ const struct sched_class dl_sched_class = { .task_woken = task_woken_dl, #endif - .set_curr_task = set_curr_task_dl, .task_tick = task_tick_dl, .task_fork = task_fork_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7d8043fc8317..8ce1b8893947 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10150,9 +10150,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * This routine is mostly called to set cfs_rq->curr field when a task * migrates between groups/classes. */ -static void set_curr_task_fair(struct rq *rq) +static void set_next_task_fair(struct rq *rq, struct task_struct *p) { - struct sched_entity *se = &rq->curr->se; + struct sched_entity *se = &p->se; + +#ifdef CONFIG_SMP + if (task_on_rq_queued(p)) { + /* + * Move the next running task to the front of the list, so our + * cfs_tasks list becomes MRU one. + */ + list_move(&se->group_node, &rq->cfs_tasks); + } +#endif for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -10423,7 +10433,9 @@ const struct sched_class fair_sched_class = { .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, + .put_prev_task = put_prev_task_fair, + .set_next_task = set_next_task_fair, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, @@ -10436,7 +10448,6 @@ const struct sched_class fair_sched_class = { .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_fork = task_fork_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80940939b733..54194d41035c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -374,14 +374,25 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +{ +} + +static void set_next_task_idle(struct rq *rq, struct task_struct *next) +{ + update_idle_core(rq); + schedstat_inc(rq->sched_goidle); +} + static struct task_struct * pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { + struct task_struct *next = rq->idle; + put_prev_task(rq, prev); - update_idle_core(rq); - schedstat_inc(rq->sched_goidle); + set_next_task_idle(rq, next); - return rq->idle; + return next; } /* @@ -397,10 +408,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) raw_spin_lock_irq(&rq->lock); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) -{ -} - /* * scheduler tick hitting a task of our scheduling class. * @@ -413,10 +420,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) { } -static void set_curr_task_idle(struct rq *rq) -{ -} - static void switched_to_idle(struct rq *rq, struct task_struct *p) { BUG(); @@ -451,13 +454,13 @@ const struct sched_class idle_sched_class = { .pick_next_task = pick_next_task_idle, .put_prev_task = put_prev_task_idle, + .set_next_task = set_next_task_idle, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_idle, .task_tick = task_tick_idle, .get_rr_interval = get_rr_interval_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 40bb71004325..f71bcbe1a00c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2354,11 +2354,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) } } -static void set_curr_task_rt(struct rq *rq) -{ - set_next_task_rt(rq, rq->curr); -} - static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) { /* @@ -2380,6 +2375,7 @@ const struct sched_class rt_sched_class = { .pick_next_task = pick_next_task_rt, .put_prev_task = put_prev_task_rt, + .set_next_task = set_next_task_rt, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_rt, @@ -2391,7 +2387,6 @@ const struct sched_class rt_sched_class = { .switched_from = switched_from_rt, #endif - .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, .get_rr_interval = get_rr_interval_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3449d0dd7f0..f3c50445bf22 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1707,6 +1707,7 @@ struct sched_class { struct task_struct *prev, struct rq_flags *rf); void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*set_next_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); @@ -1721,7 +1722,6 @@ struct sched_class { void (*rq_offline)(struct rq *rq); #endif - void (*set_curr_task)(struct rq *rq); void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); void (*task_fork)(struct task_struct *p); void (*task_dead)(struct task_struct *p); @@ -1755,9 +1755,10 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev) prev->sched_class->put_prev_task(rq, prev); } -static inline void set_curr_task(struct rq *rq, struct task_struct *curr) +static inline void set_next_task(struct rq *rq, struct task_struct *next) { - curr->sched_class->set_curr_task(rq); + WARN_ON_ONCE(rq->curr != next); + next->sched_class->set_next_task(rq, next); } #ifdef CONFIG_SMP diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index c183b790ca54..47a3d2a18a9a 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -23,6 +23,11 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) /* we're never preempted */ } +static void set_next_task_stop(struct rq *rq, struct task_struct *stop) +{ + stop->se.exec_start = rq_clock_task(rq); +} + static struct task_struct * pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { @@ -32,8 +37,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf return NULL; put_prev_task(rq, prev); - - stop->se.exec_start = rq_clock_task(rq); + set_next_task_stop(rq, stop); return stop; } @@ -86,13 +90,6 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) { } -static void set_curr_task_stop(struct rq *rq) -{ - struct task_struct *stop = rq->stop; - - stop->se.exec_start = rq_clock_task(rq); -} - static void switched_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ @@ -128,13 +125,13 @@ const struct sched_class stop_sched_class = { .pick_next_task = pick_next_task_stop, .put_prev_task = put_prev_task_stop, + .set_next_task = set_next_task_stop, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif - .set_curr_task = set_curr_task_stop, .task_tick = task_tick_stop, .get_rr_interval = get_rr_interval_stop, -- cgit v1.2.3-59-g8ed1b From 5f2a45fc9e89e022233085e6f0f352eb6ff770bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:43 +0000 Subject: sched: Allow put_prev_task() to drop rq->lock Currently the pick_next_task() loop is convoluted and ugly because of how it can drop the rq->lock and needs to restart the picking. For the RT/Deadline classes, it is put_prev_task() where we do balancing, and we could do this before the picking loop. Make this possible. Signed-off-by: Peter Zijlstra (Intel) Cc: Valentin Schneider Cc: Aaron Lu Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/e4519f6850477ab7f3d257062796e6425ee4ba7c.1559129225.git.vpillai@digitalocean.com --- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 14 +++++++++++++- kernel/sched/fair.c | 2 +- kernel/sched/idle.c | 2 +- kernel/sched/rt.c | 14 +++++++++++++- kernel/sched/sched.h | 4 ++-- kernel/sched/stop_task.c | 2 +- 7 files changed, 32 insertions(+), 8 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0c4220789092..7bbe78a31ba5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6090,7 +6090,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) for_each_class(class) { next = class->pick_next_task(rq, NULL, NULL); if (next) { - next->sched_class->put_prev_task(rq, next); + next->sched_class->put_prev_task(rq, next, NULL); return next; } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6eae79350303..2872e15a87cd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1804,13 +1804,25 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return p; } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) { update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); + + if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_dl_task(rq); + rq_repin_lock(rq, rf); + } } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e7c27eda9f24..4418c1998e69 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6901,7 +6901,7 @@ idle: /* * Account for a descheduled task: */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 54194d41035c..8d59de2e4a6e 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -374,7 +374,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f71bcbe1a00c..dbdabd76f192 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1592,7 +1592,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return p; } -static void put_prev_task_rt(struct rq *rq, struct task_struct *p) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) { update_curr_rt(rq); @@ -1604,6 +1604,18 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) */ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); + + if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_rt_task(rq); + rq_repin_lock(rq, rf); + } } #ifdef CONFIG_SMP diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 304d98e712bf..e085cffb8004 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1710,7 +1710,7 @@ struct sched_class { struct task_struct * (*pick_next_task)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf); void (*set_next_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP @@ -1756,7 +1756,7 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->curr != prev); - prev->sched_class->put_prev_task(rq, prev); + prev->sched_class->put_prev_task(rq, prev, NULL); } static inline void set_next_task(struct rq *rq, struct task_struct *next) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 47a3d2a18a9a..8f414018d5e0 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -59,7 +59,7 @@ static void yield_task_stop(struct rq *rq) BUG(); /* the stop task should never yield, its pointless. */ } -static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *curr = rq->curr; u64 delta_exec; -- cgit v1.2.3-59-g8ed1b From 67692435c411e5c53a1c588ecca2037aebd81f2e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 May 2019 20:36:44 +0000 Subject: sched: Rework pick_next_task() slow-path Avoid the RETRY_TASK case in the pick_next_task() slow path. By doing the put_prev_task() early, we get the rt/deadline pull done, and by testing rq->nr_running we know if we need newidle_balance(). This then gives a stable state to pick a task from. Since the fast-path is fair only; it means the other classes will always have pick_next_task(.prev=NULL, .rf=NULL) and we can simplify. Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mingo@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/aa34d24b36547139248f32a30138791ac6c02bd6.1559129225.git.vpillai@digitalocean.com --- kernel/sched/core.c | 19 ++++++++++++------- kernel/sched/deadline.c | 30 ++---------------------------- kernel/sched/fair.c | 9 ++++++--- kernel/sched/idle.c | 4 +++- kernel/sched/rt.c | 29 +---------------------------- kernel/sched/sched.h | 13 ++++++++----- kernel/sched/stop_task.c | 3 ++- 7 files changed, 34 insertions(+), 73 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7bbe78a31ba5..a6661852907b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3791,7 +3791,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = fair_sched_class.pick_next_task(rq, prev, rf); if (unlikely(p == RETRY_TASK)) - goto again; + goto restart; /* Assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) @@ -3800,14 +3800,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return p; } -again: +restart: + /* + * Ensure that we put DL/RT tasks before the pick loop, such that they + * can PULL higher prio tasks when we lower the RQ 'priority'. + */ + prev->sched_class->put_prev_task(rq, prev, rf); + if (!rq->nr_running) + newidle_balance(rq, rf); + for_each_class(class) { - p = class->pick_next_task(rq, prev, rf); - if (p) { - if (unlikely(p == RETRY_TASK)) - goto again; + p = class->pick_next_task(rq, NULL, NULL); + if (p) return p; - } } /* The idle class should always have a runnable task: */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2872e15a87cd..0b9cbfb2b1d4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1761,39 +1761,13 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct task_struct *p; struct dl_rq *dl_rq; - dl_rq = &rq->dl; - - if (need_pull_dl_task(rq, prev)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we're - * being very careful to re-start the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_dl_task(rq); - rq_repin_lock(rq, rf); - /* - * pull_dl_task() can drop (and re-acquire) rq->lock; this - * means a stop task can slip in, in which case we need to - * re-start task selection. - */ - if (rq->stop && task_on_rq_queued(rq->stop)) - return RETRY_TASK; - } + WARN_ON_ONCE(prev || rf); - /* - * When prev is DL, we may throttle it in put_prev_task(). - * So, we update time before we check for dl_nr_running. - */ - if (prev->sched_class == &dl_sched_class) - update_curr_dl(rq); + dl_rq = &rq->dl; if (unlikely(!dl_rq->dl_nr_running)) return NULL; - put_prev_task(rq, prev); - dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4418c1998e69..19c58599e967 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6770,7 +6770,7 @@ again: goto idle; #ifdef CONFIG_FAIR_GROUP_SCHED - if (prev->sched_class != &fair_sched_class) + if (!prev || prev->sched_class != &fair_sched_class) goto simple; /* @@ -6847,8 +6847,8 @@ again: goto done; simple: #endif - - put_prev_task(rq, prev); + if (prev) + put_prev_task(rq, prev); do { se = pick_next_entity(cfs_rq, NULL); @@ -6876,6 +6876,9 @@ done: __maybe_unused; return p; idle: + if (!rf) + return NULL; + new_tasks = newidle_balance(rq, rf); /* diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8d59de2e4a6e..7c54550dda6a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -389,7 +389,9 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf { struct task_struct *next = rq->idle; - put_prev_task(rq, prev); + if (prev) + put_prev_task(rq, prev); + set_next_task_idle(rq, next); return next; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index dbdabd76f192..858c4cc6f99b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1553,38 +1553,11 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; - if (need_pull_rt_task(rq, prev)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we're - * being very careful to re-start the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_rt_task(rq); - rq_repin_lock(rq, rf); - /* - * pull_rt_task() can drop (and re-acquire) rq->lock; this - * means a dl or stop task can slip in, in which case we need - * to re-start task selection. - */ - if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || - rq->dl.dl_nr_running)) - return RETRY_TASK; - } - - /* - * We may dequeue prev's rt_rq in put_prev_task(). - * So, we update time before rt_queued check. - */ - if (prev->sched_class == &rt_sched_class) - update_curr_rt(rq); + WARN_ON_ONCE(prev || rf); if (!rt_rq->rt_queued) return NULL; - put_prev_task(rq, prev); - p = _pick_next_task_rt(rq); set_next_task_rt(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e085cffb8004..7111e3a1eeb4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1700,12 +1700,15 @@ struct sched_class { void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); /* - * It is the responsibility of the pick_next_task() method that will - * return the next task to call put_prev_task() on the @prev task or - * something equivalent. + * Both @prev and @rf are optional and may be NULL, in which case the + * caller must already have invoked put_prev_task(rq, prev, rf). * - * May return RETRY_TASK when it finds a higher prio class has runnable - * tasks. + * Otherwise it is the responsibility of the pick_next_task() to call + * put_prev_task() on the @prev task or something equivalent, IFF it + * returns a next task. + * + * In that case (@rf != NULL) it may return RETRY_TASK when it finds a + * higher prio class has runnable tasks. */ struct task_struct * (*pick_next_task)(struct rq *rq, struct task_struct *prev, diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 8f414018d5e0..7e1cee4e65b2 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -33,10 +33,11 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf { struct task_struct *stop = rq->stop; + WARN_ON_ONCE(prev || rf); + if (!stop || !task_on_rq_queued(stop)) return NULL; - put_prev_task(rq, prev); set_next_task_stop(rq, stop); return stop; -- cgit v1.2.3-59-g8ed1b From 2480c093130f64ac3a410504fa8b3db1fc4b87ce Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 22 Aug 2019 14:28:06 +0100 Subject: sched/uclamp: Extend CPU's cgroup controller The cgroup CPU bandwidth controller allows to assign a specified (maximum) bandwidth to the tasks of a group. However this bandwidth is defined and enforced only on a temporal base, without considering the actual frequency a CPU is running on. Thus, the amount of computation completed by a task within an allocated bandwidth can be very different depending on the actual frequency the CPU is running that task. The amount of computation can be affected also by the specific CPU a task is running on, especially when running on asymmetric capacity systems like Arm's big.LITTLE. With the availability of schedutil, the scheduler is now able to drive frequency selections based on actual task utilization. Moreover, the utilization clamping support provides a mechanism to bias the frequency selection operated by schedutil depending on constraints assigned to the tasks currently RUNNABLE on a CPU. Giving the mechanisms described above, it is now possible to extend the cpu controller to specify the minimum (or maximum) utilization which should be considered for tasks RUNNABLE on a cpu. This makes it possible to better defined the actual computational power assigned to task groups, thus improving the cgroup CPU bandwidth controller which is currently based just on time constraints. Extend the CPU controller with a couple of new attributes uclamp.{min,max} which allow to enforce utilization boosting and capping for all the tasks in a group. Specifically: - uclamp.min: defines the minimum utilization which should be considered i.e. the RUNNABLE tasks of this group will run at least at a minimum frequency which corresponds to the uclamp.min utilization - uclamp.max: defines the maximum utilization which should be considered i.e. the RUNNABLE tasks of this group will run up to a maximum frequency which corresponds to the uclamp.max utilization These attributes: a) are available only for non-root nodes, both on default and legacy hierarchies, while system wide clamps are defined by a generic interface which does not depends on cgroups. This system wide interface enforces constraints on tasks in the root node. b) enforce effective constraints at each level of the hierarchy which are a restriction of the group requests considering its parent's effective constraints. Root group effective constraints are defined by the system wide interface. This mechanism allows each (non-root) level of the hierarchy to: - request whatever clamp values it would like to get - effectively get only up to the maximum amount allowed by its parent c) have higher priority than task-specific clamps, defined via sched_setattr(), thus allowing to control and restrict task requests. Add two new attributes to the cpu controller to collect "requested" clamp values. Allow that at each non-root level of the hierarchy. Keep it simple by not caring now about "effective" values computation and propagation along the hierarchy. Update sysctl_sched_uclamp_handler() to use the newly introduced uclamp_mutex so that we serialize system default updates with cgroup relate updates. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- Documentation/admin-guide/cgroup-v2.rst | 34 ++++++ init/Kconfig | 22 ++++ kernel/sched/core.c | 193 +++++++++++++++++++++++++++++++- kernel/sched/sched.h | 8 ++ 4 files changed, 253 insertions(+), 4 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 3b29005aa981..5f1c266131b0 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -951,6 +951,13 @@ controller implements weight and absolute bandwidth limit models for normal scheduling policy and absolute bandwidth allocation model for realtime scheduling policy. +In all the above models, cycles distribution is defined only on a temporal +base and it does not account for the frequency at which tasks are executed. +The (optional) utilization clamping support allows to hint the schedutil +cpufreq governor about the minimum desired frequency which should always be +provided by a CPU, as well as the maximum desired frequency, which should not +be exceeded by a CPU. + WARNING: cgroup2 doesn't yet support control of realtime processes and the cpu controller can only be enabled when all RT processes are in the root cgroup. Be aware that system management software may already @@ -1016,6 +1023,33 @@ All time durations are in microseconds. Shows pressure stall information for CPU. See Documentation/accounting/psi.rst for details. + cpu.uclamp.min + A read-write single value file which exists on non-root cgroups. + The default is "0", i.e. no utilization boosting. + + The requested minimum utilization (protection) as a percentage + rational number, e.g. 12.34 for 12.34%. + + This interface allows reading and setting minimum utilization clamp + values similar to the sched_setattr(2). This minimum utilization + value is used to clamp the task specific minimum utilization clamp. + + The requested minimum utilization (protection) is always capped by + the current value for the maximum utilization (limit), i.e. + `cpu.uclamp.max`. + + cpu.uclamp.max + A read-write single value file which exists on non-root cgroups. + The default is "max". i.e. no utilization capping + + The requested maximum utilization (limit) as a percentage rational + number, e.g. 98.76 for 98.76%. + + This interface allows reading and setting maximum utilization clamp + values similar to the sched_setattr(2). This maximum utilization + value is used to clamp the task specific maximum utilization clamp. + + Memory ------ diff --git a/init/Kconfig b/init/Kconfig index bd7d650d4a99..ac285cfa78b6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -928,6 +928,28 @@ config RT_GROUP_SCHED endif #CGROUP_SCHED +config UCLAMP_TASK_GROUP + bool "Utilization clamping per group of tasks" + depends on CGROUP_SCHED + depends on UCLAMP_TASK + default n + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks currently scheduled on that CPU. + + When this option is enabled, the user can specify a min and max + CPU bandwidth which is allowed for each single task in a group. + The max bandwidth allows to clamp the maximum frequency a task + can use, while the min bandwidth allows to define a minimum + frequency a task will always use. + + When task group based utilization clamping is enabled, an eventually + specified task-specific clamp value is constrained by the cgroup + specified clamp value. Both minimum and maximum task clamping cannot + be bigger than the corresponding clamping defined at task group level. + + If in doubt, say N. + config CGROUP_PIDS bool "PIDs controller" help diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a6661852907b..c186abed5c6d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load) } #ifdef CONFIG_UCLAMP_TASK +/* + * Serializes updates of utilization clamp values + * + * The (slow-path) user-space triggers utilization clamp value updates which + * can require updates on (fast-path) scheduler's data structures used to + * support enqueue/dequeue operations. + * While the per-CPU rq lock protects fast-path update operations, user-space + * requests are serialized using a mutex to reduce the risk of conflicting + * updates or API abuses. + */ +static DEFINE_MUTEX(uclamp_mutex); + /* Max allowed minimum utilization */ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; @@ -1010,10 +1022,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, loff_t *ppos) { int old_min, old_max; - static DEFINE_MUTEX(mutex); int result; - mutex_lock(&mutex); + mutex_lock(&uclamp_mutex); old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; @@ -1048,7 +1059,7 @@ undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; done: - mutex_unlock(&mutex); + mutex_unlock(&uclamp_mutex); return result; } @@ -1137,6 +1148,8 @@ static void __init init_uclamp(void) unsigned int clamp_id; int cpu; + mutex_init(&uclamp_mutex); + for_each_possible_cpu(cpu) { memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); cpu_rq(cpu)->uclamp_flags = 0; @@ -1149,8 +1162,12 @@ static void __init init_uclamp(void) /* System defaults allow max clamp values for both indexes */ uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); - for_each_clamp_id(clamp_id) + for_each_clamp_id(clamp_id) { uclamp_default[clamp_id] = uc_max; +#ifdef CONFIG_UCLAMP_TASK_GROUP + root_task_group.uclamp_req[clamp_id] = uc_max; +#endif + } } #else /* CONFIG_UCLAMP_TASK */ @@ -6798,6 +6815,19 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); +static inline void alloc_uclamp_sched_group(struct task_group *tg, + struct task_group *parent) +{ +#ifdef CONFIG_UCLAMP_TASK_GROUP + int clamp_id; + + for_each_clamp_id(clamp_id) { + uclamp_se_set(&tg->uclamp_req[clamp_id], + uclamp_none(clamp_id), false); + } +#endif +} + static void sched_free_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -6821,6 +6851,8 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; + alloc_uclamp_sched_group(tg, parent); + return tg; err: @@ -7037,6 +7069,131 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) sched_move_task(task); } +#ifdef CONFIG_UCLAMP_TASK_GROUP + +/* + * Integer 10^N with a given N exponent by casting to integer the literal "1eN" + * C expression. Since there is no way to convert a macro argument (N) into a + * character constant, use two levels of macros. + */ +#define _POW10(exp) ((unsigned int)1e##exp) +#define POW10(exp) _POW10(exp) + +struct uclamp_request { +#define UCLAMP_PERCENT_SHIFT 2 +#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT)) + s64 percent; + u64 util; + int ret; +}; + +static inline struct uclamp_request +capacity_from_percent(char *buf) +{ + struct uclamp_request req = { + .percent = UCLAMP_PERCENT_SCALE, + .util = SCHED_CAPACITY_SCALE, + .ret = 0, + }; + + buf = strim(buf); + if (strcmp(buf, "max")) { + req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT, + &req.percent); + if (req.ret) + return req; + if (req.percent > UCLAMP_PERCENT_SCALE) { + req.ret = -ERANGE; + return req; + } + + req.util = req.percent << SCHED_CAPACITY_SHIFT; + req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE); + } + + return req; +} + +static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off, + enum uclamp_id clamp_id) +{ + struct uclamp_request req; + struct task_group *tg; + + req = capacity_from_percent(buf); + if (req.ret) + return req.ret; + + mutex_lock(&uclamp_mutex); + rcu_read_lock(); + + tg = css_tg(of_css(of)); + if (tg->uclamp_req[clamp_id].value != req.util) + uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false); + + /* + * Because of not recoverable conversion rounding we keep track of the + * exact requested value + */ + tg->uclamp_pct[clamp_id] = req.percent; + + rcu_read_unlock(); + mutex_unlock(&uclamp_mutex); + + return nbytes; +} + +static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN); +} + +static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX); +} + +static inline void cpu_uclamp_print(struct seq_file *sf, + enum uclamp_id clamp_id) +{ + struct task_group *tg; + u64 util_clamp; + u64 percent; + u32 rem; + + rcu_read_lock(); + tg = css_tg(seq_css(sf)); + util_clamp = tg->uclamp_req[clamp_id].value; + rcu_read_unlock(); + + if (util_clamp == SCHED_CAPACITY_SCALE) { + seq_puts(sf, "max\n"); + return; + } + + percent = tg->uclamp_pct[clamp_id]; + percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem); + seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem); +} + +static int cpu_uclamp_min_show(struct seq_file *sf, void *v) +{ + cpu_uclamp_print(sf, UCLAMP_MIN); + return 0; +} + +static int cpu_uclamp_max_show(struct seq_file *sf, void *v) +{ + cpu_uclamp_print(sf, UCLAMP_MAX); + return 0; +} +#endif /* CONFIG_UCLAMP_TASK_GROUP */ + #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) @@ -7381,6 +7538,20 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_rt_period_read_uint, .write_u64 = cpu_rt_period_write_uint, }, +#endif +#ifdef CONFIG_UCLAMP_TASK_GROUP + { + .name = "uclamp.min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_min_show, + .write = cpu_uclamp_min_write, + }, + { + .name = "uclamp.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_max_show, + .write = cpu_uclamp_max_write, + }, #endif { } /* Terminate */ }; @@ -7548,6 +7719,20 @@ static struct cftype cpu_files[] = { .seq_show = cpu_max_show, .write = cpu_max_write, }, +#endif +#ifdef CONFIG_UCLAMP_TASK_GROUP + { + .name = "uclamp.min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_min_show, + .write = cpu_uclamp_min_write, + }, + { + .name = "uclamp.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_uclamp_max_show, + .write = cpu_uclamp_max_write, + }, #endif { } /* terminate */ }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7111e3a1eeb4..ae1be61fb279 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -391,6 +391,14 @@ struct task_group { #endif struct cfs_bandwidth cfs_bandwidth; + +#ifdef CONFIG_UCLAMP_TASK_GROUP + /* The two decimal precision [%] value requested from user-space */ + unsigned int uclamp_pct[UCLAMP_CNT]; + /* Clamp values requested for a task group */ + struct uclamp_se uclamp_req[UCLAMP_CNT]; +#endif + }; #ifdef CONFIG_FAIR_GROUP_SCHED -- cgit v1.2.3-59-g8ed1b From 0b60ba2dd342016e4e717dbaa4ca9af3a43f4434 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 22 Aug 2019 14:28:07 +0100 Subject: sched/uclamp: Propagate parent clamps In order to properly support hierarchical resources control, the cgroup delegation model requires that attribute writes from a child group never fail but still are locally consistent and constrained based on parent's assigned resources. This requires to properly propagate and aggregate parent attributes down to its descendants. Implement this mechanism by adding a new "effective" clamp value for each task group. The effective clamp value is defined as the smaller value between the clamp value of a group and the effective clamp value of its parent. This is the actual clamp value enforced on tasks in a task group. Since it's possible for a cpu.uclamp.min value to be bigger than the cpu.uclamp.max value, ensure local consistency by restricting each "protection" (i.e. min utilization) with the corresponding "limit" (i.e. max utilization). Do that at effective clamps propagation to ensure all user-space write never fails while still always tracking the most restrictive values. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-3-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 2 ++ 2 files changed, 46 insertions(+) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c186abed5c6d..8855481857b5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1166,6 +1166,7 @@ static void __init init_uclamp(void) uclamp_default[clamp_id] = uc_max; #ifdef CONFIG_UCLAMP_TASK_GROUP root_task_group.uclamp_req[clamp_id] = uc_max; + root_task_group.uclamp[clamp_id] = uc_max; #endif } } @@ -6824,6 +6825,7 @@ static inline void alloc_uclamp_sched_group(struct task_group *tg, for_each_clamp_id(clamp_id) { uclamp_se_set(&tg->uclamp_req[clamp_id], uclamp_none(clamp_id), false); + tg->uclamp[clamp_id] = parent->uclamp[clamp_id]; } #endif } @@ -7070,6 +7072,45 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) } #ifdef CONFIG_UCLAMP_TASK_GROUP +static void cpu_util_update_eff(struct cgroup_subsys_state *css) +{ + struct cgroup_subsys_state *top_css = css; + struct uclamp_se *uc_parent = NULL; + struct uclamp_se *uc_se = NULL; + unsigned int eff[UCLAMP_CNT]; + unsigned int clamp_id; + unsigned int clamps; + + css_for_each_descendant_pre(css, top_css) { + uc_parent = css_tg(css)->parent + ? css_tg(css)->parent->uclamp : NULL; + + for_each_clamp_id(clamp_id) { + /* Assume effective clamps matches requested clamps */ + eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value; + /* Cap effective clamps with parent's effective clamps */ + if (uc_parent && + eff[clamp_id] > uc_parent[clamp_id].value) { + eff[clamp_id] = uc_parent[clamp_id].value; + } + } + /* Ensure protection is always capped by limit */ + eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]); + + /* Propagate most restrictive effective clamps */ + clamps = 0x0; + uc_se = css_tg(css)->uclamp; + for_each_clamp_id(clamp_id) { + if (eff[clamp_id] == uc_se[clamp_id].value) + continue; + uc_se[clamp_id].value = eff[clamp_id]; + uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]); + clamps |= (0x1 << clamp_id); + } + if (!clamps) + css = css_rightmost_descendant(css); + } +} /* * Integer 10^N with a given N exponent by casting to integer the literal "1eN" @@ -7138,6 +7179,9 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, */ tg->uclamp_pct[clamp_id] = req.percent; + /* Update effective clamps to track the most restrictive value */ + cpu_util_update_eff(of_css(of)); + rcu_read_unlock(); mutex_unlock(&uclamp_mutex); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ae1be61fb279..5b343112a47b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -397,6 +397,8 @@ struct task_group { unsigned int uclamp_pct[UCLAMP_CNT]; /* Clamp values requested for a task group */ struct uclamp_se uclamp_req[UCLAMP_CNT]; + /* Effective clamp values used for a task group */ + struct uclamp_se uclamp[UCLAMP_CNT]; #endif }; -- cgit v1.2.3-59-g8ed1b From 7274a5c1bbec45f06f1fff4b8c8b5855b6cc189d Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 22 Aug 2019 14:28:08 +0100 Subject: sched/uclamp: Propagate system defaults to the root group The clamp values are not tunable at the level of the root task group. That's for two main reasons: - the root group represents "system resources" which are always entirely available from the cgroup standpoint. - when tuning/restricting "system resources" makes sense, tuning must be done using a system wide API which should also be available when control groups are not. When a system wide restriction is available, cgroups should be aware of its value in order to know exactly how much "system resources" are available for the subgroups. Utilization clamping supports already the concepts of: - system defaults: which define the maximum possible clamp values usable by tasks. - effective clamps: which allows a parent cgroup to constraint (maybe temporarily) its descendants without losing the information related to the values "requested" from them. Exploit these two concepts and bind them together in such a way that, whenever system default are tuned, the new values are propagated to (possibly) restrict or relax the "effective" value of nested cgroups. When cgroups are in use, force an update of all the RUNNABLE tasks. Otherwise, keep things simple and do just a lazy update next time each task will be enqueued. Do that since we assume a more strict resource control is required when cgroups are in use. This allows also to keep "effective" clamp values updated in case we need to expose them to user-space. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-4-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8855481857b5..e6800fe040ea 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1017,10 +1017,30 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +#ifdef CONFIG_UCLAMP_TASK_GROUP +static void cpu_util_update_eff(struct cgroup_subsys_state *css); +static void uclamp_update_root_tg(void) +{ + struct task_group *tg = &root_task_group; + + uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN], + sysctl_sched_uclamp_util_min, false); + uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], + sysctl_sched_uclamp_util_max, false); + + rcu_read_lock(); + cpu_util_update_eff(&root_task_group.css); + rcu_read_unlock(); +} +#else +static void uclamp_update_root_tg(void) { } +#endif + int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + bool update_root_tg = false; int old_min, old_max; int result; @@ -1043,16 +1063,23 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, if (old_min != sysctl_sched_uclamp_util_min) { uclamp_se_set(&uclamp_default[UCLAMP_MIN], sysctl_sched_uclamp_util_min, false); + update_root_tg = true; } if (old_max != sysctl_sched_uclamp_util_max) { uclamp_se_set(&uclamp_default[UCLAMP_MAX], sysctl_sched_uclamp_util_max, false); + update_root_tg = true; } + if (update_root_tg) + uclamp_update_root_tg(); + /* - * Updating all the RUNNABLE task is expensive, keep it simple and do - * just a lazy update at each next enqueue time. + * We update all RUNNABLE tasks only when task groups are in use. + * Otherwise, keep it simple and do just a lazy update at each next + * task enqueue time. */ + goto done; undo: -- cgit v1.2.3-59-g8ed1b From 3eac870a324728e5d17118888840dad70bcd37f3 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 22 Aug 2019 14:28:09 +0100 Subject: sched/uclamp: Use TG's clamps to restrict TASK's clamps When a task specific clamp value is configured via sched_setattr(2), this value is accounted in the corresponding clamp bucket every time the task is {en,de}qeued. However, when cgroups are also in use, the task specific clamp values could be restricted by the task_group (TG) clamp values. Update uclamp_cpu_inc() to aggregate task and TG clamp values. Every time a task is enqueued, it's accounted in the clamp bucket tracking the smaller clamp between the task specific value and its TG effective value. This allows to: 1. ensure cgroup clamps are always used to restrict task specific requests, i.e. boosted not more than its TG effective protection and capped at least as its TG effective limit. 2. implement a "nice-like" policy, where tasks are still allowed to request less than what enforced by their TG effective limits and protections Do this by exploiting the concept of "effective" clamp, which is already used by a TG to track parent enforced restrictions. Apply task group clamp restrictions only to tasks belonging to a child group. While, for tasks in the root group or in an autogroup, system defaults are still enforced. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-5-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e6800fe040ea..c32ac071c203 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -873,16 +873,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +static inline struct uclamp_se +uclamp_tg_restrict(struct task_struct *p, unsigned int clamp_id) +{ + struct uclamp_se uc_req = p->uclamp_req[clamp_id]; +#ifdef CONFIG_UCLAMP_TASK_GROUP + struct uclamp_se uc_max; + + /* + * Tasks in autogroups or root task group will be + * restricted by system defaults. + */ + if (task_group_is_autogroup(task_group(p))) + return uc_req; + if (task_group(p) == &root_task_group) + return uc_req; + + uc_max = task_group(p)->uclamp[clamp_id]; + if (uc_req.value > uc_max.value || !uc_req.user_defined) + return uc_max; +#endif + + return uc_req; +} + /* * The effective clamp bucket index of a task depends on, by increasing * priority: * - the task specific clamp value, when explicitly requested from userspace + * - the task group effective clamp value, for tasks not either in the root + * group or in an autogroup * - the system default clamp value, defined by the sysadmin */ static inline struct uclamp_se uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) { - struct uclamp_se uc_req = p->uclamp_req[clamp_id]; + struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id); struct uclamp_se uc_max = uclamp_default[clamp_id]; /* System default restrictions always apply */ -- cgit v1.2.3-59-g8ed1b From babbe170e053c6ec2343751749995b7b9fd5fd2c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 22 Aug 2019 14:28:10 +0100 Subject: sched/uclamp: Update CPU's refcount on TG's clamp changes On updates of task group (TG) clamp values, ensure that these new values are enforced on all RUNNABLE tasks of the task group, i.e. all RUNNABLE tasks are immediately boosted and/or capped as requested. Do that each time we update effective clamps from cpu_util_update_eff(). Use the *cgroup_subsys_state (css) to walk the list of tasks in each affected TG and update their RUNNABLE tasks. Update each task by using the same mechanism used for cpu affinity masks updates, i.e. by taking the rq lock. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-6-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c32ac071c203..55a1c07045ff 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1043,6 +1043,54 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +static inline void +uclamp_update_active(struct task_struct *p, unsigned int clamp_id) +{ + struct rq_flags rf; + struct rq *rq; + + /* + * Lock the task and the rq where the task is (or was) queued. + * + * We might lock the (previous) rq of a !RUNNABLE task, but that's the + * price to pay to safely serialize util_{min,max} updates with + * enqueues, dequeues and migration operations. + * This is the same locking schema used by __set_cpus_allowed_ptr(). + */ + rq = task_rq_lock(p, &rf); + + /* + * Setting the clamp bucket is serialized by task_rq_lock(). + * If the task is not yet RUNNABLE and its task_struct is not + * affecting a valid clamp bucket, the next time it's enqueued, + * it will already see the updated clamp bucket value. + */ + if (!p->uclamp[clamp_id].active) { + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + } + + task_rq_unlock(rq, p, &rf); +} + +static inline void +uclamp_update_active_tasks(struct cgroup_subsys_state *css, + unsigned int clamps) +{ + struct css_task_iter it; + struct task_struct *p; + unsigned int clamp_id; + + css_task_iter_start(css, 0, &it); + while ((p = css_task_iter_next(&it))) { + for_each_clamp_id(clamp_id) { + if ((0x1 << clamp_id) & clamps) + uclamp_update_active(p, clamp_id); + } + } + css_task_iter_end(&it); +} + #ifdef CONFIG_UCLAMP_TASK_GROUP static void cpu_util_update_eff(struct cgroup_subsys_state *css); static void uclamp_update_root_tg(void) @@ -7160,8 +7208,13 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]); clamps |= (0x1 << clamp_id); } - if (!clamps) + if (!clamps) { css = css_rightmost_descendant(css); + continue; + } + + /* Immediately update descendants RUNNABLE tasks */ + uclamp_update_active_tasks(css, clamps); } } -- cgit v1.2.3-59-g8ed1b From 0413d7f33e60751570fd6c179546bde2f7d82dcb Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 22 Aug 2019 14:28:11 +0100 Subject: sched/uclamp: Always use 'enum uclamp_id' for clamp_id values The supported clamp indexes are defined in 'enum clamp_id', however, because of the code logic in some of the first utilization clamping series version, sometimes we needed to use 'unsigned int' to represent indices. This is not more required since the final version of the uclamp_* APIs can always use the proper enum uclamp_id type. Fix it with a bulk rename now that we have all the bits merged. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-7-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 38 +++++++++++++++++++------------------- kernel/sched/sched.h | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel/sched/core.c') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 55a1c07045ff..3c7b90bcbe4e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -810,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); } -static inline unsigned int uclamp_none(int clamp_id) +static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -826,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se, } static inline unsigned int -uclamp_idle_value(struct rq *rq, unsigned int clamp_id, +uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) { /* @@ -842,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id, return uclamp_none(UCLAMP_MIN); } -static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, +static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) { /* Reset max-clamp retention only on idle exit */ @@ -853,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, } static inline -unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, - unsigned int clamp_value) +enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, + unsigned int clamp_value) { struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; int bucket_id = UCLAMP_BUCKETS - 1; @@ -874,7 +874,7 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, } static inline struct uclamp_se -uclamp_tg_restrict(struct task_struct *p, unsigned int clamp_id) +uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_req = p->uclamp_req[clamp_id]; #ifdef CONFIG_UCLAMP_TASK_GROUP @@ -906,7 +906,7 @@ uclamp_tg_restrict(struct task_struct *p, unsigned int clamp_id) * - the system default clamp value, defined by the sysadmin */ static inline struct uclamp_se -uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) +uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id); struct uclamp_se uc_max = uclamp_default[clamp_id]; @@ -918,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) return uc_req; } -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_eff; @@ -942,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) * for each bucket when all its RUNNABLE tasks require the same clamp. */ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, - unsigned int clamp_id) + enum uclamp_id clamp_id) { struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; @@ -980,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, * enforce the expected state and warn. */ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, - unsigned int clamp_id) + enum uclamp_id clamp_id) { struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; @@ -1019,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { - unsigned int clamp_id; + enum uclamp_id clamp_id; if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1034,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { - unsigned int clamp_id; + enum uclamp_id clamp_id; if (unlikely(!p->sched_class->uclamp_enabled)) return; @@ -1044,7 +1044,7 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) } static inline void -uclamp_update_active(struct task_struct *p, unsigned int clamp_id) +uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) { struct rq_flags rf; struct rq *rq; @@ -1077,9 +1077,9 @@ static inline void uclamp_update_active_tasks(struct cgroup_subsys_state *css, unsigned int clamps) { + enum uclamp_id clamp_id; struct css_task_iter it; struct task_struct *p; - unsigned int clamp_id; css_task_iter_start(css, 0, &it); while ((p = css_task_iter_next(&it))) { @@ -1187,7 +1187,7 @@ static int uclamp_validate(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { - unsigned int clamp_id; + enum uclamp_id clamp_id; /* * On scheduling class change, reset to default clamps for tasks @@ -1224,7 +1224,7 @@ static void __setscheduler_uclamp(struct task_struct *p, static void uclamp_fork(struct task_struct *p) { - unsigned int clamp_id; + enum uclamp_id clamp_id; for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; @@ -1246,7 +1246,7 @@ static void uclamp_fork(struct task_struct *p) static void __init init_uclamp(void) { struct uclamp_se uc_max = {}; - unsigned int clamp_id; + enum uclamp_id clamp_id; int cpu; mutex_init(&uclamp_mutex); @@ -6921,7 +6921,7 @@ static inline void alloc_uclamp_sched_group(struct task_group *tg, struct task_group *parent) { #ifdef CONFIG_UCLAMP_TASK_GROUP - int clamp_id; + enum uclamp_id clamp_id; for_each_clamp_id(clamp_id) { uclamp_se_set(&tg->uclamp_req[clamp_id], @@ -7179,7 +7179,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) struct uclamp_se *uc_parent = NULL; struct uclamp_se *uc_se = NULL; unsigned int eff[UCLAMP_CNT]; - unsigned int clamp_id; + enum uclamp_id clamp_id; unsigned int clamps; css_for_each_descendant_pre(css, top_css) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5b343112a47b..00ff5b57e9cd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2281,7 +2281,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ #ifdef CONFIG_UCLAMP_TASK -unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id); +enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); static __always_inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, -- cgit v1.2.3-59-g8ed1b