diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 793 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 16 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 8 | ||||
-rw-r--r-- | kernel/sched/debug.c | 10 | ||||
-rw-r--r-- | kernel/sched/fair.c | 211 | ||||
-rw-r--r-- | kernel/sched/sched.h | 31 | ||||
-rw-r--r-- | kernel/sched/topology.c | 65 |
7 files changed, 901 insertions, 233 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d9ff40f4661..c4462c454ab9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -237,9 +237,30 @@ static DEFINE_MUTEX(sched_core_mutex); static atomic_t sched_core_count; static struct cpumask sched_core_mask; +static void sched_core_lock(int cpu, unsigned long *flags) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + int t, i = 0; + + local_irq_save(*flags); + for_each_cpu(t, smt_mask) + raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); +} + +static void sched_core_unlock(int cpu, unsigned long *flags) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + int t; + + for_each_cpu(t, smt_mask) + raw_spin_unlock(&cpu_rq(t)->__lock); + local_irq_restore(*flags); +} + static void __sched_core_flip(bool enabled) { - int cpu, t, i; + unsigned long flags; + int cpu, t; cpus_read_lock(); @@ -250,19 +271,12 @@ static void __sched_core_flip(bool enabled) for_each_cpu(cpu, &sched_core_mask) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); - i = 0; - local_irq_disable(); - for_each_cpu(t, smt_mask) { - /* supports up to SMT8 */ - raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); - } + sched_core_lock(cpu, &flags); for_each_cpu(t, smt_mask) cpu_rq(t)->core_enabled = enabled; - for_each_cpu(t, smt_mask) - raw_spin_unlock(&cpu_rq(t)->__lock); - local_irq_enable(); + sched_core_unlock(cpu, &flags); cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); } @@ -993,6 +1007,7 @@ int get_nohz_timer_target(void) { int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; + const struct cpumask *hk_mask; if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { if (!idle_cpu(cpu)) @@ -1000,10 +1015,11 @@ int get_nohz_timer_target(void) default_cpu = cpu; } + hk_mask = housekeeping_cpumask(HK_FLAG_TIMER); + rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu_and(i, sched_domain_span(sd), - housekeeping_cpumask(HK_FLAG_TIMER)) { + for_each_cpu_and(i, sched_domain_span(sd), hk_mask) { if (cpu == i) continue; @@ -1619,6 +1635,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, + enum uclamp_id clamp_id) +{ + if (!p->uclamp[clamp_id].active) + return; + + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + + /* + * Make sure to clear the idle flag if we've transiently reached 0 + * active tasks on rq. + */ + if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; +} + static inline void uclamp_update_active(struct task_struct *p) { @@ -1642,12 +1675,8 @@ uclamp_update_active(struct task_struct *p) * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - for_each_clamp_id(clamp_id) { - if (p->uclamp[clamp_id].active) { - uclamp_rq_dec_id(rq, p, clamp_id); - uclamp_rq_inc_id(rq, p, clamp_id); - } - } + for_each_clamp_id(clamp_id) + uclamp_rq_reinc_id(rq, p, clamp_id); task_rq_unlock(rq, p, &rf); } @@ -1981,12 +2010,18 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -/* - * __normal_prio - return the priority that is based on the static prio - */ -static inline int __normal_prio(struct task_struct *p) +static inline int __normal_prio(int policy, int rt_prio, int nice) { - return p->static_prio; + int prio; + + if (dl_policy(policy)) + prio = MAX_DL_PRIO - 1; + else if (rt_policy(policy)) + prio = MAX_RT_PRIO - 1 - rt_prio; + else + prio = NICE_TO_PRIO(nice); + + return prio; } /* @@ -1998,15 +2033,7 @@ static inline int __normal_prio(struct task_struct *p) */ static inline int normal_prio(struct task_struct *p) { - int prio; - - if (task_has_dl_policy(p)) - prio = MAX_DL_PRIO-1; - else if (task_has_rt_policy(p)) - prio = MAX_RT_PRIO-1 - p->rt_priority; - else - prio = __normal_prio(p); - return prio; + return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio)); } /* @@ -2163,7 +2190,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) /* Non kernel threads are not allowed during either online or offline. */ if (!(p->flags & PF_KTHREAD)) - return cpu_active(cpu); + return cpu_active(cpu) && task_cpu_possible(cpu, p); /* KTHREAD_IS_PER_CPU is always allowed. */ if (kthread_is_per_cpu(p)) @@ -2470,6 +2497,34 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) __do_set_cpus_allowed(p, new_mask, 0); } +int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, + int node) +{ + if (!src->user_cpus_ptr) + return 0; + + dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); + if (!dst->user_cpus_ptr) + return -ENOMEM; + + cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); + return 0; +} + +static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p) +{ + struct cpumask *user_mask = NULL; + + swap(p->user_cpus_ptr, user_mask); + + return user_mask; +} + +void release_user_cpus_ptr(struct task_struct *p) +{ + kfree(clear_user_cpus_ptr(p)); +} + /* * This function is wildly self concurrent; here be dragons. * @@ -2687,28 +2742,26 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag } /* - * Change a given task's CPU affinity. Migrate the thread to a - * proper CPU and schedule it away if the CPU it's executing on - * is removed from the allowed bitmask. - * - * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The - * call is not atomic; no spinlocks may be held. + * Called with both p->pi_lock and rq->lock held; drops both before returning. */ -static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, - u32 flags) +static int __set_cpus_allowed_ptr_locked(struct task_struct *p, + const struct cpumask *new_mask, + u32 flags, + struct rq *rq, + struct rq_flags *rf) + __releases(rq->lock) + __releases(p->pi_lock) { + const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); const struct cpumask *cpu_valid_mask = cpu_active_mask; + bool kthread = p->flags & PF_KTHREAD; + struct cpumask *user_mask = NULL; unsigned int dest_cpu; - struct rq_flags rf; - struct rq *rq; int ret = 0; - rq = task_rq_lock(p, &rf); update_rq_clock(rq); - if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { + if (kthread || is_migration_disabled(p)) { /* * Kernel threads are allowed on online && !active CPUs, * however, during cpu-hot-unplug, even these might get pushed @@ -2722,6 +2775,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, cpu_valid_mask = cpu_online_mask; } + if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { + ret = -EINVAL; + goto out; + } + /* * Must re-check here, to close a race against __kthread_bind(), * sched_setaffinity() is not guaranteed to observe the flag. @@ -2756,20 +2814,178 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, __do_set_cpus_allowed(p, new_mask, flags); - return affine_move_task(rq, p, &rf, dest_cpu, flags); + if (flags & SCA_USER) + user_mask = clear_user_cpus_ptr(p); + + ret = affine_move_task(rq, p, rf, dest_cpu, flags); + + kfree(user_mask); + + return ret; out: - task_rq_unlock(rq, p, &rf); + task_rq_unlock(rq, p, rf); return ret; } +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, u32 flags) +{ + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf); +} + int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { return __set_cpus_allowed_ptr(p, new_mask, 0); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); +/* + * Change a given task's CPU affinity to the intersection of its current + * affinity mask and @subset_mask, writing the resulting mask to @new_mask + * and pointing @p->user_cpus_ptr to a copy of the old mask. + * If the resulting mask is empty, leave the affinity unchanged and return + * -EINVAL. + */ +static int restrict_cpus_allowed_ptr(struct task_struct *p, + struct cpumask *new_mask, + const struct cpumask *subset_mask) +{ + struct cpumask *user_mask = NULL; + struct rq_flags rf; + struct rq *rq; + int err; + + if (!p->user_cpus_ptr) { + user_mask = kmalloc(cpumask_size(), GFP_KERNEL); + if (!user_mask) + return -ENOMEM; + } + + rq = task_rq_lock(p, &rf); + + /* + * Forcefully restricting the affinity of a deadline task is + * likely to cause problems, so fail and noisily override the + * mask entirely. + */ + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + err = -EPERM; + goto err_unlock; + } + + if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { + err = -EINVAL; + goto err_unlock; + } + + /* + * We're about to butcher the task affinity, so keep track of what + * the user asked for in case we're able to restore it later on. + */ + if (user_mask) { + cpumask_copy(user_mask, p->cpus_ptr); + p->user_cpus_ptr = user_mask; + } + + return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf); + +err_unlock: + task_rq_unlock(rq, p, &rf); + kfree(user_mask); + return err; +} + +/* + * Restrict the CPU affinity of task @p so that it is a subset of + * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the + * old affinity mask. If the resulting mask is empty, we warn and walk + * up the cpuset hierarchy until we find a suitable mask. + */ +void force_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + cpumask_var_t new_mask; + const struct cpumask *override_mask = task_cpu_possible_mask(p); + + alloc_cpumask_var(&new_mask, GFP_KERNEL); + + /* + * __migrate_task() can fail silently in the face of concurrent + * offlining of the chosen destination CPU, so take the hotplug + * lock to ensure that the migration succeeds. + */ + cpus_read_lock(); + if (!cpumask_available(new_mask)) + goto out_set_mask; + + if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask)) + goto out_free_mask; + + /* + * We failed to find a valid subset of the affinity mask for the + * task, so override it based on its cpuset hierarchy. + */ + cpuset_cpus_allowed(p, new_mask); + override_mask = new_mask; + +out_set_mask: + if (printk_ratelimit()) { + printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n", + task_pid_nr(p), p->comm, + cpumask_pr_args(override_mask)); + } + + WARN_ON(set_cpus_allowed_ptr(p, override_mask)); +out_free_mask: + cpus_read_unlock(); + free_cpumask_var(new_mask); +} + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); + +/* + * Restore the affinity of a task @p which was previously restricted by a + * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) + * @p->user_cpus_ptr. + * + * It is the caller's responsibility to serialise this with any calls to + * force_compatible_cpus_allowed_ptr(@p). + */ +void relax_compatible_cpus_allowed_ptr(struct task_struct *p) +{ + struct cpumask *user_mask = p->user_cpus_ptr; + unsigned long flags; + + /* + * Try to restore the old affinity mask. If this fails, then + * we free the mask explicitly to avoid it being inherited across + * a subsequent fork(). + */ + if (!user_mask || !__sched_setaffinity(p, user_mask)) + return; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + user_mask = clear_user_cpus_ptr(p); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + kfree(user_mask); +} + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@ -3114,9 +3330,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* Look for allowed, online CPU in same node. */ for_each_cpu(dest_cpu, nodemask) { - if (!cpu_active(dest_cpu)) - continue; - if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) + if (is_cpu_allowed(p, dest_cpu)) return dest_cpu; } } @@ -3133,8 +3347,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) /* No more Mr. Nice Guy. */ switch (state) { case cpuset: - if (IS_ENABLED(CONFIG_CPUSETS)) { - cpuset_cpus_allowed_fallback(p); + if (cpuset_cpus_allowed_fallback(p)) { state = possible; break; } @@ -3146,10 +3359,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * * More yuck to audit. */ - do_set_cpus_allowed(p, cpu_possible_mask); + do_set_cpus_allowed(p, task_cpu_possible_mask(p)); state = fail; break; - case fail: BUG(); break; @@ -3564,6 +3776,55 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) } /* + * Invoked from try_to_wake_up() to check whether the task can be woken up. + * + * The caller holds p::pi_lock if p != current or has preemption + * disabled when p == current. + * + * The rules of PREEMPT_RT saved_state: + * + * The related locking code always holds p::pi_lock when updating + * p::saved_state, which means the code is fully serialized in both cases. + * + * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other + * bits set. This allows to distinguish all wakeup scenarios. + */ +static __always_inline +bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) +{ + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { + WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && + state != TASK_RTLOCK_WAIT); + } + + if (READ_ONCE(p->__state) & state) { + *success = 1; + return true; + } + +#ifdef CONFIG_PREEMPT_RT + /* + * Saved state preserves the task state across blocking on + * an RT lock. If the state matches, set p::saved_state to + * TASK_RUNNING, but do not wake the task because it waits + * for a lock wakeup. Also indicate success because from + * the regular waker's point of view this has succeeded. + * + * After acquiring the lock the task will restore p::__state + * from p::saved_state which ensures that the regular + * wakeup is not lost. The restore will also set + * p::saved_state to TASK_RUNNING so any further tests will + * not result in false positives vs. @success + */ + if (p->saved_state & state) { + p->saved_state = TASK_RUNNING; + *success = 1; + } +#endif + return false; +} + +/* * Notes on Program-Order guarantees on SMP systems. * * MIGRATION @@ -3702,10 +3963,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - if (!(READ_ONCE(p->__state) & state)) + if (!ttwu_state_match(p, state, &success)) goto out; - success = 1; trace_sched_waking(p); WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); @@ -3720,14 +3980,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); - if (!(READ_ONCE(p->__state) & state)) + if (!ttwu_state_match(p, state, &success)) goto unlock; trace_sched_waking(p); - /* We're going to change ->state: */ - success = 1; - /* * Ensure we load p->on_rq _after_ p->state, otherwise it would * be possible to, falsely, observe p->on_rq == 0 and get stuck @@ -4099,7 +4356,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) } else if (PRIO_TO_NICE(p->static_prio) < 0) p->static_prio = NICE_TO_PRIO(0); - p->prio = p->normal_prio = __normal_prio(p); + p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); /* @@ -5662,11 +5919,9 @@ static bool try_steal_cookie(int this, int that) if (p->core_occupation > dst->idle->core_occupation) goto next; - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src, p, 0); set_task_cpu(p, this); activate_task(dst, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; resched_curr(dst); @@ -5738,35 +5993,109 @@ void queue_core_balance(struct rq *rq) queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); } -static inline void sched_core_cpu_starting(unsigned int cpu) +static void sched_core_cpu_starting(unsigned int cpu) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); - struct rq *rq, *core_rq = NULL; - int i; + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; + unsigned long flags; + int t; - core_rq = cpu_rq(cpu)->core; + sched_core_lock(cpu, &flags); + + WARN_ON_ONCE(rq->core != rq); + + /* if we're the first, we'll be our own leader */ + if (cpumask_weight(smt_mask) == 1) + goto unlock; - if (!core_rq) { - for_each_cpu(i, smt_mask) { - rq = cpu_rq(i); - if (rq->core && rq->core == rq) - core_rq = rq; + /* find the leader */ + for_each_cpu(t, smt_mask) { + if (t == cpu) + continue; + rq = cpu_rq(t); + if (rq->core == rq) { + core_rq = rq; + break; } + } - if (!core_rq) - core_rq = cpu_rq(cpu); + if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ + goto unlock; - for_each_cpu(i, smt_mask) { - rq = cpu_rq(i); + /* install and validate core_rq */ + for_each_cpu(t, smt_mask) { + rq = cpu_rq(t); - WARN_ON_ONCE(rq->core && rq->core != core_rq); + if (t == cpu) rq->core = core_rq; - } + + WARN_ON_ONCE(rq->core != core_rq); } + +unlock: + sched_core_unlock(cpu, &flags); } + +static void sched_core_cpu_deactivate(unsigned int cpu) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + struct rq *rq = cpu_rq(cpu), *core_rq = NULL; + unsigned long flags; + int t; + + sched_core_lock(cpu, &flags); + + /* if we're the last man standing, nothing to do */ + if (cpumask_weight(smt_mask) == 1) { + WARN_ON_ONCE(rq->core != rq); + goto unlock; + } + + /* if we're not the leader, nothing to do */ + if (rq->core != rq) + goto unlock; + + /* find a new leader */ + for_each_cpu(t, smt_mask) { + if (t == cpu) + continue; + core_rq = cpu_rq(t); + break; + } + + if (WARN_ON_ONCE(!core_rq)) /* impossible */ + goto unlock; + + /* copy the shared state to the new leader */ + core_rq->core_task_seq = rq->core_task_seq; + core_rq->core_pick_seq = rq->core_pick_seq; + core_rq->core_cookie = rq->core_cookie; + core_rq->core_forceidle = rq->core_forceidle; + core_rq->core_forceidle_seq = rq->core_forceidle_seq; + + /* install new leader */ + for_each_cpu(t, smt_mask) { + rq = cpu_rq(t); + rq->core = core_rq; + } + +unlock: + sched_core_unlock(cpu, &flags); +} + +static inline void sched_core_cpu_dying(unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->core != rq) + rq->core = rq; +} + #else /* !CONFIG_SCHED_CORE */ static inline void sched_core_cpu_starting(unsigned int cpu) {} +static inline void sched_core_cpu_deactivate(unsigned int cpu) {} +static inline void sched_core_cpu_dying(unsigned int cpu) {} static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) @@ -5777,6 +6106,24 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #endif /* CONFIG_SCHED_CORE */ /* + * Constants for the sched_mode argument of __schedule(). + * + * The mode argument allows RT enabled kernels to differentiate a + * preemption from blocking on an 'sleeping' spin/rwlock. Note that + * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to + * optimize the AND operation out and just check for zero. + */ +#define SM_NONE 0x0 +#define SM_PREEMPT 0x1 +#define SM_RTLOCK_WAIT 0x2 + +#ifndef CONFIG_PREEMPT_RT +# define SM_MASK_PREEMPT (~0U) +#else +# define SM_MASK_PREEMPT SM_PREEMPT +#endif + +/* * __schedule() is the main scheduler function. * * The main means of driving the scheduler and thus entering this function are: @@ -5815,7 +6162,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * WARNING: must be called with preemption disabled! */ -static void __sched notrace __schedule(bool preempt) +static void __sched notrace __schedule(unsigned int sched_mode) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -5828,13 +6175,13 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - schedule_debug(prev, preempt); + schedule_debug(prev, !!sched_mode); if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); - rcu_note_context_switch(preempt); + rcu_note_context_switch(!!sched_mode); /* * Make sure that signal_pending_state()->signal_pending() below @@ -5868,7 +6215,7 @@ static void __sched notrace __schedule(bool preempt) * - ptrace_{,un}freeze_traced() can change ->state underneath us. */ prev_state = READ_ONCE(prev->__state); - if (!preempt && prev_state) { + if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { if (signal_pending_state(prev_state, prev)) { WRITE_ONCE(prev->__state, TASK_RUNNING); } else { @@ -5934,7 +6281,7 @@ static void __sched notrace __schedule(bool preempt) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(preempt, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); @@ -5955,7 +6302,7 @@ void __noreturn do_task_dead(void) /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; - __schedule(false); + __schedule(SM_NONE); BUG(); /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ @@ -6016,7 +6363,7 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { preempt_disable(); - __schedule(false); + __schedule(SM_NONE); sched_preempt_enable_no_resched(); } while (need_resched()); sched_update_worker(tsk); @@ -6044,7 +6391,7 @@ void __sched schedule_idle(void) */ WARN_ON_ONCE(current->__state); do { - __schedule(false); + __schedule(SM_NONE); } while (need_resched()); } @@ -6079,6 +6426,18 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } +#ifdef CONFIG_PREEMPT_RT +void __sched notrace schedule_rtlock(void) +{ + do { + preempt_disable(); + __schedule(SM_RTLOCK_WAIT); + sched_preempt_enable_no_resched(); + } while (need_resched()); +} +NOKPROBE_SYMBOL(schedule_rtlock); +#endif + static void __sched notrace preempt_schedule_common(void) { do { @@ -6097,7 +6456,7 @@ static void __sched notrace preempt_schedule_common(void) */ preempt_disable_notrace(); preempt_latency_start(1); - __schedule(true); + __schedule(SM_PREEMPT); preempt_latency_stop(1); preempt_enable_no_resched_notrace(); @@ -6176,7 +6535,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) * an infinite recursion. */ prev_ctx = exception_enter(); - __schedule(true); + __schedule(SM_PREEMPT); exception_exit(prev_ctx); preempt_latency_stop(1); @@ -6325,7 +6684,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) do { preempt_disable(); local_irq_enable(); - __schedule(true); + __schedule(SM_PREEMPT); local_irq_disable(); sched_preempt_enable_no_resched(); } while (need_resched()); @@ -6341,6 +6700,18 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag } EXPORT_SYMBOL(default_wake_function); +static void __setscheduler_prio(struct task_struct *p, int prio) +{ + if (dl_prio(prio)) + p->sched_class = &dl_sched_class; + else if (rt_prio(prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + + p->prio = prio; +} + #ifdef CONFIG_RT_MUTEXES static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) @@ -6456,22 +6827,19 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) } else { p->dl.pi_se = &p->dl; } - p->sched_class = &dl_sched_class; } else if (rt_prio(prio)) { if (dl_prio(oldprio)) p->dl.pi_se = &p->dl; if (oldprio < prio) queue_flag |= ENQUEUE_HEAD; - p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) p->dl.pi_se = &p->dl; if (rt_prio(oldprio)) p->rt.timeout = 0; - p->sched_class = &fair_sched_class; } - p->prio = prio; + __setscheduler_prio(p, prio); if (queued) enqueue_task(rq, p, queue_flag); @@ -6824,35 +7192,6 @@ static void __setscheduler_params(struct task_struct *p, set_load_weight(p, true); } -/* Actually do priority change: must hold pi & rq lock. */ -static void __setscheduler(struct rq *rq, struct task_struct *p, - const struct sched_attr *attr, bool keep_boost) -{ - /* - * If params can't change scheduling class changes aren't allowed - * either. - */ - if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) - return; - - __setscheduler_params(p, attr); - - /* - * Keep a potential priority boosting if called from - * sched_setscheduler(). - */ - p->prio = normal_prio(p); - if (keep_boost) - p->prio = rt_effective_prio(p, p->prio); - - if (dl_prio(p->prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(p->prio)) - p->sched_class = &rt_sched_class; - else - p->sched_class = &fair_sched_class; -} - /* * Check the target process has a UID that matches the current process's: */ @@ -6873,10 +7212,8 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi) { - int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : - MAX_RT_PRIO - 1 - attr->sched_priority; - int retval, oldprio, oldpolicy = -1, queued, running; - int new_effective_prio, policy = attr->sched_policy; + int oldpolicy = -1, policy = attr->sched_policy; + int retval, oldprio, newprio, queued, running; const struct sched_class *prev_class; struct callback_head *head; struct rq_flags rf; @@ -7074,6 +7411,7 @@ change: p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; + newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice); if (pi) { /* * Take priority boosted tasks into account. If the new @@ -7082,8 +7420,8 @@ change: * the runqueue. This will be done when the task deboost * itself. */ - new_effective_prio = rt_effective_prio(p, newprio); - if (new_effective_prio == oldprio) + newprio = rt_effective_prio(p, newprio); + if (newprio == oldprio) queue_flags &= ~DEQUEUE_MOVE; } @@ -7096,7 +7434,10 @@ change: prev_class = p->sched_class; - __setscheduler(rq, p, attr, pi); + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + __setscheduler_prio(p, newprio); + } __setscheduler_uclamp(p, attr); if (queued) { @@ -7320,6 +7661,16 @@ err_size: return -E2BIG; } +static void get_params(struct task_struct *p, struct sched_attr *attr) +{ + if (task_has_dl_policy(p)) + __getparam_dl(p, attr); + else if (task_has_rt_policy(p)) + attr->sched_priority = p->rt_priority; + else + attr->sched_nice = task_nice(p); +} + /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. @@ -7381,6 +7732,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, rcu_read_unlock(); if (likely(p)) { + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); retval = sched_setattr(p, &attr); put_task_struct(p); } @@ -7529,12 +7882,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - if (task_has_dl_policy(p)) - __getparam_dl(p, &kattr); - else if (task_has_rt_policy(p)) - kattr.sched_priority = p->rt_priority; - else - kattr.sched_nice = task_nice(p); + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK /* @@ -7555,9 +7904,76 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +#ifdef CONFIG_SMP +int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) +{ + int ret = 0; + + /* + * If the task isn't a deadline task or admission control is + * disabled then we don't care about affinity changes. + */ + if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()) + return 0; + + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ + rcu_read_lock(); + if (!cpumask_subset(task_rq(p)->rd->span, mask)) + ret = -EBUSY; + rcu_read_unlock(); + return ret; +} +#endif + +static int +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) { + int retval; cpumask_var_t cpus_allowed, new_mask; + + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) + return -ENOMEM; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, mask, cpus_allowed); + + retval = dl_task_check_affinity(p, new_mask); + if (retval) + goto out_free_new_mask; +again: + retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); + if (retval) + goto out_free_new_mask; + + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset update. + * Just reset the cpumask to the cpuset's cpus_allowed. + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } + +out_free_new_mask: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); + return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ struct task_struct *p; int retval; @@ -7577,68 +7993,22 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) retval = -EINVAL; goto out_put_task; } - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_put_task; - } - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { - retval = -ENOMEM; - goto out_free_cpus_allowed; - } - retval = -EPERM; + if (!check_same_owner(p)) { rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); - goto out_free_new_mask; + retval = -EPERM; + goto out_put_task; } rcu_read_unlock(); } retval = security_task_setscheduler(p); if (retval) - goto out_free_new_mask; - - - cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, in_mask, cpus_allowed); - - /* - * Since bandwidth control happens on root_domain basis, - * if admission test is enabled, we only admit -deadline - * tasks allowed to run on all the CPUs in the task's - * root_domain. - */ -#ifdef CONFIG_SMP - if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { - rcu_read_lock(); - if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { - retval = -EBUSY; - rcu_read_unlock(); - goto out_free_new_mask; - } - rcu_read_unlock(); - } -#endif -again: - retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); + goto out_put_task; - if (!retval) { - cpuset_cpus_allowed(p, cpus_allowed); - if (!cpumask_subset(new_mask, cpus_allowed)) { - /* - * We must have raced with a concurrent cpuset - * update. Just reset the cpus_allowed to the - * cpuset's cpus_allowed - */ - cpumask_copy(new_mask, cpus_allowed); - goto again; - } - } -out_free_new_mask: - free_cpumask_var(new_mask); -out_free_cpus_allowed: - free_cpumask_var(cpus_allowed); + retval = __sched_setaffinity(p, in_mask); out_put_task: put_task_struct(p); return retval; @@ -7781,6 +8151,17 @@ int __sched __cond_resched(void) preempt_schedule_common(); return 1; } + /* + * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * whether the current CPU is in an RCU read-side critical section, + * so the tick can report quiescent states even for CPUs looping + * in kernel context. In contrast, in non-preemptible kernels, + * RCU readers leave no in-memory hints, which means that CPU-bound + * processes executing in kernel context might never report an + * RCU quiescent state. Therefore, the following code causes + * cond_resched() to report a quiescent state, but only when RCU + * is in urgent need of one. + */ #ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); #endif @@ -8727,6 +9108,8 @@ int sched_cpu_deactivate(unsigned int cpu) */ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_dec_cpuslocked(&sched_smt_present); + + sched_core_cpu_deactivate(cpu); #endif if (!sched_smp_initialized) @@ -8831,6 +9214,7 @@ int sched_cpu_dying(unsigned int cpu) calc_load_migrate(rq); update_max_interval(); hrtick_clear(rq); + sched_core_cpu_dying(cpu); return 0; } #endif @@ -9042,7 +9426,7 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); #ifdef CONFIG_SCHED_CORE - rq->core = NULL; + rq->core = rq; rq->core_pick = NULL; rq->core_enabled = 0; rq->core_tree = RB_ROOT; @@ -9824,7 +10208,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ - get_online_cpus(); + cpus_read_lock(); mutex_lock(&cfs_constraints_mutex); ret = __cfs_schedulable(tg, period, quota); if (ret) @@ -9868,7 +10252,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); - put_online_cpus(); + cpus_read_unlock(); return ret; } @@ -10119,6 +10503,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED +static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return css_tg(css)->idle; +} + +static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 idle) +{ + return sched_group_set_idle(css_tg(css), idle); +} +#endif + static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { @@ -10126,6 +10524,11 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_shares_read_u64, .write_u64 = cpu_shares_write_u64, }, + { + .name = "idle", + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { @@ -10333,6 +10736,12 @@ static struct cftype cpu_files[] = { .read_s64 = cpu_weight_nice_read_s64, .write_s64 = cpu_weight_nice_write_s64, }, + { + .name = "idle", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, #endif #ifdef CONFIG_CFS_BANDWIDTH { diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 57124614363d..e7af18857371 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -537,9 +537,17 @@ static struct attribute *sugov_attrs[] = { }; ATTRIBUTE_GROUPS(sugov); +static void sugov_tunables_free(struct kobject *kobj) +{ + struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); + + kfree(to_sugov_tunables(attr_set)); +} + static struct kobj_type sugov_tunables_ktype = { .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, + .release = &sugov_tunables_free, }; /********************** cpufreq governor interface *********************/ @@ -639,12 +647,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic return tunables; } -static void sugov_tunables_free(struct sugov_tunables *tunables) +static void sugov_clear_global_tunables(void) { if (!have_governor_per_policy()) global_tunables = NULL; - - kfree(tunables); } static int sugov_init(struct cpufreq_policy *policy) @@ -707,7 +713,7 @@ out: fail: kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); stop_kthread: sugov_kthread_stop(sg_policy); @@ -734,7 +740,7 @@ static void sugov_exit(struct cpufreq_policy *policy) count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); policy->governor_data = NULL; if (!count) - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); mutex_unlock(&global_tunables_lock); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index aaacd6cfd42f..e94314633b39 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1733,6 +1733,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused */ raw_spin_rq_lock(rq); if (p->dl.dl_non_contending) { + update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* @@ -2741,7 +2742,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; + dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } @@ -2754,7 +2755,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_runtime = dl_se->dl_runtime; attr->sched_deadline = dl_se->dl_deadline; attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; + attr->sched_flags &= ~SCHED_DL_FLAGS; + attr->sched_flags |= dl_se->flags; } /* @@ -2851,7 +2853,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) if (dl_se->dl_runtime != attr->sched_runtime || dl_se->dl_deadline != attr->sched_deadline || dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) + dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS)) return true; return false; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0c5ec2776ddf..49716228efb4 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -388,6 +388,13 @@ void update_sched_domain_debugfs(void) { int cpu, i; + /* + * This can unfortunately be invoked before sched_debug_init() creates + * the debug directory. Don't touch sd_sysctl_cpus until then. + */ + if (!debugfs_sched) + return; + if (!cpumask_available(sd_sysctl_cpus)) { if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) return; @@ -600,6 +607,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", + cfs_rq->idle_h_nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 44c452072a1b..ff69f245b939 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -431,6 +431,23 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } } +static int tg_is_idle(struct task_group *tg) +{ + return tg->idle > 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ + return cfs_rq->idle > 0; +} + +static int se_is_idle(struct sched_entity *se) +{ + if (entity_is_task(se)) + return task_has_idle_policy(task_of(se)); + return cfs_rq_is_idle(group_cfs_rq(se)); +} + #else /* !CONFIG_FAIR_GROUP_SCHED */ #define for_each_sched_entity(se) \ @@ -468,6 +485,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) { } +static inline int tg_is_idle(struct task_group *tg) +{ + return 0; +} + +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static int se_is_idle(struct sched_entity *se) +{ + return 0; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline @@ -1486,7 +1518,7 @@ static inline bool is_core_idle(int cpu) if (cpu == sibling) continue; - if (!idle_cpu(cpu)) + if (!idle_cpu(sibling)) return false; } #endif @@ -4841,6 +4873,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; @@ -4860,6 +4895,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) update_load_avg(qcfs_rq, se, 0); se_update_runnable(se); + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; } @@ -4904,39 +4942,45 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + if (se->on_rq) break; - cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_running += task_delta; + qcfs_rq->idle_h_nr_running += idle_task_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; } for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(qcfs_rq, se, UPDATE_TG); se_update_runnable(se); - cfs_rq->h_nr_running += task_delta; - cfs_rq->idle_h_nr_running += idle_task_delta; + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; + qcfs_rq->h_nr_running += task_delta; + qcfs_rq->idle_h_nr_running += idle_task_delta; /* end evaluation on encountering a throttled cfs_rq */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; /* * One parent has been throttled and cfs_rq removed from the * list. Add it back to not break the leaf list. */ - if (throttled_hierarchy(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); + if (throttled_hierarchy(qcfs_rq)) + list_add_leaf_cfs_rq(qcfs_rq); } /* At this point se is NULL and we are at root level*/ @@ -4949,9 +4993,9 @@ unthrottle_throttle: * assertion below. */ for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (list_add_leaf_cfs_rq(cfs_rq)) + if (list_add_leaf_cfs_rq(qcfs_rq)) break; } @@ -5574,6 +5618,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; @@ -5591,6 +5638,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; @@ -5668,6 +5718,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto dequeue_throttle; @@ -5697,6 +5750,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + if (cfs_rq_is_idle(cfs_rq)) + idle_h_nr_running = 1; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto dequeue_throttle; @@ -6249,7 +6305,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); } - for_each_cpu_wrap(cpu, cpus, target) { + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); if ((unsigned int)i < nr_cpumask_bits) @@ -6376,6 +6432,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; + p->recent_used_cpu = prev; if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && @@ -6902,9 +6959,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - - if (want_affine) - current->recent_used_cpu = cpu; } rcu_read_unlock(); @@ -7041,24 +7095,22 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) static void set_last_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->last = se; } } static void set_next_buddy(struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { if (SCHED_WARN_ON(!se->on_rq)) return; + if (se_is_idle(se)) + return; cfs_rq_of(se)->next = se; } } @@ -7079,6 +7131,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; + int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) return; @@ -7123,8 +7176,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; find_matching_se(&se, &pse); - update_curr(cfs_rq_of(se)); BUG_ON(!pse); + + cse_is_idle = se_is_idle(se); + pse_is_idle = se_is_idle(pse); + + /* + * Preempt an idle group in favor of a non-idle group (and don't preempt + * in the inverse case). + */ + if (cse_is_idle && !pse_is_idle) + goto preempt; + if (cse_is_idle != pse_is_idle) + return; + + update_curr(cfs_rq_of(se)); if (wakeup_preempt_entity(se, pse) == 1) { /* * Bias pick_next to pick the sched entity that is @@ -10217,9 +10283,11 @@ static inline int on_null_domain(struct rq *rq) static inline int find_new_ilb(void) { int ilb; + const struct cpumask *hk_mask; + + hk_mask = housekeeping_cpumask(HK_FLAG_MISC); - for_each_cpu_and(ilb, nohz.idle_cpus_mask, - housekeeping_cpumask(HK_FLAG_MISC)) { + for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { if (ilb == smp_processor_id()) continue; @@ -11416,10 +11484,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, static DEFINE_MUTEX(shares_mutex); -int sched_group_set_shares(struct task_group *tg, unsigned long shares) +static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; + lockdep_assert_held(&shares_mutex); + /* * We can't change the weight of the root cgroup. */ @@ -11428,9 +11498,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); - mutex_lock(&shares_mutex); if (tg->shares == shares) - goto done; + return 0; tg->shares = shares; for_each_possible_cpu(i) { @@ -11448,10 +11517,88 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) rq_unlock_irqrestore(rq, &rf); } -done: + return 0; +} + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int ret; + + mutex_lock(&shares_mutex); + if (tg_is_idle(tg)) + ret = -EINVAL; + else + ret = __sched_group_set_shares(tg, shares); + mutex_unlock(&shares_mutex); + + return ret; +} + +int sched_group_set_idle(struct task_group *tg, long idle) +{ + int i; + + if (tg == &root_task_group) + return -EINVAL; + + if (idle < 0 || idle > 1) + return -EINVAL; + + mutex_lock(&shares_mutex); + + if (tg->idle == idle) { + mutex_unlock(&shares_mutex); + return 0; + } + + tg->idle = idle; + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + struct sched_entity *se = tg->se[i]; + struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + bool was_idle = cfs_rq_is_idle(grp_cfs_rq); + long idle_task_delta; + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + + grp_cfs_rq->idle = idle; + if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) + goto next_cpu; + + idle_task_delta = grp_cfs_rq->h_nr_running - + grp_cfs_rq->idle_h_nr_running; + if (!cfs_rq_is_idle(grp_cfs_rq)) + idle_task_delta *= -1; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!se->on_rq) + break; + + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* Already accounted at parent level and above. */ + if (cfs_rq_is_idle(cfs_rq)) + break; + } + +next_cpu: + rq_unlock_irqrestore(rq, &rf); + } + + /* Idle groups have minimum weight. */ + if (tg_is_idle(tg)) + __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO)); + else + __sched_group_set_shares(tg, NICE_0_LOAD); + mutex_unlock(&shares_mutex); return 0; } + #else /* CONFIG_FAIR_GROUP_SCHED */ void free_fair_sched_group(struct task_group *tg) { } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14a41a243f7b..3d3e5793e117 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -227,6 +227,8 @@ static inline void update_avg(u64 *avg, u64 sample) */ #define SCHED_FLAG_SUGOV 0x10000000 +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) + static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) { #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL @@ -394,6 +396,9 @@ struct task_group { struct cfs_rq **cfs_rq; unsigned long shares; + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; + #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put @@ -503,6 +508,8 @@ extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern int sched_group_set_idle(struct task_group *tg, long idle); + #ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); @@ -599,6 +606,9 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + /* Locally cached copy of our task_group's idle value */ + int idle; + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; @@ -1093,7 +1103,7 @@ struct rq { unsigned int core_sched_seq; struct rb_root core_tree; - /* shared state */ + /* shared state -- careful with sched_core_cpu_deactivate() */ unsigned int core_task_seq; unsigned int core_pick_seq; unsigned long core_cookie; @@ -2234,6 +2244,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 #define SCA_MIGRATE_ENABLE 0x04 +#define SCA_USER 0x08 #ifdef CONFIG_SMP @@ -2255,6 +2266,9 @@ static inline struct task_struct *get_push_task(struct rq *rq) if (p->nr_cpus_allowed == 1) return NULL; + if (p->migration_disabled) + return NULL; + rq->push_busy = true; return get_task_struct(p); } @@ -2385,6 +2399,21 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_DEBUG +extern unsigned int sysctl_sched_latency; +extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_wakeup_granularity; +extern int sysctl_resched_latency_warn_ms; +extern int sysctl_resched_latency_warn_once; + +extern unsigned int sysctl_sched_tunable_scaling; + +extern unsigned int sysctl_numa_balancing_scan_delay; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_size; +#endif + #ifdef CONFIG_SCHED_HRTICK /* diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b77ad49dc14f..4e8698e62f07 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1482,6 +1482,8 @@ int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; + +static unsigned long __read_mostly *sched_numa_onlined_nodes; #endif /* @@ -1833,6 +1835,16 @@ void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; for_each_node(k) { + /* + * Distance information can be unreliable for + * offline nodes, defer building the node + * masks to its bringup. + * This relies on all unique distance values + * still being visible at init time. + */ + if (!node_online(j)) + continue; + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) sched_numa_warn("Node-distance not symmetric"); @@ -1886,6 +1898,53 @@ void sched_init_numa(void) sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; init_numa_topology_type(); + + sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL); + if (!sched_numa_onlined_nodes) + return; + + bitmap_zero(sched_numa_onlined_nodes, nr_node_ids); + for_each_online_node(i) + bitmap_set(sched_numa_onlined_nodes, i, 1); +} + +static void __sched_domains_numa_masks_set(unsigned int node) +{ + int i, j; + + /* + * NUMA masks are not built for offline nodes in sched_init_numa(). + * Thus, when a CPU of a never-onlined-before node gets plugged in, + * adding that new CPU to the right NUMA masks is not sufficient: the + * masks of that CPU's node must also be updated. + */ + if (test_bit(node, sched_numa_onlined_nodes)) + return; + + bitmap_set(sched_numa_onlined_nodes, node, 1); + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (!node_online(j) || node == j) + continue; + + if (node_distance(j, node) > sched_domains_numa_distance[i]) + continue; + + /* Add remote nodes in our masks */ + cpumask_or(sched_domains_numa_masks[i][node], + sched_domains_numa_masks[i][node], + sched_domains_numa_masks[0][j]); + } + } + + /* + * A new node has been brought up, potentially changing the topology + * classification. + * + * Note that this is racy vs any use of sched_numa_topology_type :/ + */ + init_numa_topology_type(); } void sched_domains_numa_masks_set(unsigned int cpu) @@ -1893,8 +1952,14 @@ void sched_domains_numa_masks_set(unsigned int cpu) int node = cpu_to_node(cpu); int i, j; + __sched_domains_numa_masks_set(node); + for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { + if (!node_online(j)) + continue; + + /* Set ourselves in the remote node's masks */ if (node_distance(j, node) <= sched_domains_numa_distance[i]) cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); } |