diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 2 | ||||
-rw-r--r-- | kernel/sched/auto_group.c | 11 | ||||
-rw-r--r-- | kernel/sched/clock.c | 15 | ||||
-rw-r--r-- | kernel/sched/completion.c | 36 | ||||
-rw-r--r-- | kernel/sched/core.c | 1147 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 2 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 29 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.h | 5 | ||||
-rw-r--r-- | kernel/sched/cpupri.h | 3 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 64 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 364 | ||||
-rw-r--r-- | kernel/sched/debug.c | 37 | ||||
-rw-r--r-- | kernel/sched/fair.c | 1530 | ||||
-rw-r--r-- | kernel/sched/features.h | 13 | ||||
-rw-r--r-- | kernel/sched/idle.c | 85 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 7 | ||||
-rw-r--r-- | kernel/sched/proc.c | 7 | ||||
-rw-r--r-- | kernel/sched/rt.c | 279 | ||||
-rw-r--r-- | kernel/sched/sched.h | 303 | ||||
-rw-r--r-- | kernel/sched/stats.c | 11 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 7 | ||||
-rw-r--r-- | kernel/sched/wait.c | 132 |
22 files changed, 2811 insertions, 1278 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index ab32b7b0db5c..46be87024875 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -1,5 +1,5 @@ ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_clock.o = -pg +CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) endif ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e73efba98301..eae160dd669d 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void) * so we don't have to move tasks around upon policy change, * or flail around trying to allocate bandwidth on the fly. * A bandwidth exception in __sched_setscheduler() allows - * the policy change to proceed. Thereafter, task_group() - * returns &root_task_group, so zero bandwidth is required. + * the policy change to proceed. */ free_rt_sched_group(tg); tg->rt_se = root_task_group.rt_se; @@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) if (tg != &root_task_group) return false; - if (p->sched_class != &fair_sched_class) - return false; - /* * We can only assume the task group can't go away on us if * autogroup_move_group() can see us on ->thread_group list. @@ -148,11 +144,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) goto out; - t = p; - do { + for_each_thread(p, t) sched_move_task(t); - } while_each_thread(p, t); - out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 3ef6451e972e..c0a205101c23 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -134,7 +134,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); static inline struct sched_clock_data *this_scd(void) { - return &__get_cpu_var(sched_clock_data); + return this_cpu_ptr(&sched_clock_data); } static inline struct sched_clock_data *cpu_sdc(int cpu) @@ -420,3 +420,16 @@ u64 local_clock(void) EXPORT_SYMBOL_GPL(cpu_clock); EXPORT_SYMBOL_GPL(local_clock); + +/* + * Running clock - returns the time that has elapsed while a guest has been + * running. + * On a guest this value should be local_clock minus the time the guest was + * suspended by the hypervisor (for any reason). + * On bare metal this function should return the same as local_clock. + * Architectures and sub-architectures can override this. + */ +u64 __weak running_clock(void) +{ + return local_clock(); +} diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index a63f4dc27909..8d0f35debf35 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout); * * This waits to be signaled for completion of a specific task. It is NOT * interruptible and there is no timeout. The caller is accounted as waiting - * for IO. + * for IO (which traditionally means blkio only). */ void __sched wait_for_completion_io(struct completion *x) { @@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io); * * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. The caller is accounted as waiting for IO. + * interruptible. The caller is accounted as waiting for IO (which traditionally + * means blkio only). * * Return: 0 if timed out, and positive (at least 1, or number of jiffies left * till timeout) if completed. @@ -267,6 +268,15 @@ bool try_wait_for_completion(struct completion *x) unsigned long flags; int ret = 1; + /* + * Since x->done will need to be locked only + * in the non-blocking case, we check x->done + * first without taking the lock so we can + * return early in the blocking case. + */ + if (!READ_ONCE(x->done)) + return 0; + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; @@ -287,13 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { - unsigned long flags; - int ret = 1; + if (!READ_ONCE(x->done)) + return false; - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; + /* + * If ->done, we need to wait for complete() to release ->wait.lock + * otherwise we can end up freeing the completion before complete() + * is done referencing it. + * + * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders + * the loads of ->done and ->wait.lock such that we cannot observe + * the lock before complete() acquires it while observing the ->done + * after it's acquired the lock. + */ + smp_rmb(); + spin_unlock_wait(&x->wait.lock); + return true; } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc1638b33449..f9123a82cbb6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -90,22 +90,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -#ifdef smp_mb__before_atomic -void __smp_mb__before_atomic(void) -{ - smp_mb__before_atomic(); -} -EXPORT_SYMBOL(__smp_mb__before_atomic); -#endif - -#ifdef smp_mb__after_atomic -void __smp_mb__after_atomic(void) -{ - smp_mb__after_atomic(); -} -EXPORT_SYMBOL(__smp_mb__after_atomic); -#endif - void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; @@ -135,10 +119,14 @@ void update_rq_clock(struct rq *rq) { s64 delta; - if (rq->skip_clock_update > 0) + lockdep_assert_held(&rq->lock); + + if (rq->clock_skip_update & RQCF_ACT_SKIP) return; delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + if (delta < 0) + return; rq->clock += delta; update_rq_clock_task(rq, delta); } @@ -243,6 +231,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, char buf[64]; char *cmp; int i; + struct inode *inode; if (cnt > 63) cnt = 63; @@ -253,7 +242,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; cmp = strstrip(buf); + /* Ensure the static_key remains in a consistent state */ + inode = file_inode(filp); + mutex_lock(&inode->i_mutex); i = sched_feat_set(cmp); + mutex_unlock(&inode->i_mutex); if (i == __SCHED_FEAT_NR) return -EINVAL; @@ -313,59 +306,8 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* - * __task_rq_lock - lock the rq @p resides on. - */ -static inline struct rq *__task_rq_lock(struct task_struct *p) - __acquires(rq->lock) -{ - struct rq *rq; - - lockdep_assert_held(&p->pi_lock); - - for (;;) { - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) - return rq; - raw_spin_unlock(&rq->lock); - } -} - -/* - * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. - */ -static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) - __acquires(p->pi_lock) - __acquires(rq->lock) -{ - struct rq *rq; - - for (;;) { - raw_spin_lock_irqsave(&p->pi_lock, *flags); - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) - return rq; - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); - } -} - -static void __task_rq_unlock(struct rq *rq) - __releases(rq->lock) -{ - raw_spin_unlock(&rq->lock); -} - -static inline void -task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) - __releases(rq->lock) - __releases(p->pi_lock) -{ - raw_spin_unlock(&rq->lock); - raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -} +/* cpus with isolated domains */ +cpumask_var_t cpu_isolated_map; /* * this_rq_lock - lock this runqueue and disable interrupts. @@ -442,7 +384,15 @@ static void __hrtick_start(void *arg) void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = ktime_add_ns(timer->base->get_time(), delay); + ktime_t time; + s64 delta; + + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense and can cause timer DoS. + */ + delta = max_t(s64, delay, 10000LL); + time = ktime_add_ns(timer->base->get_time(), delta); hrtimer_set_expires(timer, time); @@ -485,6 +435,11 @@ static __init void init_hrtick(void) */ void hrtick_start(struct rq *rq, u64 delay) { + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense. Rely on vruntime for fairness. + */ + delay = max_t(u64, delay, 10000LL); __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, HRTIMER_MODE_REL_PINNED, 0); } @@ -587,30 +542,31 @@ static bool set_nr_if_polling(struct task_struct *p) #endif /* - * resched_task - mark a task 'to be rescheduled now'. + * resched_curr - mark rq's current task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -void resched_task(struct task_struct *p) +void resched_curr(struct rq *rq) { + struct task_struct *curr = rq->curr; int cpu; - lockdep_assert_held(&task_rq(p)->lock); + lockdep_assert_held(&rq->lock); - if (test_tsk_need_resched(p)) + if (test_tsk_need_resched(curr)) return; - cpu = task_cpu(p); + cpu = cpu_of(rq); if (cpu == smp_processor_id()) { - set_tsk_need_resched(p); + set_tsk_need_resched(curr); set_preempt_need_resched(); return; } - if (set_nr_and_not_polling(p)) + if (set_nr_and_not_polling(curr)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); @@ -623,7 +579,7 @@ void resched_cpu(int cpu) if (!raw_spin_trylock_irqsave(&rq->lock, flags)) return; - resched_task(cpu_curr(cpu)); + resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -684,10 +640,16 @@ static void wake_up_idle_cpu(int cpu) static bool wake_up_full_nohz_cpu(int cpu) { + /* + * We just need the target to call irq_exit() and re-evaluate + * the next tick. The nohz full kick at least implies that. + * If needed we can still optimize that later with an + * empty IRQ. + */ if (tick_nohz_full_cpu(cpu)) { if (cpu != smp_processor_id() || tick_nohz_tick_stopped()) - smp_send_reschedule(cpu); + tick_nohz_full_kick_cpu(cpu); return true; } @@ -730,18 +692,32 @@ static inline bool got_nohz_idle_kick(void) #ifdef CONFIG_NO_HZ_FULL bool sched_can_stop_tick(void) { - struct rq *rq; + /* + * FIFO realtime policy runs the highest priority task. Other runnable + * tasks are of a lower priority. The scheduler tick does nothing. + */ + if (current->policy == SCHED_FIFO) + return true; - rq = this_rq(); + /* + * Round-robin realtime tasks time slice with other tasks at the same + * realtime priority. Is this task the only one at this priority? + */ + if (current->policy == SCHED_RR) { + struct sched_rt_entity *rt_se = ¤t->rt; - /* Make sure rq->nr_running update is visible after the IPI */ - smp_rmb(); + return rt_se->run_list.prev == rt_se->run_list.next; + } - /* More than one running task need preemption */ - if (rq->nr_running > 1) - return false; + /* + * More than one running task need preemption. + * nr_running update is assumed to be visible + * after IPI is sent from wakers. + */ + if (this_rq()->nr_running > 1) + return false; - return true; + return true; } #endif /* CONFIG_NO_HZ_FULL */ @@ -999,6 +975,9 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } +/* + * Can drop rq->lock because from sched_class::switched_from() methods drop it. + */ static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio) @@ -1006,6 +985,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class != p->sched_class) { if (prev_class->switched_from) prev_class->switched_from(rq, p); + /* Possble rq->lock 'hole'. */ p->sched_class->switched_to(rq, p); } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); @@ -1022,7 +1002,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) if (class == rq->curr->sched_class) break; if (class == p->sched_class) { - resched_task(rq->curr); + resched_curr(rq); break; } } @@ -1032,8 +1012,15 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) - rq->skip_clock_update = 1; + if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) + rq_clock_skip_update(rq, true); +} + +static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); + +void register_task_migration_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_register(&task_migration_notifier, n); } #ifdef CONFIG_SMP @@ -1045,7 +1032,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_preempt_count(p) & PREEMPT_ACTIVE)); + !p->on_rq); #ifdef CONFIG_LOCKDEP /* @@ -1066,10 +1053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + struct task_migration_notifier tmn; + if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); + perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); + + tmn.task = p; + tmn.from_cpu = task_cpu(p); + tmn.to_cpu = new_cpu; + + atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); } __set_task_cpu(p, new_cpu); @@ -1077,7 +1072,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) static void __migrate_swap_task(struct task_struct *p, int cpu) { - if (p->on_rq) { + if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; src_rq = task_rq(p); @@ -1203,7 +1198,7 @@ static int migration_cpu_stop(void *data); unsigned long wait_task_inactive(struct task_struct *p, long match_state) { unsigned long flags; - int running, on_rq; + int running, queued; unsigned long ncsw; struct rq *rq; @@ -1241,7 +1236,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(rq, p); - on_rq = p->on_rq; + queued = task_on_rq_queued(p); ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ @@ -1273,7 +1268,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * running right now), it's preempted, and we should * yield - it could be a while. */ - if (unlikely(on_rq)) { + if (unlikely(queued)) { ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); set_current_state(TASK_UNINTERRUPTIBLE); @@ -1398,7 +1393,8 @@ out: static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + if (p->nr_cpus_allowed > 1) + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1467,7 +1463,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); - p->on_rq = 1; + p->on_rq = TASK_ON_RQ_QUEUED; /* if a worker is waking up, notify workqueue */ if (p->flags & PF_WQ_WORKER) @@ -1526,7 +1522,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) int ret = 0; rq = __task_rq_lock(p); - if (p->on_rq) { + if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); ttwu_do_wakeup(rq, p, wake_flags); @@ -1568,9 +1564,7 @@ void scheduler_ipi(void) */ preempt_fold_need_resched(); - if (llist_empty(&this_rq()->wake_list) - && !tick_nohz_full_cpu(smp_processor_id()) - && !got_nohz_idle_kick()) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) return; /* @@ -1587,7 +1581,6 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); - tick_nohz_full_check(); sched_ttwu_pending(); /* @@ -1612,6 +1605,30 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) } } +void wake_up_if_idle(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + rcu_read_lock(); + + if (!is_idle_task(rcu_dereference(rq->curr))) + goto out; + + if (set_nr_if_polling(rq->idle)) { + trace_sched_wake_idle_without_ipi(cpu); + } else { + raw_spin_lock_irqsave(&rq->lock, flags); + if (is_idle_task(rq->curr)) + smp_send_reschedule(cpu); + /* Else cpu is not in idle, do nothing here */ + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + +out: + rcu_read_unlock(); +} + bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); @@ -1734,7 +1751,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) goto out; - if (!p->on_rq) + if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0); @@ -1768,6 +1785,24 @@ int wake_up_state(struct task_struct *p, unsigned int state) } /* + * This function clears the sched_dl_entity static params. + */ +void __dl_clear_params(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + + dl_se->dl_runtime = 0; + dl_se->dl_deadline = 0; + dl_se->dl_period = 0; + dl_se->flags = 0; + dl_se->dl_bw = 0; + + dl_se->dl_throttled = 0; + dl_se->dl_new = 1; + dl_se->dl_yielded = 0; +} + +/* * Perform scheduler related setup for a newly forked process p. * p is forked by current. * @@ -1783,6 +1818,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; +#ifdef CONFIG_SMP + p->se.avg.decay_count = 0; +#endif INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS @@ -1790,11 +1828,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif RB_CLEAR_NODE(&p->dl.rb_node); - hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - p->dl.dl_runtime = p->dl.runtime = 0; - p->dl.dl_deadline = p->dl.deadline = 0; - p->dl.dl_period = 0; - p->dl.flags = 0; + init_dl_task_timer(&p->dl); + __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); @@ -1817,12 +1852,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; - p->numa_faults_memory = NULL; - p->numa_faults_buffer_memory = NULL; + p->numa_faults = NULL; p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; - INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; #endif /* CONFIG_NUMA_BALANCING */ } @@ -1969,6 +2002,8 @@ unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_SMP inline struct dl_bw *dl_bw_of(int i) { + rcu_lockdep_assert(rcu_read_lock_sched_held(), + "sched RCU must be held"); return &cpu_rq(i)->rd->dl_bw; } @@ -1977,6 +2012,8 @@ static inline int dl_bw_cpus(int i) struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; + rcu_lockdep_assert(rcu_read_lock_sched_held(), + "sched RCU must be held"); for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; @@ -1994,25 +2031,6 @@ static inline int dl_bw_cpus(int i) } #endif -static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) -{ - dl_b->total_bw -= tsk_bw; -} - -static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) -{ - dl_b->total_bw += tsk_bw; -} - -static inline -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) -{ - return dl_b->bw != -1 && - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; -} - /* * We must be sure that accepting a new task (or allowing changing the * parameters of an existing one) is consistent with the bandwidth @@ -2020,6 +2038,9 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) * allocated bandwidth to reflect the new situation. * * This function is called while holding p's rq->lock. + * + * XXX we should delay bw change until the task's 0-lag point, see + * __setparam_dl(). */ static int dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) @@ -2087,7 +2108,7 @@ void wake_up_new_task(struct task_struct *p) init_task_runnable_average(p); rq = __task_rq_lock(p); activate_task(rq, p, 0); - p->on_rq = 1; + p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -2180,7 +2201,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, /** * finish_task_switch - clean up after a task-switch - * @rq: runqueue associated with task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired @@ -2192,10 +2212,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) + * + * The context switch have flipped the stack from under us and restored the + * local variables which were saved when this task called schedule() in the + * past. prev == current is still correct but we need to recalculate this_rq + * because prev may have moved to another CPU. */ -static void finish_task_switch(struct rq *rq, struct task_struct *prev) +static struct rq *finish_task_switch(struct task_struct *prev) __releases(rq->lock) { + struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; long prev_state; @@ -2235,6 +2261,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) } tick_nohz_task_switch(current); + return rq; } #ifdef CONFIG_SMP @@ -2269,29 +2296,22 @@ static inline void post_schedule(struct rq *rq) asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq = this_rq(); - - finish_task_switch(rq, prev); + struct rq *rq; - /* - * FIXME: do we need to worry about rq being invalidated by the - * task_switch? - */ + /* finish_task_switch() drops rq->lock and enables preemtion */ + preempt_disable(); + rq = finish_task_switch(prev); post_schedule(rq); - -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); -#endif + if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } /* - * context_switch - switch to the new MM and the new - * thread's register state. + * context_switch - switch to the new MM and the new thread's register state. */ -static inline void +static inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2325,21 +2345,14 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ -#ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -#endif context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); + + return finish_task_switch(prev); } /* @@ -2358,6 +2371,18 @@ unsigned long nr_running(void) return sum; } +/* + * Check if only the current task is running on the cpu. + */ +bool single_task_running(void) +{ + if (cpu_rq(smp_processor_id())->nr_running == 1) + return true; + else + return false; +} +EXPORT_SYMBOL(single_task_running); + unsigned long long nr_context_switches(void) { int i; @@ -2385,6 +2410,13 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } +void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) +{ + struct rq *this = this_rq(); + *nr_waiters = atomic_read(&this->nr_iowait); + *load = this->cpu_load[0]; +} + #ifdef CONFIG_SMP /* @@ -2422,39 +2454,6 @@ EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); /* - * Return any ns on the sched_clock that have not yet been accounted in - * @p in case that task is currently running. - * - * Called with task_rq_lock() held on @rq. - */ -static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) -{ - u64 ns = 0; - - if (task_current(rq, p)) { - update_rq_clock(rq); - ns = rq_clock_task(rq) - p->se.exec_start; - if ((s64)ns < 0) - ns = 0; - } - - return ns; -} - -unsigned long long task_delta_exec(struct task_struct *p) -{ - unsigned long flags; - struct rq *rq; - u64 ns = 0; - - rq = task_rq_lock(p, &flags); - ns = do_task_delta_exec(p, rq); - task_rq_unlock(rq, p, &flags); - - return ns; -} - -/* * Return accounted runtime for the task. * In case the task is currently running, return the runtime plus current's * pending runtime that have not been accounted yet. @@ -2463,7 +2462,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) { unsigned long flags; struct rq *rq; - u64 ns = 0; + u64 ns; #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) /* @@ -2474,13 +2473,24 @@ unsigned long long task_sched_runtime(struct task_struct *p) * If we race with it leaving cpu, we'll take a lock. So we're correct. * If we race with it entering cpu, unaccounted time is 0. This is * indistinguishable from the read occurring a few cycles earlier. + * If we see ->on_cpu without ->on_rq, the task is leaving, and has + * been accounted, so we're correct here as well. */ - if (!p->on_cpu) + if (!p->on_cpu || !task_on_rq_queued(p)) return p->se.sum_exec_runtime; #endif rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); + /* + * Must be ->curr _and_ ->on_rq. If dequeued, we would + * project cycles that may never be accounted to this + * thread, breaking clock_gettime(). + */ + if (task_current(rq, p) && task_on_rq_queued(p)) { + update_rq_clock(rq); + p->sched_class->update_curr(rq); + } + ns = p->se.sum_exec_runtime; task_rq_unlock(rq, p, &flags); return ns; @@ -2638,6 +2648,9 @@ static noinline void __schedule_bug(struct task_struct *prev) */ static inline void schedule_debug(struct task_struct *prev) { +#ifdef CONFIG_SCHED_STACK_END_CHECK + BUG_ON(unlikely(task_stack_end_corrupted(prev))); +#endif /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path. Otherwise whine @@ -2727,6 +2740,10 @@ again: * - explicit schedule() call * - return from syscall or exception to user-space * - return from interrupt-handler to user-space + * + * WARNING: all callers must re-check need_resched() afterward and reschedule + * accordingly in case an event triggered the need for rescheduling (such as + * an interrupt waking up a task) while preemption was disabled in __schedule(). */ static void __sched __schedule(void) { @@ -2735,11 +2752,10 @@ static void __sched __schedule(void) struct rq *rq; int cpu; -need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(cpu); + rcu_note_context_switch(); prev = rq->curr; schedule_debug(prev); @@ -2755,6 +2771,8 @@ need_resched: smp_mb__before_spinlock(); raw_spin_lock_irq(&rq->lock); + rq->clock_skip_update <<= 1; /* promote REQ to ACT */ + switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev))) { @@ -2779,36 +2797,27 @@ need_resched: switch_count = &prev->nvcsw; } - if (prev->on_rq || rq->skip_clock_update < 0) + if (task_on_rq_queued(prev)) update_rq_clock(rq); next = pick_next_task(rq, prev); clear_tsk_need_resched(prev); clear_preempt_need_resched(); - rq->skip_clock_update = 0; + rq->clock_skip_update = 0; if (likely(prev != next)) { rq->nr_switches++; rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ - /* - * The context switch have flipped the stack from under us - * and restored the local variables which were saved when - * this task called schedule() in the past. prev == current - * is still correct, but it can be moved to another cpu/rq. - */ - cpu = smp_processor_id(); - rq = cpu_rq(cpu); + rq = context_switch(rq, prev, next); /* unlocks the rq */ + cpu = cpu_of(rq); } else raw_spin_unlock_irq(&rq->lock); post_schedule(rq); sched_preempt_enable_no_resched(); - if (need_resched()) - goto need_resched; } static inline void sched_submit_work(struct task_struct *tsk) @@ -2828,7 +2837,9 @@ asmlinkage __visible void __sched schedule(void) struct task_struct *tsk = current; sched_submit_work(tsk); - __schedule(); + do { + __schedule(); + } while (need_resched()); } EXPORT_SYMBOL(schedule); @@ -2840,10 +2851,14 @@ asmlinkage __visible void __sched schedule_user(void) * or we have been woken up remotely but the IPI has not yet arrived, * we haven't yet exited the RCU idle mode. Do it here manually until * we find a better solution. + * + * NB: There are buggy callers of this function. Ideally we + * should warn if prev_state != CONTEXT_USER, but that will trigger + * too frequently to make sense yet. */ - user_exit(); + enum ctx_state prev_state = exception_enter(); schedule(); - user_enter(); + exception_exit(prev_state); } #endif @@ -2859,6 +2874,21 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } +static void __sched notrace preempt_schedule_common(void) +{ + do { + __preempt_count_add(PREEMPT_ACTIVE); + __schedule(); + __preempt_count_sub(PREEMPT_ACTIVE); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (need_resched()); +} + #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption @@ -2874,20 +2904,51 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) if (likely(!preemptible())) return; + preempt_schedule_common(); +} +NOKPROBE_SYMBOL(preempt_schedule); +EXPORT_SYMBOL(preempt_schedule); + +#ifdef CONFIG_CONTEXT_TRACKING +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +asmlinkage __visible void __sched notrace preempt_schedule_context(void) +{ + enum ctx_state prev_ctx; + + if (likely(!preemptible())) + return; + do { __preempt_count_add(PREEMPT_ACTIVE); - __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); - /* - * Check again in case we missed a preemption opportunity - * between schedule and now. + * Needs preempt disabled in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. */ + prev_ctx = exception_enter(); + __schedule(); + exception_exit(prev_ctx); + + __preempt_count_sub(PREEMPT_ACTIVE); barrier(); } while (need_resched()); } -NOKPROBE_SYMBOL(preempt_schedule); -EXPORT_SYMBOL(preempt_schedule); +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_CONTEXT_TRACKING */ + #endif /* CONFIG_PREEMPT */ /* @@ -2944,7 +3005,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, on_rq, running, enqueue_flag = 0; + int oldprio, queued, running, enqueue_flag = 0; struct rq *rq; const struct sched_class *prev_class; @@ -2971,15 +3032,14 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } trace_sched_pi_setprio(p, prio); - p->pi_top_task = rt_mutex_get_top_task(p); oldprio = p->prio; prev_class = p->sched_class; - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); /* * Boosting condition are: @@ -2991,8 +3051,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) * running task */ if (dl_prio(prio)) { - if (!dl_prio(p->normal_prio) || (p->pi_top_task && - dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { + struct task_struct *pi_task = rt_mutex_get_top_task(p); + if (!dl_prio(p->normal_prio) || + (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; p->dl.dl_throttled = 0; enqueue_flag = ENQUEUE_REPLENISH; @@ -3008,6 +3069,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } else { if (dl_prio(oldprio)) p->dl.dl_boosted = 0; + if (rt_prio(oldprio)) + p->rt.timeout = 0; p->sched_class = &fair_sched_class; } @@ -3015,7 +3078,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, p, enqueue_flag); check_class_changed(rq, p, prev_class, oldprio); @@ -3026,7 +3089,7 @@ out_unlock: void set_user_nice(struct task_struct *p, long nice) { - int old_prio, delta, on_rq; + int old_prio, delta, queued; unsigned long flags; struct rq *rq; @@ -3047,8 +3110,8 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - on_rq = p->on_rq; - if (on_rq) + queued = task_on_rq_queued(p); + if (queued) dequeue_task(rq, p, 0); p->static_prio = NICE_TO_PRIO(nice); @@ -3057,14 +3120,14 @@ void set_user_nice(struct task_struct *p, long nice) p->prio = effective_prio(p); delta = p->prio - old_prio; - if (on_rq) { + if (queued) { enqueue_task(rq, p, 0); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: */ if (delta < 0 || (delta > 0 && task_running(rq, p))) - resched_task(rq->curr); + resched_curr(rq); } out_unlock: task_rq_unlock(rq, p, &flags); @@ -3192,23 +3255,45 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) { struct sched_dl_entity *dl_se = &p->dl; - init_dl_task_timer(dl_se); dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); - dl_se->dl_throttled = 0; - dl_se->dl_new = 1; - dl_se->dl_yielded = 0; + + /* + * Changing the parameters of a task is 'tricky' and we're not doing + * the correct thing -- also see task_dead_dl() and switched_from_dl(). + * + * What we SHOULD do is delay the bandwidth release until the 0-lag + * point. This would include retaining the task_struct until that time + * and change dl_overflow() to not immediately decrement the current + * amount. + * + * Instead we retain the current runtime/deadline and let the new + * parameters take effect after the current reservation period lapses. + * This is safe (albeit pessimistic) because the 0-lag point is always + * before the current scheduling deadline. + * + * We can still have temporary overloads because we do not delay the + * change in bandwidth until that time; so admission control is + * not on the safe side. It does however guarantee tasks will never + * consume more than promised. + */ } +/* + * sched_setparam() passes in -1 for its policy, to let the functions + * it calls know not to change it. + */ +#define SETPARAM_POLICY -1 + static void __setscheduler_params(struct task_struct *p, const struct sched_attr *attr) { int policy = attr->sched_policy; - if (policy == -1) /* setparam */ + if (policy == SETPARAM_POLICY) policy = p->policy; p->policy = policy; @@ -3317,13 +3402,27 @@ static bool check_same_owner(struct task_struct *p) return match; } +static bool dl_param_changed(struct task_struct *p, + const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + if (dl_se->dl_runtime != attr->sched_runtime || + dl_se->dl_deadline != attr->sched_deadline || + dl_se->dl_period != attr->sched_period || + dl_se->flags != attr->sched_flags) + return true; + + return false; +} + static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user) { int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : MAX_RT_PRIO - 1 - attr->sched_priority; - int retval, oldprio, oldpolicy = -1, on_rq, running; + int retval, oldprio, oldpolicy = -1, queued, running; int policy = attr->sched_policy; unsigned long flags; const struct sched_class *prev_class; @@ -3445,7 +3544,7 @@ recheck: goto change; if (rt_policy(policy) && attr->sched_priority != p->rt_priority) goto change; - if (dl_policy(policy)) + if (dl_policy(policy) && dl_param_changed(p, attr)) goto change; p->sched_reset_on_fork = reset_on_fork; @@ -3520,19 +3619,19 @@ change: return 0; } - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); prev_class = p->sched_class; __setscheduler(rq, p, attr); if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (queued) { /* * We enqueue to tail when the priority of a task is * increased (user space view). @@ -3557,10 +3656,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy, .sched_nice = PRIO_TO_NICE(p->static_prio), }; - /* - * Fixup the legacy SCHED_RESET_ON_FORK hack - */ - if (policy & SCHED_RESET_ON_FORK) { + /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ + if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; policy &= ~SCHED_RESET_ON_FORK; attr.sched_policy = policy; @@ -3730,7 +3827,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, */ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) { - return do_sched_setscheduler(pid, -1, param); + return do_sched_setscheduler(pid, SETPARAM_POLICY, param); } /** @@ -3958,14 +4055,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); - goto out_unlock; + goto out_free_new_mask; } rcu_read_unlock(); } retval = security_task_setscheduler(p); if (retval) - goto out_unlock; + goto out_free_new_mask; cpuset_cpus_allowed(p, cpus_allowed); @@ -3978,13 +4075,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) * root_domain. */ #ifdef CONFIG_SMP - if (task_has_dl_policy(p)) { - const struct cpumask *span = task_rq(p)->rd->span; - - if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + rcu_read_lock(); + if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { retval = -EBUSY; - goto out_unlock; + rcu_read_unlock(); + goto out_free_new_mask; } + rcu_read_unlock(); } #endif again: @@ -4002,7 +4100,7 @@ again: goto again; } } -out_unlock: +out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: free_cpumask_var(cpus_allowed); @@ -4138,17 +4236,10 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -static void __cond_resched(void) -{ - __preempt_count_add(PREEMPT_ACTIVE); - __schedule(); - __preempt_count_sub(PREEMPT_ACTIVE); -} - int __sched _cond_resched(void) { if (should_resched()) { - __cond_resched(); + preempt_schedule_common(); return 1; } return 0; @@ -4173,7 +4264,7 @@ int __cond_resched_lock(spinlock_t *lock) if (spin_needbreak(lock) || resched) { spin_unlock(lock); if (resched) - __cond_resched(); + preempt_schedule_common(); else cpu_relax(); ret = 1; @@ -4189,7 +4280,7 @@ int __sched __cond_resched_softirq(void) if (should_resched()) { local_bh_enable(); - __cond_resched(); + preempt_schedule_common(); local_bh_disable(); return 1; } @@ -4285,7 +4376,7 @@ again: * fairness. */ if (preempt && rq != p_rq) - resched_task(p_rq->curr); + resched_curr(p_rq); } out_unlock: @@ -4304,36 +4395,29 @@ EXPORT_SYMBOL_GPL(yield_to); * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. */ -void __sched io_schedule(void) -{ - struct rq *rq = raw_rq(); - - delayacct_blkio_start(); - atomic_inc(&rq->nr_iowait); - blk_flush_plug(current); - current->in_iowait = 1; - schedule(); - current->in_iowait = 0; - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); -} -EXPORT_SYMBOL(io_schedule); - long __sched io_schedule_timeout(long timeout) { - struct rq *rq = raw_rq(); + int old_iowait = current->in_iowait; + struct rq *rq; long ret; + current->in_iowait = 1; + if (old_iowait) + blk_schedule_flush_plug(current); + else + blk_flush_plug(current); + delayacct_blkio_start(); + rq = raw_rq(); atomic_inc(&rq->nr_iowait); - blk_flush_plug(current); - current->in_iowait = 1; ret = schedule_timeout(timeout); - current->in_iowait = 0; + current->in_iowait = old_iowait; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); + return ret; } +EXPORT_SYMBOL(io_schedule_timeout); /** * sys_sched_get_priority_max - return maximum RT priority. @@ -4444,9 +4528,10 @@ void sched_show_task(struct task_struct *p) { unsigned long free = 0; int ppid; - unsigned state; + unsigned long state = p->state; - state = p->state ? __ffs(p->state) + 1 : 0; + if (state) + state = __ffs(state) + 1; printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 @@ -4463,8 +4548,10 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif + ppid = 0; rcu_read_lock(); - ppid = task_pid_nr(rcu_dereference(p->real_parent)); + if (pid_alive(p)) + ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, task_pid_nr(p), ppid, @@ -4486,7 +4573,7 @@ void show_state_filter(unsigned long state_filter) " task PC stack pid father\n"); #endif rcu_read_lock(); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: @@ -4494,7 +4581,7 @@ void show_state_filter(unsigned long state_filter) touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); - } while_each_thread(g, p); + } touch_all_softlockup_watchdogs(); @@ -4549,7 +4636,7 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; - idle->on_rq = 1; + idle->on_rq = TASK_ON_RQ_QUEUED; #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif @@ -4569,10 +4656,115 @@ void init_idle(struct task_struct *idle, int cpu) #endif } +int cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial) +{ + int ret = 1, trial_cpus; + struct dl_bw *cur_dl_b; + unsigned long flags; + + if (!cpumask_weight(cur)) + return ret; + + rcu_read_lock_sched(); + cur_dl_b = dl_bw_of(cpumask_any(cur)); + trial_cpus = cpumask_weight(trial); + + raw_spin_lock_irqsave(&cur_dl_b->lock, flags); + if (cur_dl_b->bw != -1 && + cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + ret = 0; + raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); + rcu_read_unlock_sched(); + + return ret; +} + +int task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed) +{ + int ret = 0; + + /* + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. + */ + if (p->flags & PF_NO_SETAFFINITY) { + ret = -EINVAL; + goto out; + } + +#ifdef CONFIG_SMP + if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, + cs_cpus_allowed)) { + unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, + cs_cpus_allowed); + struct dl_bw *dl_b; + bool overflow; + int cpus; + unsigned long flags; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(dest_cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(dest_cpu); + overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + if (overflow) + ret = -EBUSY; + else { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw); + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + + } +#endif +out: + return ret; +} + #ifdef CONFIG_SMP +/* + * move_queued_task - move a queued task to new rq. + * + * Returns (locked) new rq. Old rq's lock is released. + */ +static struct rq *move_queued_task(struct task_struct *p, int new_cpu) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_held(&rq->lock); + + dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, new_cpu); + raw_spin_unlock(&rq->lock); + + rq = cpu_rq(new_cpu); + + raw_spin_lock(&rq->lock); + BUG_ON(task_cpu(p) != new_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; + enqueue_task(rq, p, 0); + check_preempt_curr(rq, p, 0); + + return rq; +} + void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { - if (p->sched_class && p->sched_class->set_cpus_allowed) + if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); @@ -4626,14 +4818,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (p->on_rq) { + if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; - } + } else if (task_on_rq_queued(p)) + rq = move_queued_task(p, dest_cpu); out: task_rq_unlock(rq, p, &flags); @@ -4654,20 +4847,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); */ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { - struct rq *rq_dest, *rq_src; + struct rq *rq; int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; - rq_src = cpu_rq(src_cpu); - rq_dest = cpu_rq(dest_cpu); + rq = cpu_rq(src_cpu); raw_spin_lock(&p->pi_lock); - double_rq_lock(rq_src, rq_dest); + raw_spin_lock(&rq->lock); /* Already moved. */ if (task_cpu(p) != src_cpu) goto done; + /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) goto fail; @@ -4676,16 +4869,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (p->on_rq) { - dequeue_task(rq_src, p, 0); - set_task_cpu(p, dest_cpu); - enqueue_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p, 0); - } + if (task_on_rq_queued(p)) + rq = move_queued_task(p, dest_cpu); done: ret = 1; fail: - double_rq_unlock(rq_src, rq_dest); + raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); return ret; } @@ -4717,22 +4906,22 @@ void sched_setnuma(struct task_struct *p, int nid) { struct rq *rq; unsigned long flags; - bool on_rq, running; + bool queued, running; rq = task_rq_lock(p, &flags); - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); p->numa_preferred_nid = nid; if (running) p->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, p, 0); task_rq_unlock(rq, p, &flags); } @@ -4752,6 +4941,12 @@ static int migration_cpu_stop(void *data) * be on another cpu but it doesn't matter. */ local_irq_disable(); + /* + * We need to explicitly wake pending tasks before running + * __migrate_task() such that we will not miss enforcing cpus_allowed + * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. + */ + sched_ttwu_pending(); __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); local_irq_enable(); return 0; @@ -5160,31 +5355,13 @@ static int sched_cpu_active(struct notifier_block *nfb, static int sched_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned long flags; - long cpu = (long)hcpu; - switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - set_cpu_active(cpu, false); - - /* explicitly allow suspend */ - if (!(action & CPU_TASKS_FROZEN)) { - struct dl_bw *dl_b = dl_bw_of(cpu); - bool overflow; - int cpus; - - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - - if (overflow) - return notifier_from_errno(-EBUSY); - } + set_cpu_active((long)hcpu, false); return NOTIFY_OK; + default: + return NOTIFY_DONE; } - - return NOTIFY_DONE; } static int __init migration_init(void) @@ -5232,9 +5409,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { struct sched_group *group = sd->groups; - char str[256]; - cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); cpumask_clear(groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -5247,7 +5422,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %s level %s\n", str, sd->name); + printk(KERN_CONT "span %*pbl level %s\n", + cpumask_pr_args(sched_domain_span(sd)), sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { printk(KERN_ERR "ERROR: domain->span does not contain " @@ -5266,17 +5442,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - /* - * Even though we initialize ->capacity to something semi-sane, - * we leave capacity_orig unset. This allows us to detect if - * domain iteration is still funny without causing /0 traps. - */ - if (!group->sgc->capacity_orig) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); - break; - } - if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); @@ -5292,9 +5457,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_or(groupmask, groupmask, sched_group_cpus(group)); - cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - - printk(KERN_CONT " %s", str); + printk(KERN_CONT " %*pbl", + cpumask_pr_args(sched_group_cpus(group))); if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { printk(KERN_CONT " (cpu_capacity = %d)", group->sgc->capacity); @@ -5650,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) update_top_cache_domain(cpu); } -/* cpus with isolated domains */ -static cpumask_var_t cpu_isolated_map; - /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { @@ -5720,7 +5881,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered = sched_domains_tmpmask; struct sd_data *sdd = sd->private; - struct sched_domain *child; + struct sched_domain *sibling; int i; cpumask_clear(covered); @@ -5731,10 +5892,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (cpumask_test_cpu(i, covered)) continue; - child = *per_cpu_ptr(sdd->sd, i); + sibling = *per_cpu_ptr(sdd->sd, i); /* See the comment near build_group_mask(). */ - if (!cpumask_test_cpu(i, sched_domain_span(child))) + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), @@ -5744,10 +5905,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) goto fail; sg_span = sched_group_cpus(sg); - if (child->child) { - child = child->child; - cpumask_copy(sg_span, sched_domain_span(child)); - } else + if (sibling->child) + cpumask_copy(sg_span, sched_domain_span(sibling->child)); + else cpumask_set_cpu(i, sg_span); cpumask_or(covered, covered, sg_span); @@ -5762,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->capacity_orig = sg->sgc->capacity; /* * Make sure the first group of this domain contains the @@ -5985,7 +6144,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA static int sched_domains_numa_levels; +enum numa_topology_type sched_numa_topology_type; static int *sched_domains_numa_distance; +int sched_max_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; #endif @@ -6071,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) */ if (sd->flags & SD_SHARE_CPUCAPACITY) { + sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ @@ -6157,7 +6319,7 @@ static void sched_numa_warn(const char *str) printk(KERN_WARNING "\n"); } -static bool find_numa_distance(int distance) +bool find_numa_distance(int distance) { int i; @@ -6172,6 +6334,56 @@ static bool find_numa_distance(int distance) return false; } +/* + * A system can have three types of NUMA topology: + * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system + * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes + * NUMA_BACKPLANE: nodes can reach other nodes through a backplane + * + * The difference between a glueless mesh topology and a backplane + * topology lies in whether communication between not directly + * connected nodes goes through intermediary nodes (where programs + * could run), or through backplane controllers. This affects + * placement of programs. + * + * The type of topology can be discerned with the following tests: + * - If the maximum distance between any nodes is 1 hop, the system + * is directly connected. + * - If for two nodes A and B, located N > 1 hops away from each other, + * there is an intermediary node C, which is < N hops away from both + * nodes A and B, the system is a glueless mesh. + */ +static void init_numa_topology_type(void) +{ + int a, b, c, n; + + n = sched_max_numa_distance; + + if (n <= 1) + sched_numa_topology_type = NUMA_DIRECT; + + for_each_online_node(a) { + for_each_online_node(b) { + /* Find two nodes furthest removed from each other. */ + if (node_distance(a, b) < n) + continue; + + /* Is there an intermediary node between a and b? */ + for_each_online_node(c) { + if (node_distance(a, c) < n && + node_distance(b, c) < n) { + sched_numa_topology_type = + NUMA_GLUELESS_MESH; + return; + } + } + + sched_numa_topology_type = NUMA_BACKPLANE; + return; + } + } +} + static void sched_init_numa(void) { int next_distance, curr_distance = node_distance(0, 0); @@ -6225,6 +6437,10 @@ static void sched_init_numa(void) if (!sched_debug()) break; } + + if (!level) + return; + /* * 'level' contains the number of unique distances, excluding the * identity distance node_distance(i,i). @@ -6304,6 +6520,9 @@ static void sched_init_numa(void) sched_domain_topology = tl; sched_domains_numa_levels = level; + sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + + init_numa_topology_type(); } static void sched_domains_numa_masks_set(int cpu) @@ -6465,6 +6684,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; sd->child = child; + + if (!cpumask_subset(sched_domain_span(child), + sched_domain_span(sd))) { + pr_err("BUG: arch topology borken\n"); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" the %s domain not a subset of the %s domain\n", + child->name, sd->name); +#endif + /* Fixup, ensure @sd has at least @child cpus. */ + cpumask_or(sched_domain_span(sd), + sched_domain_span(sd), + sched_domain_span(child)); + } + } set_domain_attribute(sd, attr); @@ -6765,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, */ case CPU_ONLINE: - case CPU_DOWN_FAILED: cpuset_update_active_cpus(true); break; default: @@ -6777,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action) { + unsigned long flags; + long cpu = (long)hcpu; + struct dl_bw *dl_b; + + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: + /* explicitly allow suspend */ + if (!(action & CPU_TASKS_FROZEN)) { + bool overflow; + int cpus; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(cpu); + overflow = __dl_overflow(dl_b, cpus, 0, 0); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); + + if (overflow) + return notifier_from_errno(-EBUSY); + } cpuset_update_active_cpus(false); break; case CPU_DOWN_PREPARE_FROZEN: @@ -6865,9 +7119,6 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif -#ifdef CONFIG_CPUMASK_OFFSTACK - alloc_size += num_possible_cpus() * cpumask_size(); -#endif if (alloc_size) { ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); @@ -6887,13 +7138,13 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_RT_GROUP_SCHED */ + } #ifdef CONFIG_CPUMASK_OFFSTACK - for_each_possible_cpu(i) { - per_cpu(load_balance_mask, i) = (void *)ptr; - ptr += cpumask_size(); - } -#endif /* CONFIG_CPUMASK_OFFSTACK */ + for_each_possible_cpu(i) { + per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( + cpumask_size(), GFP_KERNEL, cpu_to_node(i)); } +#endif /* CONFIG_CPUMASK_OFFSTACK */ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); @@ -6926,8 +7177,8 @@ void __init sched_init(void) rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs); - init_rt_rq(&rq->rt, rq); - init_dl_rq(&rq->dl, rq); + init_rt_rq(&rq->rt); + init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); @@ -6967,7 +7218,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = SCHED_CAPACITY_SCALE; + rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; @@ -7005,6 +7256,11 @@ void __init sched_init(void) enter_lazy_tlb(&init_mm, current); /* + * During early bootup we pretend to be a normal task: + */ + current->sched_class = &fair_sched_class; + + /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, * but because we are the idle thread, we just pick up running again @@ -7014,11 +7270,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; - /* - * During early bootup we pretend to be a normal task: - */ - current->sched_class = &fair_sched_class; - #ifdef CONFIG_SMP zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); /* May be allocated at isolcpus cmdline parse time */ @@ -7042,6 +7293,24 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { + /* + * Blocking primitives will set (and therefore destroy) current->state, + * since we will exit with TASK_RUNNING make sure we enter with it, + * otherwise we will destroy state. + */ + WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, + "do not call blocking ops when !TASK_RUNNING; " + "state=%lx set at [<%p>] %pS\n", + current->state, + (void *)current->task_state_change, + (void *)current->task_state_change); + + ___might_sleep(file, line, preempt_offset); +} +EXPORT_SYMBOL(__might_sleep); + +void ___might_sleep(const char *file, int line, int preempt_offset) +{ static unsigned long prev_jiffy; /* ratelimiting */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ @@ -7061,6 +7330,9 @@ void __might_sleep(const char *file, int line, int preempt_offset) in_atomic(), irqs_disabled(), current->pid, current->comm); + if (task_stack_end_corrupted(current)) + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); + debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); @@ -7073,7 +7345,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) #endif dump_stack(); } -EXPORT_SYMBOL(__might_sleep); +EXPORT_SYMBOL(___might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -7084,15 +7356,15 @@ static void normalize_task(struct rq *rq, struct task_struct *p) .sched_policy = SCHED_NORMAL, }; int old_prio = p->prio; - int on_rq; + int queued; - on_rq = p->on_rq; - if (on_rq) + queued = task_on_rq_queued(p); + if (queued) dequeue_task(rq, p, 0); __setscheduler(rq, p, &attr); - if (on_rq) { + if (queued) { enqueue_task(rq, p, 0); - resched_task(rq->curr); + resched_curr(rq); } check_class_changed(rq, p, prev_class, old_prio); @@ -7104,12 +7376,12 @@ void normalize_rt_tasks(void) unsigned long flags; struct rq *rq; - read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, p) { + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { /* * Only normalize user tasks: */ - if (!p->mm) + if (p->flags & PF_KTHREAD) continue; p->se.exec_start = 0; @@ -7124,21 +7396,16 @@ void normalize_rt_tasks(void) * Renice negative nice level userspace * tasks back to 0: */ - if (task_nice(p) < 0 && p->mm) + if (task_nice(p) < 0) set_user_nice(p, 0); continue; } - raw_spin_lock(&p->pi_lock); - rq = __task_rq_lock(p); - + rq = task_rq_lock(p, &flags); normalize_task(rq, p); - - __task_rq_unlock(rq); - raw_spin_unlock(&p->pi_lock); - } while_each_thread(g, p); - - read_unlock_irqrestore(&tasklist_lock, flags); + task_rq_unlock(rq, p, &flags); + } + read_unlock(&tasklist_lock); } #endif /* CONFIG_MAGIC_SYSRQ */ @@ -7278,36 +7545,40 @@ void sched_offline_group(struct task_group *tg) void sched_move_task(struct task_struct *tsk) { struct task_group *tg; - int on_rq, running; + int queued, running; unsigned long flags; struct rq *rq; rq = task_rq_lock(tsk, &flags); running = task_current(rq, tsk); - on_rq = tsk->on_rq; + queued = task_on_rq_queued(tsk); - if (on_rq) + if (queued) dequeue_task(rq, tsk, 0); if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); + put_prev_task(rq, tsk); - tg = container_of(task_css_check(tsk, cpu_cgrp_id, - lockdep_is_held(&tsk->sighand->siglock)), + /* + * All callers are synchronized by task_rq_lock(); we do not use RCU + * which is pointless here. Thus, we pass "true" to task_css_check() + * to prevent lockdep warnings. + */ + tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group, css); tg = autogroup_task_group(tsk, tg); tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, on_rq); + tsk->sched_class->task_move_group(tsk, queued); else #endif set_task_rq(tsk, task_cpu(tsk)); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, tsk, 0); task_rq_unlock(rq, tsk, &flags); @@ -7325,10 +7596,16 @@ static inline int tg_has_rt_tasks(struct task_group *tg) { struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && task_rq(p)->rt.tg == tg) + /* + * Autogroups do not have RT tasks; see autogroup_create(). + */ + if (task_group_is_autogroup(tg)) + return 0; + + for_each_process_thread(g, p) { + if (rt_task(p) && task_group(p) == tg) return 1; - } while_each_thread(g, p); + } return 0; } @@ -7417,6 +7694,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg, { int i, err = 0; + /* + * Disallowing the root group RT runtime is BAD, it would disallow the + * kernel creating (and or operating) RT threads. + */ + if (tg == &root_task_group && rt_runtime == 0) + return -EINVAL; + + /* No period doesn't make any sense. */ + if (rt_period == 0) + return -EINVAL; + mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); err = __rt_schedulable(tg, rt_period, rt_runtime); @@ -7473,9 +7761,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) rt_period = (u64)rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; - if (rt_period == 0) - return -EINVAL; - return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } @@ -7532,11 +7817,12 @@ static int sched_rt_global_constraints(void) } #endif /* CONFIG_RT_GROUP_SCHED */ -static int sched_dl_global_constraints(void) +static int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); + struct dl_bw *dl_b; int cpu, ret = 0; unsigned long flags; @@ -7550,13 +7836,16 @@ static int sched_dl_global_constraints(void) * solutions is welcome! */ for_each_possible_cpu(cpu) { - struct dl_bw *dl_b = dl_bw_of(cpu); + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); if (new_bw < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + if (ret) break; } @@ -7567,6 +7856,7 @@ static int sched_dl_global_constraints(void) static void sched_dl_do_global(void) { u64 new_bw = -1; + struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -7580,11 +7870,14 @@ static void sched_dl_do_global(void) * FIXME: As above... */ for_each_possible_cpu(cpu) { - struct dl_bw *dl_b = dl_bw_of(cpu); + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); dl_b->bw = new_bw; raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); } } @@ -7625,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write, if (ret) goto undo; - ret = sched_rt_global_constraints(); + ret = sched_dl_global_validate(); if (ret) goto undo; - ret = sched_dl_global_constraints(); + ret = sched_rt_global_constraints(); if (ret) goto undo; @@ -7714,6 +8007,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } +static void cpu_cgroup_fork(struct task_struct *task) +{ + sched_move_task(task); +} + static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { @@ -7803,6 +8101,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) if (period > max_cfs_quota_period) return -EINVAL; + /* + * Prevent race between setting of cfs_rq->runtime_enabled and + * unthrottle_offline_cfs_rqs(). + */ + get_online_cpus(); mutex_lock(&cfs_constraints_mutex); ret = __cfs_schedulable(tg, period, quota); if (ret) @@ -7828,7 +8131,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) } raw_spin_unlock_irq(&cfs_b->lock); - for_each_possible_cpu(i) { + for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; @@ -7844,6 +8147,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); + put_online_cpus(); return ret; } @@ -7959,7 +8263,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; quota = normalize_cfs_quota(tg, d); - parent_quota = parent_b->hierarchal_quota; + parent_quota = parent_b->hierarchical_quota; /* * ensure max(child_quota) <= parent_quota, inherit when no @@ -7970,7 +8274,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) else if (parent_quota != RUNTIME_INF && quota > parent_quota) return -EINVAL; } - cfs_b->hierarchal_quota = quota; + cfs_b->hierarchical_quota = quota; return 0; } @@ -8080,10 +8384,11 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, .css_offline = cpu_cgroup_css_offline, + .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .base_cftypes = cpu_files, + .legacy_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9cf350c94ec4..dd7cbb55bbf2 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -278,6 +278,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, - .base_cftypes = files, + .legacy_cftypes = files, .early_init = 1, }; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index bd95963dae80..c6acb07466bb 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -107,9 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, int best_cpu = -1; const struct sched_dl_entity *dl_se = &p->dl; - if (later_mask && cpumask_and(later_mask, cp->free_cpus, - &p->cpus_allowed) && cpumask_and(later_mask, - later_mask, cpu_active_mask)) { + if (later_mask && + cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { best_cpu = cpumask_any(later_mask); goto out; } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && @@ -188,6 +187,26 @@ out: } /* + * cpudl_set_freecpu - Set the cpudl.free_cpus + * @cp: the cpudl max-heap context + * @cpu: rd attached cpu + */ +void cpudl_set_freecpu(struct cpudl *cp, int cpu) +{ + cpumask_set_cpu(cpu, cp->free_cpus); +} + +/* + * cpudl_clear_freecpu - Clear the cpudl.free_cpus + * @cp: the cpudl max-heap context + * @cpu: rd attached cpu + */ +void cpudl_clear_freecpu(struct cpudl *cp, int cpu) +{ + cpumask_clear_cpu(cpu, cp->free_cpus); +} + +/* * cpudl_init - initialize the cpudl structure * @cp: the cpudl max-heap context */ @@ -205,7 +224,7 @@ int cpudl_init(struct cpudl *cp) if (!cp->elements) return -ENOMEM; - if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { kfree(cp->elements); return -ENOMEM; } @@ -213,8 +232,6 @@ int cpudl_init(struct cpudl *cp) for_each_possible_cpu(i) cp->elements[i].idx = IDX_INVALID; - cpumask_setall(cp->free_cpus); - return 0; } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 538c9796ad4a..1a0a6ef2fbe1 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -24,10 +24,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); int cpudl_init(struct cpudl *cp); +void cpudl_set_freecpu(struct cpudl *cp, int cpu); +void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); -#else -#define cpudl_set(cp, cpu, dl) do { } while (0) -#define cpudl_init() do { } while (0) #endif /* CONFIG_SMP */ #endif /* _LINUX_CPUDL_H */ diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 6b033347fdfd..63cbb9ca0496 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp, void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); -#else -#define cpupri_set(cp, cpu, pri) do { } while (0) -#define cpupri_init() do { } while (0) #endif #endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 72fdf06ef865..8394b1ee600c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) struct signal_struct *sig = tsk->signal; cputime_t utime, stime; struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; + unsigned int seq, nextseq; + unsigned long flags; rcu_read_lock(); - /* make sure we can trust tsk->thread_group list */ - if (!likely(pid_alive(tsk))) - goto out; - - t = tsk; + /* Attempt a lockless read on the first round. */ + nextseq = 0; do { - task_cputime(t, &utime, &stime); - times->utime += utime; - times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); - } while_each_thread(tsk, t); -out: + seq = nextseq; + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + for_each_thread(tsk, t) { + task_cputime(t, &utime, &stime); + times->utime += utime; + times->stime += stime; + times->sum_exec_runtime += task_sched_runtime(t); + } + /* If lockless access failed, take the lock. */ + nextseq = 1; + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); rcu_read_unlock(); } @@ -550,6 +555,23 @@ drop_precision: } /* + * Atomically advance counter to the new value. Interrupts, vcpu + * scheduling, and scaling inaccuracies can cause cputime_advance + * to be occasionally called with a new value smaller than counter. + * Let's enforce atomicity. + * + * Normally a caller will only go through this loop once, or not + * at all in case a previous caller updated counter the same jiffy. + */ +static void cputime_advance(cputime_t *counter, cputime_t new) +{ + cputime_t old; + + while (new > (old = ACCESS_ONCE(*counter))) + cmpxchg_cputime(counter, old, new); +} + +/* * Adjust tick based cputime random precision against scheduler * runtime accounting. */ @@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr, utime = rtime - stime; } - /* - * If the tick based count grows faster than the scheduler one, - * the result of the scaling may go backward. - * Let's enforce monotonicity. - */ - prev->stime = max(prev->stime, stime); - prev->utime = max(prev->utime, utime); + cputime_advance(&prev->stime, stime); + cputime_advance(&prev->utime, utime); out: *ut = prev->utime; @@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) cputime_adjust(&cputime, &p->prev_cputime, ut, st); } -/* - * Must be called with siglock held. - */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fc4f98b1258f..5e95145088fd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b) dl_b->total_bw = 0; } -void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) +void init_dl_rq(struct dl_rq *dl_rq) { dl_rq->rb_root = RB_ROOT; @@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq) rq->post_schedule = has_pushable_dl_tasks(rq); } +static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); + +static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) +{ + struct rq *later_rq = NULL; + bool fallback = false; + + later_rq = find_lock_later_rq(p, rq); + + if (!later_rq) { + int cpu; + + /* + * If we cannot preempt any rq, fall back to pick any + * online cpu. + */ + fallback = true; + cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); + if (cpu >= nr_cpu_ids) { + /* + * Fail to find any suitable cpu. + * The task will never come back! + */ + BUG_ON(dl_bandwidth_enabled()); + + /* + * If admission control is disabled we + * try a little harder to let the task + * run. + */ + cpu = cpumask_any(cpu_active_mask); + } + later_rq = cpu_rq(cpu); + double_lock_balance(rq, later_rq); + } + + deactivate_task(rq, p, 0); + set_task_cpu(p, later_rq->cpu); + activate_task(later_rq, p, ENQUEUE_REPLENISH); + + if (!fallback) + resched_curr(later_rq); + + double_unlock_balance(rq, later_rq); +} + #else static inline @@ -306,7 +352,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, * the overrunning entity can't interfere with other entity in the system and * can't make them miss their deadlines. Reasons why this kind of overruns * could happen are, typically, a entity voluntarily trying to overcome its - * runtime, or it just underestimated it during sched_setscheduler_ex(). + * runtime, or it just underestimated it during sched_setattr(). */ static void replenish_dl_entity(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se) @@ -350,6 +396,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } + + if (dl_se->dl_yielded) + dl_se->dl_yielded = 0; + if (dl_se->dl_throttled) + dl_se->dl_throttled = 0; } /* @@ -506,47 +557,76 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct sched_dl_entity, dl_timer); struct task_struct *p = dl_task_of(dl_se); + unsigned long flags; struct rq *rq; -again: - rq = task_rq(p); - raw_spin_lock(&rq->lock); - if (rq != task_rq(p)) { - /* Task was moved, retrying. */ - raw_spin_unlock(&rq->lock); - goto again; - } + rq = task_rq_lock(p, &flags); /* - * We need to take care of a possible races here. In fact, the - * task might have changed its scheduling policy to something - * different from SCHED_DEADLINE or changed its reservation - * parameters (through sched_setattr()). + * We need to take care of several possible races here: + * + * - the task might have changed its scheduling policy + * to something different than SCHED_DEADLINE + * - the task might have changed its reservation parameters + * (through sched_setattr()) + * - the task might have been boosted by someone else and + * might be in the boosting/deboosting path + * + * In all this cases we bail out, as the task is already + * in the runqueue or is going to be enqueued back anyway. */ - if (!dl_task(p) || dl_se->dl_new) + if (!dl_task(p) || dl_se->dl_new || + dl_se->dl_boosted || !dl_se->dl_throttled) goto unlock; sched_clock_tick(); update_rq_clock(rq); - dl_se->dl_throttled = 0; - dl_se->dl_yielded = 0; - if (p->on_rq) { - enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (task_has_dl_policy(rq->curr)) - check_preempt_curr_dl(rq, p, 0); - else - resched_task(rq->curr); + #ifdef CONFIG_SMP - /* - * Queueing this task back might have overloaded rq, - * check if we need to kick someone away. - */ - if (has_pushable_dl_tasks(rq)) - push_dl_task(rq); + /* + * If we find that the rq the task was on is no longer + * available, we need to select a new rq. + */ + if (unlikely(!rq->online)) { + dl_task_offline_migration(rq, p); + goto unlock; + } #endif + + /* + * If the throttle happened during sched-out; like: + * + * schedule() + * deactivate_task() + * dequeue_task_dl() + * update_curr_dl() + * start_dl_timer() + * __dequeue_task_dl() + * prev->on_rq = 0; + * + * We can be both throttled and !queued. Replenish the counter + * but do not enqueue -- wait for our wakeup to do that. + */ + if (!task_on_rq_queued(p)) { + replenish_dl_entity(dl_se, dl_se); + goto unlock; } + + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, + * check if we need to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) + push_dl_task(rq); +#endif unlock: - raw_spin_unlock(&rq->lock); + task_rq_unlock(rq, p, &flags); return HRTIMER_NORESTART; } @@ -555,11 +635,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - if (hrtimer_active(timer)) { - hrtimer_try_to_cancel(timer); - return; - } - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); timer->function = dl_task_timer; } @@ -567,24 +642,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) static int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) { - int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); - int rorun = dl_se->runtime <= 0; - - if (!rorun && !dmiss) - return 0; - - /* - * If we are beyond our current deadline and we are still - * executing, then we have already used some of the runtime of - * the next instance. Thus, if we do not account that, we are - * stealing bandwidth from the system at each deadline miss! - */ - if (dmiss) { - dl_se->runtime = rorun ? dl_se->runtime : 0; - dl_se->runtime -= rq_clock(rq) - dl_se->deadline; - } - - return 1; + return (dl_se->runtime <= 0); } extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); @@ -625,16 +683,15 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= delta_exec; + dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(rq, dl_se)) { + dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); - if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) - dl_se->dl_throttled = 1; - else + if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) - resched_task(curr); + resched_curr(rq); } /* @@ -823,10 +880,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) - replenish_dl_entity(dl_se, pi_se); - else + if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) update_dl_entity(dl_se, pi_se); + else if (flags & ENQUEUE_REPLENISH) + replenish_dl_entity(dl_se, pi_se); __enqueue_dl_entity(dl_se); } @@ -847,8 +904,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * smaller than our one... OTW we keep our runtime and * deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { pi_se = &pi_task->dl; + } else if (!dl_prio(p->normal_prio)) { + /* + * Special case in which we have a !SCHED_DEADLINE task + * that is going to be deboosted, but exceedes its + * runtime while doing so. No point in replenishing + * it, as it's going to return back to its original + * scheduling class after this. + */ + BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + return; + } /* * If p is throttled, we do nothing. In fact, if it exhausted @@ -856,7 +924,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * its rq, the bandwidth timer callback (which clearly has not * run yet) will take care of this. */ - if (p->dl.dl_throttled) + if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) return; enqueue_dl_entity(&p->dl, pi_se, flags); @@ -901,7 +969,14 @@ static void yield_task_dl(struct rq *rq) rq->curr->dl.dl_yielded = 1; p->dl.runtime = 0; } + update_rq_clock(rq); update_curr_dl(rq); + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() + * and double the fastpath cost. + */ + rq_clock_skip_update(rq, true); } #ifdef CONFIG_SMP @@ -914,7 +989,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) struct task_struct *curr; struct rq *rq; - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + if (sd_flag != SD_BALANCE_WAKE) goto out; rq = cpu_rq(cpu); @@ -964,7 +1039,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) cpudl_find(&rq->rd->cpudl, p, NULL) != -1) return; - resched_task(rq->curr); + resched_curr(rq); } static int pull_dl_task(struct rq *this_rq); @@ -979,7 +1054,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags) { if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { - resched_task(rq->curr); + resched_curr(rq); return; } @@ -997,10 +1072,11 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, #ifdef CONFIG_SCHED_HRTICK static void start_hrtick_dl(struct rq *rq, struct task_struct *p) { - s64 delta = p->dl.dl_runtime - p->dl.runtime; - - if (delta > 10000) - hrtick_start(rq, p->dl.runtime); + hrtick_start(rq, p->dl.runtime); +} +#else /* !CONFIG_SCHED_HRTICK */ +static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +{ } #endif @@ -1030,7 +1106,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) * means a stop task can slip in, in which case we need to * re-start task selection. */ - if (rq->stop && rq->stop->on_rq) + if (rq->stop && task_on_rq_queued(rq->stop)) return RETRY_TASK; } @@ -1055,10 +1131,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) /* Running task will never be pushed. */ dequeue_pushable_dl_task(rq, p); -#ifdef CONFIG_SCHED_HRTICK if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); -#endif set_post_schedule(rq); @@ -1077,10 +1151,14 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); -#ifdef CONFIG_SCHED_HRTICK - if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) + /* + * Even when we have runtime, update_curr_dl() might have resulted in us + * not being the leftmost task anymore. In that case NEED_RESCHED will + * be set and schedule() will start a new hrtick for the next task. + */ + if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 && + is_leftmost(p, &rq->dl)) start_hrtick_dl(rq, p); -#endif } static void task_fork_dl(struct task_struct *p) @@ -1100,6 +1178,7 @@ static void task_dead_dl(struct task_struct *p) * Since we are TASK_DEAD we won't slip out of the domain! */ raw_spin_lock_irq(&dl_b->lock); + /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); @@ -1124,10 +1203,8 @@ static void set_curr_task_dl(struct rq *rq) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && - (p->nr_cpus_allowed > 1)) + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) return 1; - return 0; } @@ -1158,7 +1235,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); static int find_later_rq(struct task_struct *task) { struct sched_domain *sd; - struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); + struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); int this_cpu = smp_processor_id(); int best_cpu, cpu = task_cpu(task); @@ -1169,6 +1246,10 @@ static int find_later_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; + /* + * We have to consider system topology and task affinity + * first, then we can look for a suitable cpu. + */ best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask); if (best_cpu == -1) @@ -1257,7 +1338,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || - task_running(rq, task) || !task->on_rq)) { + task_running(rq, task) || + !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); later_rq = NULL; break; @@ -1296,7 +1378,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); - BUG_ON(!p->on_rq); + BUG_ON(!task_on_rq_queued(p)); BUG_ON(!dl_task(p)); return p; @@ -1311,6 +1393,7 @@ static int push_dl_task(struct rq *rq) { struct task_struct *next_task; struct rq *later_rq; + int ret = 0; if (!rq->dl.overloaded) return 0; @@ -1333,7 +1416,7 @@ retry: if (dl_task(rq->curr) && dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && rq->curr->nr_cpus_allowed > 1) { - resched_task(rq->curr); + resched_curr(rq); return 0; } @@ -1356,7 +1439,6 @@ retry: * The task is still there. We don't try * again, some other cpu will pull it when ready. */ - dequeue_pushable_dl_task(rq, next_task); goto out; } @@ -1372,15 +1454,16 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, later_rq->cpu); activate_task(later_rq, next_task, 0); + ret = 1; - resched_task(later_rq->curr); + resched_curr(later_rq); double_unlock_balance(rq, later_rq); out: put_task_struct(next_task); - return 1; + return ret; } static void push_dl_tasks(struct rq *rq) @@ -1443,7 +1526,7 @@ static int pull_dl_task(struct rq *this_rq) dl_time_before(p->dl.deadline, this_rq->dl.earliest_dl.curr))) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->on_rq); + WARN_ON(!task_on_rq_queued(p)); /* * Then we pull iff p has actually an earlier @@ -1486,7 +1569,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) p->nr_cpus_allowed > 1 && dl_task(rq->curr) && (rq->curr->nr_cpus_allowed < 2 || - dl_entity_preempt(&rq->curr->dl, &p->dl))) { + !dl_entity_preempt(&p->dl, &rq->curr->dl))) { push_dl_tasks(rq); } } @@ -1495,10 +1578,33 @@ static void set_cpus_allowed_dl(struct task_struct *p, const struct cpumask *new_mask) { struct rq *rq; + struct root_domain *src_rd; int weight; BUG_ON(!dl_task(p)); + rq = task_rq(p); + src_rd = rq->rd; + /* + * Migrating a SCHED_DEADLINE task between exclusive + * cpusets (different root_domains) entails a bandwidth + * update. We already made space for us in the destination + * domain (see cpuset_can_attach()). + */ + if (!cpumask_intersects(src_rd->span, new_mask)) { + struct dl_bw *src_dl_b; + + src_dl_b = dl_bw_of(cpu_of(rq)); + /* + * We now free resources of the root_domain we are migrating + * off. In the worst case, sched_setattr() may temporary fail + * until we complete the update. + */ + raw_spin_lock(&src_dl_b->lock); + __dl_clear(src_dl_b, p->dl.dl_bw); + raw_spin_unlock(&src_dl_b->lock); + } + /* * Update only if the task is actually running (i.e., * it is on the rq AND it is not throttled). @@ -1515,8 +1621,6 @@ static void set_cpus_allowed_dl(struct task_struct *p, if ((p->nr_cpus_allowed > 1) == (weight > 1)) return; - rq = task_rq(p); - /* * The process used to be able to migrate OR it can now migrate */ @@ -1540,6 +1644,7 @@ static void rq_online_dl(struct rq *rq) if (rq->dl.overloaded) dl_set_overload(rq); + cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); } @@ -1551,6 +1656,7 @@ static void rq_offline_dl(struct rq *rq) dl_clear_overload(rq); cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); } void init_sched_dl_class(void) @@ -1564,20 +1670,48 @@ void init_sched_dl_class(void) #endif /* CONFIG_SMP */ +/* + * Ensure p's dl_timer is cancelled. May drop rq->lock for a while. + */ +static void cancel_dl_timer(struct rq *rq, struct task_struct *p) +{ + struct hrtimer *dl_timer = &p->dl.dl_timer; + + /* Nobody will change task's class if pi_lock is held */ + lockdep_assert_held(&p->pi_lock); + + if (hrtimer_active(dl_timer)) { + int ret = hrtimer_try_to_cancel(dl_timer); + + if (unlikely(ret == -1)) { + /* + * Note, p may migrate OR new deadline tasks + * may appear in rq when we are unlocking it. + * A caller of us must be fine with that. + */ + raw_spin_unlock(&rq->lock); + hrtimer_cancel(dl_timer); + raw_spin_lock(&rq->lock); + } + } +} + static void switched_from_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) - hrtimer_try_to_cancel(&p->dl.dl_timer); + /* XXX we should retain the bw until 0-lag */ + cancel_dl_timer(rq, p); + __dl_clear_params(p); -#ifdef CONFIG_SMP /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one * from an overloaded cpu, if any. */ - if (!rq->dl.dl_nr_running) - pull_dl_task(rq); -#endif + if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) + return; + + if (pull_dl_task(rq)) + resched_curr(rq); } /* @@ -1588,22 +1722,19 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) { int check_resched = 1; - /* - * If p is throttled, don't consider the possibility - * of preempting rq->curr, the check will be done right - * after its runtime will get replenished. - */ - if (unlikely(p->dl.dl_throttled)) - return; - - if (p->on_rq && rq->curr != p) { + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) + if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && + push_dl_task(rq) && rq != task_rq(p)) /* Only reschedule if pushing failed */ check_resched = 0; #endif /* CONFIG_SMP */ - if (check_resched && task_has_dl_policy(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + if (check_resched) { + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); + } } } @@ -1614,7 +1745,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (p->on_rq || rq->curr == p) { + if (task_on_rq_queued(p) || rq->curr == p) { #ifdef CONFIG_SMP /* * This might be too much, but unfortunately @@ -1632,14 +1763,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && rq->curr == p) - resched_task(p); + resched_curr(rq); #else /* * Again, we don't know if p has a earlier * or later deadline, so let's blindly set a * (maybe not needed) rescheduling point. */ - resched_task(p); + resched_curr(rq); #endif /* CONFIG_SMP */ } else switched_to_dl(rq, p); @@ -1673,4 +1804,15 @@ const struct sched_class dl_sched_class = { .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, .switched_to = switched_to_dl, + + .update_curr = update_curr_dl, }; + +#ifdef CONFIG_SCHED_DEBUG +extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); + +void print_dl_stats(struct seq_file *m, int cpu) +{ + print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); +} +#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 627b3c34b821..a245c1fc6f0a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group if (!se) { struct sched_avg *avg = &cpu_rq(cpu)->avg; P(avg->runnable_avg_sum); - P(avg->runnable_avg_period); + P(avg->avg_period); return; } @@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); - P(se->avg.runnable_avg_period); + P(se->avg.running_avg_sum); + P(se->avg.avg_period); P(se->avg.load_avg_contrib); + P(se->avg.utilization_avg_contrib); P(se->avg.decay_count); #endif #undef PN @@ -150,7 +152,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) { struct task_struct *g, *p; - unsigned long flags; SEQ_printf(m, "\nrunnable tasks:\n" @@ -159,16 +160,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) "------------------------------------------------------" "----------------------------------------------------\n"); - read_lock_irqsave(&tasklist_lock, flags); - - do_each_thread(g, p) { + rcu_read_lock(); + for_each_process_thread(g, p) { if (task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); - } while_each_thread(g, p); - - read_unlock_irqrestore(&tasklist_lock, flags); + } + rcu_read_unlock(); } void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) @@ -217,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", + cfs_rq->utilization_load_avg); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); @@ -264,6 +265,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) #undef P } +void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) +{ + SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); + SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); +} + extern __read_mostly int sched_clock_running; static void print_cpu(struct seq_file *m, int cpu) @@ -302,6 +309,7 @@ do { \ PN(next_balance); SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); + PN(clock_task); P(cpu_load[0]); P(cpu_load[1]); P(cpu_load[2]); @@ -332,10 +340,9 @@ do { \ spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); + print_dl_stats(m, cpu); - rcu_read_lock(); print_rq(m, rq, cpu); - rcu_read_unlock(); spin_unlock_irqrestore(&sched_debug_lock, flags); SEQ_printf(m, "\n"); } @@ -533,8 +540,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) unsigned long nr_faults = -1; int cpu_current, home_node; - if (p->numa_faults_memory) - nr_faults = p->numa_faults_memory[2*node + i]; + if (p->numa_faults) + nr_faults = p->numa_faults[2*node + i]; cpu_current = !i ? (task_node(p) == node) : (pol && node_isset(node, pol->v.nodes)); @@ -633,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.load.weight); #ifdef CONFIG_SMP P(se.avg.runnable_avg_sum); - P(se.avg.runnable_avg_period); + P(se.avg.running_avg_sum); + P(se.avg.avg_period); P(se.avg.load_avg_contrib); + P(se.avg.utilization_avg_contrib); P(se.avg.decay_count); #endif P(policy); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fea7d3335e1f..ffeaa4105e48 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -23,6 +23,7 @@ #include <linux/latencytop.h> #include <linux/sched.h> #include <linux/cpumask.h> +#include <linux/cpuidle.h> #include <linux/slab.h> #include <linux/profile.h> #include <linux/interrupt.h> @@ -665,20 +666,22 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP +static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); static inline void __update_task_entity_contrib(struct sched_entity *se); +static inline void __update_task_entity_utilization(struct sched_entity *se); /* Give new task start runnable values to heavy its load in infant time */ void init_task_runnable_average(struct task_struct *p) { u32 slice; - p->se.avg.decay_count = 0; slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; - p->se.avg.runnable_avg_sum = slice; - p->se.avg.runnable_avg_period = slice; + p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; + p->se.avg.avg_period = slice; __update_task_entity_contrib(&p->se); + __update_task_entity_utilization(&p->se); } #else void init_task_runnable_average(struct task_struct *p) @@ -724,6 +727,11 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); } +static void update_curr_fair(struct rq *rq) +{ + update_curr(cfs_rq_of(&rq->curr->se)); +} + static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -826,11 +834,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_scan_min(struct task_struct *p) { + unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); unsigned int scan, floor; unsigned int windows = 1; - if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) - windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; + if (scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / scan_size; floor = 1000 / windows; scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); @@ -865,7 +874,6 @@ struct numa_group { spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; pid_t gid; - struct list_head task_list; struct rcu_head rcu; nodemask_t active_nodes; @@ -893,18 +901,24 @@ pid_t task_numa_group_id(struct task_struct *p) return p->numa_group ? p->numa_group->gid : 0; } -static inline int task_faults_idx(int nid, int priv) +/* + * The averaged statistics, shared & private, memory & cpu, + * occupy the first half of the array. The second half of the + * array is for current counters, which are averaged into the + * first set by task_numa_placement. + */ +static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv) { - return NR_NUMA_HINT_FAULT_TYPES * nid + priv; + return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv; } static inline unsigned long task_faults(struct task_struct *p, int nid) { - if (!p->numa_faults_memory) + if (!p->numa_faults) return 0; - return p->numa_faults_memory[task_faults_idx(nid, 0)] + - p->numa_faults_memory[task_faults_idx(nid, 1)]; + return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] + + p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults(struct task_struct *p, int nid) @@ -912,14 +926,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) if (!p->numa_group) return 0; - return p->numa_group->faults[task_faults_idx(nid, 0)] + - p->numa_group->faults[task_faults_idx(nid, 1)]; + return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] + + p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)]; } static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) { - return group->faults_cpu[task_faults_idx(nid, 0)] + - group->faults_cpu[task_faults_idx(nid, 1)]; + return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + + group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; +} + +/* Handle placement on systems where not all nodes are directly connected. */ +static unsigned long score_nearby_nodes(struct task_struct *p, int nid, + int maxdist, bool task) +{ + unsigned long score = 0; + int node; + + /* + * All nodes are directly connected, and the same distance + * from each other. No need for fancy placement algorithms. + */ + if (sched_numa_topology_type == NUMA_DIRECT) + return 0; + + /* + * This code is called for each node, introducing N^2 complexity, + * which should be ok given the number of nodes rarely exceeds 8. + */ + for_each_online_node(node) { + unsigned long faults; + int dist = node_distance(nid, node); + + /* + * The furthest away nodes in the system are not interesting + * for placement; nid was already counted. + */ + if (dist == sched_max_numa_distance || node == nid) + continue; + + /* + * On systems with a backplane NUMA topology, compare groups + * of nodes, and move tasks towards the group with the most + * memory accesses. When comparing two nodes at distance + * "hoplimit", only nodes closer by than "hoplimit" are part + * of each group. Skip other nodes. + */ + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist > maxdist) + continue; + + /* Add up the faults from nearby nodes. */ + if (task) + faults = task_faults(p, node); + else + faults = group_faults(p, node); + + /* + * On systems with a glueless mesh NUMA topology, there are + * no fixed "groups of nodes". Instead, nodes that are not + * directly connected bounce traffic through intermediate + * nodes; a numa_group can occupy any set of nodes. + * The further away a node is, the less the faults count. + * This seems to result in good task placement. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + faults *= (sched_max_numa_distance - dist); + faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + } + + score += faults; + } + + return score; } /* @@ -928,11 +1007,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) * larger multiplier, in order to group tasks together that are almost * evenly spread out between numa nodes. */ -static inline unsigned long task_weight(struct task_struct *p, int nid) +static inline unsigned long task_weight(struct task_struct *p, int nid, + int dist) { - unsigned long total_faults; + unsigned long faults, total_faults; - if (!p->numa_faults_memory) + if (!p->numa_faults) return 0; total_faults = p->total_numa_faults; @@ -940,15 +1020,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) if (!total_faults) return 0; - return 1000 * task_faults(p, nid) / total_faults; + faults = task_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, true); + + return 1000 * faults / total_faults; } -static inline unsigned long group_weight(struct task_struct *p, int nid) +static inline unsigned long group_weight(struct task_struct *p, int nid, + int dist) { - if (!p->numa_group || !p->numa_group->total_faults) + unsigned long faults, total_faults; + + if (!p->numa_group) + return 0; + + total_faults = p->numa_group->total_faults; + + if (!total_faults) return 0; - return 1000 * group_faults(p, nid) / p->numa_group->total_faults; + faults = group_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, false); + + return 1000 * faults / total_faults; } bool should_numa_migrate_memory(struct task_struct *p, struct page * page, @@ -1038,7 +1132,8 @@ struct numa_stats { */ static void update_numa_stats(struct numa_stats *ns, int nid) { - int cpu, cpus = 0; + int smt, cpu, cpus = 0; + unsigned long capacity; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { @@ -1062,9 +1157,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; - ns->task_capacity = - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); + /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ + smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); + capacity = cpus / smt; /* cores */ + + ns->task_capacity = min_t(unsigned, capacity, + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } @@ -1077,6 +1175,7 @@ struct task_numa_env { struct numa_stats src_stats, dst_stats; int imbalance_pct; + int dist; struct task_struct *best_task; long best_imp; @@ -1096,32 +1195,59 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } -static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, - long src_load, long dst_load, +static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { - long imb, old_imb; + long src_capacity, dst_capacity; + long orig_src_load; + long load_a, load_b; + long moved_load; + long imb; + + /* + * The load is corrected for the CPU capacity available on each node. + * + * src_load dst_load + * ------------ vs --------- + * src_capacity dst_capacity + */ + src_capacity = env->src_stats.compute_capacity; + dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ - if (dst_load < src_load) - swap(dst_load, src_load); + load_a = dst_load; + load_b = src_load; + if (load_a < load_b) + swap(load_a, load_b); /* Is the difference below the threshold? */ - imb = dst_load * 100 - src_load * env->imbalance_pct; + imb = load_a * src_capacity * 100 - + load_b * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; /* * The imbalance is above the allowed threshold. - * Compare it with the old imbalance. + * Allow a move that brings us closer to a balanced situation, + * without moving things past the point of balance. */ - if (orig_dst_load < orig_src_load) - swap(orig_dst_load, orig_src_load); + orig_src_load = env->src_stats.load; - old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + /* + * In a task swap, there will be one load moving from src to dst, + * and another moving back. This is the net sum of both moves. + * A simple task move will always have a positive value. + * Allow the move if it brings the system closer to a balanced + * situation, without crossing over the balance point. + */ + moved_load = orig_src_load - src_load; - /* Would this change make things worse? */ - return (imb > old_imb); + if (moved_load > 0) + /* Moving src -> dst. Did we overshoot balance? */ + return src_load * dst_capacity < dst_load * src_capacity; + else + /* Moving dst -> src. Did we overshoot balance? */ + return dst_load * src_capacity < src_load * dst_capacity; } /* @@ -1136,15 +1262,33 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long orig_src_load, src_load; - long orig_dst_load, dst_load; + long src_load, dst_load; long load; - long imp = (groupimp > 0) ? groupimp : taskimp; + long imp = env->p->numa_group ? groupimp : taskimp; + long moveimp = imp; + int dist = env->dist; rcu_read_lock(); - cur = ACCESS_ONCE(dst_rq->curr); - if (cur->pid == 0) /* idle */ + + raw_spin_lock_irq(&dst_rq->lock); + cur = dst_rq->curr; + /* + * No need to move the exiting task, and this ensures that ->curr + * wasn't reaped and thus get_task_struct() in task_numa_assign() + * is safe under RCU read lock. + * Note that rcu_read_lock() itself can't protect from the final + * put_task_struct() after the last schedule(). + */ + if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + raw_spin_unlock_irq(&dst_rq->lock); + + /* + * Because we have preemption enabled we can get migrated around and + * end try selecting ourselves (current == env->p) as a swap candidate. + */ + if (cur == env->p) + goto unlock; /* * "imp" is the fault differential for the source task between the @@ -1163,8 +1307,8 @@ static void task_numa_compare(struct task_numa_env *env, * in any group then look only at task weights. */ if (cur->numa_group == env->p->numa_group) { - imp = taskimp + task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp = taskimp + task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); /* * Add some hysteresis to prevent swapping the * tasks within a group over tiny differences. @@ -1177,26 +1321,21 @@ static void task_numa_compare(struct task_numa_env *env, * itself (not part of a group), use the task weight * instead. */ - if (env->p->numa_group) - imp = groupimp; - else - imp = taskimp; - if (cur->numa_group) - imp += group_weight(cur, env->src_nid) - - group_weight(cur, env->dst_nid); + imp += group_weight(cur, env->src_nid, dist) - + group_weight(cur, env->dst_nid, dist); else - imp += task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp += task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); } } - if (imp < env->best_imp) + if (imp <= env->best_imp && moveimp <= env->best_imp) goto unlock; if (!cur) { /* Is there capacity at our destination? */ - if (env->src_stats.has_free_capacity && + if (env->src_stats.nr_running <= env->src_stats.task_capacity && !env->dst_stats.has_free_capacity) goto unlock; @@ -1204,20 +1343,34 @@ static void task_numa_compare(struct task_numa_env *env, } /* Balance doesn't matter much if we're running a task per cpu */ - if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) + if (imp > env->best_imp && src_rq->nr_running == 1 && + dst_rq->nr_running == 1) goto assign; /* * In the overloaded case, try and keep the load balanced. */ balance: - orig_dst_load = env->dst_stats.load; - orig_src_load = env->src_stats.load; - - /* XXX missing capacity terms */ load = task_h_load(env->p); - dst_load = orig_dst_load + load; - src_load = orig_src_load - load; + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + + if (moveimp > imp && moveimp > env->best_imp) { + /* + * If the improvement from just moving env->p direction is + * better than swapping tasks around, check if a move is + * possible. Store a slightly smaller score than moveimp, + * so an actually idle CPU will win. + */ + if (!load_too_imbalanced(src_load, dst_load, env)) { + imp = moveimp - 1; + cur = NULL; + goto assign; + } + } + + if (imp <= env->best_imp) + goto unlock; if (cur) { load = task_h_load(cur); @@ -1225,10 +1378,16 @@ balance: src_load += load; } - if (load_too_imbalanced(orig_src_load, orig_dst_load, - src_load, dst_load, env)) + if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; + /* + * One idle CPU per node is evaluated for a task numa move. + * Call select_idle_sibling to maybe find a better one. + */ + if (!cur) + env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + assign: task_numa_assign(env, cur, imp); unlock: @@ -1266,7 +1425,7 @@ static int task_numa_migrate(struct task_struct *p) }; struct sched_domain *sd; unsigned long taskweight, groupweight; - int nid, ret; + int nid, ret, dist; long taskimp, groupimp; /* @@ -1294,40 +1453,51 @@ static int task_numa_migrate(struct task_struct *p) return -EINVAL; } - taskweight = task_weight(p, env.src_nid); - groupweight = group_weight(p, env.src_nid); - update_numa_stats(&env.src_stats, env.src_nid); env.dst_nid = p->numa_preferred_nid; - taskimp = task_weight(p, env.dst_nid) - taskweight; - groupimp = group_weight(p, env.dst_nid) - groupweight; + dist = env.dist = node_distance(env.src_nid, env.dst_nid); + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + update_numa_stats(&env.src_stats, env.src_nid); + taskimp = task_weight(p, env.dst_nid, dist) - taskweight; + groupimp = group_weight(p, env.dst_nid, dist) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); - /* If the preferred nid has free capacity, try to use it. */ - if (env.dst_stats.has_free_capacity) - task_numa_find_cpu(&env, taskimp, groupimp); + /* Try to find a spot on the preferred nid. */ + task_numa_find_cpu(&env, taskimp, groupimp); - /* No space available on the preferred nid. Look elsewhere. */ - if (env.best_cpu == -1) { + /* + * Look at other nodes in these cases: + * - there is no space available on the preferred_nid + * - the task is part of a numa_group that is interleaved across + * multiple NUMA nodes; in order to better consolidate the group, + * we need to check other locations. + */ + if (env.best_cpu == -1 || (p->numa_group && + nodes_weight(p->numa_group->active_nodes) > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; + dist = node_distance(env.src_nid, env.dst_nid); + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist != env.dist) { + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + } + /* Only consider nodes where both task and groups benefit */ - taskimp = task_weight(p, nid) - taskweight; - groupimp = group_weight(p, nid) - groupweight; + taskimp = task_weight(p, nid, dist) - taskweight; + groupimp = group_weight(p, nid, dist) - groupweight; if (taskimp < 0 && groupimp < 0) continue; + env.dist = dist; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); task_numa_find_cpu(&env, taskimp, groupimp); } } - /* No better CPU than the current one was found. */ - if (env.best_cpu == -1) - return -EAGAIN; - /* * If the task is part of a workload that spans multiple NUMA nodes, * and is migrating into one of the workload's active nodes, remember @@ -1336,8 +1506,19 @@ static int task_numa_migrate(struct task_struct *p) * A task that migrated to a second choice node will be better off * trying for a better one later. Do not set the preferred node here. */ - if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) - sched_setnuma(p, env.dst_nid); + if (p->numa_group) { + if (env.best_cpu == -1) + nid = env.src_nid; + else + nid = env.dst_nid; + + if (node_isset(nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); + } + + /* No better CPU than the current one was found. */ + if (env.best_cpu == -1) + return -EAGAIN; /* * Reset the scan period if the task is being rescheduled on an @@ -1365,7 +1546,7 @@ static void numa_migrate_preferred(struct task_struct *p) unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -1415,12 +1596,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) /* * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS * increments. The more local the fault statistics are, the higher the scan - * period will be for the next scan window. If local/remote ratio is below - * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the - * scan period will decrease + * period will be for the next scan window. If local/(local+remote) ratio is + * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) + * the scan period will decrease. Aim for 70% local accesses. */ #define NUMA_PERIOD_SLOTS 10 -#define NUMA_PERIOD_THRESHOLD 3 +#define NUMA_PERIOD_THRESHOLD 7 /* * Increase the scan period (slow down scanning) if the majority of @@ -1441,9 +1622,11 @@ static void update_task_scan_period(struct task_struct *p, /* * If there were no record hinting faults then either the task is * completely idle or all activity is areas that are not of interest - * to automatic numa balancing. Scan slower + * to automatic numa balancing. Related to that, if there were failed + * migration then it implies we are migrating too quickly or the local + * node is overloaded. In either case, scan slower */ - if (local + shared == 0) { + if (local + shared == 0 || p->numa_faults_locality[2]) { p->numa_scan_period = min(p->numa_scan_period_max, p->numa_scan_period << 1); @@ -1477,7 +1660,7 @@ static void update_task_scan_period(struct task_struct *p, * scanning faster if shared accesses dominate as it may * simply bounce migrations uselessly */ - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); diff = (diff * ratio) / NUMA_PERIOD_SLOTS; } @@ -1505,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) *period = now - p->last_task_numa_placement; } else { delta = p->se.avg.runnable_avg_sum; - *period = p->se.avg.runnable_avg_period; + *period = p->se.avg.avg_period; } p->last_sum_exec_runtime = runtime; @@ -1514,6 +1697,94 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) return delta; } +/* + * Determine the preferred nid for a task in a numa_group. This needs to + * be done in a way that produces consistent results with group_weight, + * otherwise workloads might not converge. + */ +static int preferred_group_nid(struct task_struct *p, int nid) +{ + nodemask_t nodes; + int dist; + + /* Direct connections between all NUMA nodes. */ + if (sched_numa_topology_type == NUMA_DIRECT) + return nid; + + /* + * On a system with glueless mesh NUMA topology, group_weight + * scores nodes according to the number of NUMA hinting faults on + * both the node itself, and on nearby nodes. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + unsigned long score, max_score = 0; + int node, max_node = nid; + + dist = sched_max_numa_distance; + + for_each_online_node(node) { + score = group_weight(p, node, dist); + if (score > max_score) { + max_score = score; + max_node = node; + } + } + return max_node; + } + + /* + * Finding the preferred nid in a system with NUMA backplane + * interconnect topology is more involved. The goal is to locate + * tasks from numa_groups near each other in the system, and + * untangle workloads from different sides of the system. This requires + * searching down the hierarchy of node groups, recursively searching + * inside the highest scoring group of nodes. The nodemask tricks + * keep the complexity of the search down. + */ + nodes = node_online_map; + for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { + unsigned long max_faults = 0; + nodemask_t max_group = NODE_MASK_NONE; + int a, b; + + /* Are there nodes at this distance from each other? */ + if (!find_numa_distance(dist)) + continue; + + for_each_node_mask(a, nodes) { + unsigned long faults = 0; + nodemask_t this_group; + nodes_clear(this_group); + + /* Sum group's NUMA faults; includes a==b case. */ + for_each_node_mask(b, nodes) { + if (node_distance(a, b) < dist) { + faults += group_faults(p, b); + node_set(b, this_group); + node_clear(b, nodes); + } + } + + /* Remember the top group. */ + if (faults > max_faults) { + max_faults = faults; + max_group = this_group; + /* + * subtle: at the smallest distance there is + * just one node left in each "group", the + * winner is the preferred nid. + */ + nid = a; + } + } + /* Next round, evaluate the nodes within max_group. */ + if (!max_faults) + break; + nodes = max_group; + } + return nid; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; @@ -1541,18 +1812,23 @@ static void task_numa_placement(struct task_struct *p) /* Find the node with the highest number of faults */ for_each_online_node(nid) { + /* Keep track of the offsets in numa_faults array */ + int mem_idx, membuf_idx, cpu_idx, cpubuf_idx; unsigned long faults = 0, group_faults = 0; - int priv, i; + int priv; for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { long diff, f_diff, f_weight; - i = task_faults_idx(nid, priv); + mem_idx = task_faults_idx(NUMA_MEM, nid, priv); + membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv); + cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); + cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv); /* Decay existing window, copy faults since last scan */ - diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; - fault_types[priv] += p->numa_faults_buffer_memory[i]; - p->numa_faults_buffer_memory[i] = 0; + diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2; + fault_types[priv] += p->numa_faults[membuf_idx]; + p->numa_faults[membuf_idx] = 0; /* * Normalize the faults_from, so all tasks in a group @@ -1562,21 +1838,27 @@ static void task_numa_placement(struct task_struct *p) * faults are less important. */ f_weight = div64_u64(runtime << 16, period + 1); - f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / + f_weight = (f_weight * p->numa_faults[cpubuf_idx]) / (total_faults + 1); - f_diff = f_weight - p->numa_faults_cpu[i] / 2; - p->numa_faults_buffer_cpu[i] = 0; + f_diff = f_weight - p->numa_faults[cpu_idx] / 2; + p->numa_faults[cpubuf_idx] = 0; - p->numa_faults_memory[i] += diff; - p->numa_faults_cpu[i] += f_diff; - faults += p->numa_faults_memory[i]; + p->numa_faults[mem_idx] += diff; + p->numa_faults[cpu_idx] += f_diff; + faults += p->numa_faults[mem_idx]; p->total_numa_faults += diff; if (p->numa_group) { - /* safe because we can only change our own group */ - p->numa_group->faults[i] += diff; - p->numa_group->faults_cpu[i] += f_diff; + /* + * safe because we can only change our own group + * + * mem_idx represents the offset for a given + * nid and priv in a specific region because it + * is at the beginning of the numa_faults array. + */ + p->numa_group->faults[mem_idx] += diff; + p->numa_group->faults_cpu[mem_idx] += f_diff; p->numa_group->total_faults += diff; - group_faults += p->numa_group->faults[i]; + group_faults += p->numa_group->faults[mem_idx]; } } @@ -1595,30 +1877,17 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_group) { update_numa_active_node_mask(p->numa_group); - /* - * If the preferred task and group nids are different, - * iterate over the nodes again to find the best place. - */ - if (max_nid != max_group_nid) { - unsigned long weight, max_weight = 0; - - for_each_online_node(nid) { - weight = task_weight(p, nid) + group_weight(p, nid); - if (weight > max_weight) { - max_weight = weight; - max_nid = nid; - } - } - } - spin_unlock_irq(group_lock); + max_nid = preferred_group_nid(p, max_group_nid); } - /* Preferred node as the node with the most faults */ - if (max_faults && max_nid != p->numa_preferred_nid) { - /* Update the preferred nid and migrate task if possible */ - sched_setnuma(p, max_nid); - numa_migrate_preferred(p); + if (max_faults) { + /* Set the new preferred node */ + if (max_nid != p->numa_preferred_nid) + sched_setnuma(p, max_nid); + + if (task_node(p) != p->numa_preferred_nid) + numa_migrate_preferred(p); } } @@ -1652,7 +1921,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, atomic_set(&grp->refcount, 1); spin_lock_init(&grp->lock); - INIT_LIST_HEAD(&grp->task_list); grp->gid = p->pid; /* Second half of the array tracks nids where faults happen */ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * @@ -1661,11 +1929,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, node_set(task_node(current), grp->active_nodes); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) - grp->faults[i] = p->numa_faults_memory[i]; + grp->faults[i] = p->numa_faults[i]; grp->total_faults = p->total_numa_faults; - list_add(&p->numa_entry, &grp->task_list); grp->nr_tasks++; rcu_assign_pointer(p->numa_group, grp); } @@ -1720,13 +1987,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { - my_grp->faults[i] -= p->numa_faults_memory[i]; - grp->faults[i] += p->numa_faults_memory[i]; + my_grp->faults[i] -= p->numa_faults[i]; + grp->faults[i] += p->numa_faults[i]; } my_grp->total_faults -= p->total_numa_faults; grp->total_faults += p->total_numa_faults; - list_move(&p->numa_entry, &grp->task_list); my_grp->nr_tasks--; grp->nr_tasks++; @@ -1746,27 +2012,23 @@ no_join: void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; - void *numa_faults = p->numa_faults_memory; + void *numa_faults = p->numa_faults; unsigned long flags; int i; if (grp) { spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) - grp->faults[i] -= p->numa_faults_memory[i]; + grp->faults[i] -= p->numa_faults[i]; grp->total_faults -= p->total_numa_faults; - list_del(&p->numa_entry); grp->nr_tasks--; spin_unlock_irqrestore(&grp->lock, flags); - rcu_assign_pointer(p->numa_group, NULL); + RCU_INIT_POINTER(p->numa_group, NULL); put_numa_group(grp); } - p->numa_faults_memory = NULL; - p->numa_faults_buffer_memory = NULL; - p->numa_faults_cpu= NULL; - p->numa_faults_buffer_cpu = NULL; + p->numa_faults = NULL; kfree(numa_faults); } @@ -1788,29 +2050,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (!p->mm) return; - /* Do not worry about placement if exiting */ - if (p->state == TASK_DEAD) - return; - /* Allocate buffer to track faults on a per-node basis */ - if (unlikely(!p->numa_faults_memory)) { - int size = sizeof(*p->numa_faults_memory) * + if (unlikely(!p->numa_faults)) { + int size = sizeof(*p->numa_faults) * NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; - p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); - if (!p->numa_faults_memory) + p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults) return; - BUG_ON(p->numa_faults_buffer_memory); - /* - * The averaged statistics, shared & private, memory & cpu, - * occupy the first half of the array. The second half of the - * array is for current counters, which are averaged into the - * first set by task_numa_placement. - */ - p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); - p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); - p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); p->total_numa_faults = 0; memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } @@ -1849,9 +2097,11 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; + if (flags & TNF_MIGRATE_FAIL) + p->numa_faults_locality[2] += pages; - p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; - p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; + p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; + p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; p->numa_faults_locality[local] += pages; } @@ -1930,8 +2180,10 @@ void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) + if (!vma_migratable(vma) || !vma_policy_mof(vma) || + is_vm_hugetlb_page(vma)) { continue; + } /* * Shared library pages mapped by multiple processes are not @@ -2195,8 +2447,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) /* * As y^PERIOD = 1/2, we can combine - * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) - * With a look-up table which covers k^n (n<PERIOD) + * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD) + * With a look-up table which covers y^n (n<PERIOD) * * To achieve constant time decay_load. */ @@ -2266,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n) * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int __update_entity_runnable_avg(u64 now, +static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, struct sched_avg *sa, - int runnable) + int runnable, + int running) { u64 delta, periods; u32 runnable_contrib; int delta_w, decayed = 0; + unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); delta = now - sa->last_runnable_update; /* @@ -2294,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, sa->last_runnable_update = now; /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->runnable_avg_period % 1024; + delta_w = sa->avg_period % 1024; if (delta + delta_w >= 1024) { /* period roll-over */ decayed = 1; @@ -2307,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now, delta_w = 1024 - delta_w; if (runnable) sa->runnable_avg_sum += delta_w; - sa->runnable_avg_period += delta_w; + if (running) + sa->running_avg_sum += delta_w * scale_freq + >> SCHED_CAPACITY_SHIFT; + sa->avg_period += delta_w; delta -= delta_w; @@ -2317,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now, sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, periods + 1); - sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + sa->running_avg_sum = decay_load(sa->running_avg_sum, + periods + 1); + sa->avg_period = decay_load(sa->avg_period, periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ runnable_contrib = __compute_runnable_contrib(periods); if (runnable) sa->runnable_avg_sum += runnable_contrib; - sa->runnable_avg_period += runnable_contrib; + if (running) + sa->running_avg_sum += runnable_contrib * scale_freq + >> SCHED_CAPACITY_SHIFT; + sa->avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ if (runnable) sa->runnable_avg_sum += delta; - sa->runnable_avg_period += delta; + if (running) + sa->running_avg_sum += delta * scale_freq + >> SCHED_CAPACITY_SHIFT; + sa->avg_period += delta; return decayed; } @@ -2342,11 +2607,13 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) u64 decays = atomic64_read(&cfs_rq->decay_counter); decays -= se->avg.decay_count; + se->avg.decay_count = 0; if (!decays) return 0; se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); - se->avg.decay_count = 0; + se->avg.utilization_avg_contrib = + decay_load(se->avg.utilization_avg_contrib, decays); return decays; } @@ -2361,6 +2628,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; tg_contrib -= cfs_rq->tg_load_contrib; + if (!tg_contrib) + return; + if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { atomic_long_add(tg_contrib, &tg->load_avg); cfs_rq->tg_load_contrib += tg_contrib; @@ -2379,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, /* The fraction of a cpu used by this cfs_rq */ contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, - sa->runnable_avg_period + 1); + sa->avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { @@ -2432,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, + runnable, runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -2450,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); - contrib /= (se->avg.runnable_avg_period + 1); + contrib /= (se->avg.avg_period + 1); se->avg.load_avg_contrib = scale_load(contrib); } @@ -2469,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) return se->avg.load_avg_contrib - old_contrib; } + +static inline void __update_task_entity_utilization(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); + contrib /= (se->avg.avg_period + 1); + se->avg.utilization_avg_contrib = scale_load(contrib); +} + +static long __update_entity_utilization_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.utilization_avg_contrib; + + if (entity_is_task(se)) + __update_task_entity_utilization(se); + else + se->avg.utilization_avg_contrib = + group_cfs_rq(se)->utilization_load_avg; + + return se->avg.utilization_avg_contrib - old_contrib; +} + static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, long load_contrib) { @@ -2485,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta; + long contrib_delta, utilization_delta; + int cpu = cpu_of(rq_of(cfs_rq)); u64 now; /* @@ -2497,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se, else now = cfs_rq_clock_task(group_cfs_rq(se)); - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) + if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, + cfs_rq->curr == se)) return; contrib_delta = __update_entity_load_avg_contrib(se); + utilization_delta = __update_entity_utilization_avg_contrib(se); if (!update_cfs_rq) return; - if (se->on_rq) + if (se->on_rq) { cfs_rq->runnable_load_avg += contrib_delta; - else + cfs_rq->utilization_load_avg += utilization_delta; + } else { subtract_blocked_load_contrib(cfs_rq, -contrib_delta); + } } /* @@ -2583,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, } cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; /* we force update consideration on load-balancer moves */ update_cfs_rq_blocked_load(cfs_rq, !wakeup); } @@ -2601,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, update_cfs_rq_blocked_load(cfs_rq, !sleep); cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; if (sleep) { cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); @@ -2899,7 +3201,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. @@ -2923,7 +3225,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (delta > ideal_runtime) - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); } static void @@ -2938,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); + update_entity_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); @@ -3063,7 +3366,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); return; } /* @@ -3254,7 +3557,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_task(rq_of(cfs_rq)->curr); + resched_curr(rq_of(cfs_rq)); } static __always_inline @@ -3360,7 +3663,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); - list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + /* + * Add to the _head_ of the list, so that an already-started + * distribute_cfs_runtime will not see us + */ + list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); if (!cfs_b->timer_active) __start_cfs_bandwidth(cfs_b, false); raw_spin_unlock(&cfs_b->lock); @@ -3410,14 +3717,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) - resched_task(rq->curr); + resched_curr(rq); } static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining, u64 expires) { struct cfs_rq *cfs_rq; - u64 runtime = remaining; + u64 runtime; + u64 starting_runtime = remaining; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -3448,7 +3756,7 @@ next: } rcu_read_unlock(); - return remaining; + return starting_runtime - remaining; } /* @@ -3494,22 +3802,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - /* - * There are throttled entities so we must first use the new bandwidth - * to unthrottle them before making it generally available. This - * ensures that all existing debts will be paid before a new cfs_rq is - * allowed to run. - */ - runtime = cfs_b->runtime; runtime_expires = cfs_b->runtime_expires; - cfs_b->runtime = 0; /* - * This check is repeated as we are holding onto the new bandwidth - * while we unthrottle. This can potentially race with an unthrottled - * group trying to acquire new bandwidth from the global pool. + * This check is repeated as we are holding onto the new bandwidth while + * we unthrottle. This can potentially race with an unthrottled group + * trying to acquire new bandwidth from the global pool. This can result + * in us over-using our runtime if it is all used during this loop, but + * only by limited amounts in that extreme case. */ - while (throttled && runtime > 0) { + while (throttled && cfs_b->runtime > 0) { + runtime = cfs_b->runtime; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, @@ -3517,10 +3820,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) raw_spin_lock(&cfs_b->lock); throttled = !list_empty(&cfs_b->throttled_cfs_rq); + + cfs_b->runtime -= min(runtime, cfs_b->runtime); } - /* return (any) remaining runtime */ - cfs_b->runtime = runtime; /* * While we are ensured activity in the period following an * unthrottle, this also covers the case in which the new bandwidth is @@ -3631,10 +3934,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) return; } - if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - cfs_b->runtime = 0; - } + expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); @@ -3645,7 +3947,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) raw_spin_lock(&cfs_b->lock); if (expires == cfs_b->runtime_expires) - cfs_b->runtime = runtime; + cfs_b->runtime -= min(runtime, cfs_b->runtime); raw_spin_unlock(&cfs_b->lock); } @@ -3771,10 +4073,27 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + /* init_cfs_bandwidth() was not called */ + if (!cfs_b->throttled_cfs_rq.next) + return; + hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->slack_timer); } +static void __maybe_unused update_runtime_enabled(struct rq *rq) +{ + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + + raw_spin_lock(&cfs_b->lock); + cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; + raw_spin_unlock(&cfs_b->lock); + } +} + static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { struct cfs_rq *cfs_rq; @@ -3788,6 +4107,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) * there's some valid quota amount */ cfs_rq->runtime_remaining = 1; + /* + * Offline rq is schedulable till cpu is completely disabled + * in take_cpu_down(), so we prevent new cfs throttling here. + */ + cfs_rq->runtime_enabled = 0; + if (cfs_rq_throttled(cfs_rq)) unthrottle_cfs_rq(cfs_rq); } @@ -3831,6 +4156,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return NULL; } static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static inline void update_runtime_enabled(struct rq *rq) {} static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #endif /* CONFIG_CFS_BANDWIDTH */ @@ -3854,17 +4180,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) if (delta < 0) { if (rq->curr == p) - resched_task(p); + resched_curr(rq); return; } - - /* - * Don't schedule slices shorter than 10000ns, that just - * doesn't make sense. Rely on vruntime for fairness. - */ - if (rq->curr != p) - delta = max_t(s64, 10000LL, delta); - hrtick_start(rq, delta); } } @@ -4049,10 +4367,15 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; } +static unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); unsigned long load_avg = rq->cfs.runnable_load_avg; if (nr_running) @@ -4178,7 +4501,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) * wl = S * s'_i; see (2) */ if (W > 0 && w < W) - wl = (w * tg->shares) / W; + wl = (w * (long)tg->shares) / W; else wl = tg->shares; @@ -4241,8 +4564,8 @@ static int wake_wide(struct task_struct *p) static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; + s64 this_eff_load, prev_eff_load; int idx, this_cpu, prev_cpu; - unsigned long tl_per_task; struct task_group *tg; unsigned long weight; int balanced; @@ -4285,47 +4608,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - if (this_load > 0) { - s64 this_eff_load, prev_eff_load; + this_eff_load = 100; + this_eff_load *= capacity_of(prev_cpu); + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= capacity_of(this_cpu); - this_eff_load = 100; - this_eff_load *= capacity_of(prev_cpu); + if (this_load > 0) { this_eff_load *= this_load + effective_load(tg, this_cpu, weight, weight); - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= capacity_of(this_cpu); prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + } - balanced = this_eff_load <= prev_eff_load; - } else - balanced = true; - - /* - * If the currently running task will sleep within - * a reasonable amount of time then attract this newly - * woken task: - */ - if (sync && balanced) - return 1; + balanced = this_eff_load <= prev_eff_load; schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - if (balanced || - (this_load <= load && - this_load + target_load(prev_cpu, idx) <= tl_per_task)) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.statistics.nr_wakeups_affine); + if (!balanced) + return 0; - return 1; - } - return 0; + schedstat_inc(sd, ttwu_move_affine); + schedstat_inc(p, se.statistics.nr_wakeups_affine); + + return 1; } /* @@ -4393,20 +4699,46 @@ static int find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; - int idlest = -1; + unsigned int min_exit_latency = UINT_MAX; + u64 latest_idle_timestamp = 0; + int least_loaded_cpu = this_cpu; + int shallowest_idle_cpu = -1; int i; /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; + if (idle_cpu(i)) { + struct rq *rq = cpu_rq(i); + struct cpuidle_state *idle = idle_get_state(rq); + if (idle && idle->exit_latency < min_exit_latency) { + /* + * We give priority to a CPU whose idle state + * has the smallest exit latency irrespective + * of any idle timestamp. + */ + min_exit_latency = idle->exit_latency; + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } else if ((!idle || idle->exit_latency == min_exit_latency) && + rq->idle_stamp > latest_idle_timestamp) { + /* + * If equal or no active idle state, then + * the most recently idled CPU might have + * a warmer cache. + */ + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } + } else if (shallowest_idle_cpu == -1) { + load = weighted_cpuload(i); + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + least_loaded_cpu = i; + } } } - return idlest; + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } /* @@ -4453,6 +4785,33 @@ next: done: return target; } +/* + * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the usage with the capacity of the CPU that is available for CFS + * task (ie cpu_capacity). + * cfs.utilization_load_avg is the sum of running time of runnable tasks on a + * CPU. It represents the amount of utilization of a CPU in the range + * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full + * capacity of the CPU because it's about the running time on this CPU. + * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE + * because of unfortunate rounding in avg_period and running_load_avg or just + * after migrating tasks until the average stabilizes with the new running + * time. So we need to check that the usage stays into the range + * [0..cpu_capacity_orig] and cap if necessary. + * Without capping the usage, a group could be seen as overloaded (CPU0 usage + * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity + */ +static int get_cpu_usage(int cpu) +{ + unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; + unsigned long capacity = capacity_orig_of(cpu); + + if (usage >= SCHED_LOAD_SCALE) + return capacity; + + return (usage * capacity) >> SCHED_LOAD_SHIFT; +} /* * select_task_rq_fair: Select target runqueue for the waking task in domains @@ -4475,14 +4834,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (p->nr_cpus_allowed == 1) - return prev_cpu; - - if (sd_flag & SD_BALANCE_WAKE) { - if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) - want_affine = 1; - new_cpu = prev_cpu; - } + if (sd_flag & SD_BALANCE_WAKE) + want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -4669,7 +5022,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* - * This is possible from callers such as move_task(), in which we + * This is possible from callers such as attach_tasks(), in which we * unconditionally check_prempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. @@ -4723,7 +5076,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; preempt: - resched_task(curr); + resched_curr(rq); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved @@ -4904,7 +5257,7 @@ static void yield_task_fair(struct rq *rq) * so we don't do microscopic update in schedule() * and double the fastpath cost. */ - rq->skip_clock_update = 1; + rq_clock_skip_update(rq, true); } set_skip_buddy(se); @@ -5077,28 +5430,18 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + struct list_head tasks; }; /* - * move_task - move a task from one runqueue to another runqueue. - * Both runqueues must be locked. - */ -static void move_task(struct task_struct *p, struct lb_env *env) -{ - deactivate_task(env->src_rq, p, 0); - set_task_cpu(p, env->dst_cpu); - activate_task(env->dst_rq, p, 0); - check_preempt_curr(env->dst_rq, p, 0); -} - -/* * Is this task likely cache-hot: */ -static int -task_hot(struct task_struct *p, u64 now) +static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; + lockdep_assert_held(&env->src_rq->lock); + if (p->sched_class != &fair_sched_class) return 0; @@ -5108,7 +5451,7 @@ task_hot(struct task_struct *p, u64 now) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && + if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; @@ -5118,7 +5461,7 @@ task_hot(struct task_struct *p, u64 now) if (sysctl_sched_migration_cost == 0) return 0; - delta = now - p->se.exec_start; + delta = rq_clock_task(env->src_rq) - p->se.exec_start; return delta < (s64)sysctl_sched_migration_cost; } @@ -5130,7 +5473,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || !(env->sd->flags & SD_NUMA)) { return false; } @@ -5169,7 +5512,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) return false; - if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) + if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) return false; src_nid = cpu_to_node(env->src_cpu); @@ -5218,6 +5561,9 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot = 0; + + lockdep_assert_held(&env->src_rq->lock); + /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or @@ -5272,28 +5618,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); + tsk_cache_hot = task_hot(p, env); if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); - if (migrate_improves_locality(p, env)) { -#ifdef CONFIG_SCHEDSTATS - if (tsk_cache_hot) { - schedstat_inc(env->sd, lb_hot_gained[env->idle]); - schedstat_inc(p, se.statistics.nr_forced_migrations); - } -#endif - return 1; - } - - if (!tsk_cache_hot || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - + if (migrate_improves_locality(p, env) || !tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } - return 1; } @@ -5302,47 +5636,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) } /* - * move_one_task tries to move exactly one task from busiest to this_rq, as + * detach_task() -- detach the task for the migration specified in env + */ +static void detach_task(struct task_struct *p, struct lb_env *env) +{ + lockdep_assert_held(&env->src_rq->lock); + + deactivate_task(env->src_rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, env->dst_cpu); +} + +/* + * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as * part of active balancing operations within "domain". - * Returns 1 if successful and 0 otherwise. * - * Called with both runqueues locked. + * Returns a task if successful and NULL otherwise. */ -static int move_one_task(struct lb_env *env) +static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p, *n; + lockdep_assert_held(&env->src_rq->lock); + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { if (!can_migrate_task(p, env)) continue; - move_task(p, env); + detach_task(p, env); + /* - * Right now, this is only the second place move_task() - * is called, so we can safely collect move_task() - * stats here rather than inside move_task(). + * Right now, this is only the second place where + * lb_gained[env->idle] is updated (other is detach_tasks) + * so we can safely collect stats here rather than + * inside detach_tasks(). */ schedstat_inc(env->sd, lb_gained[env->idle]); - return 1; + return p; } - return 0; + return NULL; } static const unsigned int sched_nr_migrate_break = 32; /* - * move_tasks tries to move up to imbalance weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. + * detach_tasks() -- tries to detach up to imbalance weighted load from + * busiest_rq, as part of a balancing operation within domain "sd". * - * Called with both runqueues locked. + * Returns number of detached tasks if successful and 0 otherwise. */ -static int move_tasks(struct lb_env *env) +static int detach_tasks(struct lb_env *env) { struct list_head *tasks = &env->src_rq->cfs_tasks; struct task_struct *p; unsigned long load; - int pulled = 0; + int detached = 0; + + lockdep_assert_held(&env->src_rq->lock); if (env->imbalance <= 0) return 0; @@ -5373,14 +5723,16 @@ static int move_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - move_task(p, env); - pulled++; + detach_task(p, env); + list_add(&p->se.group_node, &env->tasks); + + detached++; env->imbalance -= load; #ifdef CONFIG_PREEMPT /* * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is pulled to minimize + * kernels will stop after the first task is detached to minimize * the critical section. */ if (env->idle == CPU_NEWLY_IDLE) @@ -5400,13 +5752,58 @@ next: } /* - * Right now, this is one of only two places move_task() is called, - * so we can safely collect move_task() stats here rather than - * inside move_task(). + * Right now, this is one of only two places we collect this stat + * so we can safely collect detach_one_task() stats here rather + * than inside detach_one_task(). */ - schedstat_add(env->sd, lb_gained[env->idle], pulled); + schedstat_add(env->sd, lb_gained[env->idle], detached); - return pulled; + return detached; +} + +/* + * attach_task() -- attach the task detached by detach_task() to its new rq. + */ +static void attach_task(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_held(&rq->lock); + + BUG_ON(task_rq(p) != rq); + p->on_rq = TASK_ON_RQ_QUEUED; + activate_task(rq, p, 0); + check_preempt_curr(rq, p, 0); +} + +/* + * attach_one_task() -- attaches the task returned from detach_one_task() to + * its new rq. + */ +static void attach_one_task(struct rq *rq, struct task_struct *p) +{ + raw_spin_lock(&rq->lock); + attach_task(rq, p); + raw_spin_unlock(&rq->lock); +} + +/* + * attach_tasks() -- attaches all tasks detached by detach_tasks() to their + * new rq. + */ +static void attach_tasks(struct lb_env *env) +{ + struct list_head *tasks = &env->tasks; + struct task_struct *p; + + raw_spin_lock(&env->dst_rq->lock); + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); + list_del_init(&p->se.group_node); + + attach_task(env->dst_rq, p); + } + + raw_spin_unlock(&env->dst_rq->lock); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -5525,6 +5922,13 @@ static unsigned long task_h_load(struct task_struct *p) #endif /********** Helpers for find_busiest_group ************************/ + +enum group_type { + group_other = 0, + group_imbalanced, + group_overloaded, +}; + /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -5534,12 +5938,12 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; + unsigned long group_usage; /* Total usage of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ - unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; - int group_imb; /* Is there an imbalance in the group ? */ - int group_has_free_capacity; + enum group_type group_type; + int group_no_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -5576,6 +5980,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, + .sum_nr_running = 0, + .group_type = group_other, }, }; } @@ -5608,35 +6014,23 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) +static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - return SCHED_CAPACITY_SCALE; -} + if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + return sd->smt_gain / sd->span_weight; -unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) -{ - return default_scale_capacity(sd, cpu); -} - -static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) -{ - unsigned long weight = sd->span_weight; - unsigned long smt_gain = sd->smt_gain; - - smt_gain /= weight; - - return smt_gain; + return SCHED_CAPACITY_SCALE; } -unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - return default_scale_smt_capacity(sd, cpu); + return default_scale_cpu_capacity(sd, cpu); } static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, available, age_stamp, avg; + u64 total, used, age_stamp, avg; s64 delta; /* @@ -5645,52 +6039,35 @@ static unsigned long scale_rt_capacity(int cpu) */ age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); + delta = __rq_clock_broken(rq) - age_stamp; - delta = rq_clock(rq) - age_stamp; if (unlikely(delta < 0)) delta = 0; total = sched_avg_period() + delta; - if (unlikely(total < avg)) { - /* Ensures that capacity won't end up being negative */ - available = 0; - } else { - available = total - avg; - } - - if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) - total = SCHED_CAPACITY_SCALE; + used = div_u64(avg, total); - total >>= SCHED_CAPACITY_SHIFT; + if (likely(used < SCHED_CAPACITY_SCALE)) + return SCHED_CAPACITY_SCALE - used; - return div_u64(available, total); + return 1; } static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long weight = sd->span_weight; unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; - if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_smt_capacity(sd, cpu); - else - capacity *= default_scale_smt_capacity(sd, cpu); - - capacity >>= SCHED_CAPACITY_SHIFT; - } - - sdg->sgc->capacity_orig = capacity; - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_freq_capacity(sd, cpu); + capacity *= arch_scale_cpu_capacity(sd, cpu); else - capacity *= default_scale_capacity(sd, cpu); + capacity *= default_scale_cpu_capacity(sd, cpu); capacity >>= SCHED_CAPACITY_SHIFT; + cpu_rq(cpu)->cpu_capacity_orig = capacity; + capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; @@ -5705,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, capacity_orig; + unsigned long capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -5717,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) return; } - capacity_orig = capacity = 0; + capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -5737,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * Use capacity_of(), which is set irrespective of domains * in update_cpu_capacity(). * - * This avoids capacity/capacity_orig from being 0 and + * This avoids capacity from being 0 and * causing divide-by-zero issues on boot. - * - * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - capacity_orig += capacity_of(cpu); capacity += capacity_of(cpu); continue; } sgc = rq->sd->groups->sgc; - capacity_orig += sgc->capacity_orig; capacity += sgc->capacity; } } else { @@ -5760,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity_orig += group->sgc->capacity_orig; capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } - sdg->sgc->capacity_orig = capacity_orig; sdg->sgc->capacity = capacity; } /* - * Try and fix up capacity for tiny siblings, this is needed when - * things like SD_ASYM_PACKING need f_b_g to select another sibling - * which on its own isn't powerful enough. - * - * See update_sd_pick_busiest() and check_asym_packing(). + * Check whether the capacity of the rq has been noticeably reduced by side + * activity. The imbalance_pct is used for the threshold. + * Return true is the capacity is reduced */ static inline int -fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { - /* - * Only siblings can have significantly less than SCHED_CAPACITY_SCALE - */ - if (!(sd->flags & SD_SHARE_CPUCAPACITY)) - return 0; - - /* - * If ~90% of the cpu_capacity is still there, we're good. - */ - if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) - return 1; - - return 0; + return ((rq->cpu_capacity * sd->imbalance_pct) < + (rq->cpu_capacity_orig * 100)); } /* @@ -5830,31 +6188,62 @@ static inline int sg_imbalanced(struct sched_group *group) } /* - * Compute the group capacity factor. - * - * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by - * first dividing out the smt factor and computing the actual number of cores - * and limit unit capacity with that. + * group_has_capacity returns true if the group has spare capacity that could + * be used by some tasks. + * We consider that a group has spare capacity if the * number of task is + * smaller than the number of CPUs or if the usage is lower than the available + * capacity for CFS tasks. + * For the latter, we use a threshold to stabilize the state, to take into + * account the variance of the tasks' load and to return true if the available + * capacity in meaningful for the load balancer. + * As an example, an available capacity of 1% can appear but it doesn't make + * any benefit for the load balance. + */ +static inline bool +group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running < sgs->group_weight) + return true; + + if ((sgs->group_capacity * 100) > + (sgs->group_usage * env->sd->imbalance_pct)) + return true; + + return false; +} + +/* + * group_is_overloaded returns true if the group has more tasks than it can + * handle. + * group_is_overloaded is not equals to !group_has_capacity because a group + * with the exact right number of tasks, has no more spare capacity but is not + * overloaded so both group_has_capacity and group_is_overloaded return + * false. */ -static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) +static inline bool +group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) { - unsigned int capacity_factor, smt, cpus; - unsigned int capacity, capacity_orig; + if (sgs->sum_nr_running <= sgs->group_weight) + return false; + + if ((sgs->group_capacity * 100) < + (sgs->group_usage * env->sd->imbalance_pct)) + return true; - capacity = group->sgc->capacity; - capacity_orig = group->sgc->capacity_orig; - cpus = group->group_weight; + return false; +} - /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); - capacity_factor = cpus / smt; /* cores */ +static enum group_type group_classify(struct lb_env *env, + struct sched_group *group, + struct sg_lb_stats *sgs) +{ + if (sgs->group_no_capacity) + return group_overloaded; - capacity_factor = min_t(unsigned, - capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); + if (sg_imbalanced(group)) + return group_imbalanced; - return capacity_factor; + return group_other; } /** @@ -5864,10 +6253,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. + * @overload: Indicate more than one runnable task for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, - int local_group, struct sg_lb_stats *sgs) + int local_group, struct sg_lb_stats *sgs, + bool *overload) { unsigned long load; int i; @@ -5884,7 +6275,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->sum_nr_running += rq->nr_running; + sgs->group_usage += get_cpu_usage(i); + sgs->sum_nr_running += rq->cfs.h_nr_running; + + if (rq->nr_running > 1) + *overload = true; + #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; @@ -5903,11 +6299,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; - sgs->group_imb = sg_imbalanced(group); - sgs->group_capacity_factor = sg_capacity_factor(env, group); - - if (sgs->group_capacity_factor > sgs->sum_nr_running) - sgs->group_has_free_capacity = 1; + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(env, group, sgs); } /** @@ -5928,13 +6321,19 @@ static bool update_sd_pick_busiest(struct lb_env *env, struct sched_group *sg, struct sg_lb_stats *sgs) { - if (sgs->avg_load <= sds->busiest_stat.avg_load) - return false; + struct sg_lb_stats *busiest = &sds->busiest_stat; - if (sgs->sum_nr_running > sgs->group_capacity_factor) + if (sgs->group_type > busiest->group_type) return true; - if (sgs->group_imb) + if (sgs->group_type < busiest->group_type) + return false; + + if (sgs->avg_load <= busiest->avg_load) + return false; + + /* This is the busiest node in its class. */ + if (!(env->sd->flags & SD_ASYM_PACKING)) return true; /* @@ -5942,8 +6341,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * numbered CPUs in the group, therefore mark all groups * higher than ourself as busy. */ - if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && - env->dst_cpu < group_first_cpu(sg)) { + if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { if (!sds->busiest) return true; @@ -5995,6 +6393,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; + bool overload = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -6015,24 +6414,28 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, load_idx, local_group, sgs); + update_sg_lb_stats(env, sg, load_idx, local_group, sgs, + &overload); if (local_group) goto next_group; /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity factor to one so that we'll try + * first, lower the sg capacity so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity_factor. The - * extra check prevents the case where you always pull from the - * heaviest group when it is already under-utilized (possible - * with a large weight task outweighs the tasks on the system). + * these excess tasks. The extra check prevents the case where + * you always pull from the heaviest group when it is already + * under-utilized (possible with a large weight task outweighs + * the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_free_capacity) - sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); + group_has_capacity(env, &sds->local_stat) && + (sgs->sum_nr_running > 1)) { + sgs->group_no_capacity = 1; + sgs->group_type = group_overloaded; + } if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -6049,6 +6452,13 @@ next_group: if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); + + if (!env->sd->parent) { + /* update overload indicator if we are at root domain */ + if (env->dst_rq->rd->overload != overload) + env->dst_rq->rd->overload = overload; + } + } /** @@ -6179,7 +6589,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local = &sds->local_stat; busiest = &sds->busiest_stat; - if (busiest->group_imb) { + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages * to ensure cpu-load equilibrium, look at wider averages. XXX @@ -6199,17 +6609,17 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return fix_small_imbalance(env, sds); } - if (!busiest->group_imb) { - /* - * Don't want to pull so many tasks that a group would go idle. - * Except of course for the group_imb case, since then we might - * have to drop below capacity to reach cpu-load equilibrium. - */ - load_above_capacity = - (busiest->sum_nr_running - busiest->group_capacity_factor); - - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); - load_above_capacity /= busiest->group_capacity; + /* + * If there aren't any idle cpus, avoid creating some. + */ + if (busiest->group_type == group_overloaded && + local->group_type == group_overloaded) { + load_above_capacity = busiest->sum_nr_running * + SCHED_LOAD_SCALE; + if (load_above_capacity > busiest->group_capacity) + load_above_capacity -= busiest->group_capacity; + else + load_above_capacity = ~0UL; } /* @@ -6272,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) local = &sds.local_stat; busiest = &sds.busiest_stat; + /* ASYM feature bypasses nice load balance check */ if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && check_asym_packing(env, &sds)) return sds.busiest; @@ -6288,16 +6699,16 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * work because they assume all things are equal, which typically * isn't true due to cpus_allowed constraints and the like. */ - if (busiest->group_imb) + if (busiest->group_type == group_imbalanced) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && - !busiest->group_has_free_capacity) + if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + busiest->group_no_capacity) goto force_balance; /* - * If the local group is more busy than the selected busiest group + * If the local group is busier than the selected busiest group * don't try and pull any tasks. */ if (local->avg_load >= busiest->avg_load) @@ -6312,13 +6723,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (env->idle == CPU_IDLE) { /* - * This cpu is idle. If the busiest group load doesn't - * have more tasks than the number of available cpu's and - * there is no imbalance between this and busiest group - * wrt to idle cpu's, it is balanced. + * This cpu is idle. If the busiest group is not overloaded + * and there is no imbalance between this and busiest group + * wrt idle cpus, it is balanced. The imbalance becomes + * significant if the diff is greater than 1 otherwise we + * might end up to just move the imbalance on another group */ - if ((local->idle_cpus < busiest->idle_cpus) && - busiest->sum_nr_running <= busiest->group_weight) + if ((busiest->group_type != group_overloaded) && + (local->idle_cpus <= (busiest->idle_cpus + 1))) goto out_balanced; } else { /* @@ -6351,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long capacity, capacity_factor, wl; + unsigned long capacity, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6380,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; capacity = capacity_of(i); - capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); wl = weighted_cpuload(i); @@ -6390,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu capacity. */ - if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) + + if (rq->nr_running == 1 && wl > env->imbalance && + !check_cpu_capacity(rq, env->sd)) continue; /* @@ -6438,6 +6849,19 @@ static int need_active_balance(struct lb_env *env) return 1; } + /* + * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. + * It's worth migrating the task if the src_cpu's capacity is reduced + * because of other sched_class or IRQs if more capacity stays + * available on dst_cpu. + */ + if ((env->idle != CPU_NOT_IDLE) && + (env->src_rq->cfs.h_nr_running == 1)) { + if ((check_cpu_capacity(env->src_rq, sd)) && + (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -6490,7 +6914,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_group *group; struct rq *busiest; unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_mask); + struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { .sd = sd, @@ -6501,6 +6925,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .loop_break = sched_nr_migrate_break, .cpus = cpus, .fbq_type = all, + .tasks = LIST_HEAD_INIT(env.tasks), }; /* @@ -6536,6 +6961,9 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + ld_moved = 0; if (busiest->nr_running > 1) { /* @@ -6545,28 +6973,33 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.src_cpu = busiest->cpu; - env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: - local_irq_save(flags); - double_rq_lock(env.dst_rq, busiest); + raw_spin_lock_irqsave(&busiest->lock, flags); /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations */ - cur_ld_moved = move_tasks(&env); - ld_moved += cur_ld_moved; - double_rq_unlock(env.dst_rq, busiest); - local_irq_restore(flags); + cur_ld_moved = detach_tasks(&env); /* - * some other cpu did the load balance for us. + * We've detached some tasks from busiest_rq. Every + * task is masked "TASK_ON_RQ_MIGRATING", so we can safely + * unlock busiest->lock, and we are able to be sure + * that nobody can manipulate the tasks in parallel. + * See task_rq_lock() family for the details. */ - if (cur_ld_moved && env.dst_cpu != smp_processor_id()) - resched_cpu(env.dst_cpu); + + raw_spin_unlock(&busiest->lock); + + if (cur_ld_moved) { + attach_tasks(&env); + ld_moved += cur_ld_moved; + } + + local_irq_restore(flags); if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; @@ -6616,10 +7049,8 @@ more_balance: if (sd_parent) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) *group_imbalance = 1; - } else if (*group_imbalance) - *group_imbalance = 0; } /* All tasks on this runqueue were pinned by CPU affinity */ @@ -6630,7 +7061,7 @@ more_balance: env.loop_break = sched_nr_migrate_break; goto redo; } - goto out_balanced; + goto out_all_pinned; } } @@ -6695,7 +7126,7 @@ more_balance: * If we've begun active balancing, start to back off. This * case may not be covered by the all_pinned logic if there * is only 1 task on the busy runqueue (because we don't call - * move_tasks). + * detach_tasks). */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; @@ -6704,6 +7135,23 @@ more_balance: goto out; out_balanced: + /* + * We reach balance although we may have faced some affinity + * constraints. Clear the imbalance flag if it was set. + */ + if (sd_parent) { + int *group_imbalance = &sd_parent->groups->sgc->imbalance; + + if (*group_imbalance) + *group_imbalance = 0; + } + +out_all_pinned: + /* + * We reach balance because all tasks are pinned at this level so + * we can't migrate them. Let the imbalance flag set so parent level + * can try to migrate them. + */ schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; @@ -6767,7 +7215,8 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) { + if (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) @@ -6864,6 +7313,7 @@ static int active_load_balance_cpu_stop(void *data) int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; + struct task_struct *p = NULL; raw_spin_lock_irq(&busiest_rq->lock); @@ -6883,9 +7333,6 @@ static int active_load_balance_cpu_stop(void *data) */ BUG_ON(busiest_rq == target_rq); - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { @@ -6906,16 +7353,22 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); - if (move_one_task(&env)) + p = detach_one_task(&env); + if (p) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); } rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); out_unlock: busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); + raw_spin_unlock(&busiest_rq->lock); + + if (p) + attach_one_task(target_rq, p); + + local_irq_enable(); + return 0; } @@ -7219,22 +7672,25 @@ end: /* * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu is the system. + * of an idle cpu in the system. * - This rq has more than one task. - * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's capacity. + * - This rq has at least one CFS task and the capacity of the CPU is + * significantly reduced because of RT tasks or IRQs. + * - At parent of LLC scheduler domain level, this cpu's scheduler group has + * multiple busy cpu. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline int nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; + bool kick = false; if (unlikely(rq->idle_balance)) - return 0; + return false; /* * We may be recently in ticked or tickless idle mode. At the first @@ -7248,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq) * balancing. */ if (likely(!atomic_read(&nohz.nr_cpus))) - return 0; + return false; if (time_before(now, nohz.next_balance)) - return 0; + return false; if (rq->nr_running >= 2) - goto need_kick; + return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); - if (nr_busy > 1) - goto need_kick_unlock; + if (nr_busy > 1) { + kick = true; + goto unlock; + } + } - sd = rcu_dereference(per_cpu(sd_asym, cpu)); + sd = rcu_dereference(rq->sd); + if (sd) { + if ((rq->cfs.h_nr_running >= 1) && + check_cpu_capacity(rq, sd)) { + kick = true; + goto unlock; + } + } + sd = rcu_dereference(per_cpu(sd_asym, cpu)); if (sd && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) - goto need_kick_unlock; - - rcu_read_unlock(); - return 0; + sched_domain_span(sd)) < cpu)) { + kick = true; + goto unlock; + } -need_kick_unlock: +unlock: rcu_read_unlock(); -need_kick: - return 1; + return kick; } #else static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } @@ -7295,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; - rebalance_domains(this_rq, idle); - /* * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are - * stopped. + * stopped. Do nohz_idle_balance *before* rebalance_domains to + * give the idle cpus a chance to load balance. Else we may + * load balance only within the local sched_domain hierarchy + * and abort nohz_idle_balance altogether if we pull some load. */ nohz_idle_balance(this_rq, idle); + rebalance_domains(this_rq, idle); } /* @@ -7325,6 +7791,8 @@ void trigger_load_balance(struct rq *rq) static void rq_online_fair(struct rq *rq) { update_sysctl(); + + update_runtime_enabled(rq); } static void rq_offline_fair(struct rq *rq) @@ -7398,7 +7866,7 @@ static void task_fork_fair(struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); - resched_task(rq->curr); + resched_curr(rq); } se->vruntime -= cfs_rq->min_vruntime; @@ -7413,7 +7881,7 @@ static void task_fork_fair(struct task_struct *p) static void prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->se.on_rq) + if (!task_on_rq_queued(p)) return; /* @@ -7423,7 +7891,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) */ if (rq->curr == p) { if (p->prio > oldprio) - resched_task(rq->curr); + resched_curr(rq); } else check_preempt_curr(rq, p, 0); } @@ -7438,11 +7906,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it's on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it's !on_rq, then only when + * If it's queued, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !queued, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!p->on_rq && p->state != TASK_RUNNING) { + if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -7469,15 +7937,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { - struct sched_entity *se = &p->se; #ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *se = &p->se; /* * Since the real-depth could have been changed (only FAIR * class maintain depth value), reset depth properly. */ se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!se->on_rq) + if (!task_on_rq_queued(p)) return; /* @@ -7486,7 +7954,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * if we can still preempt the current task. */ if (rq->curr == p) - resched_task(rq->curr); + resched_curr(rq); else check_preempt_curr(rq, p, 0); } @@ -7523,7 +7991,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_move_group_fair(struct task_struct *p, int on_rq) +static void task_move_group_fair(struct task_struct *p, int queued) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq; @@ -7542,7 +8010,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * fair sleeper stuff for the first placement, but who cares. */ /* - * When !on_rq, vruntime of the task has usually NOT been normalized. + * When !queued, vruntime of the task has usually NOT been normalized. * But there are some cases where it has already been normalized: * * - Moving a forked child which is waiting for being woken up by @@ -7553,14 +8021,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * To prevent boost or penalty in the new cfs_rq caused by delta * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. */ - if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) - on_rq = 1; + if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) + queued = 1; - if (!on_rq) + if (!queued) se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); se->depth = se->parent ? se->parent->depth + 1 : 0; - if (!on_rq) { + if (!queued) { cfs_rq = cfs_rq_of(se); se->vruntime += cfs_rq->min_vruntime; #ifdef CONFIG_SMP @@ -7783,6 +8251,8 @@ const struct sched_class fair_sched_class = { .get_rr_interval = get_rr_interval_fair, + .update_curr = update_curr_fair, + #ifdef CONFIG_FAIR_GROUP_SCHED .task_move_group = task_move_group_fair, #endif diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 90284d117fe6..91e33cd485f6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true) */ SCHED_FEAT(TTWU_QUEUE, true) +#ifdef HAVE_RT_PUSH_IPI +/* + * In order to avoid a thundering herd attack of CPUs that are + * lowering their priorities at the same time, and there being + * a single CPU that has an RT task that can migrate and is waiting + * to run, where the other CPUs will try to take that CPUs + * rq lock and possibly create a large contention, sending an + * IPI to that CPU and let that CPU push the RT task to where + * it should go may be a better scenario. + */ +SCHED_FEAT(RT_PUSH_IPI, true) +#endif + SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cf009fb0bc25..deef1caa94c6 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -7,6 +7,7 @@ #include <linux/tick.h> #include <linux/mm.h> #include <linux/stackprotector.h> +#include <linux/suspend.h> #include <asm/tlb.h> @@ -47,7 +48,8 @@ static inline int cpu_idle_poll(void) rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); local_irq_enable(); - while (!tif_need_resched()) + while (!tif_need_resched() && + (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); rcu_idle_exit(); @@ -79,7 +81,8 @@ static void cpuidle_idle_call(void) struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; - bool broadcast; + unsigned int broadcast; + bool reflect; /* * Check if the idle task must be rescheduled. If it is the @@ -103,25 +106,37 @@ static void cpuidle_idle_call(void) */ rcu_idle_enter(); + if (cpuidle_not_available(drv, dev)) + goto use_default; + /* - * Ask the cpuidle framework to choose a convenient idle state. - * Fall back to the default arch idle method on errors. + * Suspend-to-idle ("freeze") is a system state in which all user space + * has been frozen, all I/O devices have been suspended and the only + * activity happens here and in iterrupts (if any). In that case bypass + * the cpuidle governor and go stratight for the deepest idle state + * available. Possibly also suspend the local tick and the entire + * timekeeping to prevent timer interrupts from kicking us out of idle + * until a proper wakeup interrupt happens. */ - next_state = cpuidle_select(drv, dev); - if (next_state < 0) { -use_default: - /* - * We can't use the cpuidle framework, let's use the default - * idle routine. - */ - if (current_clr_polling_and_test()) + if (idle_should_freeze()) { + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state >= 0) { local_irq_enable(); - else - arch_cpu_idle(); + goto exit_idle; + } - goto exit_idle; + reflect = false; + next_state = cpuidle_find_deepest_state(drv, dev); + } else { + reflect = true; + /* + * Ask the cpuidle framework to choose a convenient idle state. + */ + next_state = cpuidle_select(drv, dev); } - + /* Fall back to the default arch idle method on errors. */ + if (next_state < 0) + goto use_default; /* * The idle task must be scheduled, it is pointless to @@ -135,7 +150,7 @@ use_default: goto exit_idle; } - broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); + broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP; /* * Tell the time framework to switch to a broadcast timer @@ -143,11 +158,11 @@ use_default: * is used from another cpu as a broadcast timer, this call may * fail if it is not available */ - if (broadcast && - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) + if (broadcast && tick_broadcast_enter()) goto use_default; - trace_cpu_idle_rcuidle(next_state, dev->cpu); + /* Take note of the planned idle state. */ + idle_set_state(this_rq(), &drv->states[next_state]); /* * Enter the idle state previously returned by the governor decision. @@ -156,15 +171,17 @@ use_default: */ entered_state = cpuidle_enter(drv, dev, next_state); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); + /* The cpu is no longer idle or about to enter idle. */ + idle_set_state(this_rq(), NULL); if (broadcast) - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); + tick_broadcast_exit(); /* * Give the governor an opportunity to reflect on the outcome */ - cpuidle_reflect(dev, entered_state); + if (reflect) + cpuidle_reflect(dev, entered_state); exit_idle: __current_set_polling(); @@ -177,8 +194,23 @@ exit_idle: rcu_idle_exit(); start_critical_timings(); + return; + +use_default: + /* + * We can't use the cpuidle framework, let's use the default + * idle routine. + */ + if (current_clr_polling_and_test()) + local_irq_enable(); + else + arch_cpu_idle(); + + goto exit_idle; } +DEFINE_PER_CPU(bool, cpu_dead_idle); + /* * Generic idle loop implementation * @@ -203,8 +235,13 @@ static void cpu_idle_loop(void) check_pgt_cache(); rmb(); - if (cpu_is_offline(smp_processor_id())) + if (cpu_is_offline(smp_processor_id())) { + rcu_cpu_notify(NULL, CPU_DYING_IDLE, + (void *)(long)smp_processor_id()); + smp_mb(); /* all activity before dead. */ + this_cpu_write(cpu_dead_idle, true); arch_cpu_idle_dead(); + } local_irq_disable(); arch_cpu_idle_enter(); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 879f2b75266a..c65dac8c97cd 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) */ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) { - resched_task(rq->idle); + resched_curr(rq); } static struct task_struct * @@ -75,6 +75,10 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task return 0; } +static void update_curr_idle(struct rq *rq) +{ +} + /* * Simple, special scheduling class for the per-CPU idle tasks: */ @@ -101,4 +105,5 @@ const struct sched_class idle_sched_class = { .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, + .update_curr = update_curr_idle, }; diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index 16f5a30f9c88..8ecd552fe4f2 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -8,13 +8,6 @@ #include "sched.h" -unsigned long this_cpu_load(void) -{ - struct rq *this = this_rq(); - return this->cpu_load[0]; -} - - /* * Global load-average calculations * diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a49083192c64..575da76a3874 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -6,6 +6,7 @@ #include "sched.h" #include <linux/slab.h> +#include <linux/irq_work.h> int sched_rr_timeslice = RR_TIMESLICE; @@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } -void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +#ifdef CONFIG_SMP +static void push_irq_work_func(struct irq_work *work); +#endif + +void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; int i; @@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); + +#ifdef HAVE_RT_PUSH_IPI + rt_rq->push_flags = 0; + rt_rq->push_cpu = nr_cpu_ids; + raw_spin_lock_init(&rt_rq->push_lock); + init_irq_work(&rt_rq->push_work, push_irq_work_func); #endif +#endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; - init_rt_rq(rt_rq, cpu_rq(i)); + init_rt_rq(rt_rq); rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } @@ -463,9 +475,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + struct rq *rq = rq_of_rt_rq(rt_rq); struct sched_rt_entity *rt_se; - int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + int cpu = cpu_of(rq); rt_se = rt_rq->tg->rt_se[cpu]; @@ -476,7 +489,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) enqueue_rt_entity(rt_se, false); if (rt_rq->highest_prio.curr < curr->prio) - resched_task(curr); + resched_curr(rq); } } @@ -566,7 +579,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) return; enqueue_top_rt_rq(rt_rq); - resched_task(rq->curr); + resched_curr(rq); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) @@ -740,6 +753,9 @@ balanced: rt_rq->rt_throttled = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock); + + /* Make rt_rq available for pick_next_task() */ + sched_rt_rq_enqueue(rt_rq); } } @@ -827,11 +843,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) enqueue = 1; /* - * Force a clock update if the CPU was idle, - * lest wakeup -> unthrottle time accumulate. + * When we're idle and a woken (rt) task is + * throttled check_preempt_curr() will set + * skip_update and the time between the wakeup + * and this unthrottle will get accounted as + * 'runtime'. */ if (rt_rq->rt_nr_running && rq->curr == rq->idle) - rq->skip_clock_update = -1; + rq_clock_skip_update(rq, false); } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; @@ -948,7 +967,7 @@ static void update_curr_rt(struct rq *rq) raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); } } @@ -1297,9 +1316,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) struct task_struct *curr; struct rq *rq; - if (p->nr_cpus_allowed == 1) - goto out; - /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) goto out; @@ -1336,7 +1352,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) curr->prio <= p->prio)) { int target = find_lowest_rq(p); - if (target != -1) + /* + * Don't bother moving it if the destination CPU is + * not running a lower priority task. + */ + if (target != -1 && + p->prio < cpu_rq(target)->rt.highest_prio.curr) cpu = target; } rcu_read_unlock(); @@ -1347,23 +1368,29 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->nr_cpus_allowed == 1) + /* + * Current can't be migrated, useless to reschedule, + * let's hope p can move out. + */ + if (rq->curr->nr_cpus_allowed == 1 || + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) return; + /* + * p is migratable, so let's not schedule it and + * see if it is pushed or pulled somewhere else. + */ if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) - return; - /* * There appears to be other cpus that can accept * current and none to run 'p', so lets reschedule * to try and push current away: */ requeue_task_rt(rq, p, 1); - resched_task(rq->curr); + resched_curr(rq); } #endif /* CONFIG_SMP */ @@ -1374,7 +1401,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) { if (p->prio < rq->curr->prio) { - resched_task(rq->curr); + resched_curr(rq); return; } @@ -1444,7 +1471,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) * means a dl or stop task can slip in, in which case we need * to re-start task selection. */ - if (unlikely((rq->stop && rq->stop->on_rq) || + if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || rq->dl.dl_nr_running)) return RETRY_TASK; } @@ -1464,8 +1491,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) p = _pick_next_task_rt(rq); /* The running task is never eligible for pushing */ - if (p) - dequeue_pushable_task(rq, p); + dequeue_pushable_task(rq, p); set_post_schedule(rq); @@ -1522,7 +1548,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); + struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); @@ -1608,6 +1634,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) lowest_rq = cpu_rq(cpu); + if (lowest_rq->rt.highest_prio.curr <= task->prio) { + /* + * Target rq has tasks of equal or higher priority, + * retrying does not release any lock and is unlikely + * to yield a different result. + */ + lowest_rq = NULL; + break; + } + /* if the prio of this runqueue changed, try again */ if (double_lock_balance(rq, lowest_rq)) { /* @@ -1620,7 +1656,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(lowest_rq->cpu, tsk_cpus_allowed(task)) || task_running(rq, task) || - !task->on_rq)) { + !task_on_rq_queued(task))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1654,7 +1690,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); - BUG_ON(!p->on_rq); + BUG_ON(!task_on_rq_queued(p)); BUG_ON(!rt_task(p)); return p; @@ -1690,7 +1726,7 @@ retry: * just reschedule current. */ if (unlikely(next_task->prio < rq->curr->prio)) { - resched_task(rq->curr); + resched_curr(rq); return 0; } @@ -1737,7 +1773,7 @@ retry: activate_task(lowest_rq, next_task, 0); ret = 1; - resched_task(lowest_rq->curr); + resched_curr(lowest_rq); double_unlock_balance(rq, lowest_rq); @@ -1754,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq) ; } +#ifdef HAVE_RT_PUSH_IPI +/* + * The search for the next cpu always starts at rq->cpu and ends + * when we reach rq->cpu again. It will never return rq->cpu. + * This returns the next cpu to check, or nr_cpu_ids if the loop + * is complete. + * + * rq->rt.push_cpu holds the last cpu returned by this function, + * or if this is the first instance, it must hold rq->cpu. + */ +static int rto_next_cpu(struct rq *rq) +{ + int prev_cpu = rq->rt.push_cpu; + int cpu; + + cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); + + /* + * If the previous cpu is less than the rq's CPU, then it already + * passed the end of the mask, and has started from the beginning. + * We end if the next CPU is greater or equal to rq's CPU. + */ + if (prev_cpu < rq->cpu) { + if (cpu >= rq->cpu) + return nr_cpu_ids; + + } else if (cpu >= nr_cpu_ids) { + /* + * We passed the end of the mask, start at the beginning. + * If the result is greater or equal to the rq's CPU, then + * the loop is finished. + */ + cpu = cpumask_first(rq->rd->rto_mask); + if (cpu >= rq->cpu) + return nr_cpu_ids; + } + rq->rt.push_cpu = cpu; + + /* Return cpu to let the caller know if the loop is finished or not */ + return cpu; +} + +static int find_next_push_cpu(struct rq *rq) +{ + struct rq *next_rq; + int cpu; + + while (1) { + cpu = rto_next_cpu(rq); + if (cpu >= nr_cpu_ids) + break; + next_rq = cpu_rq(cpu); + + /* Make sure the next rq can push to this rq */ + if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) + break; + } + + return cpu; +} + +#define RT_PUSH_IPI_EXECUTING 1 +#define RT_PUSH_IPI_RESTART 2 + +static void tell_cpu_to_push(struct rq *rq) +{ + int cpu; + + if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { + raw_spin_lock(&rq->rt.push_lock); + /* Make sure it's still executing */ + if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { + /* + * Tell the IPI to restart the loop as things have + * changed since it started. + */ + rq->rt.push_flags |= RT_PUSH_IPI_RESTART; + raw_spin_unlock(&rq->rt.push_lock); + return; + } + raw_spin_unlock(&rq->rt.push_lock); + } + + /* When here, there's no IPI going around */ + + rq->rt.push_cpu = rq->cpu; + cpu = find_next_push_cpu(rq); + if (cpu >= nr_cpu_ids) + return; + + rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; + + irq_work_queue_on(&rq->rt.push_work, cpu); +} + +/* Called from hardirq context */ +static void try_to_push_tasks(void *arg) +{ + struct rt_rq *rt_rq = arg; + struct rq *rq, *src_rq; + int this_cpu; + int cpu; + + this_cpu = rt_rq->push_cpu; + + /* Paranoid check */ + BUG_ON(this_cpu != smp_processor_id()); + + rq = cpu_rq(this_cpu); + src_rq = rq_of_rt_rq(rt_rq); + +again: + if (has_pushable_tasks(rq)) { + raw_spin_lock(&rq->lock); + push_rt_task(rq); + raw_spin_unlock(&rq->lock); + } + + /* Pass the IPI to the next rt overloaded queue */ + raw_spin_lock(&rt_rq->push_lock); + /* + * If the source queue changed since the IPI went out, + * we need to restart the search from that CPU again. + */ + if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { + rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; + rt_rq->push_cpu = src_rq->cpu; + } + + cpu = find_next_push_cpu(src_rq); + + if (cpu >= nr_cpu_ids) + rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; + raw_spin_unlock(&rt_rq->push_lock); + + if (cpu >= nr_cpu_ids) + return; + + /* + * It is possible that a restart caused this CPU to be + * chosen again. Don't bother with an IPI, just see if we + * have more to push. + */ + if (unlikely(cpu == rq->cpu)) + goto again; + + /* Try the next RT overloaded CPU */ + irq_work_queue_on(&rt_rq->push_work, cpu); +} + +static void push_irq_work_func(struct irq_work *work) +{ + struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); + + try_to_push_tasks(rt_rq); +} +#endif /* HAVE_RT_PUSH_IPI */ + static int pull_rt_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, ret = 0, cpu; @@ -1769,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq) */ smp_rmb(); +#ifdef HAVE_RT_PUSH_IPI + if (sched_feat(RT_PUSH_IPI)) { + tell_cpu_to_push(this_rq); + return 0; + } +#endif + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; @@ -1805,7 +2006,7 @@ static int pull_rt_task(struct rq *this_rq) */ if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->on_rq); + WARN_ON(!task_on_rq_queued(p)); /* * There's a chance that p is higher in priority @@ -1866,7 +2067,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, BUG_ON(!rt_task(p)); - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; weight = cpumask_weight(new_mask); @@ -1932,11 +2133,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!p->on_rq || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) return; if (pull_rt_task(rq)) - resched_task(rq->curr); + resched_curr(rq); } void __init init_sched_rt_class(void) @@ -1966,7 +2167,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (p->on_rq && rq->curr != p) { + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && /* Don't resched if we changed runqueues */ @@ -1974,7 +2175,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) check_resched = 0; #endif /* CONFIG_SMP */ if (check_resched && p->prio < rq->curr->prio) - resched_task(rq->curr); + resched_curr(rq); } } @@ -1985,7 +2186,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) static void prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; if (rq->curr == p) { @@ -2003,11 +2204,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * Only reschedule if p is still on the same runqueue. */ if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) - resched_task(p); + resched_curr(rq); #else /* For UP simply resched on drop of prio */ if (oldprio < p->prio) - resched_task(p); + resched_curr(rq); #endif /* CONFIG_SMP */ } else { /* @@ -2016,7 +2217,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) * then reschedule. */ if (p->prio < rq->curr->prio) - resched_task(rq->curr); + resched_curr(rq); } } @@ -2069,7 +2270,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) for_each_sched_rt_entity(rt_se) { if (rt_se->run_list.prev != rt_se->run_list.next) { requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); + resched_curr(rq); return; } } @@ -2125,6 +2326,8 @@ const struct sched_class rt_sched_class = { .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, + + .update_curr = update_curr_rt, }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 31cc02ebc54e..e0e129993958 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -6,6 +6,7 @@ #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> +#include <linux/irq_work.h> #include <linux/tick.h> #include <linux/slab.h> @@ -14,6 +15,11 @@ #include "cpuacct.h" struct rq; +struct cpuidle_state; + +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 +#define TASK_ON_RQ_MIGRATING 2 extern __read_mostly int scheduler_running; @@ -126,6 +132,9 @@ struct rt_bandwidth { u64 rt_runtime; struct hrtimer rt_period_timer; }; + +void __dl_clear_params(struct task_struct *p); + /* * To keep the bandwidth of -deadline tasks and groups under control * we need some place where: @@ -168,6 +177,25 @@ struct dl_bw { u64 bw, total_bw; }; +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + extern struct mutex sched_domains_mutex; #ifdef CONFIG_CGROUP_SCHED @@ -184,7 +212,7 @@ struct cfs_bandwidth { raw_spinlock_t lock; ktime_t period; u64 quota, runtime; - s64 hierarchal_quota; + s64 hierarchical_quota; u64 runtime_expires; int idle, timer_active; @@ -335,8 +363,14 @@ struct cfs_rq { * Under CFS, load is tracked on a per-entity basis and aggregated up. * This allows for the description of both thread and group usage (in * the FAIR_GROUP_SCHED case). + * runnable_load_avg is the sum of the load_avg_contrib of the + * sched_entities on the rq. + * blocked_load_avg is similar to runnable_load_avg except that its + * the blocked sched_entities on the rq. + * utilization_load_avg is the sum of the average running time of the + * sched_entities on the rq. */ - unsigned long runnable_load_avg, blocked_load_avg; + unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; atomic64_t decay_counter; u64 last_decay; atomic_long_t removed_load; @@ -391,6 +425,11 @@ static inline int rt_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } +/* RT IPI pull logic requires IRQ_WORK */ +#ifdef CONFIG_IRQ_WORK +# define HAVE_RT_PUSH_IPI +#endif + /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; @@ -408,7 +447,13 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; +#ifdef HAVE_RT_PUSH_IPI + int push_flags; + int push_cpu; + struct irq_work push_work; + raw_spinlock_t push_lock; #endif +#endif /* CONFIG_SMP */ int rt_queued; int rt_throttled; @@ -477,6 +522,9 @@ struct root_domain { cpumask_var_t span; cpumask_var_t online; + /* Indicate more than one runnable task for any CPU */ + bool overload; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). @@ -528,8 +576,6 @@ struct rq { #ifdef CONFIG_NO_HZ_FULL unsigned long last_sched_tick; #endif - int skip_clock_update; - /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -558,6 +604,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; + unsigned int clock_skip_update; u64 clock; u64 clock_task; @@ -568,6 +615,7 @@ struct rq { struct sched_domain *sd; unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; unsigned char idle_balance; /* For active balancing */ @@ -633,6 +681,11 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif + +#ifdef CONFIG_CPU_IDLE + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; +#endif }; static inline int cpu_of(struct rq *rq) @@ -644,25 +697,62 @@ static inline int cpu_of(struct rq *rq) #endif } -DECLARE_PER_CPU(struct rq, runqueues); +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) +#define this_rq() this_cpu_ptr(&runqueues) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define raw_rq() (&__raw_get_cpu_var(runqueues)) +#define raw_rq() raw_cpu_ptr(&runqueues) + +static inline u64 __rq_clock_broken(struct rq *rq) +{ + return ACCESS_ONCE(rq->clock); +} static inline u64 rq_clock(struct rq *rq) { + lockdep_assert_held(&rq->lock); return rq->clock; } static inline u64 rq_clock_task(struct rq *rq) { + lockdep_assert_held(&rq->lock); return rq->clock_task; } +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 + +static inline void rq_clock_skip_update(struct rq *rq, bool skip) +{ + lockdep_assert_held(&rq->lock); + if (skip) + rq->clock_skip_update |= RQCF_REQ_SKIP; + else + rq->clock_skip_update &= ~RQCF_REQ_SKIP; +} + +#ifdef CONFIG_NUMA +enum numa_topology_type { + NUMA_DIRECT, + NUMA_GLUELESS_MESH, + NUMA_BACKPLANE, +}; +extern enum numa_topology_type sched_numa_topology_type; +extern int sched_max_numa_distance; +extern bool find_numa_distance(int distance); +#endif + #ifdef CONFIG_NUMA_BALANCING +/* The regions in numa_faults array from task_struct */ +enum numa_faults_stats { + NUMA_MEM = 0, + NUMA_CPU, + NUMA_MEMBUF, + NUMA_CPUBUF +}; extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *, struct task_struct *); @@ -736,7 +826,7 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity * for a single CPU. */ - unsigned int capacity, capacity_orig; + unsigned int capacity; unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* @@ -884,20 +974,10 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct static_key *key) -{ - return static_key_true(key); /* Not out of line branch. */ -} - -static __always_inline bool static_branch__false(struct static_key *key) -{ - return static_key_false(key); /* Out of line branch. */ -} - #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ - return static_branch__##enabled(key); \ + return static_key_##enabled(key); \ } #include "features.h" @@ -949,6 +1029,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p) #endif } +static inline int task_on_rq_queued(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_QUEUED; +} + +static inline int task_on_rq_migrating(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_MIGRATING; +} #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) @@ -960,7 +1049,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p) # define finish_arch_post_lock_switch() do { } while (0) #endif -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP @@ -998,35 +1086,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) raw_spin_unlock_irq(&rq->lock); } -#else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif - raw_spin_unlock(&rq->lock); -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif - local_irq_enable(); -} -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ - /* * wake flags */ @@ -1142,6 +1201,11 @@ struct sched_class { void (*task_fork) (struct task_struct *p); void (*task_dead) (struct task_struct *p); + /* + * The switched_from() call is allowed to drop rq->lock, therefore we + * cannot assume the switched_from/switched_to pair is serliazed by + * rq->lock. They are however serialized by p->pi_lock. + */ void (*switched_from) (struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, @@ -1150,6 +1214,8 @@ struct sched_class { unsigned int (*get_rr_interval) (struct rq *rq, struct task_struct *task); + void (*update_curr) (struct rq *rq); + #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_move_group) (struct task_struct *p, int on_rq); #endif @@ -1187,6 +1253,30 @@ static inline void idle_exit_fair(struct rq *rq) { } #endif +#ifdef CONFIG_CPU_IDLE +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ + rq->idle_state = idle_state; +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + WARN_ON(!rcu_read_lock_held()); + return rq->idle_state; +} +#else +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + return NULL; +} +#endif + extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); @@ -1196,7 +1286,7 @@ extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); extern void init_sched_dl_class(void); -extern void resched_task(struct task_struct *p); +extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; @@ -1218,15 +1308,26 @@ static inline void add_nr_running(struct rq *rq, unsigned count) rq->nr_running = prev_nr + count; -#ifdef CONFIG_NO_HZ_FULL if (prev_nr < 2 && rq->nr_running >= 2) { +#ifdef CONFIG_SMP + if (!rq->rd->overload) + rq->rd->overload = true; +#endif + +#ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(rq->cpu)) { - /* Order rq->nr_running write against the IPI */ - smp_wmb(); - smp_send_reschedule(rq->cpu); + /* + * Tick is needed if more than one task runs on a CPU. + * Send the target an IPI to kick it out of nohz mode. + * + * We assume that IPI implies full memory barrier and the + * new value of rq->nr_running is visible on reception + * from the target. + */ + tick_nohz_full_kick_cpu(rq->cpu); } - } #endif + } } static inline void sub_nr_running(struct rq *rq, unsigned count) @@ -1286,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq) #ifdef CONFIG_SMP extern void sched_avg_update(struct rq *rq); + +#ifndef arch_scale_freq_capacity +static __always_inline +unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { - rq->rt_avg += rt_delta; + rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); sched_avg_update(rq); } #else @@ -1298,6 +1408,82 @@ static inline void sched_avg_update(struct rq *rq) { } extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); +/* + * __task_rq_lock - lock the rq @p resides on. + */ +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + for (;;) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) + return rq; + raw_spin_unlock(&rq->lock); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + raw_spin_lock_irqsave(&p->pi_lock, *flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + /* + * move_queued_task() task_rq_lock() + * + * ACQUIRE (rq->lock) + * [S] ->on_rq = MIGRATING [L] rq = task_rq() + * WMB (__set_task_cpu()) ACQUIRE (rq->lock); + * [S] ->cpu = new_cpu [L] task_rq() + * [L] ->on_rq + * RELEASE (rq->lock) + * + * If we observe the old cpu in task_rq_lock, the acquire of + * the old rq->lock will fully serialize against the stores. + * + * If we observe the new cpu in task_rq_lock, the acquire will + * pair with the WMB to ensure we must then also see migrating. + */ + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) + return rq; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); + } +} + +static inline void __task_rq_unlock(struct rq *rq) + __releases(rq->lock) +{ + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) + __releases(rq->lock) + __releases(p->pi_lock) +{ + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); +} + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT @@ -1482,10 +1668,11 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); +extern void print_dl_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); -extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); +extern void init_rt_rq(struct rt_rq *rt_rq); +extern void init_dl_rq(struct dl_rq *dl_rq); extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index a476bea17fbc..87e2c9f0c33e 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -15,11 +15,6 @@ static int show_schedstat(struct seq_file *seq, void *v) { int cpu; - int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; - char *mask_str = kmalloc(mask_len, GFP_KERNEL); - - if (mask_str == NULL) - return -ENOMEM; if (v == (void *)1) { seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); @@ -50,9 +45,8 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - cpumask_scnprintf(mask_str, mask_len, - sched_domain_span(sd)); - seq_printf(seq, "domain%d %s", dcount++, mask_str); + seq_printf(seq, "domain%d %*pb", dcount++, + cpumask_pr_args(sched_domain_span(sd))); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { seq_printf(seq, " %u %u %u %u %u %u %u %u", @@ -76,7 +70,6 @@ static int show_schedstat(struct seq_file *seq, void *v) rcu_read_unlock(); #endif } - kfree(mask_str); return 0; } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index bfe0edadbfbb..79ffec45a6ac 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *stop = rq->stop; - if (!stop || !stop->on_rq) + if (!stop || !task_on_rq_queued(stop)) return NULL; put_prev_task(rq, prev); @@ -102,6 +102,10 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) return 0; } +static void update_curr_stop(struct rq *rq) +{ +} + /* * Simple, special scheduling class for the per-CPU stop tasks: */ @@ -128,4 +132,5 @@ const struct sched_class stop_sched_class = { .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, + .update_curr = update_curr_stop, }; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 0ffa20ae657b..852143a79f36 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/wait.h> #include <linux/hash.h> +#include <linux/kthread.h> void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) { @@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void * } EXPORT_SYMBOL(autoremove_wake_function); +static inline bool is_kthread_should_stop(void) +{ + return (current->flags & PF_KTHREAD) && kthread_should_stop(); +} + +/* + * DEFINE_WAIT_FUNC(wait, woken_wake_func); + * + * add_wait_queue(&wq, &wait); + * for (;;) { + * if (condition) + * break; + * + * p->state = mode; condition = true; + * smp_mb(); // A smp_wmb(); // C + * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; + * schedule() try_to_wake_up(); + * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ + * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; + * smp_mb() // B smp_wmb(); // C + * wait->flags |= WQ_FLAG_WOKEN; + * } + * remove_wait_queue(&wq, &wait); + * + */ +long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) +{ + set_current_state(mode); /* A */ + /* + * The above implies an smp_mb(), which matches with the smp_wmb() from + * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must + * also observe all state before the wakeup. + */ + if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) + timeout = schedule_timeout(timeout); + __set_current_state(TASK_RUNNING); + + /* + * The below implies an smp_mb(), it too pairs with the smp_wmb() from + * woken_wake_function() such that we must either observe the wait + * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss + * an event. + */ + set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + + return timeout; +} +EXPORT_SYMBOL(wait_woken); + +int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expects write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in wait_woken(). + */ + smp_wmb(); /* C */ + wait->flags |= WQ_FLAG_WOKEN; + + return default_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(woken_wake_function); + int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; @@ -319,14 +385,14 @@ EXPORT_SYMBOL(wake_bit_function); */ int __sched __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { int ret = 0; do { prepare_to_wait(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(q->key.flags); + ret = (*action)(&q->key); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); finish_wait(wq, &q->wait); return ret; @@ -334,7 +400,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, EXPORT_SYMBOL(__wait_on_bit); int __sched out_of_line_wait_on_bit(void *word, int bit, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { wait_queue_head_t *wq = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); @@ -343,9 +409,21 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, } EXPORT_SYMBOL(out_of_line_wait_on_bit); +int __sched out_of_line_wait_on_bit_timeout( + void *word, int bit, wait_bit_action_f *action, + unsigned mode, unsigned long timeout) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + wait.key.timeout = jiffies + timeout; + return __wait_on_bit(wq, &wait, action, mode); +} +EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); + int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { do { int ret; @@ -353,7 +431,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait_exclusive(wq, &q->wait, mode); if (!test_bit(q->key.bit_nr, q->key.flags)) continue; - ret = action(q->key.flags); + ret = action(&q->key); if (!ret) continue; abort_exclusive_wait(wq, &q->wait, mode, &q->key); @@ -365,7 +443,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, EXPORT_SYMBOL(__wait_on_bit_lock); int __sched out_of_line_wait_on_bit_lock(void *word, int bit, - int (*action)(void *), unsigned mode) + wait_bit_action_f *action, unsigned mode) { wait_queue_head_t *wq = bit_waitqueue(word, bit); DEFINE_WAIT_BIT(wait, word, bit); @@ -502,3 +580,45 @@ void wake_up_atomic_t(atomic_t *p) __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); } EXPORT_SYMBOL(wake_up_atomic_t); + +__sched int bit_wait(struct wait_bit_key *word) +{ + if (signal_pending_state(current->state, current)) + return 1; + schedule(); + return 0; +} +EXPORT_SYMBOL(bit_wait); + +__sched int bit_wait_io(struct wait_bit_key *word) +{ + if (signal_pending_state(current->state, current)) + return 1; + io_schedule(); + return 0; +} +EXPORT_SYMBOL(bit_wait_io); + +__sched int bit_wait_timeout(struct wait_bit_key *word) +{ + unsigned long now = ACCESS_ONCE(jiffies); + if (signal_pending_state(current->state, current)) + return 1; + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + schedule_timeout(word->timeout - now); + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_timeout); + +__sched int bit_wait_io_timeout(struct wait_bit_key *word) +{ + unsigned long now = ACCESS_ONCE(jiffies); + if (signal_pending_state(current->state, current)) + return 1; + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + io_schedule_timeout(word->timeout - now); + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_io_timeout); |