diff options
Diffstat (limited to 'kernel/sched/deadline.c')
-rw-r--r-- | kernel/sched/deadline.c | 866 |
1 files changed, 567 insertions, 299 deletions
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 43323f875cb9..9ae8f41e3372 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -15,10 +15,42 @@ * Michael Trimarchi <michael@amarulasolutions.com>, * Fabio Checconi <fchecconi@gmail.com> */ -#include "sched.h" -#include "pelt.h" -struct dl_bandwidth def_dl_bandwidth; +/* + * Default limits for DL period; on the top end we guard against small util + * tasks still getting ridiculously long effective runtimes, on the bottom end we + * guard against timer DoS. + */ +static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ +static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ +#ifdef CONFIG_SYSCTL +static struct ctl_table sched_dl_sysctls[] = { + { + .procname = "sched_deadline_period_max_us", + .data = &sysctl_sched_dl_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (void *)&sysctl_sched_dl_period_min, + }, + { + .procname = "sched_deadline_period_min_us", + .data = &sysctl_sched_dl_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra2 = (void *)&sysctl_sched_dl_period_max, + }, + {} +}; + +static int __init sched_dl_sysctl_init(void) +{ + register_sysctl_init("kernel", sched_dl_sysctls); + return 0; +} +late_initcall(sched_dl_sysctl_init); +#endif static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) { @@ -43,6 +75,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +#ifdef CONFIG_RT_MUTEXES +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se->pi_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return pi_of(dl_se) != dl_se; +} +#else +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return false; +} +#endif + #ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { @@ -54,15 +108,75 @@ static inline struct dl_bw *dl_bw_of(int i) static inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; - int cpus = 0; + int cpus; RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), "sched RCU must be held"); + + if (cpumask_subset(rd->span, cpu_active_mask)) + return cpumask_weight(rd->span); + + cpus = 0; + for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; return cpus; } + +static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) +{ + unsigned long cap = 0; + int i; + + for_each_cpu_and(i, mask, cpu_active_mask) + cap += capacity_orig_of(i); + + return cap; +} + +/* + * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity + * of the CPU the task is running on rather rd's \Sum CPU capacity. + */ +static inline unsigned long dl_bw_capacity(int i) +{ + if (!sched_asym_cpucap_active() && + capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { + return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; + } else { + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + + return __dl_bw_capacity(cpu_rq(i)->rd->span); + } +} + +static inline bool dl_bw_visited(int cpu, u64 gen) +{ + struct root_domain *rd = cpu_rq(cpu)->rd; + + if (rd->visit_gen == gen) + return true; + + rd->visit_gen = gen; + return false; +} + +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); + int i; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + for_each_cpu_and(i, rd->span, cpu_active_mask) { + struct rq *rq = cpu_rq(i); + + rq->dl.extra_bw += bw; + } +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -73,14 +187,53 @@ static inline int dl_bw_cpus(int i) { return 1; } + +static inline unsigned long dl_bw_capacity(int i) +{ + return SCHED_CAPACITY_SCALE; +} + +static inline bool dl_bw_visited(int cpu, u64 gen) +{ + return false; +} + +static inline +void __dl_update(struct dl_bw *dl_b, s64 bw) +{ + struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); + + dl->extra_bw += bw; +} #endif static inline +void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw -= tsk_bw; + __dl_update(dl_b, (s32)tsk_bw / cpus); +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) +{ + dl_b->total_bw += tsk_bw; + __dl_update(dl_b, -((s32)tsk_bw / cpus)); +} + +static inline bool +__dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; +} + +static inline void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw += dl_bw; SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); @@ -93,7 +246,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw -= dl_bw; SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) @@ -107,7 +260,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw += dl_bw; SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ } @@ -117,7 +270,7 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; - lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); + lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw -= dl_bw; SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ if (dl_rq->this_bw > old) @@ -153,11 +306,11 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) __sub_running_bw(dl_se->dl_bw, dl_rq); } -void dl_change_utilization(struct task_struct *p, u64 new_bw) +static void dl_change_utilization(struct task_struct *p, u64 new_bw) { struct rq *rq; - BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); + WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); if (task_on_rq_queued(p)) return; @@ -168,7 +321,7 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw) p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() + * timer cannot be canceled, inactive_task_timer() * will see that dl_not_contending is not set, and * will not touch the rq's active utilization, * so we are still safe. @@ -190,7 +343,7 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw) * fires. * * If the task wakes up again before the inactive timer fires, - * the timer is cancelled, whereas if the task wakes up after the + * the timer is canceled, whereas if the task wakes up after the * inactive timer fired (and running_bw has been decreased) the * task's utilization has to be added to running_bw again. * A flag in the deadline scheduling entity (dl_non_contending) @@ -271,15 +424,15 @@ static void task_non_contending(struct task_struct *p) if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { if (dl_task(p)) sub_running_bw(dl_se, dl_rq); - if (!dl_task(p) || p->state == TASK_DEAD) { + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - if (p->state == TASK_DEAD) + if (READ_ONCE(p->__state) == TASK_DEAD) sub_rq_bw(&p->dl, &rq->dl); raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); - __dl_clear_params(p); raw_spin_unlock(&dl_b->lock); + __dl_clear_params(p); } return; @@ -308,7 +461,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) dl_se->dl_non_contending = 0; /* * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() + * timer cannot be canceled, inactive_task_timer() * will see that dl_not_contending is not set, and * will not touch the rq's active utilization, * so we are still safe. @@ -331,9 +484,11 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; - return dl_rq->root.rb_leftmost == &dl_se->rb_node; + return rb_first_cached(&dl_rq->root) == &dl_se->rb_node; } +static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); + void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) { raw_spin_lock_init(&dl_b->dl_runtime_lock); @@ -344,12 +499,10 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) void init_dl_bw(struct dl_bw *dl_b) { raw_spin_lock_init(&dl_b->lock); - raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); if (global_rt_runtime() == RUNTIME_INF) dl_b->bw = -1; else dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); - raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); dl_b->total_bw = 0; } @@ -438,58 +591,44 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) update_dl_migration(dl_rq); } +#define __node_2_pdl(node) \ + rb_entry((node), struct task_struct, pushable_dl_tasks) + +static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b) +{ + return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl); +} + /* * The list of pushable -deadline task is not a plist, like in * sched_rt.c, it is an rb-tree with tasks ordered by deadline. */ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { - struct dl_rq *dl_rq = &rq->dl; - struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct task_struct *entry; - bool leftmost = true; - - BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct task_struct, - pushable_dl_tasks); - if (dl_entity_preempt(&p->dl, &entry->dl)) - link = &parent->rb_left; - else { - link = &parent->rb_right; - leftmost = false; - } - } + struct rb_node *leftmost; - if (leftmost) - dl_rq->earliest_dl.next = p->dl.deadline; + WARN_ON_ONCE(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); - rb_link_node(&p->pushable_dl_tasks, parent, link); - rb_insert_color_cached(&p->pushable_dl_tasks, - &dl_rq->pushable_dl_tasks_root, leftmost); + leftmost = rb_add_cached(&p->pushable_dl_tasks, + &rq->dl.pushable_dl_tasks_root, + __pushable_less); + if (leftmost) + rq->dl.earliest_dl.next = p->dl.deadline; } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct dl_rq *dl_rq = &rq->dl; + struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root; + struct rb_node *leftmost; if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) return; - if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) { - struct rb_node *next_node; - - next_node = rb_next(&p->pushable_dl_tasks); - if (next_node) { - dl_rq->earliest_dl.next = rb_entry(next_node, - struct task_struct, pushable_dl_tasks)->dl.deadline; - } - } + leftmost = rb_erase_cached(&p->pushable_dl_tasks, root); + if (leftmost) + dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline; - rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); RB_CLEAR_NODE(&p->pushable_dl_tasks); } @@ -502,11 +641,11 @@ static int push_dl_task(struct rq *rq); static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) { - return dl_task(prev); + return rq->online && dl_task(prev); } -static DEFINE_PER_CPU(struct callback_head, dl_push_head); -static DEFINE_PER_CPU(struct callback_head, dl_pull_head); +static DEFINE_PER_CPU(struct balance_callback, dl_push_head); +static DEFINE_PER_CPU(struct balance_callback, dl_pull_head); static void push_dl_tasks(struct rq *); static void pull_dl_task(struct rq *); @@ -545,7 +684,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * Failed to find any suitable CPU. * The task will never come back! */ - BUG_ON(dl_bandwidth_enabled()); + WARN_ON_ONCE(dl_bandwidth_enabled()); /* * If admission control is disabled we @@ -618,15 +757,6 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { } -static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) -{ - return false; -} - -static inline void pull_dl_task(struct rq *rq) -{ -} - static inline void deadline_queue_push_tasks(struct rq *rq) { } @@ -640,6 +770,14 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); +static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, + struct rq *rq) +{ + /* for non-boosted task, pi_of(dl_se) == dl_se */ + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; +} + /* * We are being explicitly informed that a new instance is starting, * and this means that: @@ -657,7 +795,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - WARN_ON(dl_se->dl_boosted); + WARN_ON(is_dl_boosted(dl_se)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); /* @@ -673,8 +811,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * future; in fact, we must consider execution overheads (time * spent on hardirq context, etc.). */ - dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; - dl_se->runtime = dl_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); } /* @@ -695,22 +832,19 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * could happen are, typically, a entity voluntarily trying to overcome its * runtime, or it just underestimated it during sched_setattr(). */ -static void replenish_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void replenish_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - BUG_ON(pi_se->dl_runtime <= 0); + WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0); /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. */ - if (dl_se->dl_deadline == 0) { - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; - } + if (dl_se->dl_deadline == 0) + replenish_dl_new_period(dl_se, rq); if (dl_se->dl_yielded && dl_se->runtime > 0) dl_se->runtime = 0; @@ -722,8 +856,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * arbitrary large. */ while (dl_se->runtime <= 0) { - dl_se->deadline += pi_se->dl_period; - dl_se->runtime += pi_se->dl_runtime; + dl_se->deadline += pi_of(dl_se)->dl_period; + dl_se->runtime += pi_of(dl_se)->dl_runtime; } /* @@ -737,8 +871,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { printk_deferred_once("sched: DL replenish lagged too much\n"); - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); } if (dl_se->dl_yielded) @@ -771,8 +904,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * task with deadline equal to period this is the same of using * dl_period instead of dl_deadline in the equation above. */ -static bool dl_entity_overflow(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, u64 t) +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) { u64 left, right; @@ -794,9 +926,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); right = ((dl_se->deadline - t) >> DL_SCALE) * - (pi_se->dl_runtime >> DL_SCALE); + (pi_of(dl_se)->dl_runtime >> DL_SCALE); return dl_time_before(right, left); } @@ -881,24 +1013,22 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) * Please refer to the comments update_dl_revised_wakeup() function to find * more about the Revised CBS rule. */ -static void update_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void update_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); if (dl_time_before(dl_se->deadline, rq_clock(rq)) || - dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + dl_entity_overflow(dl_se, rq_clock(rq))) { if (unlikely(!dl_is_implicit(dl_se) && !dl_time_before(dl_se->deadline, rq_clock(rq)) && - !dl_se->dl_boosted)){ + !is_dl_boosted(dl_se))) { update_dl_revised_wakeup(dl_se, rq); return; } - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); } } @@ -925,7 +1055,7 @@ static int start_dl_timer(struct task_struct *p) ktime_t now, act; s64 delta; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); /* * We want the timer to fire at the deadline, but considering @@ -997,7 +1127,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have been boosted by someone else and might be in the * boosting/deboosting path, its not throttled. */ - if (dl_se->dl_boosted) + if (is_dl_boosted(dl_se)) goto unlock; /* @@ -1025,7 +1155,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * but do not enqueue -- wait for our wakeup to do that. */ if (!task_on_rq_queued(p)) { - replenish_dl_entity(dl_se, dl_se); + replenish_dl_entity(dl_se); goto unlock; } @@ -1035,9 +1165,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * If the runqueue is no longer available, migrate the * task elsewhere. This necessarily changes rq. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + lockdep_unpin_lock(__rq_lockp(rq), rf.cookie); rq = dl_task_offline_migration(rq, p); - rf.cookie = lockdep_pin_lock(&rq->lock); + rf.cookie = lockdep_pin_lock(__rq_lockp(rq)); update_rq_clock(rq); /* @@ -1096,7 +1226,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) * cannot use the runtime, and so it replenishes the task. This rule * works fine for implicit deadline tasks (deadline == period), and the * CBS was designed for implicit deadline tasks. However, a task with - * constrained deadline (deadine < period) might be awakened after the + * constrained deadline (deadline < period) might be awakened after the * deadline, but before the next period. In this case, replenishing the * task would allow it to run for runtime / deadline. As in this case * deadline < period, CBS enables a task to run for more than the @@ -1115,7 +1245,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) if (dl_time_before(dl_se->deadline, rq_clock(rq)) && dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { - if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p))) return; dl_se->dl_throttled = 1; if (dl_se->runtime > 0) @@ -1129,8 +1259,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) return (dl_se->runtime <= 0); } -extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); - /* * This function implements the GRUB accounting rule: * according to the GRUB reclaiming algorithm, the runtime is @@ -1144,7 +1272,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations * multiplied by 2^BW_SHIFT, the result has to be shifted right by * BW_SHIFT. - * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT, + * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. * Since delta is a 64 bit variable, to have an overflow its value * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. @@ -1203,14 +1331,12 @@ static void update_curr_dl(struct rq *rq) return; } - schedstat_set(curr->se.statistics.exec_max, - max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->stats.exec_max, + max(curr->stats.exec_max, delta_exec)); - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); + trace_sched_stat_runtime(curr, delta_exec, 0); - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); + update_current_exec_runtime(curr, now, delta_exec); if (dl_entity_is_special(dl_se)) return; @@ -1246,7 +1372,7 @@ throttle: dl_se->dl_overrun = 1; __dequeue_task_dl(rq, curr, 0); - if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) @@ -1293,10 +1419,10 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); - if (!dl_task(p) || p->state == TASK_DEAD) { + if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); - if (p->state == TASK_DEAD && dl_se->dl_non_contending) { + if (READ_ONCE(p->__state) == TASK_DEAD && dl_se->dl_non_contending) { sub_running_bw(&p->dl, dl_rq_of_se(&p->dl)); sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl)); dl_se->dl_non_contending = 0; @@ -1329,6 +1455,9 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) timer->function = inactive_task_timer; } +#define __node_2_dle(node) \ + rb_entry((node), struct sched_dl_entity, rb_node) + #ifdef CONFIG_SMP static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) @@ -1337,6 +1466,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { + if (dl_rq->earliest_dl.curr == 0) + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER); dl_rq->earliest_dl.curr = deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); } @@ -1354,11 +1485,11 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { - struct rb_node *leftmost = dl_rq->root.rb_leftmost; - struct sched_dl_entity *entry; + struct rb_node *leftmost = rb_first_cached(&dl_rq->root); + struct sched_dl_entity *entry = __node_2_dle(leftmost); - entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline); } @@ -1399,29 +1530,94 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) dec_dl_migration(dl_se, dl_rq); } +static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) +{ + return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline); +} + +static inline struct sched_statistics * +__schedstats_from_dl_se(struct sched_dl_entity *dl_se) +{ + return &dl_task_of(dl_se)->stats; +} + +static inline void +update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats; + + if (!schedstat_enabled()) + return; + + stats = __schedstats_from_dl_se(dl_se); + __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats; + + if (!schedstat_enabled()) + return; + + stats = __schedstats_from_dl_se(dl_se); + __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) +{ + struct sched_statistics *stats; + + if (!schedstat_enabled()) + return; + + stats = __schedstats_from_dl_se(dl_se); + __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); +} + +static inline void +update_stats_enqueue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags) +{ + if (!schedstat_enabled()) + return; + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper_dl(dl_rq, dl_se); +} + +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags) +{ + struct task_struct *p = dl_task_of(dl_se); + + if (!schedstat_enabled()) + return; + + if ((flags & DEQUEUE_SLEEP)) { + unsigned int state; + + state = READ_ONCE(p->__state); + if (state & TASK_INTERRUPTIBLE) + __schedstat_set(p->stats.sleep_start, + rq_clock(rq_of_dl_rq(dl_rq))); + + if (state & TASK_UNINTERRUPTIBLE) + __schedstat_set(p->stats.block_start, + rq_clock(rq_of_dl_rq(dl_rq))); + } +} + static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rb_node **link = &dl_rq->root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_dl_entity *entry; - int leftmost = 1; - - BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_dl_entity, rb_node); - if (dl_time_before(dl_se->deadline, entry->deadline)) - link = &parent->rb_left; - else { - link = &parent->rb_right; - leftmost = 0; - } - } - rb_link_node(&dl_se->rb_node, parent, link); - rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost); + WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node)); + + rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less); inc_dl_tasks(dl_se, dl_rq); } @@ -1434,16 +1630,18 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) return; rb_erase_cached(&dl_se->rb_node, &dl_rq->root); + RB_CLEAR_NODE(&dl_se->rb_node); dec_dl_tasks(dl_se, dl_rq); } static void -enqueue_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, int flags) +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) { - BUG_ON(on_dl_rq(dl_se)); + WARN_ON_ONCE(on_dl_rq(dl_se)); + + update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags); /* * If this is a wakeup or a new instance, the scheduling @@ -1452,9 +1650,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, */ if (flags & ENQUEUE_WAKEUP) { task_contending(dl_se, flags); - update_dl_entity(dl_se, pi_se); + update_dl_entity(dl_se); } else if (flags & ENQUEUE_REPLENISH) { - replenish_dl_entity(dl_se, pi_se); + replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { @@ -1471,28 +1669,43 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); - struct sched_dl_entity *pi_se = &p->dl; - - /* - * Use the scheduling parameters of the top pi-waiter task if: - * - we have a top pi-waiter which is a SCHED_DEADLINE task AND - * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is - * smaller than our deadline OR we are a !SCHED_DEADLINE task getting - * boosted due to a SCHED_DEADLINE pi-waiter). - * Otherwise we keep our runtime and deadline. - */ - if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { - pi_se = &pi_task->dl; + if (is_dl_boosted(&p->dl)) { + /* + * Because of delays in the detection of the overrun of a + * thread's runtime, it might be the case that a thread + * goes to sleep in a rt mutex with negative runtime. As + * a consequence, the thread will be throttled. + * + * While waiting for the mutex, this thread can also be + * boosted via PI, resulting in a thread that is throttled + * and boosted at the same time. + * + * In this case, the boost overrides the throttle. + */ + if (p->dl.dl_throttled) { + /* + * The replenish timer needs to be canceled. No + * problem if it fires concurrently: boosted threads + * are ignored in dl_task_timer(). + */ + hrtimer_try_to_cancel(&p->dl.dl_timer); + p->dl.dl_throttled = 0; + } } else if (!dl_prio(p->normal_prio)) { /* - * Special case in which we have a !SCHED_DEADLINE task - * that is going to be deboosted, but exceeds its - * runtime while doing so. No point in replenishing - * it, as it's going to return back to its original - * scheduling class after this. + * Special case in which we have a !SCHED_DEADLINE task that is going + * to be deboosted, but exceeds its runtime while doing so. No point in + * replenishing it, as it's going to return back to its original + * scheduling class after this. If it has been throttled, we need to + * clear the flag, otherwise the task may wake up as throttled after + * being boosted again with no means to replenish the runtime and clear + * the throttle. */ - BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + p->dl.dl_throttled = 0; + if (!(flags & ENQUEUE_REPLENISH)) + printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n", + task_pid_nr(p)); + return; } @@ -1529,7 +1742,10 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) return; } - enqueue_dl_entity(&p->dl, pi_se, flags); + check_schedstat_required(); + update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl); + + enqueue_dl_entity(&p->dl, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); @@ -1537,6 +1753,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { + update_stats_dequeue_dl(&rq->dl, &p->dl, flags); dequeue_dl_entity(&p->dl); dequeue_pushable_dl_task(rq, p); } @@ -1596,15 +1813,24 @@ static void yield_task_dl(struct rq *rq) #ifdef CONFIG_SMP +static inline bool dl_task_is_earliest_deadline(struct task_struct *p, + struct rq *rq) +{ + return (!rq->dl.dl_nr_running || + dl_time_before(p->dl.deadline, + rq->dl.earliest_dl.curr)); +} + static int find_later_rq(struct task_struct *task); static int -select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_dl(struct task_struct *p, int cpu, int flags) { struct task_struct *curr; + bool select_rq; struct rq *rq; - if (sd_flag != SD_BALANCE_WAKE) + if (!(flags & WF_TTWU)) goto out; rq = cpu_rq(cpu); @@ -1621,16 +1847,23 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && - (p->nr_cpus_allowed > 1)) { + select_rq = unlikely(dl_task(curr)) && + (curr->nr_cpus_allowed < 2 || + !dl_entity_preempt(&p->dl, &curr->dl)) && + p->nr_cpus_allowed > 1; + + /* + * Take the capacity of the CPU into account to + * ensure it fits the requirement of the task. + */ + if (sched_asym_cpucap_active()) + select_rq |= !dl_task_fits_capacity(p, cpu); + + if (select_rq) { int target = find_later_rq(p); if (target != -1 && - (dl_time_before(p->dl.deadline, - cpu_rq(target)->dl.earliest_dl.curr) || - (cpu_rq(target)->dl.dl_nr_running == 0))) + dl_task_is_earliest_deadline(p, cpu_rq(target))) cpu = target; } rcu_read_unlock(); @@ -1641,9 +1874,10 @@ out: static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { + struct rq_flags rf; struct rq *rq; - if (p->state != TASK_WAKING) + if (READ_ONCE(p->__state) != TASK_WAKING) return; rq = task_rq(p); @@ -1652,13 +1886,14 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * from try_to_wake_up(). Hence, p->pi_lock is locked, but * rq->lock is not... So, lock it */ - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); if (p->dl.dl_non_contending) { + update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the - * timer cannot be cancelled, inactive_task_timer() + * timer cannot be canceled, inactive_task_timer() * will see that dl_not_contending is not set, and * will not touch the rq's active utilization, * so we are still safe. @@ -1667,7 +1902,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused put_task_struct(p); } sub_rq_bw(&p->dl, &rq->dl); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) @@ -1745,7 +1980,12 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { + struct sched_dl_entity *dl_se = &p->dl; + struct dl_rq *dl_rq = &rq->dl; + p->se.exec_start = rq_clock_task(rq); + if (on_dl_rq(&p->dl)) + update_stats_wait_end_dl(dl_rq, dl_se); /* You can't push away the running task */ dequeue_pushable_dl_task(rq, p); @@ -1753,7 +1993,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (hrtick_enabled(rq)) + if (hrtick_enabled_dl(rq)) start_hrtick_dl(rq, p); if (rq->curr->sched_class != &dl_sched_class) @@ -1762,18 +2002,17 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) deadline_queue_push_tasks(rq); } -static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, - struct dl_rq *dl_rq) +static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) { struct rb_node *left = rb_first_cached(&dl_rq->root); if (!left) return NULL; - return rb_entry(left, struct sched_dl_entity, rb_node); + return __node_2_dle(left); } -static struct task_struct *pick_next_task_dl(struct rq *rq) +static struct task_struct *pick_task_dl(struct rq *rq) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -1782,15 +2021,32 @@ static struct task_struct *pick_next_task_dl(struct rq *rq) if (!sched_dl_runnable(rq)) return NULL; - dl_se = pick_next_dl_entity(rq, dl_rq); - BUG_ON(!dl_se); + dl_se = pick_next_dl_entity(dl_rq); + WARN_ON_ONCE(!dl_se); p = dl_task_of(dl_se); - set_next_task_dl(rq, p, true); + + return p; +} + +static struct task_struct *pick_next_task_dl(struct rq *rq) +{ + struct task_struct *p; + + p = pick_task_dl(rq); + if (p) + set_next_task_dl(rq, p, true); + return p; } static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { + struct sched_dl_entity *dl_se = &p->dl; + struct dl_rq *dl_rq = &rq->dl; + + if (on_dl_rq(&p->dl)) + update_stats_wait_start_dl(dl_rq, dl_se); + update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); @@ -1816,7 +2072,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) * not being the leftmost task anymore. In that case NEED_RESCHED will * be set and schedule() will start a new hrtick for the next task. */ - if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 && + if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 && is_leftmost(p, &rq->dl)) start_hrtick_dl(rq, p); } @@ -1836,8 +2092,8 @@ static void task_fork_dl(struct task_struct *p) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { - if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + if (!task_on_cpu(rq, p) && + cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; return 0; } @@ -1848,15 +2104,17 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) */ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu) { - struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost; struct task_struct *p = NULL; + struct rb_node *next_node; if (!has_pushable_dl_tasks(rq)) return NULL; + next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); + next_node: if (next_node) { - p = rb_entry(next_node, struct task_struct, pushable_dl_tasks); + p = __node_2_pdl(next_node); if (pick_dl_task(rq, p, cpu)) return p; @@ -1927,8 +2185,8 @@ static int find_later_rq(struct task_struct *task) return this_cpu; } - best_cpu = cpumask_first_and(later_mask, - sched_domain_span(sd)); + best_cpu = cpumask_any_and_distribute(later_mask, + sched_domain_span(sd)); /* * Last chance: if a CPU being in both later_mask * and current sd span is valid, that becomes our @@ -1950,7 +2208,7 @@ static int find_later_rq(struct task_struct *task) if (this_cpu != -1) return this_cpu; - cpu = cpumask_any(later_mask); + cpu = cpumask_any_distribute(later_mask); if (cpu < nr_cpu_ids) return cpu; @@ -1972,9 +2230,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) later_rq = cpu_rq(cpu); - if (later_rq->dl.dl_nr_running && - !dl_time_before(task->dl.deadline, - later_rq->dl.earliest_dl.curr)) { + if (!dl_task_is_earliest_deadline(task, later_rq)) { /* * Target rq has tasks of equal or earlier deadline, * retrying does not release any lock and is unlikely @@ -1987,8 +2243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || - task_running(rq, task) || + !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || + task_on_cpu(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); @@ -2002,9 +2258,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) * its earliest one has a later deadline than our * task, the rq is a good one. */ - if (!later_rq->dl.dl_nr_running || - dl_time_before(task->dl.deadline, - later_rq->dl.earliest_dl.curr)) + if (dl_task_is_earliest_deadline(task, later_rq)) break; /* Otherwise we try again. */ @@ -2022,15 +2276,14 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) if (!has_pushable_dl_tasks(rq)) return NULL; - p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, - struct task_struct, pushable_dl_tasks); + p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + WARN_ON_ONCE(rq->cpu != task_cpu(p)); + WARN_ON_ONCE(task_current(rq, p)); + WARN_ON_ONCE(p->nr_cpus_allowed <= 1); - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!dl_task(p)); + WARN_ON_ONCE(!task_on_rq_queued(p)); + WARN_ON_ONCE(!dl_task(p)); return p; } @@ -2054,9 +2307,6 @@ static int push_dl_task(struct rq *rq) return 0; retry: - if (WARN_ON(next_task == rq->curr)) - return 0; - /* * If next_task preempts rq->curr, and rq->curr * can move away, it makes sense to just reschedule @@ -2069,6 +2319,12 @@ retry: return 0; } + if (is_migration_disabled(next_task)) + return 0; + + if (WARN_ON(next_task == rq->curr)) + return 0; + /* We might release rq lock */ get_task_struct(next_task); @@ -2102,13 +2358,7 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, later_rq->cpu); - - /* - * Update the later_rq clock here, because the clock is used - * by the cpufreq_update_util() inside __add_running_bw(). - */ - update_rq_clock(later_rq); - activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); + activate_task(later_rq, next_task, 0); ret = 1; resched_curr(later_rq); @@ -2131,7 +2381,7 @@ static void push_dl_tasks(struct rq *rq) static void pull_dl_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, cpu; - struct task_struct *p; + struct task_struct *p, *push_task; bool resched = false; struct rq *src_rq; u64 dmin = LONG_MAX; @@ -2161,6 +2411,7 @@ static void pull_dl_task(struct rq *this_rq) continue; /* Might drop this_rq->lock */ + push_task = NULL; double_lock_balance(this_rq, src_rq); /* @@ -2178,9 +2429,7 @@ static void pull_dl_task(struct rq *this_rq) * - it will preempt the last one we pulled (if any). */ if (p && dl_time_before(p->dl.deadline, dmin) && - (!this_rq->dl.dl_nr_running || - dl_time_before(p->dl.deadline, - this_rq->dl.earliest_dl.curr))) { + dl_task_is_earliest_deadline(p, this_rq)) { WARN_ON(p == src_rq->curr); WARN_ON(!task_on_rq_queued(p)); @@ -2192,17 +2441,27 @@ static void pull_dl_task(struct rq *this_rq) src_rq->curr->dl.deadline)) goto skip; - resched = true; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - dmin = p->dl.deadline; + if (is_migration_disabled(p)) { + push_task = get_push_task(src_rq); + } else { + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + dmin = p->dl.deadline; + resched = true; + } /* Is there any other task even earlier? */ } skip: double_unlock_balance(this_rq, src_rq); + + if (push_task) { + raw_spin_rq_unlock(this_rq); + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, + push_task, &src_rq->push_work); + raw_spin_rq_lock(this_rq); + } } if (resched) @@ -2215,7 +2474,7 @@ skip: */ static void task_woken_dl(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && + if (!task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && dl_task(rq->curr) && @@ -2226,12 +2485,13 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) } static void set_cpus_allowed_dl(struct task_struct *p, - const struct cpumask *new_mask) + const struct cpumask *new_mask, + u32 flags) { struct root_domain *src_rd; struct rq *rq; - BUG_ON(!dl_task(p)); + WARN_ON_ONCE(!dl_task(p)); rq = task_rq(p); src_rd = rq->rd; @@ -2255,7 +2515,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - set_cpus_allowed_common(p, new_mask); + set_cpus_allowed_common(p, new_mask, flags); } /* Assumes rq->lock is held */ @@ -2294,9 +2554,13 @@ void dl_add_task_root_domain(struct task_struct *p) struct rq *rq; struct dl_bw *dl_b; - rq = task_rq_lock(p, &rf); - if (!dl_task(p)) - goto unlock; + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + if (!dl_task(p)) { + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); + return; + } + + rq = __task_rq_lock(p, &rf); dl_b = &rq->rd->dl_bw; raw_spin_lock(&dl_b->lock); @@ -2305,7 +2569,6 @@ void dl_add_task_root_domain(struct task_struct *p) raw_spin_unlock(&dl_b->lock); -unlock: task_rq_unlock(rq, p, &rf); } @@ -2389,6 +2652,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); + } else { + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); } } @@ -2399,7 +2664,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (task_on_rq_queued(p) || rq->curr == p) { + if (task_on_rq_queued(p) || task_current(rq, p)) { #ifdef CONFIG_SMP /* * This might be too much, but unfortunately @@ -2428,8 +2693,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, } } -const struct sched_class dl_sched_class = { - .next = &rt_sched_class, +DEFINE_SCHED_CLASS(dl) = { + .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, @@ -2442,12 +2707,14 @@ const struct sched_class dl_sched_class = { #ifdef CONFIG_SMP .balance = balance_dl, + .pick_task = pick_task_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, + .find_lock_rq = find_lock_later_rq, #endif .task_tick = task_tick_dl, @@ -2460,33 +2727,39 @@ const struct sched_class dl_sched_class = { .update_curr = update_curr_dl, }; +/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ +static u64 dl_generation; + int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); + u64 gen = ++dl_generation; struct dl_bw *dl_b; - int cpu, ret = 0; + int cpu, cpus, ret = 0; unsigned long flags; /* * Here we want to check the bandwidth not being set to some * value smaller than the currently allocated bandwidth in * any of the root_domains. - * - * FIXME: Cycling on all the CPUs is overdoing, but simpler than - * cycling on root_domains... Discussion on different/better - * solutions is welcome! */ for_each_possible_cpu(cpu) { rcu_read_lock_sched(); + + if (dl_bw_visited(cpu, gen)) + goto next; + dl_b = dl_bw_of(cpu); + cpus = dl_bw_cpus(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - if (new_bw < dl_b->total_bw) + if (new_bw * cpus < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); +next: rcu_read_unlock_sched(); if (ret) @@ -2496,7 +2769,7 @@ int sched_dl_global_validate(void) return ret; } -void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) +static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) { if (global_rt_runtime() == RUNTIME_INF) { dl_rq->bw_ratio = 1 << RATIO_SHIFT; @@ -2512,21 +2785,22 @@ void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) void sched_dl_do_global(void) { u64 new_bw = -1; + u64 gen = ++dl_generation; struct dl_bw *dl_b; int cpu; unsigned long flags; - def_dl_bandwidth.dl_period = global_rt_period(); - def_dl_bandwidth.dl_runtime = global_rt_runtime(); - if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); - /* - * FIXME: As above... - */ for_each_possible_cpu(cpu) { rcu_read_lock_sched(); + + if (dl_bw_visited(cpu, gen)) { + rcu_read_unlock_sched(); + continue; + } + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); @@ -2549,11 +2823,12 @@ void sched_dl_do_global(void) int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus, err = -1; + int cpus, err = -1, cpu = task_cpu(p); + struct dl_bw *dl_b = dl_bw_of(cpu); + unsigned long cap; if (attr->sched_flags & SCHED_FLAG_SUGOV) return 0; @@ -2568,15 +2843,17 @@ int sched_dl_overflow(struct task_struct *p, int policy, * allocated bandwidth of the container. */ raw_spin_lock(&dl_b->lock); - cpus = dl_bw_cpus(task_cpu(p)); + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { + !__dl_overflow(dl_b, cap, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) { /* * XXX this is slightly incorrect: when the task * utilization decreases, we should delay the total @@ -2616,7 +2893,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; + dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } @@ -2629,7 +2906,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_runtime = dl_se->dl_runtime; attr->sched_deadline = dl_se->dl_deadline; attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; + attr->sched_flags &= ~SCHED_DL_FLAGS; + attr->sched_flags |= dl_se->flags; } /* @@ -2644,6 +2922,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) */ bool __checkparam_dl(const struct sched_attr *attr) { + u64 period, max, min; + /* special dl tasks don't actually use any parameter */ if (attr->sched_flags & SCHED_FLAG_SUGOV) return true; @@ -2667,12 +2947,21 @@ bool __checkparam_dl(const struct sched_attr *attr) attr->sched_period & (1ULL << 63)) return false; + period = attr->sched_period; + if (!period) + period = attr->sched_deadline; + /* runtime <= deadline <= period (if period != 0) */ - if ((attr->sched_period != 0 && - attr->sched_period < attr->sched_deadline) || + if (period < attr->sched_deadline || attr->sched_deadline < attr->sched_runtime) return false; + max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC; + min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC; + + if (period < min || period > max) + return false; + return true; } @@ -2694,6 +2983,10 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; + +#ifdef CONFIG_RT_MUTEXES + dl_se->pi_se = dl_se; +#endif } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) @@ -2703,60 +2996,25 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) if (dl_se->dl_runtime != attr->sched_runtime || dl_se->dl_deadline != attr->sched_deadline || dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) + dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS)) return true; return false; } #ifdef CONFIG_SMP -int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) -{ - unsigned int dest_cpu; - struct dl_bw *dl_b; - bool overflow; - int cpus, ret; - unsigned long flags; - - dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); - - rcu_read_lock_sched(); - dl_b = dl_bw_of(dest_cpu); - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); - if (overflow) { - ret = -EBUSY; - } else { - /* - * We reserve space for this task in the destination - * root_domain, as we can't fail after this point. - * We will free resources in the source root_domain - * later on (see set_cpus_allowed_dl()). - */ - __dl_add(dl_b, p->dl.dl_bw, cpus); - ret = 0; - } - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - rcu_read_unlock_sched(); - - return ret; -} - int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { - int ret = 1, trial_cpus; + unsigned long flags, cap; struct dl_bw *cur_dl_b; - unsigned long flags; + int ret = 1; rcu_read_lock_sched(); cur_dl_b = dl_bw_of(cpumask_any(cur)); - trial_cpus = cpumask_weight(trial); - + cap = __dl_bw_capacity(trial); raw_spin_lock_irqsave(&cur_dl_b->lock, flags); - if (cur_dl_b->bw != -1 && - cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + if (__dl_overflow(cur_dl_b, cap, 0, 0)) ret = 0; raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); rcu_read_unlock_sched(); @@ -2764,22 +3022,32 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, return ret; } -bool dl_cpu_busy(unsigned int cpu) +int dl_cpu_busy(int cpu, struct task_struct *p) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; bool overflow; - int cpus; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); + cap = dl_bw_capacity(cpu); + overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0); + + if (!overflow && p) { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu)); + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); - return overflow; + return overflow ? -EBUSY : 0; } #endif |