aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--kernel/sched/fair.c (renamed from kernel/sched_fair.c)1155
1 files changed, 890 insertions, 265 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c
index 5c9e67923b7c..aca16b843b7e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,13 @@
#include <linux/latencytop.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
+#include <linux/slab.h>
+#include <linux/profile.h>
+#include <linux/interrupt.h>
+
+#include <trace/events/sched.h>
+
+#include "sched.h"
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -103,7 +110,110 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
-static const struct sched_class fair_sched_class;
+/*
+ * Increase the granularity value when there are more CPUs,
+ * because with more CPUs the 'effective latency' as visible
+ * to users decreases. But the relationship is not linear,
+ * so pick a second-best guess by going with the log2 of the
+ * number of CPUs.
+ *
+ * This idea comes from the SD scheduler of Con Kolivas:
+ */
+static int get_update_sysctl_factor(void)
+{
+ unsigned int cpus = min_t(int, num_online_cpus(), 8);
+ unsigned int factor;
+
+ switch (sysctl_sched_tunable_scaling) {
+ case SCHED_TUNABLESCALING_NONE:
+ factor = 1;
+ break;
+ case SCHED_TUNABLESCALING_LINEAR:
+ factor = cpus;
+ break;
+ case SCHED_TUNABLESCALING_LOG:
+ default:
+ factor = 1 + ilog2(cpus);
+ break;
+ }
+
+ return factor;
+}
+
+static void update_sysctl(void)
+{
+ unsigned int factor = get_update_sysctl_factor();
+
+#define SET_SYSCTL(name) \
+ (sysctl_##name = (factor) * normalized_sysctl_##name)
+ SET_SYSCTL(sched_min_granularity);
+ SET_SYSCTL(sched_latency);
+ SET_SYSCTL(sched_wakeup_granularity);
+#undef SET_SYSCTL
+}
+
+void sched_init_granularity(void)
+{
+ update_sysctl();
+}
+
+#if BITS_PER_LONG == 32
+# define WMULT_CONST (~0UL)
+#else
+# define WMULT_CONST (1UL << 32)
+#endif
+
+#define WMULT_SHIFT 32
+
+/*
+ * Shift right and round:
+ */
+#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+
+/*
+ * delta *= weight / lw
+ */
+static unsigned long
+calc_delta_mine(unsigned long delta_exec, unsigned long weight,
+ struct load_weight *lw)
+{
+ u64 tmp;
+
+ /*
+ * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+ * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+ * 2^SCHED_LOAD_RESOLUTION.
+ */
+ if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+ tmp = (u64)delta_exec * scale_load_down(weight);
+ else
+ tmp = (u64)delta_exec;
+
+ if (!lw->inv_weight) {
+ unsigned long w = scale_load_down(lw->weight);
+
+ if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+ lw->inv_weight = 1;
+ else if (unlikely(!w))
+ lw->inv_weight = WMULT_CONST;
+ else
+ lw->inv_weight = WMULT_CONST / w;
+ }
+
+ /*
+ * Check whether we'd overflow the 64-bit multiplication:
+ */
+ if (unlikely(tmp > WMULT_CONST))
+ tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+ WMULT_SHIFT/2);
+ else
+ tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+
+ return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+}
+
+
+const struct sched_class fair_sched_class;
/**************************************************************
* CFS operations on generic schedulable entities:
@@ -413,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
-static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
+struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = cfs_rq->rb_leftmost;
@@ -434,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
}
#ifdef CONFIG_SCHED_DEBUG
-static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -684,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
- inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+ update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
if (entity_is_task(se)) {
add_cfs_task_weight(cfs_rq, se->load.weight);
list_add(&se->group_node, &cfs_rq->tasks);
@@ -697,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
- dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+ update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
if (entity_is_task(se)) {
add_cfs_task_weight(cfs_rq, -se->load.weight);
list_del_init(&se->group_node);
@@ -772,19 +882,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
list_del_leaf_cfs_rq(cfs_rq);
}
+static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+{
+ long tg_weight;
+
+ /*
+ * Use this CPU's actual weight instead of the last load_contribution
+ * to gain a more accurate current total weight. See
+ * update_cfs_rq_load_contribution().
+ */
+ tg_weight = atomic_read(&tg->load_weight);
+ tg_weight -= cfs_rq->load_contribution;
+ tg_weight += cfs_rq->load.weight;
+
+ return tg_weight;
+}
+
static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
- long load_weight, load, shares;
+ long tg_weight, load, shares;
+ tg_weight = calc_tg_weight(tg, cfs_rq);
load = cfs_rq->load.weight;
- load_weight = atomic_read(&tg->load_weight);
- load_weight += load;
- load_weight -= cfs_rq->load_contribution;
-
shares = (tg->shares * load);
- if (load_weight)
- shares /= load_weight;
+ if (tg_weight)
+ shares /= tg_weight;
if (shares < MIN_SHARES)
shares = MIN_SHARES;
@@ -907,6 +1030,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
trace_sched_stat_iowait(tsk, delta);
}
+ trace_sched_stat_blocked(tsk, delta);
+
/*
* Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the
@@ -1274,6 +1399,32 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
*/
#ifdef CONFIG_CFS_BANDWIDTH
+
+#ifdef HAVE_JUMP_LABEL
+static struct jump_label_key __cfs_bandwidth_used;
+
+static inline bool cfs_bandwidth_used(void)
+{
+ return static_branch(&__cfs_bandwidth_used);
+}
+
+void account_cfs_bandwidth_used(int enabled, int was_enabled)
+{
+ /* only need to count groups transitioning between enabled/!enabled */
+ if (enabled && !was_enabled)
+ jump_label_inc(&__cfs_bandwidth_used);
+ else if (!enabled && was_enabled)
+ jump_label_dec(&__cfs_bandwidth_used);
+}
+#else /* HAVE_JUMP_LABEL */
+static bool cfs_bandwidth_used(void)
+{
+ return true;
+}
+
+void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+#endif /* HAVE_JUMP_LABEL */
+
/*
* default period for cfs group bandwidth.
* default: 0.1s, units: nanoseconds
@@ -1295,7 +1446,7 @@ static inline u64 sched_cfs_bandwidth_slice(void)
*
* requires cfs_b->lock
*/
-static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
u64 now;
@@ -1307,6 +1458,11 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
}
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+ return &tg->cfs_bandwidth;
+}
+
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -1408,7 +1564,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec)
{
- if (!cfs_rq->runtime_enabled)
+ if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return;
__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1416,13 +1572,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
- return cfs_rq->throttled;
+ return cfs_bandwidth_used() && cfs_rq->throttled;
}
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
- return cfs_rq->throttle_count;
+ return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
/*
@@ -1517,7 +1673,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
raw_spin_unlock(&cfs_b->lock);
}
-static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
@@ -1743,7 +1899,10 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+ if (!cfs_bandwidth_used())
+ return;
+
+ if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
return;
__return_cfs_rq_runtime(cfs_rq);
@@ -1788,6 +1947,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
+ if (!cfs_bandwidth_used())
+ return;
+
/* an active group must be handled by the update_curr()->put() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;
@@ -1805,6 +1967,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
/* conditionally throttle active cfs_rq's from put_prev_entity() */
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
+ if (!cfs_bandwidth_used())
+ return;
+
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
return;
@@ -1817,7 +1982,112 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
throttle_cfs_rq(cfs_rq);
}
-#else
+
+static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, slack_timer);
+ do_sched_cfs_slack_timer(cfs_b);
+
+ return HRTIMER_NORESTART;
+}
+
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, period_timer);
+ ktime_t now;
+ int overrun;
+ int idle = 0;
+
+ for (;;) {
+ now = hrtimer_cb_get_time(timer);
+ overrun = hrtimer_forward(timer, now, cfs_b->period);
+
+ if (!overrun)
+ break;
+
+ idle = do_sched_cfs_period_timer(cfs_b, overrun);
+ }
+
+ return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ raw_spin_lock_init(&cfs_b->lock);
+ cfs_b->runtime = 0;
+ cfs_b->quota = RUNTIME_INF;
+ cfs_b->period = ns_to_ktime(default_cfs_period());
+
+ INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
+ hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->period_timer.function = sched_cfs_period_timer;
+ hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->slack_timer.function = sched_cfs_slack_timer;
+}
+
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->runtime_enabled = 0;
+ INIT_LIST_HEAD(&cfs_rq->throttled_list);
+}
+
+/* requires cfs_b->lock, may release to reprogram timer */
+void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ /*
+ * The timer may be active because we're trying to set a new bandwidth
+ * period or because we're racing with the tear-down path
+ * (timer_active==0 becomes visible before the hrtimer call-back
+ * terminates). In either case we ensure that it's re-programmed
+ */
+ while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+ raw_spin_unlock(&cfs_b->lock);
+ /* ensure cfs_b->lock is available while we wait */
+ hrtimer_cancel(&cfs_b->period_timer);
+
+ raw_spin_lock(&cfs_b->lock);
+ /* if someone else restarted the timer then we're done */
+ if (cfs_b->timer_active)
+ return;
+ }
+
+ cfs_b->timer_active = 1;
+ start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
+
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ hrtimer_cancel(&cfs_b->period_timer);
+ hrtimer_cancel(&cfs_b->slack_timer);
+}
+
+void unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+ struct cfs_rq *cfs_rq;
+
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+
+ if (!cfs_rq->runtime_enabled)
+ continue;
+
+ /*
+ * clock_task is not advancing so we just need to make sure
+ * there's some valid quota amount
+ */
+ cfs_rq->runtime_remaining = cfs_b->quota;
+ if (cfs_rq_throttled(cfs_rq))
+ unthrottle_cfs_rq(cfs_rq);
+ }
+}
+
+#else /* CONFIG_CFS_BANDWIDTH */
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -1839,8 +2109,22 @@ static inline int throttled_lb_pair(struct task_group *tg,
{
return 0;
}
+
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
#endif
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+ return NULL;
+}
+static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
/**************************************************
* CFS operations on tasks:
*/
@@ -1853,7 +2137,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
WARN_ON(task_rq(p) != rq);
- if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
+ if (cfs_rq->nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
@@ -1884,7 +2168,7 @@ static void hrtick_update(struct rq *rq)
{
struct task_struct *curr = rq->curr;
- if (curr->sched_class != &fair_sched_class)
+ if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
return;
if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
@@ -2007,6 +2291,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+ return cpu_rq(cpu)->load.weight;
+}
+
+/*
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static unsigned long source_load(int cpu, int type)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(cpu);
+
+ if (type == 0 || !sched_feat(LB_BIAS))
+ return total;
+
+ return min(rq->cpu_load[type-1], total);
+}
+
+/*
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
+ */
+static unsigned long target_load(int cpu, int type)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(cpu);
+
+ if (type == 0 || !sched_feat(LB_BIAS))
+ return total;
+
+ return max(rq->cpu_load[type-1], total);
+}
+
+static unsigned long power_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_power;
+}
+
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+
+ if (nr_running)
+ return rq->load.weight / nr_running;
+
+ return 0;
+}
+
static void task_waking_fair(struct task_struct *p)
{
@@ -2036,36 +2375,100 @@ static void task_waking_fair(struct task_struct *p)
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares.
+ *
+ * Calculate the effective load difference if @wl is added (subtracted) to @tg
+ * on this @cpu and results in a total addition (subtraction) of @wg to the
+ * total group weight.
+ *
+ * Given a runqueue weight distribution (rw_i) we can compute a shares
+ * distribution (s_i) using:
+ *
+ * s_i = rw_i / \Sum rw_j (1)
+ *
+ * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
+ * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
+ * shares distribution (s_i):
+ *
+ * rw_i = { 2, 4, 1, 0 }
+ * s_i = { 2/7, 4/7, 1/7, 0 }
+ *
+ * As per wake_affine() we're interested in the load of two CPUs (the CPU the
+ * task used to run on and the CPU the waker is running on), we need to
+ * compute the effect of waking a task on either CPU and, in case of a sync
+ * wakeup, compute the effect of the current task going to sleep.
+ *
+ * So for a change of @wl to the local @cpu with an overall group weight change
+ * of @wl we can compute the new shares distribution (s'_i) using:
+ *
+ * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
+ *
+ * Suppose we're interested in CPUs 0 and 1, and want to compute the load
+ * differences in waking a task to CPU 0. The additional task changes the
+ * weight and shares distributions like:
+ *
+ * rw'_i = { 3, 4, 1, 0 }
+ * s'_i = { 3/8, 4/8, 1/8, 0 }
+ *
+ * We can then compute the difference in effective weight by using:
+ *
+ * dw_i = S * (s'_i - s_i) (3)
+ *
+ * Where 'S' is the group weight as seen by its parent.
+ *
+ * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
+ * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
+ * 4/7) times the weight of the group.
*/
static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
- if (!tg->parent)
+ if (!tg->parent) /* the trivial, non-cgroup case */
return wl;
for_each_sched_entity(se) {
- long lw, w;
+ long w, W;
tg = se->my_q->tg;
- w = se->my_q->load.weight;
- /* use this cpu's instantaneous contribution */
- lw = atomic_read(&tg->load_weight);
- lw -= se->my_q->load_contribution;
- lw += w + wg;
+ /*
+ * W = @wg + \Sum rw_j
+ */
+ W = wg + calc_tg_weight(tg, se->my_q);
- wl += w;
+ /*
+ * w = rw_i + @wl
+ */
+ w = se->my_q->load.weight + wl;
- if (lw > 0 && wl < lw)
- wl = (wl * tg->shares) / lw;
+ /*
+ * wl = S * s'_i; see (2)
+ */
+ if (W > 0 && w < W)
+ wl = (w * tg->shares) / W;
else
wl = tg->shares;
- /* zero point is MIN_SHARES */
+ /*
+ * Per the above, wl is the new se->load.weight value; since
+ * those are clipped to [MIN_SHARES, ...) do so now. See
+ * calc_cfs_shares().
+ */
if (wl < MIN_SHARES)
wl = MIN_SHARES;
+
+ /*
+ * wl = dw_i = S * (s'_i - s_i); see (3)
+ */
wl -= se->load.weight;
+
+ /*
+ * Recursively apply this logic to all parent groups to compute
+ * the final effective load change on the root group. Since
+ * only the @tg group gets extra weight, all parent groups can
+ * only redistribute existing shares. @wl is the shift in shares
+ * resulting from this level per the above.
+ */
wg = 0;
}
@@ -2249,6 +2652,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
struct sched_domain *sd;
+ struct sched_group *sg;
int i;
/*
@@ -2269,25 +2673,28 @@ static int select_idle_sibling(struct task_struct *p, int target)
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
rcu_read_lock();
- for_each_domain(target, sd) {
- if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
- break;
- for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
- if (idle_cpu(i)) {
- target = i;
- break;
+ sd = rcu_dereference(per_cpu(sd_llc, target));
+ for_each_lower_domain(sd) {
+ sg = sd->groups;
+ do {
+ if (!cpumask_intersects(sched_group_cpus(sg),
+ tsk_cpus_allowed(p)))
+ goto next;
+
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (!idle_cpu(i))
+ goto next;
}
- }
- /*
- * Lets stop looking for an idle sibling when we reached
- * the domain that spans the current cpu and prev_cpu.
- */
- if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
- cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
- break;
+ target = cpumask_first_and(sched_group_cpus(sg),
+ tsk_cpus_allowed(p));
+ goto done;
+next:
+ sg = sg->next;
+ } while (sg != sd->groups);
}
+done:
rcu_read_unlock();
return target;
@@ -2315,6 +2722,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
int want_sd = 1;
int sync = wake_flags & WF_SYNC;
+ if (p->rt.nr_cpus_allowed == 1)
+ return prev_cpu;
+
if (sd_flag & SD_BALANCE_WAKE) {
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
want_affine = 1;
@@ -2599,7 +3009,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
} while (cfs_rq);
p = task_of(se);
- hrtick_start_fair(rq, p);
+ if (hrtick_enabled(rq))
+ hrtick_start_fair(rq, p);
return p;
}
@@ -2643,6 +3054,12 @@ static void yield_task_fair(struct rq *rq)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq->skip_clock_update = 1;
}
set_skip_buddy(se);
@@ -2683,12 +3100,50 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
}
/*
+ * Is this task likely cache-hot:
+ */
+static int
+task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+{
+ s64 delta;
+
+ if (p->sched_class != &fair_sched_class)
+ return 0;
+
+ if (unlikely(p->policy == SCHED_IDLE))
+ return 0;
+
+ /*
+ * Buddy candidates are cache hot:
+ */
+ if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
+ (&p->se == cfs_rq_of(&p->se)->next ||
+ &p->se == cfs_rq_of(&p->se)->last))
+ return 1;
+
+ if (sysctl_sched_migration_cost == -1)
+ return 1;
+ if (sysctl_sched_migration_cost == 0)
+ return 0;
+
+ delta = now - p->se.exec_start;
+
+ return delta < (s64)sysctl_sched_migration_cost;
+}
+
+#define LBF_ALL_PINNED 0x01
+#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */
+#define LBF_HAD_BREAK 0x04
+#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */
+#define LBF_ABORT 0x10
+
+/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
static
int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned)
+ int *lb_flags)
{
int tsk_cache_hot = 0;
/*
@@ -2701,7 +3156,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
return 0;
}
- *all_pinned = 0;
+ *lb_flags &= ~LBF_ALL_PINNED;
if (task_running(rq, p)) {
schedstat_inc(p, se.statistics.nr_failed_migrations_running);
@@ -2775,7 +3230,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
static unsigned long
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move, struct sched_domain *sd,
- enum cpu_idle_type idle, int *all_pinned,
+ enum cpu_idle_type idle, int *lb_flags,
struct cfs_rq *busiest_cfs_rq)
{
int loops = 0, pulled = 0;
@@ -2786,12 +3241,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
goto out;
list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
- if (loops++ > sysctl_sched_nr_migrate)
+ if (loops++ > sysctl_sched_nr_migrate) {
+ *lb_flags |= LBF_NEED_BREAK;
break;
+ }
if ((p->se.load.weight >> 1) > rem_load_move ||
!can_migrate_task(p, busiest, this_cpu, sd, idle,
- all_pinned))
+ lb_flags))
continue;
pull_task(busiest, p, this_rq, this_cpu);
@@ -2804,8 +3261,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
* kernels will stop after the first task is pulled to minimize
* the critical section.
*/
- if (idle == CPU_NEWLY_IDLE)
+ if (idle == CPU_NEWLY_IDLE) {
+ *lb_flags |= LBF_ABORT;
break;
+ }
#endif
/*
@@ -2910,7 +3369,7 @@ static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned)
+ int *lb_flags)
{
long rem_load_move = max_load_move;
struct cfs_rq *busiest_cfs_rq;
@@ -2923,6 +3382,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long busiest_weight = busiest_cfs_rq->load.weight;
u64 rem_load, moved_load;
+ if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+ break;
+
/*
* empty group or part of a throttled hierarchy
*/
@@ -2934,7 +3396,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
rem_load = div_u64(rem_load, busiest_h_load + 1);
moved_load = balance_tasks(this_rq, this_cpu, busiest,
- rem_load, sd, idle, all_pinned,
+ rem_load, sd, idle, lb_flags,
busiest_cfs_rq);
if (!moved_load)
@@ -2960,10 +3422,10 @@ static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned)
+ int *lb_flags)
{
return balance_tasks(this_rq, this_cpu, busiest,
- max_load_move, sd, idle, all_pinned,
+ max_load_move, sd, idle, lb_flags,
&busiest->cfs);
}
#endif
@@ -2978,29 +3440,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned)
+ int *lb_flags)
{
unsigned long total_load_moved = 0, load_moved;
do {
load_moved = load_balance_fair(this_rq, this_cpu, busiest,
max_load_move - total_load_moved,
- sd, idle, all_pinned);
+ sd, idle, lb_flags);
total_load_moved += load_moved;
+ if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+ break;
+
#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is pulled to minimize
* the critical section.
*/
- if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
- break;
-
- if (raw_spin_is_contended(&this_rq->lock) ||
- raw_spin_is_contended(&busiest->lock))
+ if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
+ *lb_flags |= LBF_ABORT;
break;
+ }
#endif
} while (load_moved && max_load_move > total_load_moved);
@@ -3062,15 +3525,6 @@ struct sg_lb_stats {
};
/**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
- return cpumask_first(sched_group_cpus(group));
-}
-
-/**
* get_sd_load_idx - Obtain the load index for a given sched domain.
* @sd: The sched_domain whose load_idx is to be obtained.
* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
@@ -3319,7 +3773,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
sdg->sgp->power = power;
}
-static void update_group_power(struct sched_domain *sd, int cpu)
+void update_group_power(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
@@ -3511,7 +3965,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
}
/**
- * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @sd: sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
@@ -3585,11 +4039,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
} while (sg != sd->groups);
}
-int __weak arch_sd_sibling_asym_packing(void)
-{
- return 0*SD_ASYM_PACKING;
-}
-
/**
* check_asym_packing - Check to see if the group is packed into the
* sched doman.
@@ -3953,7 +4402,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
#define MAX_PINNED_INTERVAL 512
/* Working cpumask for load_balance and load_balance_newidle. */
-static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
static int need_active_balance(struct sched_domain *sd, int idle,
int busiest_cpu, int this_cpu)
@@ -4004,7 +4453,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
- int ld_moved, all_pinned = 0, active_balance = 0;
+ int ld_moved, lb_flags = 0, active_balance = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
@@ -4045,11 +4494,11 @@ redo:
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- all_pinned = 1;
+ lb_flags |= LBF_ALL_PINNED;
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
ld_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, idle, &all_pinned);
+ imbalance, sd, idle, &lb_flags);
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
@@ -4059,8 +4508,18 @@ redo:
if (ld_moved && this_cpu != smp_processor_id())
resched_cpu(this_cpu);
+ if (lb_flags & LBF_ABORT)
+ goto out_balanced;
+
+ if (lb_flags & LBF_NEED_BREAK) {
+ lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
+ if (lb_flags & LBF_ABORT)
+ goto out_balanced;
+ goto redo;
+ }
+
/* All tasks on this runqueue were pinned by CPU affinity */
- if (unlikely(all_pinned)) {
+ if (unlikely(lb_flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
if (!cpumask_empty(cpus))
goto redo;
@@ -4090,7 +4549,7 @@ redo:
tsk_cpus_allowed(busiest->curr))) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
- all_pinned = 1;
+ lb_flags |= LBF_ALL_PINNED;
goto out_one_pinned;
}
@@ -4143,7 +4602,8 @@ out_balanced:
out_one_pinned:
/* tune up the balancing interval */
- if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+ if (((lb_flags & LBF_ALL_PINNED) &&
+ sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
@@ -4156,7 +4616,7 @@ out:
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-static void idle_balance(int this_cpu, struct rq *this_rq)
+void idle_balance(int this_cpu, struct rq *this_rq)
{
struct sched_domain *sd;
int pulled_task = 0;
@@ -4271,28 +4731,16 @@ out_unlock:
#ifdef CONFIG_NO_HZ
/*
* idle load balancing details
- * - One of the idle CPUs nominates itself as idle load_balancer, while
- * entering idle.
- * - This idle load balancer CPU will also go into tickless mode when
- * it is idle, just like all other idle CPUs
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
static struct {
- atomic_t load_balancer;
- atomic_t first_pick_cpu;
- atomic_t second_pick_cpu;
cpumask_var_t idle_cpus_mask;
- cpumask_var_t grp_idle_mask;
+ atomic_t nr_cpus;
unsigned long next_balance; /* in jiffy units */
} nohz ____cacheline_aligned;
-int get_nohz_load_balancer(void)
-{
- return atomic_read(&nohz.load_balancer);
-}
-
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
/**
* lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -4329,33 +4777,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
(sd && (sd->flags & flag)); sd = sd->parent)
/**
- * is_semi_idle_group - Checks if the given sched_group is semi-idle.
- * @ilb_group: group to be checked for semi-idleness
- *
- * Returns: 1 if the group is semi-idle. 0 otherwise.
- *
- * We define a sched_group to be semi idle if it has atleast one idle-CPU
- * and atleast one non-idle CPU. This helper function checks if the given
- * sched_group is semi-idle or not.
- */
-static inline int is_semi_idle_group(struct sched_group *ilb_group)
-{
- cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
- sched_group_cpus(ilb_group));
-
- /*
- * A sched_group is semi-idle when it has atleast one busy cpu
- * and atleast one idle cpu.
- */
- if (cpumask_empty(nohz.grp_idle_mask))
- return 0;
-
- if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
- return 0;
-
- return 1;
-}
-/**
* find_new_ilb - Finds the optimum idle load balancer for nomination.
* @cpu: The cpu which is nominating a new idle_load_balancer.
*
@@ -4369,9 +4790,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
*/
static int find_new_ilb(int cpu)
{
+ int ilb = cpumask_first(nohz.idle_cpus_mask);
+ struct sched_group *ilbg;
struct sched_domain *sd;
- struct sched_group *ilb_group;
- int ilb = nr_cpu_ids;
/*
* Have idle load balancer selection from semi-idle packages only
@@ -4389,23 +4810,28 @@ static int find_new_ilb(int cpu)
rcu_read_lock();
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
- ilb_group = sd->groups;
+ ilbg = sd->groups;
do {
- if (is_semi_idle_group(ilb_group)) {
- ilb = cpumask_first(nohz.grp_idle_mask);
+ if (ilbg->group_weight !=
+ atomic_read(&ilbg->sgp->nr_busy_cpus)) {
+ ilb = cpumask_first_and(nohz.idle_cpus_mask,
+ sched_group_cpus(ilbg));
goto unlock;
}
- ilb_group = ilb_group->next;
+ ilbg = ilbg->next;
- } while (ilb_group != sd->groups);
+ } while (ilbg != sd->groups);
}
unlock:
rcu_read_unlock();
out_done:
- return ilb;
+ if (ilb < nr_cpu_ids && idle_cpu(ilb))
+ return ilb;
+
+ return nr_cpu_ids;
}
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
static inline int find_new_ilb(int call_cpu)
@@ -4425,102 +4851,98 @@ static void nohz_balancer_kick(int cpu)
nohz.next_balance++;
- ilb_cpu = get_nohz_load_balancer();
+ ilb_cpu = find_new_ilb(cpu);
- if (ilb_cpu >= nr_cpu_ids) {
- ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
- if (ilb_cpu >= nr_cpu_ids)
- return;
- }
+ if (ilb_cpu >= nr_cpu_ids)
+ return;
- if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
- cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+ if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
+ return;
+ /*
+ * Use smp_send_reschedule() instead of resched_cpu().
+ * This way we generate a sched IPI on the target cpu which
+ * is idle. And the softirq performing nohz idle load balance
+ * will be run before returning from the IPI.
+ */
+ smp_send_reschedule(ilb_cpu);
+ return;
+}
- smp_mb();
- /*
- * Use smp_send_reschedule() instead of resched_cpu().
- * This way we generate a sched IPI on the target cpu which
- * is idle. And the softirq performing nohz idle load balance
- * will be run before returning from the IPI.
- */
- smp_send_reschedule(ilb_cpu);
+static inline void clear_nohz_tick_stopped(int cpu)
+{
+ if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+ clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
- return;
}
-/*
- * This routine will try to nominate the ilb (idle load balancing)
- * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus.
- *
- * When the ilb owner becomes busy, we will not have new ilb owner until some
- * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
- * idle load balancing by kicking one of the idle CPUs.
- *
- * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
- * ilb owner CPU in future (when there is a need for idle load balancing on
- * behalf of all idle CPUs).
- */
-void select_nohz_load_balancer(int stop_tick)
+static inline void set_cpu_sd_state_busy(void)
{
+ struct sched_domain *sd;
int cpu = smp_processor_id();
- if (stop_tick) {
- if (!cpu_active(cpu)) {
- if (atomic_read(&nohz.load_balancer) != cpu)
- return;
+ if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
+ return;
+ clear_bit(NOHZ_IDLE, nohz_flags(cpu));
- /*
- * If we are going offline and still the leader,
- * give up!
- */
- if (atomic_cmpxchg(&nohz.load_balancer, cpu,
- nr_cpu_ids) != cpu)
- BUG();
+ rcu_read_lock();
+ for_each_domain(cpu, sd)
+ atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+ rcu_read_unlock();
+}
- return;
- }
+void set_cpu_sd_state_idle(void)
+{
+ struct sched_domain *sd;
+ int cpu = smp_processor_id();
- cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
+ return;
+ set_bit(NOHZ_IDLE, nohz_flags(cpu));
- if (atomic_read(&nohz.first_pick_cpu) == cpu)
- atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
- if (atomic_read(&nohz.second_pick_cpu) == cpu)
- atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+ rcu_read_lock();
+ for_each_domain(cpu, sd)
+ atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+ rcu_read_unlock();
+}
- if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
- int new_ilb;
+/*
+ * This routine will record that this cpu is going idle with tick stopped.
+ * This info will be used in performing idle load balancing in the future.
+ */
+void select_nohz_load_balancer(int stop_tick)
+{
+ int cpu = smp_processor_id();
- /* make me the ilb owner */
- if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
- cpu) != nr_cpu_ids)
- return;
+ /*
+ * If this cpu is going down, then nothing needs to be done.
+ */
+ if (!cpu_active(cpu))
+ return;
- /*
- * Check to see if there is a more power-efficient
- * ilb.
- */
- new_ilb = find_new_ilb(cpu);
- if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
- atomic_set(&nohz.load_balancer, nr_cpu_ids);
- resched_cpu(new_ilb);
- return;
- }
- return;
- }
- } else {
- if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ if (stop_tick) {
+ if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
return;
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
-
- if (atomic_read(&nohz.load_balancer) == cpu)
- if (atomic_cmpxchg(&nohz.load_balancer, cpu,
- nr_cpu_ids) != cpu)
- BUG();
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_inc(&nohz.nr_cpus);
+ set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
return;
}
+
+static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DYING:
+ clear_nohz_tick_stopped(smp_processor_id());
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
#endif
static DEFINE_SPINLOCK(balancing);
@@ -4531,7 +4953,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
* Scale the max load_balance interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
*/
-static void update_max_interval(void)
+void update_max_interval(void)
{
max_load_balance_interval = HZ*num_online_cpus()/10;
}
@@ -4623,11 +5045,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
struct rq *rq;
int balance_cpu;
- if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
- return;
+ if (idle != CPU_IDLE ||
+ !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
+ goto end;
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
- if (balance_cpu == this_cpu)
+ if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;
/*
@@ -4635,10 +5058,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
- if (need_resched()) {
- this_rq->nohz_balance_kick = 0;
+ if (need_resched())
break;
- }
raw_spin_lock_irq(&this_rq->lock);
update_rq_clock(this_rq);
@@ -4652,53 +5073,71 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
this_rq->next_balance = rq->next_balance;
}
nohz.next_balance = this_rq->next_balance;
- this_rq->nohz_balance_kick = 0;
+end:
+ clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}
/*
- * Current heuristic for kicking the idle load balancer
- * - first_pick_cpu is the one of the busy CPUs. It will kick
- * idle load balancer when it has more than one process active. This
- * eliminates the need for idle load balancing altogether when we have
- * only one running process in the system (common case).
- * - If there are more than one busy CPU, idle load balancer may have
- * to run for active_load_balance to happen (i.e., two busy CPUs are
- * SMT or core siblings and can run better if they move to different
- * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
- * which will kick idle load balancer as soon as it has any load.
+ * Current heuristic for kicking the idle load balancer in the presence
+ * of an idle cpu is the system.
+ * - This rq has more than one task.
+ * - At any scheduler domain level, this cpu's scheduler group has multiple
+ * busy cpu's exceeding the group's power.
+ * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
+ * domain span are idle.
*/
static inline int nohz_kick_needed(struct rq *rq, int cpu)
{
unsigned long now = jiffies;
- int ret;
- int first_pick_cpu, second_pick_cpu;
+ struct sched_domain *sd;
- if (time_before(now, nohz.next_balance))
+ if (unlikely(idle_cpu(cpu)))
return 0;
- if (idle_cpu(cpu))
- return 0;
+ /*
+ * We may be recently in ticked or tickless idle mode. At the first
+ * busy tick after returning from idle, we will update the busy stats.
+ */
+ set_cpu_sd_state_busy();
+ clear_nohz_tick_stopped(cpu);
- first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
- second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+ /*
+ * None are in tickless mode and hence no need for NOHZ idle load
+ * balancing.
+ */
+ if (likely(!atomic_read(&nohz.nr_cpus)))
+ return 0;
- if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
- second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
+ if (time_before(now, nohz.next_balance))
return 0;
- ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
- if (ret == nr_cpu_ids || ret == cpu) {
- atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
- if (rq->nr_running > 1)
- return 1;
- } else {
- ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
- if (ret == nr_cpu_ids || ret == cpu) {
- if (rq->nr_running)
- return 1;
- }
+ if (rq->nr_running >= 2)
+ goto need_kick;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ struct sched_group *sg = sd->groups;
+ struct sched_group_power *sgp = sg->sgp;
+ int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+
+ if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
+ goto need_kick_unlock;
+
+ if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
+ && (cpumask_first_and(nohz.idle_cpus_mask,
+ sched_domain_span(sd)) < cpu))
+ goto need_kick_unlock;
+
+ if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
+ break;
}
+ rcu_read_unlock();
return 0;
+
+need_kick_unlock:
+ rcu_read_unlock();
+need_kick:
+ return 1;
}
#else
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
@@ -4733,14 +5172,14 @@ static inline int on_null_domain(int cpu)
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
-static inline void trigger_load_balance(struct rq *rq, int cpu)
+void trigger_load_balance(struct rq *rq, int cpu)
{
/* Don't need to rebalance while attached to NULL domain */
if (time_after_eq(jiffies, rq->next_balance) &&
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ
- else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+ if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
nohz_balancer_kick(cpu);
#endif
}
@@ -4755,15 +5194,6 @@ static void rq_offline_fair(struct rq *rq)
update_sysctl();
}
-#else /* CONFIG_SMP */
-
-/*
- * on UP we do not need to balance between CPUs:
- */
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
-
#endif /* CONFIG_SMP */
/*
@@ -4787,8 +5217,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
*/
static void task_fork_fair(struct task_struct *p)
{
- struct cfs_rq *cfs_rq = task_cfs_rq(current);
- struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se, *curr;
int this_cpu = smp_processor_id();
struct rq *rq = this_rq();
unsigned long flags;
@@ -4797,6 +5227,9 @@ static void task_fork_fair(struct task_struct *p)
update_rq_clock(rq);
+ cfs_rq = task_cfs_rq(current);
+ curr = cfs_rq->curr;
+
if (unlikely(task_cpu(p) != this_cpu)) {
rcu_read_lock();
__set_task_cpu(p, this_cpu);
@@ -4906,6 +5339,16 @@ static void set_curr_task_fair(struct rq *rq)
}
}
+void init_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->tasks_timeline = RB_ROOT;
+ INIT_LIST_HEAD(&cfs_rq->tasks);
+ cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+ cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int on_rq)
{
@@ -4922,13 +5365,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* to another cgroup's rq. This does somewhat interfere with the
* fair sleeper stuff for the first placement, but who cares.
*/
+ /*
+ * When !on_rq, vruntime of the task has usually NOT been normalized.
+ * But there are some cases where it has already been normalized:
+ *
+ * - Moving a forked child which is waiting for being woken up by
+ * wake_up_new_task().
+ * - Moving a task which has been woken up by try_to_wake_up() and
+ * waiting for actually being woken up by sched_ttwu_pending().
+ *
+ * To prevent boost or penalty in the new cfs_rq caused by delta
+ * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
+ */
+ if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+ on_rq = 1;
+
if (!on_rq)
p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
set_task_rq(p, task_cpu(p));
if (!on_rq)
p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
}
+
+void free_fair_sched_group(struct task_group *tg)
+{
+ int i;
+
+ destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+ for_each_possible_cpu(i) {
+ if (tg->cfs_rq)
+ kfree(tg->cfs_rq[i]);
+ if (tg->se)
+ kfree(tg->se[i]);
+ }
+
+ kfree(tg->cfs_rq);
+ kfree(tg->se);
+}
+
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se;
+ int i;
+
+ tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
+ if (!tg->cfs_rq)
+ goto err;
+ tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
+ if (!tg->se)
+ goto err;
+
+ tg->shares = NICE_0_LOAD;
+
+ init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+ for_each_possible_cpu(i) {
+ cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!cfs_rq)
+ goto err;
+
+ se = kzalloc_node(sizeof(struct sched_entity),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!se)
+ goto err_free_rq;
+
+ init_cfs_rq(cfs_rq);
+ init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+ }
+
+ return 1;
+
+err_free_rq:
+ kfree(cfs_rq);
+err:
+ return 0;
+}
+
+void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ /*
+ * Only empty task groups can be destroyed; so we can speculatively
+ * check on_list without danger of it being re-added.
+ */
+ if (!tg->cfs_rq[cpu]->on_list)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+ struct sched_entity *se, int cpu,
+ struct sched_entity *parent)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ cfs_rq->tg = tg;
+ cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+ /* allow initial update_cfs_load() to truncate */
+ cfs_rq->load_stamp = 1;
#endif
+ init_cfs_rq_runtime(cfs_rq);
+
+ tg->cfs_rq[cpu] = cfs_rq;
+ tg->se[cpu] = se;
+
+ /* se could be NULL for root_task_group */
+ if (!se)
+ return;
+
+ if (!parent)
+ se->cfs_rq = &rq->cfs;
+ else
+ se->cfs_rq = parent->my_q;
+
+ se->my_q = cfs_rq;
+ update_load_set(&se->load, 0);
+ se->parent = parent;
+}
+
+static DEFINE_MUTEX(shares_mutex);
+
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+ int i;
+ unsigned long flags;
+
+ /*
+ * We can't change the weight of the root cgroup.
+ */
+ if (!tg->se[0])
+ return -EINVAL;
+
+ shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
+
+ mutex_lock(&shares_mutex);
+ if (tg->shares == shares)
+ goto done;
+
+ tg->shares = shares;
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ struct sched_entity *se;
+
+ se = tg->se[i];
+ /* Propagate contribution to hierarchy */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ for_each_sched_entity(se)
+ update_cfs_shares(group_cfs_rq(se));
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+done:
+ mutex_unlock(&shares_mutex);
+ return 0;
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+
+void free_fair_sched_group(struct task_group *tg) { }
+
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ return 1;
+}
+
+void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
{
@@ -4948,7 +5560,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
/*
* All the scheduling class methods:
*/
-static const struct sched_class fair_sched_class = {
+const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
@@ -4985,7 +5597,7 @@ static const struct sched_class fair_sched_class = {
};
#ifdef CONFIG_SCHED_DEBUG
-static void print_cfs_stats(struct seq_file *m, int cpu)
+void print_cfs_stats(struct seq_file *m, int cpu)
{
struct cfs_rq *cfs_rq;
@@ -4995,3 +5607,16 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
rcu_read_unlock();
}
#endif
+
+__init void init_sched_fair_class(void)
+{
+#ifdef CONFIG_SMP
+ open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+
+#ifdef CONFIG_NO_HZ
+ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+ cpu_notifier(sched_ilb_notifier, 0);
+#endif
+#endif /* SMP */
+
+}