aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c543
1 files changed, 142 insertions, 401 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0d97ebdc58f0..c099cc6eebe3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se))
- list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+ list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
#endif
cfs_rq->nr_running++;
}
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
int want_sd = 1;
int sync = wake_flags & WF_SYNC;
- if (p->rt.nr_cpus_allowed == 1)
+ if (p->nr_cpus_allowed == 1)
return prev_cpu;
if (sd_flag & SD_BALANCE_WAKE) {
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
* If power savings logic is enabled for a domain, see if we
* are not overloaded, if so, don't balance wider.
*/
- if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
+ if (tmp->flags & (SD_PREFER_LOCAL)) {
unsigned long power = 0;
unsigned long nr_running = 0;
unsigned long capacity;
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
- if (tmp->flags & SD_POWERSAVINGS_BALANCE)
- nr_running /= 2;
-
if (nr_running < capacity)
want_sd = 0;
}
@@ -3082,7 +3079,7 @@ struct lb_env {
struct rq *dst_rq;
enum cpu_idle_type idle;
- long load_move;
+ long imbalance;
unsigned int flags;
unsigned int loop;
@@ -3215,8 +3212,10 @@ static int move_one_task(struct lb_env *env)
static unsigned long task_h_load(struct task_struct *p);
+static const unsigned int sched_nr_migrate_break = 32;
+
/*
- * move_tasks tries to move up to load_move weighted load from busiest to
+ * move_tasks tries to move up to imbalance weighted load from busiest to
* this_rq, as part of a balancing operation within domain "sd".
* Returns 1 if successful and 0 otherwise.
*
@@ -3229,7 +3228,7 @@ static int move_tasks(struct lb_env *env)
unsigned long load;
int pulled = 0;
- if (env->load_move <= 0)
+ if (env->imbalance <= 0)
return 0;
while (!list_empty(tasks)) {
@@ -3242,7 +3241,7 @@ static int move_tasks(struct lb_env *env)
/* take a breather every nr_migrate tasks */
if (env->loop > env->loop_break) {
- env->loop_break += sysctl_sched_nr_migrate;
+ env->loop_break += sched_nr_migrate_break;
env->flags |= LBF_NEED_BREAK;
break;
}
@@ -3252,10 +3251,10 @@ static int move_tasks(struct lb_env *env)
load = task_h_load(p);
- if (load < 16 && !env->sd->nr_balance_failed)
+ if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
goto next;
- if ((load / 2) > env->load_move)
+ if ((load / 2) > env->imbalance)
goto next;
if (!can_migrate_task(p, env))
@@ -3263,7 +3262,7 @@ static int move_tasks(struct lb_env *env)
move_task(p, env);
pulled++;
- env->load_move -= load;
+ env->imbalance -= load;
#ifdef CONFIG_PREEMPT
/*
@@ -3279,7 +3278,7 @@ static int move_tasks(struct lb_env *env)
* We only want to steal up to the prescribed amount of
* weighted load.
*/
- if (env->load_move <= 0)
+ if (env->imbalance <= 0)
break;
continue;
@@ -3433,14 +3432,6 @@ struct sd_lb_stats {
unsigned int busiest_group_weight;
int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- int power_savings_balance; /* Is powersave balance needed for this sd */
- struct sched_group *group_min; /* Least loaded group in sd */
- struct sched_group *group_leader; /* Group which relieves group_min */
- unsigned long min_load_per_task; /* load_per_task in group_min */
- unsigned long leader_nr_running; /* Nr running of group_leader */
- unsigned long min_nr_running; /* Nr running of group_min */
-#endif
};
/*
@@ -3484,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * init_sd_power_savings_stats - Initialize power savings statistics for
- * the given sched_domain, during load balancing.
- *
- * @sd: Sched domain whose power-savings statistics are to be initialized.
- * @sds: Variable containing the statistics for sd.
- * @idle: Idle status of the CPU at which we're performing load-balancing.
- */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
- struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
- /*
- * Busy processors will not participate in power savings
- * balance.
- */
- if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
- sds->power_savings_balance = 0;
- else {
- sds->power_savings_balance = 1;
- sds->min_nr_running = ULONG_MAX;
- sds->leader_nr_running = 0;
- }
-}
-
-/**
- * update_sd_power_savings_stats - Update the power saving stats for a
- * sched_domain while performing load balancing.
- *
- * @group: sched_group belonging to the sched_domain under consideration.
- * @sds: Variable containing the statistics of the sched_domain
- * @local_group: Does group contain the CPU for which we're performing
- * load balancing ?
- * @sgs: Variable containing the statistics of the group.
- */
-static inline void update_sd_power_savings_stats(struct sched_group *group,
- struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-
- if (!sds->power_savings_balance)
- return;
-
- /*
- * If the local group is idle or completely loaded
- * no need to do power savings balance at this domain
- */
- if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
- !sds->this_nr_running))
- sds->power_savings_balance = 0;
-
- /*
- * If a group is already running at full capacity or idle,
- * don't include that group in power savings calculations
- */
- if (!sds->power_savings_balance ||
- sgs->sum_nr_running >= sgs->group_capacity ||
- !sgs->sum_nr_running)
- return;
-
- /*
- * Calculate the group which has the least non-idle load.
- * This is the group from where we need to pick up the load
- * for saving power
- */
- if ((sgs->sum_nr_running < sds->min_nr_running) ||
- (sgs->sum_nr_running == sds->min_nr_running &&
- group_first_cpu(group) > group_first_cpu(sds->group_min))) {
- sds->group_min = group;
- sds->min_nr_running = sgs->sum_nr_running;
- sds->min_load_per_task = sgs->sum_weighted_load /
- sgs->sum_nr_running;
- }
-
- /*
- * Calculate the group which is almost near its
- * capacity but still has some space to pick up some load
- * from other group and save more power
- */
- if (sgs->sum_nr_running + 1 > sgs->group_capacity)
- return;
-
- if (sgs->sum_nr_running > sds->leader_nr_running ||
- (sgs->sum_nr_running == sds->leader_nr_running &&
- group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
- sds->group_leader = group;
- sds->leader_nr_running = sgs->sum_nr_running;
- }
-}
-
-/**
- * check_power_save_busiest_group - see if there is potential for some power-savings balance
- * @sds: Variable containing the statistics of the sched_domain
- * under consideration.
- * @this_cpu: Cpu at which we're currently performing load-balancing.
- * @imbalance: Variable to store the imbalance.
- *
- * Description:
- * Check if we have potential to perform some power-savings balance.
- * If yes, set the busiest group to be the least loaded group in the
- * sched_domain, so that it's CPUs can be put to idle.
- *
- * Returns 1 if there is potential to perform power-savings balance.
- * Else returns 0.
- */
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
-{
- if (!sds->power_savings_balance)
- return 0;
-
- if (sds->this != sds->group_leader ||
- sds->group_leader == sds->group_min)
- return 0;
-
- *imbalance = sds->min_load_per_task;
- sds->busiest = sds->group_min;
-
- return 1;
-
-}
-#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
- struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
- return;
-}
-
-static inline void update_sd_power_savings_stats(struct sched_group *group,
- struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
- return;
-}
-
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
-{
- return 0;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-
-
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
{
return SCHED_POWER_SCALE;
@@ -3654,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, available;
+ u64 total, available, age_stamp, avg;
+
+ /*
+ * Since we're reading these variables without serialization make sure
+ * we read them once before doing sanity checks on them.
+ */
+ age_stamp = ACCESS_ONCE(rq->age_stamp);
+ avg = ACCESS_ONCE(rq->rt_avg);
- total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ total = sched_avg_period() + (rq->clock - age_stamp);
- if (unlikely(total < rq->rt_avg)) {
+ if (unlikely(total < avg)) {
/* Ensures that power won't end up being negative */
available = 0;
} else {
- available = total - rq->rt_avg;
+ available = total - avg;
}
if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3725,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu)
power = 0;
- group = child->groups;
- do {
- power += group->sgp->power;
- group = group->next;
- } while (group != child->groups);
+ if (child->flags & SD_OVERLAP) {
+ /*
+ * SD_OVERLAP domains cannot assume that child groups
+ * span the current group.
+ */
- sdg->sgp->power = power;
+ for_each_cpu(cpu, sched_group_cpus(sdg))
+ power += power_of(cpu);
+ } else {
+ /*
+ * !SD_OVERLAP domains can assume that child groups
+ * span the current group.
+ */
+
+ group = child->groups;
+ do {
+ power += group->sgp->power;
+ group = group->next;
+ } while (group != child->groups);
+ }
+
+ sdg->sgp->power_orig = sdg->sgp->power = power;
}
/*
@@ -3761,41 +3632,43 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: The sched_domain whose statistics are to be updated.
+ * @env: The load balancing environment.
* @group: sched_group whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sgs: variable to hold the statistics for this group.
*/
-static inline void update_sg_lb_stats(struct sched_domain *sd,
- struct sched_group *group, int this_cpu,
- enum cpu_idle_type idle, int load_idx,
+static inline void update_sg_lb_stats(struct lb_env *env,
+ struct sched_group *group, int load_idx,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
- unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
- int i;
+ unsigned long nr_running, max_nr_running, min_nr_running;
+ unsigned long load, max_cpu_load, min_cpu_load;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
+ int i;
if (local_group)
- balance_cpu = group_first_cpu(group);
+ balance_cpu = group_balance_cpu(group);
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
min_cpu_load = ~0UL;
max_nr_running = 0;
+ min_nr_running = ~0UL;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
struct rq *rq = cpu_rq(i);
+ nr_running = rq->nr_running;
+
/* Bias balancing toward cpus of our domain */
if (local_group) {
- if (idle_cpu(i) && !first_idle_cpu) {
+ if (idle_cpu(i) && !first_idle_cpu &&
+ cpumask_test_cpu(i, sched_group_mask(group))) {
first_idle_cpu = 1;
balance_cpu = i;
}
@@ -3803,16 +3676,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
load = target_load(i, load_idx);
} else {
load = source_load(i, load_idx);
- if (load > max_cpu_load) {
+ if (load > max_cpu_load)
max_cpu_load = load;
- max_nr_running = rq->nr_running;
- }
if (min_cpu_load > load)
min_cpu_load = load;
+
+ if (nr_running > max_nr_running)
+ max_nr_running = nr_running;
+ if (min_nr_running > nr_running)
+ min_nr_running = nr_running;
}
sgs->group_load += load;
- sgs->sum_nr_running += rq->nr_running;
+ sgs->sum_nr_running += nr_running;
sgs->sum_weighted_load += weighted_cpuload(i);
if (idle_cpu(i))
sgs->idle_cpus++;
@@ -3825,14 +3701,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
* to do the newly idle load balance.
*/
if (local_group) {
- if (idle != CPU_NEWLY_IDLE) {
- if (balance_cpu != this_cpu) {
+ if (env->idle != CPU_NEWLY_IDLE) {
+ if (balance_cpu != env->dst_cpu) {
*balance = 0;
return;
}
- update_group_power(sd, this_cpu);
+ update_group_power(env->sd, env->dst_cpu);
} else if (time_after_eq(jiffies, group->sgp->next_update))
- update_group_power(sd, this_cpu);
+ update_group_power(env->sd, env->dst_cpu);
}
/* Adjust by relative CPU power of the group */
@@ -3850,13 +3726,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
if (sgs->sum_nr_running)
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
- if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
+ if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
+ (max_nr_running - min_nr_running) > 1)
sgs->group_imb = 1;
sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
SCHED_POWER_SCALE);
if (!sgs->group_capacity)
- sgs->group_capacity = fix_small_capacity(sd, group);
+ sgs->group_capacity = fix_small_capacity(env->sd, group);
sgs->group_weight = group->group_weight;
if (sgs->group_capacity > sgs->sum_nr_running)
@@ -3865,20 +3742,18 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
/**
* update_sd_pick_busiest - return 1 on busiest group
- * @sd: sched_domain whose statistics are to be checked
+ * @env: The load balancing environment.
* @sds: sched_domain statistics
* @sg: sched_group candidate to be checked for being the busiest
* @sgs: sched_group statistics
- * @this_cpu: the current cpu
*
* Determine if @sg is a busier group than the previously selected
* busiest group.
*/
-static bool update_sd_pick_busiest(struct sched_domain *sd,
+static bool update_sd_pick_busiest(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *sg,
- struct sg_lb_stats *sgs,
- int this_cpu)
+ struct sg_lb_stats *sgs)
{
if (sgs->avg_load <= sds->max_load)
return false;
@@ -3894,8 +3769,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
* numbered CPUs in the group, therefore mark all groups
* higher than ourself as busy.
*/
- if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
- this_cpu < group_first_cpu(sg)) {
+ if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+ env->dst_cpu < group_first_cpu(sg)) {
if (!sds->busiest)
return true;
@@ -3908,35 +3783,32 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
- * @sd: sched_domain whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
+ * @env: The load balancing environment.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
-static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
- enum cpu_idle_type idle, const struct cpumask *cpus,
- int *balance, struct sd_lb_stats *sds)
+static inline void update_sd_lb_stats(struct lb_env *env,
+ const struct cpumask *cpus,
+ int *balance, struct sd_lb_stats *sds)
{
- struct sched_domain *child = sd->child;
- struct sched_group *sg = sd->groups;
+ struct sched_domain *child = env->sd->child;
+ struct sched_group *sg = env->sd->groups;
struct sg_lb_stats sgs;
int load_idx, prefer_sibling = 0;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
- init_sd_power_savings_stats(sd, sds, idle);
- load_idx = get_sd_load_idx(sd, idle);
+ load_idx = get_sd_load_idx(env->sd, env->idle);
do {
int local_group;
- local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
memset(&sgs, 0, sizeof(sgs));
- update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
- local_group, cpus, balance, &sgs);
+ update_sg_lb_stats(env, sg, load_idx, local_group,
+ cpus, balance, &sgs);
if (local_group && !(*balance))
return;
@@ -3964,7 +3836,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
sds->this_load_per_task = sgs.sum_weighted_load;
sds->this_has_capacity = sgs.group_has_capacity;
sds->this_idle_cpus = sgs.idle_cpus;
- } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
+ } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
sds->max_load = sgs.avg_load;
sds->busiest = sg;
sds->busiest_nr_running = sgs.sum_nr_running;
@@ -3976,9 +3848,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
sds->group_imb = sgs.group_imb;
}
- update_sd_power_savings_stats(sg, sds, local_group, &sgs);
sg = sg->next;
- } while (sg != sd->groups);
+ } while (sg != env->sd->groups);
}
/**
@@ -4001,29 +3872,26 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
* Returns 1 when packing is required and a task should be moved to
* this CPU. The amount of the imbalance is returned in *imbalance.
*
- * @sd: The sched_domain whose packing is to be checked.
+ * @env: The load balancing environment.
* @sds: Statistics of the sched_domain which is to be packed
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: returns amount of imbalanced due to packing.
*/
-static int check_asym_packing(struct sched_domain *sd,
- struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
+static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
{
int busiest_cpu;
- if (!(sd->flags & SD_ASYM_PACKING))
+ if (!(env->sd->flags & SD_ASYM_PACKING))
return 0;
if (!sds->busiest)
return 0;
busiest_cpu = group_first_cpu(sds->busiest);
- if (this_cpu > busiest_cpu)
+ if (env->dst_cpu > busiest_cpu)
return 0;
- *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
- SCHED_POWER_SCALE);
+ env->imbalance = DIV_ROUND_CLOSEST(
+ sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
+
return 1;
}
@@ -4031,12 +3899,11 @@ static int check_asym_packing(struct sched_domain *sd,
* fix_small_imbalance - Calculate the minor imbalance that exists
* amongst the groups of a sched_domain, during
* load balancing.
+ * @env: The load balancing environment.
* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: Variable to store the imbalance.
*/
-static inline void fix_small_imbalance(struct sd_lb_stats *sds,
- int this_cpu, unsigned long *imbalance)
+static inline
+void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long tmp, pwr_now = 0, pwr_move = 0;
unsigned int imbn = 2;
@@ -4047,9 +3914,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
if (sds->busiest_load_per_task >
sds->this_load_per_task)
imbn = 1;
- } else
+ } else {
sds->this_load_per_task =
- cpu_avg_load_per_task(this_cpu);
+ cpu_avg_load_per_task(env->dst_cpu);
+ }
scaled_busy_load_per_task = sds->busiest_load_per_task
* SCHED_POWER_SCALE;
@@ -4057,7 +3925,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
(scaled_busy_load_per_task * imbn)) {
- *imbalance = sds->busiest_load_per_task;
+ env->imbalance = sds->busiest_load_per_task;
return;
}
@@ -4094,18 +3962,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
/* Move if we gain throughput */
if (pwr_move > pwr_now)
- *imbalance = sds->busiest_load_per_task;
+ env->imbalance = sds->busiest_load_per_task;
}
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
+ * @env: load balance environment
* @sds: statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: Cpu for which currently load balance is being performed.
- * @imbalance: The variable to store the imbalance.
*/
-static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
- unsigned long *imbalance)
+static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long max_pull, load_above_capacity = ~0UL;
@@ -4121,8 +3987,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* its cpu_power, while calculating max_load..)
*/
if (sds->max_load < sds->avg_load) {
- *imbalance = 0;
- return fix_small_imbalance(sds, this_cpu, imbalance);
+ env->imbalance = 0;
+ return fix_small_imbalance(env, sds);
}
if (!sds->group_imb) {
@@ -4150,7 +4016,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * sds->busiest->sgp->power,
+ env->imbalance = min(max_pull * sds->busiest->sgp->power,
(sds->avg_load - sds->this_load) * sds->this->sgp->power)
/ SCHED_POWER_SCALE;
@@ -4160,8 +4026,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* a think about bumping its value to force at least one task to be
* moved
*/
- if (*imbalance < sds->busiest_load_per_task)
- return fix_small_imbalance(sds, this_cpu, imbalance);
+ if (env->imbalance < sds->busiest_load_per_task)
+ return fix_small_imbalance(env, sds);
}
@@ -4177,11 +4043,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* Also calculates the amount of weighted load which should be moved
* to restore balance.
*
- * @sd: The sched_domain whose busiest group is to be returned.
- * @this_cpu: The cpu for which load balancing is currently being performed.
- * @imbalance: Variable which stores amount of weighted load which should
- * be moved to restore balance/put a group to idle.
- * @idle: The idle status of this_cpu.
+ * @env: The load balancing environment.
* @cpus: The set of CPUs under consideration for load-balancing.
* @balance: Pointer to a variable indicating if this_cpu
* is the appropriate cpu to perform load balancing at this_level.
@@ -4192,9 +4054,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* put to idle by rebalancing its tasks onto our group.
*/
static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum cpu_idle_type idle,
- const struct cpumask *cpus, int *balance)
+find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
{
struct sd_lb_stats sds;
@@ -4204,7 +4064,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* Compute the various statistics relavent for load balancing at
* this level.
*/
- update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
+ update_sd_lb_stats(env, cpus, balance, &sds);
/*
* this_cpu is not the appropriate cpu to perform load balancing at
@@ -4213,8 +4073,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (!(*balance))
goto ret;
- if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
- check_asym_packing(sd, &sds, this_cpu, imbalance))
+ if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
+ check_asym_packing(env, &sds))
return sds.busiest;
/* There is no busy sibling group to pull tasks from */
@@ -4232,7 +4092,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+ if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
!sds.busiest_has_capacity)
goto force_balance;
@@ -4250,7 +4110,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (sds.this_load >= sds.avg_load)
goto out_balanced;
- if (idle == CPU_IDLE) {
+ if (env->idle == CPU_IDLE) {
/*
* This cpu is idle. If the busiest group load doesn't
* have more tasks than the number of available cpu's and
@@ -4265,34 +4125,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
* imbalance_pct to be conservative.
*/
- if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
goto out_balanced;
}
force_balance:
/* Looks like there is an imbalance. Compute it */
- calculate_imbalance(&sds, this_cpu, imbalance);
+ calculate_imbalance(env, &sds);
return sds.busiest;
out_balanced:
- /*
- * There is no obvious imbalance. But check if we can do some balancing
- * to save power.
- */
- if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
- return sds.busiest;
ret:
- *imbalance = 0;
+ env->imbalance = 0;
return NULL;
}
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
-static struct rq *
-find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
- enum cpu_idle_type idle, unsigned long imbalance,
- const struct cpumask *cpus)
+static struct rq *find_busiest_queue(struct lb_env *env,
+ struct sched_group *group,
+ const struct cpumask *cpus)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
@@ -4305,7 +4158,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
unsigned long wl;
if (!capacity)
- capacity = fix_small_capacity(sd, group);
+ capacity = fix_small_capacity(env->sd, group);
if (!cpumask_test_cpu(i, cpus))
continue;
@@ -4317,7 +4170,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the cpu power.
*/
- if (capacity && rq->nr_running == 1 && wl > imbalance)
+ if (capacity && rq->nr_running == 1 && wl > env->imbalance)
continue;
/*
@@ -4346,40 +4199,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
/* Working cpumask for load_balance and load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static int need_active_balance(struct sched_domain *sd, int idle,
- int busiest_cpu, int this_cpu)
+static int need_active_balance(struct lb_env *env)
{
- if (idle == CPU_NEWLY_IDLE) {
+ struct sched_domain *sd = env->sd;
+
+ if (env->idle == CPU_NEWLY_IDLE) {
/*
* ASYM_PACKING needs to force migrate tasks from busy but
* higher numbered CPUs in order to pack all tasks in the
* lowest numbered CPUs.
*/
- if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+ if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
return 1;
-
- /*
- * The only task running in a non-idle cpu can be moved to this
- * cpu in an attempt to completely freeup the other CPU
- * package.
- *
- * The package power saving logic comes from
- * find_busiest_group(). If there are no imbalance, then
- * f_b_g() will return NULL. However when sched_mc={1,2} then
- * f_b_g() will select a group from which a running task may be
- * pulled to this cpu in order to make the other package idle.
- * If there is no opportunity to make a package idle and if
- * there are no imbalance, then f_b_g() will return NULL and no
- * action will be taken in load_balance_newidle().
- *
- * Under normal task pull operation due to imbalance, there
- * will be more than one task in the source run queue and
- * move_tasks() will succeed. ld_moved will be true and this
- * active balance code will not be triggered.
- */
- if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
- return 0;
}
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4397,7 +4229,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
{
int ld_moved, active_balance = 0;
struct sched_group *group;
- unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
@@ -4407,7 +4238,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.idle = idle,
- .loop_break = sysctl_sched_nr_migrate,
+ .loop_break = sched_nr_migrate_break,
};
cpumask_copy(cpus, cpu_active_mask);
@@ -4415,8 +4246,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
schedstat_inc(sd, lb_count[idle]);
redo:
- group = find_busiest_group(sd, this_cpu, &imbalance, idle,
- cpus, balance);
+ group = find_busiest_group(&env, cpus, balance);
if (*balance == 0)
goto out_balanced;
@@ -4426,7 +4256,7 @@ redo:
goto out_balanced;
}
- busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
+ busiest = find_busiest_queue(&env, group, cpus);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
@@ -4434,7 +4264,7 @@ redo:
BUG_ON(busiest == this_rq);
- schedstat_add(sd, lb_imbalance[idle], imbalance);
+ schedstat_add(sd, lb_imbalance[idle], env.imbalance);
ld_moved = 0;
if (busiest->nr_running > 1) {
@@ -4445,10 +4275,9 @@ redo:
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.load_move = imbalance;
- env.src_cpu = busiest->cpu;
- env.src_rq = busiest;
- env.loop_max = busiest->nr_running;
+ env.src_cpu = busiest->cpu;
+ env.src_rq = busiest;
+ env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
local_irq_save(flags);
@@ -4490,7 +4319,7 @@ more_balance:
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;
- if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
+ if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
@@ -4517,10 +4346,11 @@ more_balance:
}
raw_spin_unlock_irqrestore(&busiest->lock, flags);
- if (active_balance)
+ if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
+ }
/*
* We've kicked active balancing, reset the failure
@@ -4701,104 +4531,15 @@ static struct {
unsigned long next_balance; /* in jiffy units */
} nohz ____cacheline_aligned;
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu: The cpu whose lowest level of sched domain is to
- * be returned.
- * @flag: The flag to check for the lowest sched_domain
- * for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
- struct sched_domain *sd;
-
- for_each_domain(cpu, sd)
- if (sd->flags & flag)
- break;
-
- return sd;
-}
-
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu: The cpu whose domains we're iterating over.
- * @sd: variable holding the value of the power_savings_sd
- * for cpu.
- * @flag: The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
- for (sd = lowest_flag_domain(cpu, flag); \
- (sd && (sd->flags & flag)); sd = sd->parent)
-
-/**
- * find_new_ilb - Finds the optimum idle load balancer for nomination.
- * @cpu: The cpu which is nominating a new idle_load_balancer.
- *
- * Returns: Returns the id of the idle load balancer if it exists,
- * Else, returns >= nr_cpu_ids.
- *
- * This algorithm picks the idle load balancer such that it belongs to a
- * semi-idle powersavings sched_domain. The idea is to try and avoid
- * completely idle packages/cores just for the purpose of idle load balancing
- * when there are other idle cpu's which are better suited for that job.
- */
-static int find_new_ilb(int cpu)
+static inline int find_new_ilb(int call_cpu)
{
int ilb = cpumask_first(nohz.idle_cpus_mask);
- struct sched_group *ilbg;
- struct sched_domain *sd;
-
- /*
- * Have idle load balancer selection from semi-idle packages only
- * when power-aware load balancing is enabled
- */
- if (!(sched_smt_power_savings || sched_mc_power_savings))
- goto out_done;
-
- /*
- * Optimize for the case when we have no idle CPUs or only one
- * idle CPU. Don't walk the sched_domain hierarchy in such cases
- */
- if (cpumask_weight(nohz.idle_cpus_mask) < 2)
- goto out_done;
- rcu_read_lock();
- for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
- ilbg = sd->groups;
-
- do {
- if (ilbg->group_weight !=
- atomic_read(&ilbg->sgp->nr_busy_cpus)) {
- ilb = cpumask_first_and(nohz.idle_cpus_mask,
- sched_group_cpus(ilbg));
- goto unlock;
- }
-
- ilbg = ilbg->next;
-
- } while (ilbg != sd->groups);
- }
-unlock:
- rcu_read_unlock();
-
-out_done:
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
return nr_cpu_ids;
}
-#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_ilb(int call_cpu)
-{
- return nr_cpu_ids;
-}
-#endif
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
@@ -5021,7 +4762,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
raw_spin_lock_irq(&this_rq->lock);
update_rq_clock(this_rq);
- update_cpu_load(this_rq);
+ update_idle_cpu_load(this_rq);
raw_spin_unlock_irq(&this_rq->lock);
rebalance_domains(balance_cpu, CPU_IDLE);