diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 315 |
1 files changed, 251 insertions, 64 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f808ddf2a868..ee271bb661cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -693,6 +693,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); +static unsigned long capacity_of(int cpu); /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) @@ -1392,6 +1393,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int last_cpupid, this_cpupid; this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + + /* + * Allow first faults or private faults to migrate immediately early in + * the lifetime of a task. The magic number 4 is based on waiting for + * two full passes of the "multi-stage node selection" test that is + * executed below. + */ + if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) + return true; /* * Multi-stage node selection is used in conjunction with a periodic @@ -1410,7 +1422,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * This quadric squishes small probabilities, making it less likely we * act on an unlikely task<->page relation. */ - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != dst_nid) return false; @@ -1446,7 +1457,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, static unsigned long weighted_cpuload(struct rq *rq); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long capacity_of(int cpu); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -1454,8 +1464,6 @@ struct numa_stats { /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; - - unsigned int nr_running; }; /* @@ -1463,36 +1471,16 @@ struct numa_stats { */ static void update_numa_stats(struct numa_stats *ns, int nid) { - int smt, cpu, cpus = 0; - unsigned long capacity; + int cpu; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { struct rq *rq = cpu_rq(cpu); - ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(rq); ns->compute_capacity += capacity_of(cpu); - - cpus++; } - /* - * If we raced with hotplug and there are no CPUs left in our mask - * the @ns structure is NULL'ed and task_numa_compare() will - * not find this node attractive. - * - * We'll detect a huge imbalance and bail there. - */ - if (!cpus) - return; - - /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); - capacity = cpus / smt; /* cores */ - - capacity = min_t(unsigned, capacity, - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); } struct task_numa_env { @@ -1514,6 +1502,21 @@ struct task_numa_env { static void task_numa_assign(struct task_numa_env *env, struct task_struct *p, long imp) { + struct rq *rq = cpu_rq(env->dst_cpu); + + /* Bail out if run-queue part of active NUMA balance. */ + if (xchg(&rq->numa_migrate_on, 1)) + return; + + /* + * Clear previous best_cpu/rq numa-migrate flag, since task now + * found a better CPU to move/swap. + */ + if (env->best_cpu != -1) { + rq = cpu_rq(env->best_cpu); + WRITE_ONCE(rq->numa_migrate_on, 0); + } + if (env->best_task) put_task_struct(env->best_task); if (p) @@ -1553,6 +1556,13 @@ static bool load_too_imbalanced(long src_load, long dst_load, } /* + * Maximum NUMA importance can be 1998 (2*999); + * SMALLIMP @ 30 would be close to 1998/64. + * Used to deter task migration. + */ +#define SMALLIMP 30 + +/* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking * into account that it might be best if task running on the dst_cpu should @@ -1569,6 +1579,9 @@ static void task_numa_compare(struct task_numa_env *env, long moveimp = imp; int dist = env->dist; + if (READ_ONCE(dst_rq->numa_migrate_on)) + return; + rcu_read_lock(); cur = task_rcu_dereference(&dst_rq->curr); if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) @@ -1582,7 +1595,7 @@ static void task_numa_compare(struct task_numa_env *env, goto unlock; if (!cur) { - if (maymove || imp > env->best_imp) + if (maymove && moveimp >= env->best_imp) goto assign; else goto unlock; @@ -1625,16 +1638,22 @@ static void task_numa_compare(struct task_numa_env *env, task_weight(cur, env->dst_nid, dist); } - if (imp <= env->best_imp) - goto unlock; - if (maymove && moveimp > imp && moveimp > env->best_imp) { - imp = moveimp - 1; + imp = moveimp; cur = NULL; goto assign; } /* + * If the NUMA importance is less than SMALLIMP, + * task migration might only result in ping pong + * of tasks and also hurt performance due to cache + * misses. + */ + if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) + goto unlock; + + /* * In the overloaded case, try and keep the load balanced. */ load = task_h_load(env->p) - task_h_load(cur); @@ -1710,6 +1729,7 @@ static int task_numa_migrate(struct task_struct *p) .best_cpu = -1, }; struct sched_domain *sd; + struct rq *best_rq; unsigned long taskweight, groupweight; int nid, ret, dist; long taskimp, groupimp; @@ -1805,20 +1825,17 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; - /* - * Reset the scan period if the task is being rescheduled on an - * alternative node to recheck if the tasks is now properly placed. - */ - p->numa_scan_period = task_scan_start(p); - + best_rq = cpu_rq(env.best_cpu); if (env.best_task == NULL) { ret = migrate_task_to(p, env.best_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); return ret; } ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); if (ret != 0) trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); @@ -2596,6 +2613,39 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) } } +static void update_scan_period(struct task_struct *p, int new_cpu) +{ + int src_nid = cpu_to_node(task_cpu(p)); + int dst_nid = cpu_to_node(new_cpu); + + if (!static_branch_likely(&sched_numa_balancing)) + return; + + if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) + return; + + if (src_nid == dst_nid) + return; + + /* + * Allow resets if faults have been trapped before one scan + * has completed. This is most likely due to a new task that + * is pulled cross-node due to wakeups or load balancing. + */ + if (p->numa_scan_seq) { + /* + * Avoid scan adjustments if moving to the preferred + * node or if the task was not previously running on + * the preferred node. + */ + if (dst_nid == p->numa_preferred_nid || + (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + return; + } + + p->numa_scan_period = task_scan_start(p); +} + #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { @@ -2609,6 +2659,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } +static inline void update_scan_period(struct task_struct *p, int new_cpu) +{ +} + #endif /* CONFIG_NUMA_BALANCING */ static void @@ -3647,6 +3701,29 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) WRITE_ONCE(p->se.avg.util_est, ue); } +static inline int task_fits_capacity(struct task_struct *p, long capacity) +{ + return capacity * 1024 > task_util_est(p) * capacity_margin; +} + +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) +{ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return; + + if (!p) { + rq->misfit_task_load = 0; + return; + } + + if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + rq->misfit_task_load = 0; + return; + } + + rq->misfit_task_load = task_h_load(p); +} + #else /* CONFIG_SMP */ #define UPDATE_TG 0x0 @@ -3676,6 +3753,7 @@ util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) {} +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -3925,7 +4003,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * put back on, and if we advance min_vruntime, we'll be placed back * further than we started -- ie. we'll be penalized. */ - if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); } @@ -4400,9 +4478,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) /* * Add to the _head_ of the list, so that an already-started - * distribute_cfs_runtime will not see us + * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is + * not running add to the tail so that later runqueues don't get starved. */ - list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + if (cfs_b->distribute_running) + list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + else + list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); /* * If we're the first throttled task, make sure the bandwidth @@ -4546,14 +4628,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) * in us over-using our runtime if it is all used during this loop, but * only by limited amounts in that extreme case. */ - while (throttled && cfs_b->runtime > 0) { + while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { runtime = cfs_b->runtime; + cfs_b->distribute_running = 1; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires); raw_spin_lock(&cfs_b->lock); + cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); cfs_b->runtime -= min(runtime, cfs_b->runtime); @@ -4664,6 +4748,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) /* confirm we're still not at a refresh boundary */ raw_spin_lock(&cfs_b->lock); + if (cfs_b->distribute_running) { + raw_spin_unlock(&cfs_b->lock); + return; + } + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { raw_spin_unlock(&cfs_b->lock); return; @@ -4673,6 +4762,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) runtime = cfs_b->runtime; expires = cfs_b->runtime_expires; + if (runtime) + cfs_b->distribute_running = 1; + raw_spin_unlock(&cfs_b->lock); if (!runtime) @@ -4683,6 +4775,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) raw_spin_lock(&cfs_b->lock); if (expires == cfs_b->runtime_expires) cfs_b->runtime -= min(runtime, cfs_b->runtime); + cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } @@ -4791,6 +4884,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; + cfs_b->distribute_running = 0; } static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) @@ -6188,6 +6282,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) { long min_cap, max_cap; + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return 0; + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; @@ -6198,7 +6295,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) /* Bring task utilization in sync with prev_cpu */ sync_entity_load_avg(&p->se); - return min_cap * 1024 < task_util(p) * capacity_margin; + return !task_fits_capacity(p, min_cap); } /* @@ -6275,7 +6372,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se); * cfs_rq_of(p) references at time of call are still valid and identify the * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ -static void migrate_task_rq_fair(struct task_struct *p) +static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { /* * As blocked tasks retain absolute vruntime the migration needs to @@ -6328,6 +6425,8 @@ static void migrate_task_rq_fair(struct task_struct *p) /* We have migrated, no longer consider this task hot */ p->se.exec_start = 0; + + update_scan_period(p, new_cpu); } static void task_dead_fair(struct task_struct *p) @@ -6615,9 +6714,12 @@ done: __maybe_unused; if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + update_misfit_status(p, rq); + return p; idle: + update_misfit_status(NULL, rq); new_tasks = idle_balance(rq, rf); /* @@ -6823,6 +6925,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -6853,6 +6962,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type src_grp_type; struct list_head tasks; }; @@ -7233,7 +7343,7 @@ static inline bool others_have_blocked(struct rq *rq) if (READ_ONCE(rq->avg_dl.util_avg)) return true; -#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ if (READ_ONCE(rq->avg_irq.util_avg)) return true; #endif @@ -7396,12 +7506,6 @@ static unsigned long task_h_load(struct task_struct *p) /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -7417,6 +7521,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -7525,13 +7630,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; + sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, min_capacity; + unsigned long capacity, min_capacity, max_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -7545,6 +7651,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity = 0; min_capacity = ULONG_MAX; + max_capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -7575,6 +7682,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } min_capacity = min(capacity, min_capacity); + max_capacity = max(capacity, max_capacity); } } else { /* @@ -7588,12 +7696,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity += sgc->capacity; min_capacity = min(sgc->min_capacity, min_capacity); + max_capacity = max(sgc->max_capacity, max_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = min_capacity; + sdg->sgc->max_capacity = max_capacity; } /* @@ -7689,16 +7799,27 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) } /* - * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller * per-CPU capacity than sched_group ref. */ static inline bool -group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { return sg->sgc->min_capacity * capacity_margin < ref->sgc->min_capacity * 1024; } +/* + * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller + * per-CPU capacity_orig than sched_group ref. + */ +static inline bool +group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity * capacity_margin < + ref->sgc->max_capacity * 1024; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -7709,6 +7830,9 @@ group_type group_classify(struct sched_group *group, if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task_load) + return group_misfit_task; + return group_other; } @@ -7741,7 +7865,7 @@ static bool update_nohz_stats(struct rq *rq, bool force) * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. - * @overload: Indicate more than one runnable task for any CPU. + * @overload: Indicate pullable load (e.g. >1 runnable task). */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, @@ -7783,6 +7907,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, */ if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; + + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + sgs->group_misfit_task_load < rq->misfit_task_load) { + sgs->group_misfit_task_load = rq->misfit_task_load; + *overload = 1; + } } /* Adjust by relative CPU capacity of the group */ @@ -7818,6 +7948,17 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + /* + * Don't try to pull misfit tasks we can't help. + * We can use max_capacity here as reduction in capacity on some + * CPUs in the group should either be possible to resolve + * internally or be covered by avg_load imbalance (eventually). + */ + if (sgs->group_type == group_misfit_task && + (!group_smaller_max_cpu_capacity(sg, sds->local) || + !group_has_capacity(env, &sds->local_stat))) + return false; + if (sgs->group_type > busiest->group_type) return true; @@ -7837,7 +7978,14 @@ static bool update_sd_pick_busiest(struct lb_env *env, * power/energy consequences are not considered. */ if (sgs->sum_nr_running <= sgs->group_weight && - group_smaller_cpu_capacity(sds->local, sg)) + group_smaller_min_cpu_capacity(sds->local, sg)) + return false; + + /* + * If we have more than one misfit sg go with the biggest misfit. + */ + if (sgs->group_type == group_misfit_task && + sgs->group_misfit_task_load < busiest->group_misfit_task_load) return false; asym_packing: @@ -7908,11 +8056,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - int load_idx, prefer_sibling = 0; + int load_idx; bool overload = false; - - if (child && child->flags & SD_PREFER_SIBLING) - prefer_sibling = 1; + bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; #ifdef CONFIG_NO_HZ_COMMON if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) @@ -7986,8 +8132,8 @@ next_group: if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - if (env->dst_rq->rd->overload != overload) - env->dst_rq->rd->overload = overload; + if (READ_ONCE(env->dst_rq->rd->overload) != overload) + WRITE_ONCE(env->dst_rq->rd->overload, overload); } } @@ -8137,8 +8283,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * factors in sg capacity and sgs with smaller group_type are * skipped when updating the busiest sg: */ - if (busiest->avg_load <= sds->avg_load || - local->avg_load >= sds->avg_load) { + if (busiest->group_type != group_misfit_task && + (busiest->avg_load <= sds->avg_load || + local->avg_load >= sds->avg_load)) { env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -8172,6 +8319,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task_load); + } + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -8238,6 +8391,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; + /* Misfit tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) + goto force_balance; + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -8275,6 +8432,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ + env->src_grp_type = busiest->group_type; calculate_imbalance(env, &sds); return env->imbalance ? sds.busiest : NULL; @@ -8322,8 +8480,32 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; + /* + * For ASYM_CPUCAPACITY domains with misfit tasks we simply + * seek the "biggest" misfit task. + */ + if (env->src_grp_type == group_misfit_task) { + if (rq->misfit_task_load > busiest_load) { + busiest_load = rq->misfit_task_load; + busiest = rq; + } + + continue; + } + capacity = capacity_of(i); + /* + * For ASYM_CPUCAPACITY domains, don't pick a CPU that could + * eventually lead to active_balancing high->low capacity. + * Higher per-CPU capacity is considered better than balancing + * average load. + */ + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + capacity_of(env->dst_cpu) < capacity && + rq->nr_running == 1) + continue; + wl = weighted_cpuload(rq); /* @@ -8391,6 +8573,9 @@ static int need_active_balance(struct lb_env *env) return 1; } + if (env->src_grp_type == group_misfit_task) + return 1; + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -9033,7 +9218,7 @@ static void nohz_balancer_kick(struct rq *rq) if (time_before(now, nohz.next_balance)) goto out; - if (rq->nr_running >= 2) { + if (rq->nr_running >= 2 || rq->misfit_task_load) { flags = NOHZ_KICK_MASK; goto out; } @@ -9402,7 +9587,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) rq_unpin_lock(this_rq, rf); if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + !READ_ONCE(this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); @@ -9564,6 +9749,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + + update_misfit_status(curr, rq); } /* |