diff options
Diffstat (limited to '')
-rw-r--r-- | kernel/sched/fair.c | 402 |
1 files changed, 283 insertions, 119 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 914096c5b1ae..e4a0b8bd941c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -40,6 +40,7 @@ #include <linux/cpuidle.h> #include <linux/interrupt.h> +#include <linux/memory-tiers.h> #include <linux/mempolicy.h> #include <linux/mutex_api.h> #include <linux/profile.h> @@ -799,8 +800,6 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static void attach_entity_cfs_rq(struct sched_entity *se); - /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: @@ -835,20 +834,6 @@ void post_init_entity_util_avg(struct task_struct *p) long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; - if (cap > 0) { - if (cfs_rq->avg.util_avg != 0) { - sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; - sa->util_avg /= (cfs_rq->avg.load_avg + 1); - - if (sa->util_avg > cap) - sa->util_avg = cap; - } else { - sa->util_avg = cap; - } - } - - sa->runnable_avg = sa->util_avg; - if (p->sched_class != &fair_sched_class) { /* * For !fair tasks do: @@ -864,7 +849,19 @@ void post_init_entity_util_avg(struct task_struct *p) return; } - attach_entity_cfs_rq(se); + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + } + + sa->runnable_avg = sa->util_avg; } #else /* !CONFIG_SMP */ @@ -1094,6 +1091,12 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +/* The page with hint page fault latency < threshold in ms is considered hot */ +unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC; + +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; + struct numa_group { refcount_t refcount; @@ -1436,6 +1439,120 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, return 1000 * faults / total_faults; } +/* + * If memory tiering mode is enabled, cpupid of slow memory page is + * used to record scan time instead of CPU and PID. When tiering mode + * is disabled at run time, the scan time (in cpupid) will be + * interpreted as CPU and PID. So CPU needs to be checked to avoid to + * access out of array bound. + */ +static inline bool cpupid_valid(int cpupid) +{ + return cpupid_to_cpu(cpupid) < nr_cpu_ids; +} + +/* + * For memory tiering mode, if there are enough free pages (more than + * enough watermark defined here) in fast memory node, to take full + * advantage of fast memory capacity, all recently accessed slow + * memory pages will be migrated to fast memory node without + * considering hot threshold. + */ +static bool pgdat_free_space_enough(struct pglist_data *pgdat) +{ + int z; + unsigned long enough_wmark; + + enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT, + pgdat->node_present_pages >> 4); + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + if (zone_watermark_ok(zone, 0, + wmark_pages(zone, WMARK_PROMO) + enough_wmark, + ZONE_MOVABLE, 0)) + return true; + } + return false; +} + +/* + * For memory tiering mode, when page tables are scanned, the scan + * time will be recorded in struct page in addition to make page + * PROT_NONE for slow memory page. So when the page is accessed, in + * hint page fault handler, the hint page fault latency is calculated + * via, + * + * hint page fault latency = hint page fault time - scan time + * + * The smaller the hint page fault latency, the higher the possibility + * for the page to be hot. + */ +static int numa_hint_fault_latency(struct page *page) +{ + int last_time, time; + + time = jiffies_to_msecs(jiffies); + last_time = xchg_page_access_time(page, time); + + return (time - last_time) & PAGE_ACCESS_TIME_MASK; +} + +/* + * For memory tiering mode, too high promotion/demotion throughput may + * hurt application latency. So we provide a mechanism to rate limit + * the number of pages that are tried to be promoted. + */ +static bool numa_promotion_rate_limit(struct pglist_data *pgdat, + unsigned long rate_limit, int nr) +{ + unsigned long nr_cand; + unsigned int now, start; + + now = jiffies_to_msecs(jiffies); + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr); + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + start = pgdat->nbp_rl_start; + if (now - start > MSEC_PER_SEC && + cmpxchg(&pgdat->nbp_rl_start, start, now) == start) + pgdat->nbp_rl_nr_cand = nr_cand; + if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit) + return true; + return false; +} + +#define NUMA_MIGRATION_ADJUST_STEPS 16 + +static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, + unsigned long rate_limit, + unsigned int ref_th) +{ + unsigned int now, start, th_period, unit_th, th; + unsigned long nr_cand, ref_cand, diff_cand; + + now = jiffies_to_msecs(jiffies); + th_period = sysctl_numa_balancing_scan_period_max; + start = pgdat->nbp_th_start; + if (now - start > th_period && + cmpxchg(&pgdat->nbp_th_start, start, now) == start) { + ref_cand = rate_limit * + sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC; + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + diff_cand = nr_cand - pgdat->nbp_th_nr_cand; + unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS; + th = pgdat->nbp_threshold ? : ref_th; + if (diff_cand > ref_cand * 11 / 10) + th = max(th - unit_th, unit_th); + else if (diff_cand < ref_cand * 9 / 10) + th = min(th + unit_th, ref_th * 2); + pgdat->nbp_th_nr_cand = nr_cand; + pgdat->nbp_threshold = th; + } +} + bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int src_nid, int dst_cpu) { @@ -1443,9 +1560,44 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid; + /* + * The pages in slow memory node should be migrated according + * to hot/cold instead of private/shared. + */ + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !node_is_toptier(src_nid)) { + struct pglist_data *pgdat; + unsigned long rate_limit; + unsigned int latency, th, def_th; + + pgdat = NODE_DATA(dst_nid); + if (pgdat_free_space_enough(pgdat)) { + /* workload changed, reset hot threshold */ + pgdat->nbp_threshold = 0; + return true; + } + + def_th = sysctl_numa_balancing_hot_threshold; + rate_limit = sysctl_numa_balancing_promote_rate_limit << \ + (20 - PAGE_SHIFT); + numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); + + th = pgdat->nbp_threshold ? : def_th; + latency = numa_hint_fault_latency(page); + if (latency >= th) + return false; + + return !numa_promotion_rate_limit(pgdat, rate_limit, + thp_nr_pages(page)); + } + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) + return false; + /* * Allow first faults or private faults to migrate immediately early in * the lifetime of a task. The magic number 4 is based on waiting for @@ -1592,11 +1744,11 @@ numa_type numa_classify(unsigned int imbalance_pct, #ifdef CONFIG_SCHED_SMT /* Forward declarations of select_idle_sibling helpers */ -static inline bool test_idle_cores(int cpu, bool def); +static inline bool test_idle_cores(int cpu); static inline int numa_idle_core(int idle_core, int cpu) { if (!static_branch_likely(&sched_smt_present) || - idle_core >= 0 || !test_idle_cores(cpu, false)) + idle_core >= 0 || !test_idle_cores(cpu)) return idle_core; /* @@ -2600,7 +2752,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!join) return; - BUG_ON(irqs_disabled()); + WARN_ON_ONCE(irqs_disabled()); double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { @@ -2685,6 +2837,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (!p->mm) return; + /* + * NUMA faults statistics are unnecessary for the slow memory + * node for memory tiering mode. + */ + if (!node_is_toptier(mem_node) && + (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING || + !cpupid_valid(last_cpupid))) + return; + /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) * @@ -2765,6 +2926,7 @@ static void task_numa_work(struct callback_head *work) struct task_struct *p = current; struct mm_struct *mm = p->mm; u64 runtime = p->se.sum_exec_runtime; + MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; @@ -2821,13 +2983,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; - vma = find_vma(mm, start); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) { reset_ptenuma_scan(p); start = 0; - vma = mm->mmap; + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); } - for (; vma; vma = vma->vm_next) { + + for (; vma; vma = mas_find(&mas, ULONG_MAX)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; @@ -3838,8 +4003,7 @@ static void migrate_se_pelt_lag(struct sched_entity *se) {} * @cfs_rq: cfs_rq to update * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) - * avg. The immediate corollary is that all (fair) tasks must be attached, see - * post_init_entity_util_avg(). + * avg. The immediate corollary is that all (fair) tasks must be attached. * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * @@ -4003,6 +4167,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s #define UPDATE_TG 0x1 #define SKIP_AGE_LOAD 0x2 #define DO_ATTACH 0x4 +#define DO_DETACH 0x8 /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -4032,6 +4197,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq); + } else if (flags & DO_DETACH) { + /* + * DO_DETACH means we're here from dequeue_entity() + * and we are migrating task out of the CPU. + */ + detach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq); } else if (decayed) { cfs_rq_util_change(cfs_rq, 0); @@ -4064,8 +4236,8 @@ static void remove_entity_load_avg(struct sched_entity *se) /* * tasks cannot exit without having gone through wake_up_new_task() -> - * post_init_entity_util_avg() which will have added things to the - * cfs_rq, so we can remove unconditionally. + * enqueue_task_fair() which will have added things to the cfs_rq, + * so we can remove unconditionally. */ sync_entity_load_avg(se); @@ -4262,7 +4434,7 @@ static inline int task_fits_capacity(struct task_struct *p, static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { - if (!static_branch_unlikely(&sched_asym_cpucapacity)) + if (!sched_asym_cpucap_active()) return; if (!p || p->nr_cpus_allowed == 1) { @@ -4292,6 +4464,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 #define DO_ATTACH 0x0 +#define DO_DETACH 0x0 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { @@ -4434,7 +4607,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its runnable_weight to reflect the new + * h_nr_running of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight @@ -4511,6 +4685,11 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + int action = UPDATE_TG; + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) + action |= DO_DETACH; + /* * Update run-time statistics of the 'current'. */ @@ -4519,12 +4698,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Subtract its load from the cfs_rq->runnable_avg. + * - For group_entity, update its runnable_weight to reflect the new + * h_nr_running of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(cfs_rq, se, action); se_update_runnable(se); update_stats_dequeue_fair(cfs_rq, se, flags); @@ -5893,8 +6073,8 @@ dequeue_throttle: #ifdef CONFIG_SMP /* Working cpumask for: load_balance, load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); -DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); +static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); #ifdef CONFIG_NO_HZ_COMMON @@ -6260,7 +6440,7 @@ static inline void set_idle_cores(int cpu, int val) WRITE_ONCE(sds->has_idle_cores, val); } -static inline bool test_idle_cores(int cpu, bool def) +static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; @@ -6268,7 +6448,7 @@ static inline bool test_idle_cores(int cpu, bool def) if (sds) return READ_ONCE(sds->has_idle_cores); - return def; + return false; } /* @@ -6284,7 +6464,7 @@ void __update_idle_core(struct rq *rq) int cpu; rcu_read_lock(); - if (test_idle_cores(core, true)) + if (test_idle_cores(core)) goto unlock; for_each_cpu(cpu, cpu_smt_mask(core)) { @@ -6310,9 +6490,6 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu bool idle = true; int cpu; - if (!static_branch_likely(&sched_smt_present)) - return __select_idle_cpu(core, p); - for_each_cpu(cpu, cpu_smt_mask(core)) { if (!available_idle_cpu(cpu)) { idle = false; @@ -6339,13 +6516,12 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_smt(struct task_struct *p, int target) { int cpu; - for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr) || - !cpumask_test_cpu(cpu, sched_domain_span(sd))) + for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { + if (cpu == target) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; @@ -6360,9 +6536,9 @@ static inline void set_idle_cores(int cpu, int val) { } -static inline bool test_idle_cores(int cpu, bool def) +static inline bool test_idle_cores(int cpu) { - return def; + return false; } static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) @@ -6370,7 +6546,7 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma return __select_idle_cpu(core, p); } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static inline int select_idle_smt(struct task_struct *p, int target) { return -1; } @@ -6389,19 +6565,19 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct sched_domain_shared *sd_share; struct rq *this_rq = this_rq(); int this = smp_processor_id(); - struct sched_domain *this_sd; + struct sched_domain *this_sd = NULL; u64 time = 0; - this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - if (!this_sd) - return -1; - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; unsigned long now = jiffies; + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd) + return -1; + /* * If we're busy, the assumption that the last idle period * predicts the future is flawed; age away the remaining @@ -6455,7 +6631,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (has_idle_core) set_idle_cores(target, false); - if (sched_feat(SIS_PROP) && !has_idle_core) { + if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) { time = cpu_clock(this) - time; /* @@ -6506,7 +6682,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) static inline bool asym_fits_capacity(unsigned long task_util, int cpu) { - if (static_branch_unlikely(&sched_asym_cpucapacity)) + if (sched_asym_cpucap_active()) return fits_capacity(task_util, capacity_of(cpu)); return true; @@ -6526,7 +6702,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * On asymmetric system, update task utilization because we will check * that the task fits with cpu's capacity. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_asym_cpucap_active()) { sync_entity_load_avg(&p->se); task_util = uclamp_task_util(p); } @@ -6580,7 +6756,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * For asymmetric CPU capacity systems, our domain of interest is * sd_asym_cpucapacity rather than sd_llc. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_asym_cpucap_active()) { sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); /* * On an asymmetric CPU capacity system where an exclusive @@ -6601,10 +6777,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; if (sched_smt_active()) { - has_idle_core = test_idle_cores(target, false); + has_idle_core = test_idle_cores(target); if (!has_idle_core && cpus_share_cache(prev, target)) { - i = select_idle_smt(p, sd, prev); + i = select_idle_smt(p, prev); if ((unsigned int)i < nr_cpumask_bits) return i; } @@ -7076,8 +7252,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) return new_cpu; } -static void detach_entity_cfs_rq(struct sched_entity *se); - /* * Called immediately before a task is migrated to a new CPU; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -7099,15 +7273,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) se->vruntime -= u64_u32_load(cfs_rq->min_vruntime); } - if (p->on_rq == TASK_ON_RQ_MIGRATING) { - /* - * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' - * rq->lock and can modify state directly. - */ - lockdep_assert_rq_held(task_rq(p)); - detach_entity_cfs_rq(se); - - } else { + if (!task_on_rq_migrating(p)) { remove_entity_load_avg(se); /* @@ -7279,7 +7445,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; find_matching_se(&se, &pse); - BUG_ON(!pse); + WARN_ON_ONCE(!pse); cse_is_idle = se_is_idle(se); pse_is_idle = se_is_idle(pse); @@ -7938,7 +8104,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found at least one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (task_running(env->src_rq, p)) { + if (task_on_cpu(env->src_rq, p)) { schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } @@ -8012,8 +8178,6 @@ static struct task_struct *detach_one_task(struct lb_env *env) return NULL; } -static const unsigned int sched_nr_migrate_break = 32; - /* * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". @@ -8049,20 +8213,24 @@ static int detach_tasks(struct lb_env *env) if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) break; - p = list_last_entry(tasks, struct task_struct, se.group_node); - env->loop++; - /* We've more or less seen every task there is, call it quits */ - if (env->loop > env->loop_max) + /* + * We've more or less seen every task there is, call it quits + * unless we haven't found any movable task yet. + */ + if (env->loop > env->loop_max && + !(env->flags & LBF_ALL_PINNED)) break; /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { - env->loop_break += sched_nr_migrate_break; + env->loop_break += SCHED_NR_MIGRATE_BREAK; env->flags |= LBF_NEED_BREAK; break; } + p = list_last_entry(tasks, struct task_struct, se.group_node); + if (!can_migrate_task(p, env)) goto next; @@ -8159,7 +8327,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) { lockdep_assert_rq_held(rq); - BUG_ON(task_rq(p) != rq); + WARN_ON_ONCE(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); check_preempt_curr(rq, p, 0); } @@ -10099,14 +10267,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; struct rq_flags rf; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); - struct lb_env env = { .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, .dst_grpmask = sched_group_span(sd->groups), .idle = idle, - .loop_break = sched_nr_migrate_break, + .loop_break = SCHED_NR_MIGRATE_BREAK, .cpus = cpus, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), @@ -10134,7 +10301,7 @@ redo: goto out_balanced; } - BUG_ON(busiest == env.dst_rq); + WARN_ON_ONCE(busiest == env.dst_rq); schedstat_add(sd->lb_imbalance[idle], env.imbalance); @@ -10182,7 +10349,9 @@ more_balance: if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; - goto more_balance; + /* Stop if we tried all running tasks */ + if (env.loop < busiest->nr_running) + goto more_balance; } /* @@ -10213,7 +10382,7 @@ more_balance: env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_DST_PINNED; env.loop = 0; - env.loop_break = sched_nr_migrate_break; + env.loop_break = SCHED_NR_MIGRATE_BREAK; /* * Go back to "more_balance" rather than "redo" since we @@ -10245,7 +10414,7 @@ more_balance: */ if (!cpumask_subset(cpus, env.dst_grpmask)) { env.loop = 0; - env.loop_break = sched_nr_migrate_break; + env.loop_break = SCHED_NR_MIGRATE_BREAK; goto redo; } goto out_all_pinned; @@ -10430,7 +10599,7 @@ static int active_load_balance_cpu_stop(void *data) * we need to fix it. Originally reported by * Bjorn Helgaas on a 128-CPU setup. */ - BUG_ON(busiest_rq == target_rq); + WARN_ON_ONCE(busiest_rq == target_rq); /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); @@ -10916,8 +11085,7 @@ static bool update_nohz_stats(struct rq *rq) * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags. */ -static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, - enum cpu_idle_type idle) +static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) { /* Earliest time when we have to do rebalance again */ unsigned long now = jiffies; @@ -11032,7 +11200,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (idle != CPU_IDLE) return false; - _nohz_idle_balance(this_rq, flags, idle); + _nohz_idle_balance(this_rq, flags); return true; } @@ -11052,7 +11220,7 @@ void nohz_run_idle_balance(int cpu) * (ie NOHZ_STATS_KICK set) and will do the same. */ if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) - _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE); + _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); } static void nohz_newidle_balance(struct rq *this_rq) @@ -11552,6 +11720,17 @@ static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); +#ifdef CONFIG_SMP + /* + * In case the task sched_avg hasn't been attached: + * - A forked task which hasn't been woken up by wake_up_new_task(). + * - A task which has been woken up by try_to_wake_up() but is + * waiting for actually being woken up by sched_ttwu_pending(). + */ + if (!se->avg.last_update_time) + return; +#endif + /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); @@ -11563,14 +11742,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Since the real-depth could have been changed (only FAIR - * class maintain depth value), reset depth properly. - */ - se->depth = se->parent ? se->parent->depth + 1 : 0; -#endif - /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); @@ -11666,39 +11837,25 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_set_group_fair(struct task_struct *p) +static void task_change_group_fair(struct task_struct *p) { - struct sched_entity *se = &p->se; - - set_task_rq(p, task_cpu(p)); - se->depth = se->parent ? se->parent->depth + 1 : 0; -} + /* + * We couldn't detach or attach a forked task which + * hasn't been woken up by wake_up_new_task(). + */ + if (READ_ONCE(p->__state) == TASK_NEW) + return; -static void task_move_group_fair(struct task_struct *p) -{ detach_task_cfs_rq(p); - set_task_rq(p, task_cpu(p)); #ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; #endif + set_task_rq(p, task_cpu(p)); attach_task_cfs_rq(p); } -static void task_change_group_fair(struct task_struct *p, int type) -{ - switch (type) { - case TASK_SET_GROUP: - task_set_group_fair(p); - break; - - case TASK_MOVE_GROUP: - task_move_group_fair(p); - break; - } -} - void free_fair_sched_group(struct task_group *tg) { int i; @@ -12075,6 +12232,13 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { #ifdef CONFIG_SMP + int i; + + for_each_possible_cpu(i) { + zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); + zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); + } + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #ifdef CONFIG_NO_HZ_COMMON |