diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 81 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 16 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 4 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 15 | ||||
-rw-r--r-- | kernel/sched/debug.c | 52 | ||||
-rw-r--r-- | kernel/sched/fair.c | 88 | ||||
-rw-r--r-- | kernel/sched/idle.c | 1 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 25 | ||||
-rw-r--r-- | kernel/sched/pelt.h | 4 | ||||
-rw-r--r-- | kernel/sched/psi.c | 66 | ||||
-rw-r--r-- | kernel/sched/rt.c | 40 | ||||
-rw-r--r-- | kernel/sched/sched.h | 62 | ||||
-rw-r--r-- | kernel/sched/topology.c | 99 | ||||
-rw-r--r-- | kernel/sched/wait.c | 16 |
14 files changed, 339 insertions, 230 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 79ce22de4409..5befdecefe94 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -36,7 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) +#ifdef CONFIG_SCHED_DEBUG /* * Debugging: various feature bits * @@ -254,8 +254,9 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = rq->hrtick_time; - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); + hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); } /* @@ -280,7 +281,6 @@ static void __hrtick_start(void *arg) void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time; s64 delta; /* @@ -288,9 +288,7 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - time = ktime_add_ns(timer->base->get_time(), delta); - - hrtimer_set_expires(timer, time); + rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); if (rq == this_rq()) { __hrtick_restart(rq); @@ -822,7 +820,7 @@ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) { - return clamp_value / UCLAMP_BUCKET_DELTA; + return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); } static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) @@ -896,9 +894,10 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { + /* Copy by value as we could modify it */ struct uclamp_se uc_req = p->uclamp_req[clamp_id]; #ifdef CONFIG_UCLAMP_TASK_GROUP - struct uclamp_se uc_max; + unsigned int tg_min, tg_max, value; /* * Tasks in autogroups or root task group will be @@ -909,9 +908,11 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) if (task_group(p) == &root_task_group) return uc_req; - uc_max = task_group(p)->uclamp[clamp_id]; - if (uc_req.value > uc_max.value || !uc_req.user_defined) - return uc_max; + tg_min = task_group(p)->uclamp[UCLAMP_MIN].value; + tg_max = task_group(p)->uclamp[UCLAMP_MAX].value; + value = uc_req.value; + value = clamp(value, tg_min, tg_max); + uclamp_se_set(&uc_req, value, false); #endif return uc_req; @@ -1109,9 +1110,27 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) uclamp_rq_dec_id(rq, p, clamp_id); } +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p, + enum uclamp_id clamp_id) +{ + if (!p->uclamp[clamp_id].active) + return; + + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + + /* + * Make sure to clear the idle flag if we've transiently reached 0 + * active tasks on rq. + */ + if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; +} + static inline void -uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) +uclamp_update_active(struct task_struct *p) { + enum uclamp_id clamp_id; struct rq_flags rf; struct rq *rq; @@ -1131,30 +1150,22 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - if (p->uclamp[clamp_id].active) { - uclamp_rq_dec_id(rq, p, clamp_id); - uclamp_rq_inc_id(rq, p, clamp_id); - } + for_each_clamp_id(clamp_id) + uclamp_rq_reinc_id(rq, p, clamp_id); task_rq_unlock(rq, p, &rf); } #ifdef CONFIG_UCLAMP_TASK_GROUP static inline void -uclamp_update_active_tasks(struct cgroup_subsys_state *css, - unsigned int clamps) +uclamp_update_active_tasks(struct cgroup_subsys_state *css) { - enum uclamp_id clamp_id; struct css_task_iter it; struct task_struct *p; css_task_iter_start(css, 0, &it); - while ((p = css_task_iter_next(&it))) { - for_each_clamp_id(clamp_id) { - if ((0x1 << clamp_id) & clamps) - uclamp_update_active(p, clamp_id); - } - } + while ((p = css_task_iter_next(&it))) + uclamp_update_active(p); css_task_iter_end(&it); } @@ -1326,7 +1337,7 @@ static void __init init_uclamp_rq(struct rq *rq) }; } - rq->uclamp_flags = 0; + rq->uclamp_flags = UCLAMP_FLAG_IDLE; } static void __init init_uclamp(void) @@ -2471,6 +2482,9 @@ out: bool cpus_share_cache(int this_cpu, int that_cpu) { + if (this_cpu == that_cpu) + return true; + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } #endif /* CONFIG_SMP */ @@ -5679,12 +5693,8 @@ static void do_sched_yield(void) schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: - */ preempt_disable(); - rq_unlock(rq, &rf); + rq_unlock_irq(rq, &rf); sched_preempt_enable_no_resched(); schedule(); @@ -7192,7 +7202,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) #ifdef CONFIG_UCLAMP_TASK_GROUP /* Propagate the effective uclamp value for the new group */ + mutex_lock(&uclamp_mutex); + rcu_read_lock(); cpu_util_update_eff(css); + rcu_read_unlock(); + mutex_unlock(&uclamp_mutex); #endif return 0; @@ -7282,6 +7296,9 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) enum uclamp_id clamp_id; unsigned int clamps; + lockdep_assert_held(&uclamp_mutex); + SCHED_WARN_ON(!rcu_read_lock_held()); + css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent ? css_tg(css)->parent->uclamp : NULL; @@ -7314,7 +7331,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) } /* Immediately update descendants RUNNABLE tasks */ - uclamp_update_active_tasks(css, clamps); + uclamp_update_active_tasks(css); } } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 4cb80e6042c4..831fee509404 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -624,9 +624,17 @@ static struct attribute *sugov_attrs[] = { }; ATTRIBUTE_GROUPS(sugov); +static void sugov_tunables_free(struct kobject *kobj) +{ + struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); + + kfree(to_sugov_tunables(attr_set)); +} + static struct kobj_type sugov_tunables_ktype = { .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, + .release = &sugov_tunables_free, }; /********************** cpufreq governor interface *********************/ @@ -726,12 +734,10 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic return tunables; } -static void sugov_tunables_free(struct sugov_tunables *tunables) +static void sugov_clear_global_tunables(void) { if (!have_governor_per_policy()) global_tunables = NULL; - - kfree(tunables); } static int sugov_init(struct cpufreq_policy *policy) @@ -794,7 +800,7 @@ out: fail: kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); stop_kthread: sugov_kthread_stop(sg_policy); @@ -821,7 +827,7 @@ static void sugov_exit(struct cpufreq_policy *policy) count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); policy->governor_data = NULL; if (!count) - sugov_tunables_free(tunables); + sugov_clear_global_tunables(); mutex_unlock(&global_tunables_lock); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 46ed4e1383e2..66188567778d 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -147,10 +147,10 @@ void account_guest_time(struct task_struct *p, u64 cputime) /* Add guest time to cpustat. */ if (task_nice(p) > 0) { - cpustat[CPUTIME_NICE] += cputime; + task_group_account_field(p, CPUTIME_NICE, cputime); cpustat[CPUTIME_GUEST_NICE] += cputime; } else { - cpustat[CPUTIME_USER] += cputime; + task_group_account_field(p, CPUTIME_USER, cputime); cpustat[CPUTIME_GUEST] += cputime; } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 4cb00538a207..2bda9fdba31c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1654,6 +1654,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused */ raw_spin_lock(&rq->lock); if (p->dl.dl_non_contending) { + update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* @@ -2392,6 +2393,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); + } else { + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); } } @@ -2469,7 +2472,7 @@ int sched_dl_global_validate(void) u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); struct dl_bw *dl_b; - int cpu, ret = 0; + int cpu, cpus, ret = 0; unsigned long flags; /* @@ -2484,9 +2487,10 @@ int sched_dl_global_validate(void) for_each_possible_cpu(cpu) { rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); + cpus = dl_bw_cpus(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - if (new_bw < dl_b->total_bw) + if (new_bw * cpus < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); @@ -2619,7 +2623,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_runtime = attr->sched_runtime; dl_se->dl_deadline = attr->sched_deadline; dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; - dl_se->flags = attr->sched_flags; + dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } @@ -2632,7 +2636,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) attr->sched_runtime = dl_se->dl_runtime; attr->sched_deadline = dl_se->dl_deadline; attr->sched_period = dl_se->dl_period; - attr->sched_flags = dl_se->flags; + attr->sched_flags &= ~SCHED_DL_FLAGS; + attr->sched_flags |= dl_se->flags; } /* @@ -2707,7 +2712,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) if (dl_se->dl_runtime != attr->sched_runtime || dl_se->dl_deadline != attr->sched_deadline || dl_se->dl_period != attr->sched_period || - dl_se->flags != attr->sched_flags) + dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS)) return true; return false; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c4b702fe1d73..d5f7fc7099bc 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -8,8 +8,6 @@ */ #include "sched.h" -static DEFINE_SPINLOCK(sched_debug_lock); - /* * This allows printing both to /proc/sched_debug and * to the console @@ -417,16 +415,37 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #endif #ifdef CONFIG_CGROUP_SCHED +static DEFINE_SPINLOCK(sched_debug_lock); static char group_path[PATH_MAX]; -static char *task_group_path(struct task_group *tg) +static void task_group_path(struct task_group *tg, char *path, int plen) { - if (autogroup_path(tg, group_path, PATH_MAX)) - return group_path; + if (autogroup_path(tg, path, plen)) + return; - cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + cgroup_path(tg->css.cgroup, path, plen); +} - return group_path; +/* + * Only 1 SEQ_printf_task_group_path() caller can use the full length + * group_path[] for cgroup path. Other simultaneous callers will have + * to use a shorter stack buffer. A "..." suffix is appended at the end + * of the stack buffer so that it will show up in case the output length + * matches the given buffer size to indicate possible path name truncation. + */ +#define SEQ_printf_task_group_path(m, tg, fmt...) \ +{ \ + if (spin_trylock(&sched_debug_lock)) { \ + task_group_path(tg, group_path, sizeof(group_path)); \ + SEQ_printf(m, fmt, group_path); \ + spin_unlock(&sched_debug_lock); \ + } else { \ + char buf[128]; \ + char *bufend = buf + sizeof(buf) - 3; \ + task_group_path(tg, buf, bufend - buf); \ + strcpy(bufend - 1, "..."); \ + SEQ_printf(m, fmt, buf); \ + } \ } #endif @@ -453,7 +472,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif #ifdef CONFIG_CGROUP_SCHED - SEQ_printf(m, " %s", task_group_path(task_group(p))); + SEQ_printf_task_group_path(m, task_group(p), " %s") #endif SEQ_printf(m, "\n"); @@ -490,7 +509,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, "\n"); - SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); + SEQ_printf_task_group_path(m, cfs_rq->tg, "cfs_rq[%d]:%s\n", cpu); #else SEQ_printf(m, "\n"); SEQ_printf(m, "cfs_rq[%d]:\n", cpu); @@ -562,7 +581,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #ifdef CONFIG_RT_GROUP_SCHED SEQ_printf(m, "\n"); - SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); + SEQ_printf_task_group_path(m, rt_rq->tg, "rt_rq[%d]:%s\n", cpu); #else SEQ_printf(m, "\n"); SEQ_printf(m, "rt_rq[%d]:\n", cpu); @@ -614,7 +633,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; #ifdef CONFIG_X86 { @@ -666,13 +684,11 @@ do { \ } #undef P - spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); print_dl_stats(m, cpu); print_rq(m, rq, cpu); - spin_unlock_irqrestore(&sched_debug_lock, flags); SEQ_printf(m, "\n"); } @@ -831,25 +847,15 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, static void sched_show_numa(struct task_struct *p, struct seq_file *m) { #ifdef CONFIG_NUMA_BALANCING - struct mempolicy *pol; - if (p->mm) P(mm->numa_scan_seq); - task_lock(p); - pol = p->mempolicy; - if (pol && !(pol->flags & MPOL_F_MORON)) - pol = NULL; - mpol_get(pol); - task_unlock(p); - P(numa_pages_migrated); P(numa_preferred_nid); P(total_numa_faults); SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", task_node(p), task_numa_group_id(p)); show_numa_stats(p, m); - mpol_put(pol); #endif } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b02a83ff4068..d2a68ae7596e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2927,7 +2927,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (1) - * \Sum grq->load.weight + * \Sum grq->load.weight * * Now, because computing that sum is prohibitively expensive to compute (been * there, done that) we approximate it with this average stuff. The average @@ -2941,7 +2941,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->avg.load_avg * ge->load.weight = ------------------------------ (3) - * tg->load_avg + * tg->load_avg * * Where: tg->load_avg ~= \Sum grq->avg.load_avg * @@ -2957,7 +2957,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- = tg->weight (4) - * grp->load.weight + * grp->load.weight * * That is, the sum collapses because all other CPUs are idle; the UP scenario. * @@ -2976,7 +2976,7 @@ void reweight_task(struct task_struct *p, int prio) * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (6) - * tg_load_avg' + * tg_load_avg' * * Where: * @@ -3814,7 +3814,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) if (!static_branch_unlikely(&sched_asym_cpucapacity)) return; - if (!p) { + if (!p || p->nr_cpus_allowed == 1) { rq->misfit_task_load = 0; return; } @@ -4485,8 +4485,8 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - - cfs_rq->throttled_clock_task; + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; /* Add cfs_rq with already running entity in the list */ if (cfs_rq->nr_running >= 1) @@ -4503,7 +4503,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task = rq_clock_task(rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); } cfs_rq->throttle_count++; @@ -4580,7 +4580,6 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - int enqueue = 1; long task_delta, idle_task_delta; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4604,21 +4603,41 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { if (se->on_rq) - enqueue = 0; + break; + cfs_rq = cfs_rq_of(se); + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto unthrottle_throttle; + } + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - if (enqueue) - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; + + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - break; + goto unthrottle_throttle; + + /* + * One parent has been throttled and cfs_rq removed from the + * list. Add it back to not break the leaf list. + */ + if (throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); } - if (!se) - add_nr_running(rq, task_delta); + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, task_delta); +unthrottle_throttle: /* * The cfs_rq_throttled() breaks in the above iteration can result in * incomplete leaf list maintenance, resulting in triggering the @@ -4627,7 +4646,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - list_add_leaf_cfs_rq(cfs_rq); + if (list_add_leaf_cfs_rq(cfs_rq)) + break; } assert_list_leaf_cfs_rq(rq); @@ -4766,7 +4786,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { struct hrtimer *refresh_timer = &cfs_b->period_timer; - u64 remaining; + s64 remaining; /* if the call-back is running a quota refresh is already occurring */ if (hrtimer_callback_running(refresh_timer)) @@ -4774,7 +4794,7 @@ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) /* is a quota refresh about to occur? */ remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); - if (remaining < min_expire) + if (remaining < (s64)min_expire) return 1; return 0; @@ -4912,7 +4932,7 @@ static void sync_throttle(struct task_group *tg, int cpu) pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ @@ -5228,6 +5248,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); + int task_new = !(flags & ENQUEUE_WAKEUP); /* * The code below (indirectly) updates schedutil which looks at @@ -5299,7 +5320,7 @@ enqueue_throttle: * into account, but that is not straightforward to implement, * and the following generally works well enough in practice. */ - if (flags & ENQUEUE_WAKEUP) + if (!task_new) update_overutilized_status(rq); } @@ -5936,7 +5957,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, int target) +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { int cpu, si_cpu = -1; @@ -5944,7 +5965,8 @@ static int select_idle_smt(struct task_struct *p, int target) return -1; for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr) || + !cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; if (available_idle_cpu(cpu)) return cpu; @@ -5962,7 +5984,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s return -1; } -static inline int select_idle_smt(struct task_struct *p, int target) +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { return -1; } @@ -6072,7 +6094,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, target); + i = select_idle_smt(p, sd, target); if ((unsigned)i < nr_cpumask_bits) return i; @@ -7278,6 +7300,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; + /* Disregard pcpu kthreads; they are where they need to be. */ + if (kthread_is_per_cpu(p)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; @@ -7634,7 +7660,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) - update_load_avg(cfs_rq_of(se), se, 0); + update_load_avg(cfs_rq_of(se), se, UPDATE_TG); /* * There can be a lot of idle CPU cgroups. Don't let fully @@ -10120,16 +10146,22 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq; + list_add_leaf_cfs_rq(cfs_rq_of(se)); + /* Start to propagate at parent */ se = se->parent; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - if (cfs_rq_throttled(cfs_rq)) - break; + if (!cfs_rq_throttled(cfs_rq)){ + update_load_avg(cfs_rq, se, UPDATE_TG); + list_add_leaf_cfs_rq(cfs_rq); + continue; + } - update_load_avg(cfs_rq, se, UPDATE_TG); + if (list_add_leaf_cfs_rq(cfs_rq)) + break; } } #else diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 131e7c86cf06..3f8c7867c14c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -249,6 +249,7 @@ static void do_idle(void) } arch_cpu_idle_enter(); + rcu_nocb_flush_deferred_wakeup(); /* * In poll mode we reenable interrupts and spin. Also if we diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 168479a7d61b..46c142b69598 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -30,6 +30,23 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_core(void *info) +{ + /* + * The smp_mb() in membarrier after all the IPIs is supposed to + * ensure that memory on remote CPUs that occur before the IPI + * become visible to membarrier()'s caller -- see scenario B in + * the big comment at the top of this file. + * + * A sync_core() would provide this guarantee, but + * sync_core_before_usermode() might end up being deferred until + * after membarrier()'s smp_mb(). + */ + smp_mb(); /* IPIs should be serializing but paranoid. */ + + sync_core_before_usermode(); +} + static void ipi_sync_rq_state(void *info) { struct mm_struct *mm = (struct mm_struct *) info; @@ -134,6 +151,7 @@ static int membarrier_private_expedited(int flags) int cpu; cpumask_var_t tmpmask; struct mm_struct *mm = current->mm; + smp_call_func_t ipi_func = ipi_mb; if (flags & MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) @@ -141,6 +159,7 @@ static int membarrier_private_expedited(int flags) if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + ipi_func = ipi_sync_core; } else { if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) @@ -181,7 +200,7 @@ static int membarrier_private_expedited(int flags) rcu_read_unlock(); preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + smp_call_function_many(tmpmask, ipi_func, NULL, 1); preempt_enable(); free_cpumask_var(tmpmask); @@ -246,9 +265,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm) } rcu_read_unlock(); - preempt_disable(); - smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); - preempt_enable(); + on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true); free_cpumask_var(tmpmask); cpus_read_unlock(); diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index afff644da065..43e2a47489fa 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -127,9 +127,9 @@ static inline u64 rq_clock_pelt(struct rq *rq) static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; + return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; - return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; } #else static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 9154e745f097..9dd83eb74a9d 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1046,7 +1046,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount); mutex_lock(&group->trigger_lock); @@ -1079,15 +1078,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, return t; } -static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; + struct psi_group *group; struct kthread_worker *kworker_to_destroy = NULL; - if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return; + group = t->group; /* * Wakeup waiters to stop polling. Can happen if cgroup is deleted * from under a polling process. @@ -1122,9 +1125,9 @@ static void psi_trigger_destroy(struct kref *ref) mutex_unlock(&group->trigger_lock); /* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_kworker RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_kworker + * Wait for psi_schedule_poll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * poll_task. */ synchronize_rcu(); /* @@ -1146,18 +1149,6 @@ static void psi_trigger_destroy(struct kref *ref) kfree(t); } -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); -} - __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait) { @@ -1167,24 +1158,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock(); poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; - kref_put(&t->refcount, psi_trigger_destroy); - return ret; } @@ -1208,14 +1190,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, buf[buf_size - 1] = '\0'; - new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock); return nbytes; @@ -1250,7 +1242,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5b04bba4500d..28c82dee13ea 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -52,11 +52,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.function = sched_rt_period_timer; } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) { - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return; - raw_spin_lock(&rt_b->rt_runtime_lock); if (!rt_b->rt_period_active) { rt_b->rt_period_active = 1; @@ -75,6 +72,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; + + do_start_rt_bandwidth(rt_b); +} + void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; @@ -983,13 +988,17 @@ static void update_curr_rt(struct rq *rq) for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + int exceeded; if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) + exceeded = sched_rt_runtime_exceeded(rt_rq); + if (exceeded) resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (exceeded) + do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } } @@ -2221,13 +2230,20 @@ void __init init_sched_rt_class(void) static void switched_to_rt(struct rq *rq, struct task_struct *p) { /* - * If we are already running, then there's nothing - * that needs to be done. But if we are not running - * we may need to preempt the current running task. - * If that current running task is also an RT task + * If we are running, update the avg_rt tracking, as the running time + * will now on be accounted into the latter. + */ + if (task_current(rq, p)) { + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + return; + } + + /* + * If we are not running we may need to preempt the current + * running task. If that current running task is also an RT task * then see if we can move to another run queue. */ - if (task_on_rq_queued(p) && rq->curr != p) { + if (task_on_rq_queued(p)) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); @@ -2652,8 +2668,12 @@ static int sched_rt_global_validate(void) static void sched_rt_do_global(void) { + unsigned long flags; + + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); def_rt_bandwidth.rt_runtime = global_rt_runtime(); def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); } int sched_rt_handler(struct ctl_table *table, int write, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9f2a9e34a78d..b8a3db59e326 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -209,6 +209,8 @@ static inline int task_has_dl_policy(struct task_struct *p) */ #define SCHED_FLAG_SUGOV 0x10000000 +#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) + static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) { #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL @@ -247,30 +249,6 @@ struct rt_bandwidth { void __dl_clear_params(struct task_struct *p); -/* - * To keep the bandwidth of -deadline tasks and groups under control - * we need some place where: - * - store the maximum -deadline bandwidth of the system (the group); - * - cache the fraction of that bandwidth that is currently allocated. - * - * This is all done in the data structure below. It is similar to the - * one used for RT-throttling (rt_bandwidth), with the main difference - * that, since here we are only interested in admission control, we - * do not decrease any runtime while the group "executes", neither we - * need a timer to replenish it. - * - * With respect to SMP, the bandwidth is given on a per-CPU basis, - * meaning that: - * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; - * - dl_total_bw array contains, in the i-eth element, the currently - * allocated bandwidth on the i-eth CPU. - * Moreover, groups consume bandwidth on each CPU, while tasks only - * consume bandwidth on the CPU they're running on. - * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw - * that will be shown the next time the proc or cgroup controls will - * be red. It on its turn can be changed by writing on its own - * control. - */ struct dl_bandwidth { raw_spinlock_t dl_runtime_lock; u64 dl_runtime; @@ -282,6 +260,24 @@ static inline int dl_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } +/* + * To keep the bandwidth of -deadline tasks under control + * we need some place where: + * - store the maximum -deadline bandwidth of each cpu; + * - cache the fraction of bandwidth that is currently allocated in + * each root domain; + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, bandwidth is given on a per root domain basis, + * meaning that: + * - bw (< 100%) is the deadline bandwidth of each CPU; + * - total_bw is the currently allocated bandwidth in each root domain; + */ struct dl_bw { raw_spinlock_t lock; u64 bw; @@ -574,8 +570,8 @@ struct cfs_rq { s64 runtime_remaining; u64 throttled_clock; - u64 throttled_clock_task; - u64 throttled_clock_task_time; + u64 throttled_clock_pelt; + u64 throttled_clock_pelt_time; int throttled; int throttle_count; struct list_head throttled_list; @@ -979,6 +975,7 @@ struct rq { call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; + ktime_t hrtick_time; #endif #ifdef CONFIG_SCHEDSTATS @@ -1568,7 +1565,7 @@ enum { #undef SCHED_FEAT -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) +#ifdef CONFIG_SCHED_DEBUG /* * To support run-time toggling of sched features, all the translation units @@ -1576,6 +1573,7 @@ enum { */ extern const_debug unsigned int sysctl_sched_features; +#ifdef CONFIG_JUMP_LABEL #define SCHED_FEAT(name, enabled) \ static __always_inline bool static_branch_##name(struct static_key *key) \ { \ @@ -1588,7 +1586,13 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) -#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */ +#else /* !CONFIG_JUMP_LABEL */ + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +#endif /* CONFIG_JUMP_LABEL */ + +#else /* !SCHED_DEBUG */ /* * Each translation unit has its own copy of sysctl_sched_features to allow @@ -1604,7 +1608,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */ +#endif /* SCHED_DEBUG */ extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ffaa97a8d405..e5ebaffc4fef 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1552,66 +1552,58 @@ static void init_numa_topology_type(void) } } + +#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) + void sched_init_numa(void) { - int next_distance, curr_distance = node_distance(0, 0); struct sched_domain_topology_level *tl; - int level = 0; - int i, j, k; - - sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL); - if (!sched_domains_numa_distance) - return; - - /* Includes NUMA identity node at level 0. */ - sched_domains_numa_distance[level++] = curr_distance; - sched_domains_numa_levels = level; + unsigned long *distance_map; + int nr_levels = 0; + int i, j; /* * O(nr_nodes^2) deduplicating selection sort -- in order to find the * unique distances in the node_distance() table. - * - * Assumes node_distance(0,j) includes all distances in - * node_distance(i,j) in order to avoid cubic time. */ - next_distance = curr_distance; + distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); + if (!distance_map) + return; + + bitmap_zero(distance_map, NR_DISTANCE_VALUES); for (i = 0; i < nr_node_ids; i++) { for (j = 0; j < nr_node_ids; j++) { - for (k = 0; k < nr_node_ids; k++) { - int distance = node_distance(i, k); - - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; - - /* - * While not a strong assumption it would be nice to know - * about cases where if node A is connected to B, B is not - * equally connected to A. - */ - if (sched_debug() && node_distance(k, i) != distance) - sched_numa_warn("Node-distance not symmetric"); + int distance = node_distance(i, j); - if (sched_debug() && i && !find_numa_distance(distance)) - sched_numa_warn("Node-0 not representative"); + if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { + sched_numa_warn("Invalid distance value range"); + return; } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; + + bitmap_set(distance_map, distance, 1); } + } + /* + * We can now figure out how many unique distance values there are and + * allocate memory accordingly. + */ + nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); - /* - * In case of sched_debug() we verify the above assumption. - */ - if (!sched_debug()) - break; + sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); + if (!sched_domains_numa_distance) { + bitmap_free(distance_map); + return; + } + + for (i = 0, j = 0; i < nr_levels; i++, j++) { + j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); + sched_domains_numa_distance[i] = j; } + bitmap_free(distance_map); + /* - * 'level' contains the number of unique distances + * 'nr_levels' contains the number of unique distances * * The sched_domains_numa_distance[] array includes the actual distance * numbers. @@ -1620,15 +1612,15 @@ void sched_init_numa(void) /* * Here, we should temporarily reset sched_domains_numa_levels to 0. * If it fails to allocate memory for array sched_domains_numa_masks[][], - * the array will contain less then 'level' members. This could be + * the array will contain less then 'nr_levels' members. This could be * dangerous when we use it to iterate array sched_domains_numa_masks[][] * in other functions. * - * We reset it to 'level' at the end of this function. + * We reset it to 'nr_levels' at the end of this function. */ sched_domains_numa_levels = 0; - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); + sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); if (!sched_domains_numa_masks) return; @@ -1636,7 +1628,7 @@ void sched_init_numa(void) * Now for each level, construct a mask per node which contains all * CPUs of nodes that are that many hops away from us. */ - for (i = 0; i < level; i++) { + for (i = 0; i < nr_levels; i++) { sched_domains_numa_masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); if (!sched_domains_numa_masks[i]) @@ -1644,12 +1636,17 @@ void sched_init_numa(void) for (j = 0; j < nr_node_ids; j++) { struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); + int k; + if (!mask) return; sched_domains_numa_masks[i][j] = mask; for_each_node(k) { + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) + sched_numa_warn("Node-distance not symmetric"); + if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; @@ -1661,7 +1658,7 @@ void sched_init_numa(void) /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + level + 1) * + tl = kzalloc((i + nr_levels + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -1684,7 +1681,7 @@ void sched_init_numa(void) /* * .. and append 'j' levels of NUMA goodness. */ - for (j = 1; j < level; i++, j++) { + for (j = 1; j < nr_levels; i++, j++) { tl[i] = (struct sched_domain_topology_level){ .mask = sd_numa_mask, .sd_flags = cpu_numa_flags, @@ -1696,8 +1693,8 @@ void sched_init_numa(void) sched_domain_topology = tl; - sched_domains_numa_levels = level; - sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + sched_domains_numa_levels = nr_levels; + sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; init_numa_topology_type(); } diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index c1e566a114ca..7d668b31dbc6 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -206,6 +206,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_e } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +void __wake_up_pollfree(struct wait_queue_head *wq_head) +{ + __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE)); + /* POLLFREE must have cleared the queue. */ + WARN_ON_ONCE(waitqueue_active(wq_head)); +} + /* * Note: we use "set_current_state()" _after_ the wait-queue add, * because we need a memory barrier there on SMP, so that any @@ -232,17 +239,22 @@ prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_ent } EXPORT_SYMBOL(prepare_to_wait); -void +/* Returns true if we are the first waiter in the queue, false otherwise. */ +bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; + bool was_empty = false; wq_entry->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&wq_head->lock, flags); - if (list_empty(&wq_entry->entry)) + if (list_empty(&wq_entry->entry)) { + was_empty = list_empty(&wq_head->head); __add_wait_queue_entry_tail(wq_head, wq_entry); + } set_current_state(state); spin_unlock_irqrestore(&wq_head->lock, flags); + return was_empty; } EXPORT_SYMBOL(prepare_to_wait_exclusive); |