aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/autogroup.c2
-rw-r--r--kernel/sched/core.c1215
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cpufreq_schedutil.c51
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/deadline.c160
-rw-r--r--kernel/sched/debug.c43
-rw-r--r--kernel/sched/fair.c1114
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/idle.c44
-rw-r--r--kernel/sched/isolation.c18
-rw-r--r--kernel/sched/pelt.c13
-rw-r--r--kernel/sched/pelt.h2
-rw-r--r--kernel/sched/psi.c14
-rw-r--r--kernel/sched/rt.c95
-rw-r--r--kernel/sched/sched-pelt.h2
-rw-r--r--kernel/sched/sched.h195
-rw-r--r--kernel/sched/stats.h7
-rw-r--r--kernel/sched/stop_task.c22
-rw-r--r--kernel/sched/topology.c71
-rw-r--r--kernel/sched/wait.c8
21 files changed, 2001 insertions, 1084 deletions
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2d4ff5353ded..2067080bb235 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -259,7 +259,6 @@ out:
}
#endif /* CONFIG_PROC_FS */
-#ifdef CONFIG_SCHED_DEBUG
int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
if (!task_group_is_autogroup(tg))
@@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
-#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 874c427742a9..f9a1346a5fa9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -23,6 +23,17 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+/*
+ * Export tracepoints that act as a bare tracehook (ie: have no trace event
+ * associated with them) to allow external modules to probe them.
+ */
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
+
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
@@ -244,7 +255,7 @@ static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
/*
@@ -303,7 +314,7 @@ void hrtick_start(struct rq *rq, u64 delay)
*/
delay = max_t(u64, delay, 10000LL);
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_REL_PINNED_HARD);
}
#endif /* CONFIG_SMP */
@@ -317,7 +328,7 @@ static void hrtick_rq_init(struct rq *rq)
rq->hrtick_csd.info = rq;
#endif
- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
}
#else /* CONFIG_SCHED_HRTICK */
@@ -761,6 +772,520 @@ static void set_load_weight(struct task_struct *p, bool update_load)
}
}
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * Serializes updates of utilization clamp values
+ *
+ * The (slow-path) user-space triggers utilization clamp value updates which
+ * can require updates on (fast-path) scheduler's data structures used to
+ * support enqueue/dequeue operations.
+ * While the per-CPU rq lock protects fast-path update operations, user-space
+ * requests are serialized using a mutex to reduce the risk of conflicting
+ * updates or API abuses.
+ */
+static DEFINE_MUTEX(uclamp_mutex);
+
+/* Max allowed minimum utilization */
+unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+
+/* Max allowed maximum utilization */
+unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+
+/* All clamps are required to be less or equal than these values */
+static struct uclamp_se uclamp_default[UCLAMP_CNT];
+
+/* Integer rounded range for each bucket */
+#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
+
+#define for_each_clamp_id(clamp_id) \
+ for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
+
+static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
+{
+ return clamp_value / UCLAMP_BUCKET_DELTA;
+}
+
+static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
+{
+ return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
+}
+
+static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline void uclamp_se_set(struct uclamp_se *uc_se,
+ unsigned int value, bool user_defined)
+{
+ uc_se->value = value;
+ uc_se->bucket_id = uclamp_bucket_id(value);
+ uc_se->user_defined = user_defined;
+}
+
+static inline unsigned int
+uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int clamp_value)
+{
+ /*
+ * Avoid blocked utilization pushing up the frequency when we go
+ * idle (which drops the max-clamp) by retaining the last known
+ * max-clamp.
+ */
+ if (clamp_id == UCLAMP_MAX) {
+ rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
+ return clamp_value;
+ }
+
+ return uclamp_none(UCLAMP_MIN);
+}
+
+static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int clamp_value)
+{
+ /* Reset max-clamp retention only on idle exit */
+ if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+ return;
+
+ WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
+}
+
+static inline
+enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int clamp_value)
+{
+ struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
+ int bucket_id = UCLAMP_BUCKETS - 1;
+
+ /*
+ * Since both min and max clamps are max aggregated, find the
+ * top most bucket with tasks in.
+ */
+ for ( ; bucket_id >= 0; bucket_id--) {
+ if (!bucket[bucket_id].tasks)
+ continue;
+ return bucket[bucket_id].value;
+ }
+
+ /* No tasks -- default clamp values */
+ return uclamp_idle_value(rq, clamp_id, clamp_value);
+}
+
+static inline struct uclamp_se
+uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
+{
+ struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ struct uclamp_se uc_max;
+
+ /*
+ * Tasks in autogroups or root task group will be
+ * restricted by system defaults.
+ */
+ if (task_group_is_autogroup(task_group(p)))
+ return uc_req;
+ if (task_group(p) == &root_task_group)
+ return uc_req;
+
+ uc_max = task_group(p)->uclamp[clamp_id];
+ if (uc_req.value > uc_max.value || !uc_req.user_defined)
+ return uc_max;
+#endif
+
+ return uc_req;
+}
+
+/*
+ * The effective clamp bucket index of a task depends on, by increasing
+ * priority:
+ * - the task specific clamp value, when explicitly requested from userspace
+ * - the task group effective clamp value, for tasks not either in the root
+ * group or in an autogroup
+ * - the system default clamp value, defined by the sysadmin
+ */
+static inline struct uclamp_se
+uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
+{
+ struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
+ struct uclamp_se uc_max = uclamp_default[clamp_id];
+
+ /* System default restrictions always apply */
+ if (unlikely(uc_req.value > uc_max.value))
+ return uc_max;
+
+ return uc_req;
+}
+
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
+{
+ struct uclamp_se uc_eff;
+
+ /* Task currently refcounted: use back-annotated (effective) value */
+ if (p->uclamp[clamp_id].active)
+ return p->uclamp[clamp_id].value;
+
+ uc_eff = uclamp_eff_get(p, clamp_id);
+
+ return uc_eff.value;
+}
+
+/*
+ * When a task is enqueued on a rq, the clamp bucket currently defined by the
+ * task's uclamp::bucket_id is refcounted on that rq. This also immediately
+ * updates the rq's clamp value if required.
+ *
+ * Tasks can have a task-specific value requested from user-space, track
+ * within each bucket the maximum value for tasks refcounted in it.
+ * This "local max aggregation" allows to track the exact "requested" value
+ * for each bucket when all its RUNNABLE tasks require the same clamp.
+ */
+static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+ struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+ struct uclamp_bucket *bucket;
+
+ lockdep_assert_held(&rq->lock);
+
+ /* Update task effective clamp */
+ p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
+
+ bucket = &uc_rq->bucket[uc_se->bucket_id];
+ bucket->tasks++;
+ uc_se->active = true;
+
+ uclamp_idle_reset(rq, clamp_id, uc_se->value);
+
+ /*
+ * Local max aggregation: rq buckets always track the max
+ * "requested" clamp value of its RUNNABLE tasks.
+ */
+ if (bucket->tasks == 1 || uc_se->value > bucket->value)
+ bucket->value = uc_se->value;
+
+ if (uc_se->value > READ_ONCE(uc_rq->value))
+ WRITE_ONCE(uc_rq->value, uc_se->value);
+}
+
+/*
+ * When a task is dequeued from a rq, the clamp bucket refcounted by the task
+ * is released. If this is the last task reference counting the rq's max
+ * active clamp value, then the rq's clamp value is updated.
+ *
+ * Both refcounted tasks and rq's cached clamp values are expected to be
+ * always valid. If it's detected they are not, as defensive programming,
+ * enforce the expected state and warn.
+ */
+static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+ struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+ struct uclamp_bucket *bucket;
+ unsigned int bkt_clamp;
+ unsigned int rq_clamp;
+
+ lockdep_assert_held(&rq->lock);
+
+ bucket = &uc_rq->bucket[uc_se->bucket_id];
+ SCHED_WARN_ON(!bucket->tasks);
+ if (likely(bucket->tasks))
+ bucket->tasks--;
+ uc_se->active = false;
+
+ /*
+ * Keep "local max aggregation" simple and accept to (possibly)
+ * overboost some RUNNABLE tasks in the same bucket.
+ * The rq clamp bucket value is reset to its base value whenever
+ * there are no more RUNNABLE tasks refcounting it.
+ */
+ if (likely(bucket->tasks))
+ return;
+
+ rq_clamp = READ_ONCE(uc_rq->value);
+ /*
+ * Defensive programming: this should never happen. If it happens,
+ * e.g. due to future modification, warn and fixup the expected value.
+ */
+ SCHED_WARN_ON(bucket->value > rq_clamp);
+ if (bucket->value >= rq_clamp) {
+ bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
+ WRITE_ONCE(uc_rq->value, bkt_clamp);
+ }
+}
+
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
+{
+ enum uclamp_id clamp_id;
+
+ if (unlikely(!p->sched_class->uclamp_enabled))
+ return;
+
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_inc_id(rq, p, clamp_id);
+
+ /* Reset clamp idle holding when there is one RUNNABLE task */
+ if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+}
+
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
+{
+ enum uclamp_id clamp_id;
+
+ if (unlikely(!p->sched_class->uclamp_enabled))
+ return;
+
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_dec_id(rq, p, clamp_id);
+}
+
+static inline void
+uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ /*
+ * Lock the task and the rq where the task is (or was) queued.
+ *
+ * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+ * price to pay to safely serialize util_{min,max} updates with
+ * enqueues, dequeues and migration operations.
+ * This is the same locking schema used by __set_cpus_allowed_ptr().
+ */
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Setting the clamp bucket is serialized by task_rq_lock().
+ * If the task is not yet RUNNABLE and its task_struct is not
+ * affecting a valid clamp bucket, the next time it's enqueued,
+ * it will already see the updated clamp bucket value.
+ */
+ if (!p->uclamp[clamp_id].active) {
+ uclamp_rq_dec_id(rq, p, clamp_id);
+ uclamp_rq_inc_id(rq, p, clamp_id);
+ }
+
+ task_rq_unlock(rq, p, &rf);
+}
+
+static inline void
+uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+ unsigned int clamps)
+{
+ enum uclamp_id clamp_id;
+ struct css_task_iter it;
+ struct task_struct *p;
+
+ css_task_iter_start(css, 0, &it);
+ while ((p = css_task_iter_next(&it))) {
+ for_each_clamp_id(clamp_id) {
+ if ((0x1 << clamp_id) & clamps)
+ uclamp_update_active(p, clamp_id);
+ }
+ }
+ css_task_iter_end(&it);
+}
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+static void uclamp_update_root_tg(void)
+{
+ struct task_group *tg = &root_task_group;
+
+ uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+ sysctl_sched_uclamp_util_min, false);
+ uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+ sysctl_sched_uclamp_util_max, false);
+
+ rcu_read_lock();
+ cpu_util_update_eff(&root_task_group.css);
+ rcu_read_unlock();
+}
+#else
+static void uclamp_update_root_tg(void) { }
+#endif
+
+int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ bool update_root_tg = false;
+ int old_min, old_max;
+ int result;
+
+ mutex_lock(&uclamp_mutex);
+ old_min = sysctl_sched_uclamp_util_min;
+ old_max = sysctl_sched_uclamp_util_max;
+
+ result = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (result)
+ goto undo;
+ if (!write)
+ goto done;
+
+ if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
+ result = -EINVAL;
+ goto undo;
+ }
+
+ if (old_min != sysctl_sched_uclamp_util_min) {
+ uclamp_se_set(&uclamp_default[UCLAMP_MIN],
+ sysctl_sched_uclamp_util_min, false);
+ update_root_tg = true;
+ }
+ if (old_max != sysctl_sched_uclamp_util_max) {
+ uclamp_se_set(&uclamp_default[UCLAMP_MAX],
+ sysctl_sched_uclamp_util_max, false);
+ update_root_tg = true;
+ }
+
+ if (update_root_tg)
+ uclamp_update_root_tg();
+
+ /*
+ * We update all RUNNABLE tasks only when task groups are in use.
+ * Otherwise, keep it simple and do just a lazy update at each next
+ * task enqueue time.
+ */
+
+ goto done;
+
+undo:
+ sysctl_sched_uclamp_util_min = old_min;
+ sysctl_sched_uclamp_util_max = old_max;
+done:
+ mutex_unlock(&uclamp_mutex);
+
+ return result;
+}
+
+static int uclamp_validate(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
+ unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
+ lower_bound = attr->sched_util_min;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
+ upper_bound = attr->sched_util_max;
+
+ if (lower_bound > upper_bound)
+ return -EINVAL;
+ if (upper_bound > SCHED_CAPACITY_SCALE)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void __setscheduler_uclamp(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ enum uclamp_id clamp_id;
+
+ /*
+ * On scheduling class change, reset to default clamps for tasks
+ * without a task-specific value.
+ */
+ for_each_clamp_id(clamp_id) {
+ struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
+ unsigned int clamp_value = uclamp_none(clamp_id);
+
+ /* Keep using defined clamps across class changes */
+ if (uc_se->user_defined)
+ continue;
+
+ /* By default, RT tasks always get 100% boost */
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
+ clamp_value = uclamp_none(UCLAMP_MAX);
+
+ uclamp_se_set(uc_se, clamp_value, false);
+ }
+
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
+ return;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+ uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
+ attr->sched_util_min, true);
+ }
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+ uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
+ attr->sched_util_max, true);
+ }
+}
+
+static void uclamp_fork(struct task_struct *p)
+{
+ enum uclamp_id clamp_id;
+
+ for_each_clamp_id(clamp_id)
+ p->uclamp[clamp_id].active = false;
+
+ if (likely(!p->sched_reset_on_fork))
+ return;
+
+ for_each_clamp_id(clamp_id) {
+ unsigned int clamp_value = uclamp_none(clamp_id);
+
+ /* By default, RT tasks always get 100% boost */
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
+ clamp_value = uclamp_none(UCLAMP_MAX);
+
+ uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false);
+ }
+}
+
+static void __init init_uclamp(void)
+{
+ struct uclamp_se uc_max = {};
+ enum uclamp_id clamp_id;
+ int cpu;
+
+ mutex_init(&uclamp_mutex);
+
+ for_each_possible_cpu(cpu) {
+ memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
+ cpu_rq(cpu)->uclamp_flags = 0;
+ }
+
+ for_each_clamp_id(clamp_id) {
+ uclamp_se_set(&init_task.uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
+ }
+
+ /* System defaults allow max clamp values for both indexes */
+ uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
+ for_each_clamp_id(clamp_id) {
+ uclamp_default[clamp_id] = uc_max;
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ root_task_group.uclamp_req[clamp_id] = uc_max;
+ root_task_group.uclamp[clamp_id] = uc_max;
+#endif
+ }
+}
+
+#else /* CONFIG_UCLAMP_TASK */
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
+static inline int uclamp_validate(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+static void __setscheduler_uclamp(struct task_struct *p,
+ const struct sched_attr *attr) { }
+static inline void uclamp_fork(struct task_struct *p) { }
+static inline void init_uclamp(void) { }
+#endif /* CONFIG_UCLAMP_TASK */
+
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
@@ -771,6 +1296,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
psi_enqueue(p, flags & ENQUEUE_WAKEUP);
}
+ uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}
@@ -784,6 +1310,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
psi_dequeue(p, flags & DEQUEUE_SLEEP);
}
+ uclamp_rq_dec(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -930,7 +1457,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
*/
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
if (is_per_cpu_kthread(p))
@@ -1025,7 +1552,7 @@ static int migration_cpu_stop(void *data)
local_irq_disable();
/*
* We need to explicitly wake pending tasks before running
- * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
sched_ttwu_pending();
@@ -1056,7 +1583,7 @@ static int migration_cpu_stop(void *data)
*/
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
{
- cpumask_copy(&p->cpus_allowed, new_mask);
+ cpumask_copy(&p->cpus_mask, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
@@ -1086,7 +1613,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
}
/*
@@ -1126,7 +1653,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
goto out;
}
- if (cpumask_equal(&p->cpus_allowed, new_mask))
+ if (cpumask_equal(p->cpus_ptr, new_mask))
goto out;
if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -1286,10 +1813,10 @@ static int migrate_swap_stop(void *data)
if (task_cpu(arg->src_task) != arg->src_cpu)
goto unlock;
- if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
goto unlock;
- if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
goto unlock;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1331,10 +1858,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
goto out;
- if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
goto out;
- if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
goto out;
trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1479,7 +2006,7 @@ void kick_process(struct task_struct *p)
EXPORT_SYMBOL_GPL(kick_process);
/*
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
*
* A few notes on cpu_active vs cpu_online:
*
@@ -1519,14 +2046,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for_each_cpu(dest_cpu, nodemask) {
if (!cpu_active(dest_cpu))
continue;
- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
return dest_cpu;
}
}
for (;;) {
/* Any allowed, online CPU? */
- for_each_cpu(dest_cpu, &p->cpus_allowed) {
+ for_each_cpu(dest_cpu, p->cpus_ptr) {
if (!is_cpu_allowed(p, dest_cpu))
continue;
@@ -1570,7 +2097,7 @@ out:
}
/*
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
*/
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
@@ -1580,11 +2107,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
if (p->nr_cpus_allowed > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
else
- cpu = cpumask_any(&p->cpus_allowed);
+ cpu = cpumask_any(p->cpus_ptr);
/*
* In order not to call set_task_cpu() on a blocking task we need
- * to rely on ttwu() to place the task on a valid ->cpus_allowed
+ * to rely on ttwu() to place the task on a valid ->cpus_ptr
* CPU.
*
* Since this is common to all placement strategies, this lives here.
@@ -1991,6 +2518,30 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
unsigned long flags;
int cpu, success = 0;
+ preempt_disable();
+ if (p == current) {
+ /*
+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
+ * == smp_processor_id()'. Together this means we can special
+ * case the whole 'p->on_rq && ttwu_remote()' case below
+ * without taking any locks.
+ *
+ * In particular:
+ * - we rely on Program-Order guarantees for all the ordering,
+ * - we're serialized against set_special_state() by virtue of
+ * it disabling IRQs (this allows not taking ->pi_lock).
+ */
+ if (!(p->state & state))
+ goto out;
+
+ success = 1;
+ cpu = task_cpu(p);
+ trace_sched_waking(p);
+ p->state = TASK_RUNNING;
+ trace_sched_wakeup(p);
+ goto out;
+ }
+
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
@@ -2000,7 +2551,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
if (!(p->state & state))
- goto out;
+ goto unlock;
trace_sched_waking(p);
@@ -2030,7 +2581,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
- goto stat;
+ goto unlock;
#ifdef CONFIG_SMP
/*
@@ -2090,10 +2641,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
-stat:
- ttwu_stat(p, cpu, wake_flags);
-out:
+unlock:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+out:
+ if (success)
+ ttwu_stat(p, cpu, wake_flags);
+ preempt_enable();
return success;
}
@@ -2300,6 +2853,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
*/
p->prio = current->normal_prio;
+ uclamp_fork(p);
+
/*
* Revert to default priority/policy on fork if requested.
*/
@@ -2395,7 +2950,7 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
- * - cpus_allowed can change in the fork path
+ * - cpus_ptr can change in the fork path
* - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
@@ -2778,12 +3333,8 @@ static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
- struct mm_struct *mm, *oldmm;
-
prepare_task_switch(rq, prev, next);
- mm = next->mm;
- oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
@@ -2792,22 +3343,37 @@ context_switch(struct rq *rq, struct task_struct *prev,
arch_start_context_switch(prev);
/*
- * If mm is non-NULL, we pass through switch_mm(). If mm is
- * NULL, we will pass through mmdrop() in finish_task_switch().
- * Both of these contain the full memory barrier required by
- * membarrier after storing to rq->curr, before returning to
- * user-space.
+ * kernel -> kernel lazy + transfer active
+ * user -> kernel lazy + mmgrab() active
+ *
+ * kernel -> user switch + mmdrop() active
+ * user -> user switch
*/
- if (!mm) {
- next->active_mm = oldmm;
- mmgrab(oldmm);
- enter_lazy_tlb(oldmm, next);
- } else
- switch_mm_irqs_off(oldmm, mm, next);
+ if (!next->mm) { // to kernel
+ enter_lazy_tlb(prev->active_mm, next);
- if (!prev->mm) {
- prev->active_mm = NULL;
- rq->prev_mm = oldmm;
+ next->active_mm = prev->active_mm;
+ if (prev->mm) // from user
+ mmgrab(prev->active_mm);
+ else
+ prev->active_mm = NULL;
+ } else { // to user
+ /*
+ * sys_membarrier() requires an smp_mb() between setting
+ * rq->curr and returning to userspace.
+ *
+ * The below provides this either through switch_mm(), or in
+ * case 'prev->active_mm == next->mm' through
+ * finish_task_switch()'s mmdrop().
+ */
+
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
+
+ if (!prev->mm) { // from kernel
+ /* will mmdrop() in finish_task_switch(). */
+ rq->prev_mm = prev->active_mm;
+ prev->active_mm = NULL;
+ }
}
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@ -3033,7 +3599,6 @@ void scheduler_tick(void)
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
- cpu_load_update_active(rq);
calc_global_load_tick(rq);
psi_task_tick(rq);
@@ -3051,8 +3616,36 @@ void scheduler_tick(void)
struct tick_work {
int cpu;
+ atomic_t state;
struct delayed_work work;
};
+/* Values for ->state, see diagram below. */
+#define TICK_SCHED_REMOTE_OFFLINE 0
+#define TICK_SCHED_REMOTE_OFFLINING 1
+#define TICK_SCHED_REMOTE_RUNNING 2
+
+/*
+ * State diagram for ->state:
+ *
+ *
+ * TICK_SCHED_REMOTE_OFFLINE
+ * | ^
+ * | |
+ * | | sched_tick_remote()
+ * | |
+ * | |
+ * +--TICK_SCHED_REMOTE_OFFLINING
+ * | ^
+ * | |
+ * sched_tick_start() | | sched_tick_stop()
+ * | |
+ * V |
+ * TICK_SCHED_REMOTE_RUNNING
+ *
+ *
+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
+ * and sched_tick_start() are happy to leave the state in RUNNING.
+ */
static struct tick_work __percpu *tick_work_cpu;
@@ -3065,6 +3658,7 @@ static void sched_tick_remote(struct work_struct *work)
struct task_struct *curr;
struct rq_flags rf;
u64 delta;
+ int os;
/*
* Handle the tick only if it appears the remote CPU is running in full
@@ -3078,7 +3672,7 @@ static void sched_tick_remote(struct work_struct *work)
rq_lock_irq(rq, &rf);
curr = rq->curr;
- if (is_idle_task(curr))
+ if (is_idle_task(curr) || cpu_is_offline(cpu))
goto out_unlock;
update_rq_clock(rq);
@@ -3098,13 +3692,18 @@ out_requeue:
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
- * to keep scheduler internal stats reasonably up to date.
+ * to keep scheduler internal stats reasonably up to date. But
+ * first update state to reflect hotplug activity if required.
*/
- queue_delayed_work(system_unbound_wq, dwork, HZ);
+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
+ if (os == TICK_SCHED_REMOTE_RUNNING)
+ queue_delayed_work(system_unbound_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
{
+ int os;
struct tick_work *twork;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
@@ -3113,15 +3712,20 @@ static void sched_tick_start(int cpu)
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
- twork->cpu = cpu;
- INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
- queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
+ if (os == TICK_SCHED_REMOTE_OFFLINE) {
+ twork->cpu = cpu;
+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+ queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ }
}
#ifdef CONFIG_HOTPLUG_CPU
static void sched_tick_stop(int cpu)
{
struct tick_work *twork;
+ int os;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
return;
@@ -3129,7 +3733,10 @@ static void sched_tick_stop(int cpu)
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
- cancel_delayed_work_sync(&twork->work);
+ /* There cannot be competing actions, but don't rely on stop-machine. */
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
+ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
+ /* Don't cancel, as this would mess up the state machine. */
}
#endif /* CONFIG_HOTPLUG_CPU */
@@ -3137,7 +3744,6 @@ int __init sched_tick_offload_init(void)
{
tick_work_cpu = alloc_percpu(struct tick_work);
BUG_ON(!tick_work_cpu);
-
return 0;
}
@@ -3146,7 +3752,7 @@ static inline void sched_tick_start(int cpu) { }
static inline void sched_tick_stop(int cpu) { }
#endif
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_TRACE_PREEMPT_TOGGLE))
/*
* If the value passed in is equal to the current preempt count
@@ -3265,13 +3871,22 @@ static noinline void __schedule_bug(struct task_struct *prev)
/*
* Various schedule()-time debugging checks and statistics:
*/
-static inline void schedule_debug(struct task_struct *prev)
+static inline void schedule_debug(struct task_struct *prev, bool preempt)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
if (task_stack_end_corrupted(prev))
panic("corrupted stack end detected inside scheduler\n");
#endif
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ if (!preempt && prev->state && prev->non_block_count) {
+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
+ prev->comm, prev->pid, prev->non_block_count);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+ }
+#endif
+
if (unlikely(in_atomic_preempt_off())) {
__schedule_bug(prev);
preempt_count_set(PREEMPT_DISABLED);
@@ -3304,7 +3919,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
- goto again;
+ goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
@@ -3313,14 +3928,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return p;
}
-again:
+restart:
+ /*
+ * Ensure that we put DL/RT tasks before the pick loop, such that they
+ * can PULL higher prio tasks when we lower the RQ 'priority'.
+ */
+ prev->sched_class->put_prev_task(rq, prev, rf);
+ if (!rq->nr_running)
+ newidle_balance(rq, rf);
+
for_each_class(class) {
- p = class->pick_next_task(rq, prev, rf);
- if (p) {
- if (unlikely(p == RETRY_TASK))
- goto again;
+ p = class->pick_next_task(rq, NULL, NULL);
+ if (p)
return p;
- }
}
/* The idle class should always have a runnable task: */
@@ -3347,7 +3967,7 @@ again:
* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
* called on the nearest possible occasion:
*
- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
*
* - in syscall or exception context, at the next outmost
* preempt_enable(). (this might be as soon as the wake_up()'s
@@ -3356,7 +3976,7 @@ again:
* - in IRQ context, return from interrupt-handler to
* preemptible context
*
- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
* then at the next:
*
* - cond_resched() call
@@ -3378,7 +3998,7 @@ static void __sched notrace __schedule(bool preempt)
rq = cpu_rq(cpu);
prev = rq->curr;
- schedule_debug(prev);
+ schedule_debug(prev, preempt);
if (sched_feat(HRTICK))
hrtick_clear(rq);
@@ -3469,7 +4089,7 @@ void __noreturn do_task_dead(void)
static inline void sched_submit_work(struct task_struct *tsk)
{
- if (!tsk->state || tsk_is_pi_blocked(tsk))
+ if (!tsk->state)
return;
/*
@@ -3485,6 +4105,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
preempt_enable_no_resched();
}
+ if (tsk_is_pi_blocked(tsk))
+ return;
+
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
@@ -3598,7 +4221,7 @@ static void __sched notrace preempt_schedule_common(void)
} while (need_resched());
}
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
* this is the entry point to schedule() from in-kernel preemption
* off of preempt_enable. Kernel preemptions off return from interrupt
@@ -3670,7 +4293,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
/*
* this is the entry point to schedule() from kernel preemption
@@ -3838,7 +4461,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
if (queued)
enqueue_task(rq, p, queue_flag);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -3905,7 +4528,7 @@ void set_user_nice(struct task_struct *p, long nice)
resched_curr(rq);
}
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
out_unlock:
task_rq_unlock(rq, p, &rf);
}
@@ -4071,6 +4694,13 @@ static void __setscheduler_params(struct task_struct *p,
static void __setscheduler(struct rq *rq, struct task_struct *p,
const struct sched_attr *attr, bool keep_boost)
{
+ /*
+ * If params can't change scheduling class changes aren't allowed
+ * either.
+ */
+ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ return;
+
__setscheduler_params(p, attr);
/*
@@ -4208,6 +4838,16 @@ recheck:
return retval;
}
+ /* Update task specific "requested" clamps */
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
+ retval = uclamp_validate(p, attr);
+ if (retval)
+ return retval;
+ }
+
+ if (pi)
+ cpuset_read_lock();
+
/*
* Make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
@@ -4222,8 +4862,8 @@ recheck:
* Changing the policy of the stop threads its a very bad idea:
*/
if (p == rq->stop) {
- task_rq_unlock(rq, p, &rf);
- return -EINVAL;
+ retval = -EINVAL;
+ goto unlock;
}
/*
@@ -4237,10 +4877,12 @@ recheck:
goto change;
if (dl_policy(policy) && dl_param_changed(p, attr))
goto change;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+ goto change;
p->sched_reset_on_fork = reset_on_fork;
- task_rq_unlock(rq, p, &rf);
- return 0;
+ retval = 0;
+ goto unlock;
}
change:
@@ -4253,8 +4895,8 @@ change:
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
- task_rq_unlock(rq, p, &rf);
- return -EPERM;
+ retval = -EPERM;
+ goto unlock;
}
#endif
#ifdef CONFIG_SMP
@@ -4267,10 +4909,10 @@ change:
* the entire root_domain to become SCHED_DEADLINE. We
* will also fail if there's no bandwidth available.
*/
- if (!cpumask_subset(span, &p->cpus_allowed) ||
+ if (!cpumask_subset(span, p->cpus_ptr) ||
rq->rd->dl_bw.bw == 0) {
- task_rq_unlock(rq, p, &rf);
- return -EPERM;
+ retval = -EPERM;
+ goto unlock;
}
}
#endif
@@ -4280,6 +4922,8 @@ change:
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
+ if (pi)
+ cpuset_read_unlock();
goto recheck;
}
@@ -4289,8 +4933,8 @@ change:
* is available.
*/
if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
- task_rq_unlock(rq, p, &rf);
- return -EBUSY;
+ retval = -EBUSY;
+ goto unlock;
}
p->sched_reset_on_fork = reset_on_fork;
@@ -4317,7 +4961,9 @@ change:
put_prev_task(rq, p);
prev_class = p->sched_class;
+
__setscheduler(rq, p, attr, pi);
+ __setscheduler_uclamp(p, attr);
if (queued) {
/*
@@ -4330,7 +4976,7 @@ change:
enqueue_task(rq, p, queue_flags);
}
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
@@ -4338,14 +4984,22 @@ change:
preempt_disable();
task_rq_unlock(rq, p, &rf);
- if (pi)
+ if (pi) {
+ cpuset_read_unlock();
rt_mutex_adjust_pi(p);
+ }
/* Run balance callbacks after we've adjusted the PI chain: */
balance_callback(rq);
preempt_enable();
return 0;
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+ if (pi)
+ cpuset_read_unlock();
+ return retval;
}
static int _sched_setscheduler(struct task_struct *p, int policy,
@@ -4429,10 +5083,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
- if (p != NULL)
- retval = sched_setscheduler(p, policy, &lparam);
+ if (likely(p))
+ get_task_struct(p);
rcu_read_unlock();
+ if (likely(p)) {
+ retval = sched_setscheduler(p, policy, &lparam);
+ put_task_struct(p);
+ }
+
return retval;
}
@@ -4493,6 +5152,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
if (ret)
return -EFAULT;
+ if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
+ size < SCHED_ATTR_SIZE_VER1)
+ return -EINVAL;
+
/*
* XXX: Do we want to be lenient like existing syscalls; or do we want
* to be strict and return an error on out-of-bounds values?
@@ -4556,14 +5219,21 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
if ((int)attr.sched_policy < 0)
return -EINVAL;
+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
+ attr.sched_policy = SETPARAM_POLICY;
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
- if (p != NULL)
- retval = sched_setattr(p, &attr);
+ if (likely(p))
+ get_task_struct(p);
rcu_read_unlock();
+ if (likely(p)) {
+ retval = sched_setattr(p, &attr);
+ put_task_struct(p);
+ }
+
return retval;
}
@@ -4638,37 +5308,40 @@ out_unlock:
return retval;
}
-static int sched_read_attr(struct sched_attr __user *uattr,
- struct sched_attr *attr,
- unsigned int usize)
+/*
+ * Copy the kernel size attribute structure (which might be larger
+ * than what user-space knows about) to user-space.
+ *
+ * Note that all cases are valid: user-space buffer can be larger or
+ * smaller than the kernel-space buffer. The usual case is that both
+ * have the same size.
+ */
+static int
+sched_attr_copy_to_user(struct sched_attr __user *uattr,
+ struct sched_attr *kattr,
+ unsigned int usize)
{
- int ret;
+ unsigned int ksize = sizeof(*kattr);
if (!access_ok(uattr, usize))
return -EFAULT;
/*
- * If we're handed a smaller struct than we know of,
- * ensure all the unknown bits are 0 - i.e. old
- * user-space does not get uncomplete information.
+ * sched_getattr() ABI forwards and backwards compatibility:
+ *
+ * If usize == ksize then we just copy everything to user-space and all is good.
+ *
+ * If usize < ksize then we only copy as much as user-space has space for,
+ * this keeps ABI compatibility as well. We skip the rest.
+ *
+ * If usize > ksize then user-space is using a newer version of the ABI,
+ * which part the kernel doesn't know about. Just ignore it - tooling can
+ * detect the kernel's knowledge of attributes from the attr->size value
+ * which is set to ksize in this case.
*/
- if (usize < sizeof(*attr)) {
- unsigned char *addr;
- unsigned char *end;
-
- addr = (void *)attr + usize;
- end = (void *)attr + sizeof(*attr);
-
- for (; addr < end; addr++) {
- if (*addr)
- return -EFBIG;
- }
-
- attr->size = usize;
- }
+ kattr->size = min(usize, ksize);
- ret = copy_to_user(uattr, attr, attr->size);
- if (ret)
+ if (copy_to_user(uattr, kattr, kattr->size))
return -EFAULT;
return 0;
@@ -4678,20 +5351,18 @@ static int sched_read_attr(struct sched_attr __user *uattr,
* sys_sched_getattr - similar to sched_getparam, but with sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
- * @size: sizeof(attr) for fwd/bwd comp.
+ * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
* @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, size, unsigned int, flags)
+ unsigned int, usize, unsigned int, flags)
{
- struct sched_attr attr = {
- .size = sizeof(struct sched_attr),
- };
+ struct sched_attr kattr = { };
struct task_struct *p;
int retval;
- if (!uattr || pid < 0 || size > PAGE_SIZE ||
- size < SCHED_ATTR_SIZE_VER0 || flags)
+ if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
rcu_read_lock();
@@ -4704,20 +5375,24 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
if (retval)
goto out_unlock;
- attr.sched_policy = p->policy;
+ kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
- attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
if (task_has_dl_policy(p))
- __getparam_dl(p, &attr);
+ __getparam_dl(p, &kattr);
else if (task_has_rt_policy(p))
- attr.sched_priority = p->rt_priority;
+ kattr.sched_priority = p->rt_priority;
else
- attr.sched_nice = task_nice(p);
+ kattr.sched_nice = task_nice(p);
+
+#ifdef CONFIG_UCLAMP_TASK
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+#endif
rcu_read_unlock();
- retval = sched_read_attr(uattr, &attr, size);
- return retval;
+ return sched_attr_copy_to_user(uattr, &kattr, usize);
out_unlock:
rcu_read_unlock();
@@ -4866,7 +5541,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
@@ -4947,7 +5622,7 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
int __sched _cond_resched(void)
{
if (should_resched(0)) {
@@ -4964,7 +5639,7 @@ EXPORT_SYMBOL(_cond_resched);
* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
* call schedule, and on return reacquire the lock.
*
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
* operations here to prevent schedule() from being called twice (once via
* spin_unlock(), once by hand).
*/
@@ -5123,7 +5798,7 @@ long __sched io_schedule_timeout(long timeout)
}
EXPORT_SYMBOL(io_schedule_timeout);
-void io_schedule(void)
+void __sched io_schedule(void)
{
int token;
@@ -5443,7 +6118,7 @@ int task_can_attach(struct task_struct *p,
* allowed nodes is unnecessary. Thus, cpusets are not
* applicable for such threads. This prevents checking for
* success of set_cpus_allowed_ptr() on all attached tasks
- * before cpus_allowed may be changed.
+ * before cpus_mask may be changed.
*/
if (p->flags & PF_NO_SETAFFINITY) {
ret = -EINVAL;
@@ -5470,7 +6145,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (curr_cpu == target_cpu)
return 0;
- if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
/* TODO: This is not properly updating schedstats */
@@ -5503,7 +6178,7 @@ void sched_setnuma(struct task_struct *p, int nid)
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -5543,21 +6218,22 @@ static void calc_load_migrate(struct rq *rq)
atomic_long_add(delta, &calc_load_tasks);
}
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+static struct task_struct *__pick_migrate_task(struct rq *rq)
{
-}
+ const struct sched_class *class;
+ struct task_struct *next;
-static const struct sched_class fake_sched_class = {
- .put_prev_task = put_prev_task_fake,
-};
+ for_each_class(class) {
+ next = class->pick_next_task(rq, NULL, NULL);
+ if (next) {
+ next->sched_class->put_prev_task(rq, next, NULL);
+ return next;
+ }
+ }
-static struct task_struct fake_task = {
- /*
- * Avoid pull_{rt,dl}_task()
- */
- .prio = MAX_PRIO + 1,
- .sched_class = &fake_sched_class,
-};
+ /* The idle class should always have a runnable task */
+ BUG();
+}
/*
* Migrate all tasks from the rq, sleeping tasks will be migrated by
@@ -5600,15 +6276,10 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
if (rq->nr_running == 1)
break;
- /*
- * pick_next_task() assumes pinned rq->lock:
- */
- next = pick_next_task(rq, &fake_task, rf);
- BUG_ON(!next);
- put_prev_task(rq, next);
+ next = __pick_migrate_task(rq);
/*
- * Rules for changing task_struct::cpus_allowed are holding
+ * Rules for changing task_struct::cpus_mask are holding
* both pi_lock and rq->lock, such that holding either
* stabilizes the mask.
*
@@ -5902,19 +6573,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
void __init sched_init(void)
{
- int i, j;
- unsigned long alloc_size = 0, ptr;
+ unsigned long ptr = 0;
+ int i;
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
#ifdef CONFIG_RT_GROUP_SCHED
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
- if (alloc_size) {
- ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+ if (ptr) {
+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.se = (struct sched_entity **)ptr;
@@ -6005,10 +6676,6 @@ void __init sched_init(void)
#ifdef CONFIG_RT_GROUP_SCHED
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
-
- for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
- rq->cpu_load[j] = 0;
-
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
@@ -6063,6 +6730,8 @@ void __init sched_init(void)
psi_init();
+ init_uclamp();
+
scheduler_running = 1;
}
@@ -6103,7 +6772,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
rcu_sleep_check();
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- !is_idle_task(current)) ||
+ !is_idle_task(current) && !current->non_block_count) ||
system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
oops_in_progress)
return;
@@ -6119,8 +6788,8 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
"BUG: sleeping function called from invalid context at %s:%d\n",
file, line);
printk(KERN_ERR
- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
- in_atomic(), irqs_disabled(),
+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), current->non_block_count,
current->pid, current->comm);
if (task_stack_end_corrupted(current))
@@ -6235,7 +6904,7 @@ struct task_struct *curr_task(int cpu)
#ifdef CONFIG_IA64
/**
- * set_curr_task - set the current task for a given CPU.
+ * ia64_set_curr_task - set the current task for a given CPU.
* @cpu: the processor in question.
* @p: the task pointer to set.
*
@@ -6260,6 +6929,20 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
+static inline void alloc_uclamp_sched_group(struct task_group *tg,
+ struct task_group *parent)
+{
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ enum uclamp_id clamp_id;
+
+ for_each_clamp_id(clamp_id) {
+ uclamp_se_set(&tg->uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
+ tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+ }
+#endif
+}
+
static void sched_free_group(struct task_group *tg)
{
free_fair_sched_group(tg);
@@ -6283,6 +6966,8 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ alloc_uclamp_sched_group(tg, parent);
+
return tg;
err:
@@ -6386,7 +7071,7 @@ void sched_move_task(struct task_struct *tsk)
if (queued)
enqueue_task(rq, tsk, queue_flags);
if (running)
- set_curr_task(rq, tsk);
+ set_next_task(rq, tsk);
task_rq_unlock(rq, tsk, &rf);
}
@@ -6469,10 +7154,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
-#else
- /* We don't support RT-tasks being in separate groups */
- if (task->sched_class != &fair_sched_class)
- return -EINVAL;
#endif
/*
* Serialize against wake_up_new_task() such that if its
@@ -6503,6 +7184,178 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
sched_move_task(task);
}
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *top_css = css;
+ struct uclamp_se *uc_parent = NULL;
+ struct uclamp_se *uc_se = NULL;
+ unsigned int eff[UCLAMP_CNT];
+ enum uclamp_id clamp_id;
+ unsigned int clamps;
+
+ css_for_each_descendant_pre(css, top_css) {
+ uc_parent = css_tg(css)->parent
+ ? css_tg(css)->parent->uclamp : NULL;
+
+ for_each_clamp_id(clamp_id) {
+ /* Assume effective clamps matches requested clamps */
+ eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+ /* Cap effective clamps with parent's effective clamps */
+ if (uc_parent &&
+ eff[clamp_id] > uc_parent[clamp_id].value) {
+ eff[clamp_id] = uc_parent[clamp_id].value;
+ }
+ }
+ /* Ensure protection is always capped by limit */
+ eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+
+ /* Propagate most restrictive effective clamps */
+ clamps = 0x0;
+ uc_se = css_tg(css)->uclamp;
+ for_each_clamp_id(clamp_id) {
+ if (eff[clamp_id] == uc_se[clamp_id].value)
+ continue;
+ uc_se[clamp_id].value = eff[clamp_id];
+ uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+ clamps |= (0x1 << clamp_id);
+ }
+ if (!clamps) {
+ css = css_rightmost_descendant(css);
+ continue;
+ }
+
+ /* Immediately update descendants RUNNABLE tasks */
+ uclamp_update_active_tasks(css, clamps);
+ }
+}
+
+/*
+ * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+ * C expression. Since there is no way to convert a macro argument (N) into a
+ * character constant, use two levels of macros.
+ */
+#define _POW10(exp) ((unsigned int)1e##exp)
+#define POW10(exp) _POW10(exp)
+
+struct uclamp_request {
+#define UCLAMP_PERCENT_SHIFT 2
+#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
+ s64 percent;
+ u64 util;
+ int ret;
+};
+
+static inline struct uclamp_request
+capacity_from_percent(char *buf)
+{
+ struct uclamp_request req = {
+ .percent = UCLAMP_PERCENT_SCALE,
+ .util = SCHED_CAPACITY_SCALE,
+ .ret = 0,
+ };
+
+ buf = strim(buf);
+ if (strcmp(buf, "max")) {
+ req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+ &req.percent);
+ if (req.ret)
+ return req;
+ if (req.percent > UCLAMP_PERCENT_SCALE) {
+ req.ret = -ERANGE;
+ return req;
+ }
+
+ req.util = req.percent << SCHED_CAPACITY_SHIFT;
+ req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+ }
+
+ return req;
+}
+
+static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off,
+ enum uclamp_id clamp_id)
+{
+ struct uclamp_request req;
+ struct task_group *tg;
+
+ req = capacity_from_percent(buf);
+ if (req.ret)
+ return req.ret;
+
+ mutex_lock(&uclamp_mutex);
+ rcu_read_lock();
+
+ tg = css_tg(of_css(of));
+ if (tg->uclamp_req[clamp_id].value != req.util)
+ uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+
+ /*
+ * Because of not recoverable conversion rounding we keep track of the
+ * exact requested value
+ */
+ tg->uclamp_pct[clamp_id] = req.percent;
+
+ /* Update effective clamps to track the most restrictive value */
+ cpu_util_update_eff(of_css(of));
+
+ rcu_read_unlock();
+ mutex_unlock(&uclamp_mutex);
+
+ return nbytes;
+}
+
+static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+}
+
+static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+}
+
+static inline void cpu_uclamp_print(struct seq_file *sf,
+ enum uclamp_id clamp_id)
+{
+ struct task_group *tg;
+ u64 util_clamp;
+ u64 percent;
+ u32 rem;
+
+ rcu_read_lock();
+ tg = css_tg(seq_css(sf));
+ util_clamp = tg->uclamp_req[clamp_id].value;
+ rcu_read_unlock();
+
+ if (util_clamp == SCHED_CAPACITY_SCALE) {
+ seq_puts(sf, "max\n");
+ return;
+ }
+
+ percent = tg->uclamp_pct[clamp_id];
+ percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+ seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+}
+
+static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+{
+ cpu_uclamp_print(sf, UCLAMP_MIN);
+ return 0;
+}
+
+static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+{
+ cpu_uclamp_print(sf, UCLAMP_MAX);
+ return 0;
+}
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
@@ -6848,6 +7701,20 @@ static struct cftype cpu_legacy_files[] = {
.write_u64 = cpu_rt_period_write_uint,
},
#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ {
+ .name = "uclamp.min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_min_show,
+ .write = cpu_uclamp_min_write,
+ },
+ {
+ .name = "uclamp.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_max_show,
+ .write = cpu_uclamp_max_write,
+ },
+#endif
{ } /* Terminate */
};
@@ -7015,6 +7882,20 @@ static struct cftype cpu_files[] = {
.write = cpu_max_write,
},
#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ {
+ .name = "uclamp.min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_min_show,
+ .write = cpu_uclamp_min_write,
+ },
+ {
+ .name = "uclamp.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_max_show,
+ .write = cpu_uclamp_max_write,
+ },
+#endif
{ } /* terminate */
};
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index ec4e4a9aab5f..5cc4012572ec 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -120,14 +120,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
- cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
+ cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
- if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+ if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
if (later_mask)
cpumask_set_cpu(best_cpu, later_mask);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 962cf343f798..86800b4d5453 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -40,6 +40,7 @@ struct sugov_policy {
struct task_struct *thread;
bool work_in_progress;
+ bool limits_changed;
bool need_freq_update;
};
@@ -89,8 +90,11 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
- if (unlikely(sg_policy->need_freq_update))
+ if (unlikely(sg_policy->limits_changed)) {
+ sg_policy->limits_changed = false;
+ sg_policy->need_freq_update = true;
return true;
+ }
delta_ns = time - sg_policy->last_freq_update_time;
@@ -113,6 +117,7 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
struct cpufreq_policy *policy = sg_policy->policy;
+ int cpu;
if (!sugov_update_next_freq(sg_policy, time, next_freq))
return;
@@ -122,7 +127,11 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
return;
policy->cur = next_freq;
- trace_cpu_frequency(next_freq, smp_processor_id());
+
+ if (trace_cpu_frequency_enabled()) {
+ for_each_cpu(cpu, policy->cpus)
+ trace_cpu_frequency(next_freq, cpu);
+ }
}
static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
@@ -196,14 +205,17 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
* based on the task model parameters and gives the minimal utilization
* required to meet deadlines.
*/
-unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type)
+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum schedutil_type type,
+ struct task_struct *p)
{
unsigned long dl_util, util, irq;
struct rq *rq = cpu_rq(cpu);
- if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt))
+ if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
+ type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
+ }
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
@@ -219,9 +231,16 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
+ *
+ * CFS and RT utilization can be boosted or capped, depending on
+ * utilization clamp constraints requested by currently RUNNABLE
+ * tasks.
+ * When there are no CFS RUNNABLE tasks, clamps are released and
+ * frequency will be gracefully reduced with the utilization decay.
*/
- util = util_cfs;
- util += cpu_util_rt(rq);
+ util = util_cfs + cpu_util_rt(rq);
+ if (type == FREQUENCY_UTIL)
+ util = uclamp_util_with(rq, util, p);
dl_util = cpu_util_dl(rq);
@@ -249,9 +268,9 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
* irq metric. Because IRQ/steal time is hidden from the task clock we
* need to scale the task numbers:
*
- * 1 - irq
- * U' = irq + ------- * U
- * max
+ * max - irq
+ * U' = irq + --------- * U
+ * max
*/
util = scale_irq_capacity(util, irq, max);
util += irq;
@@ -276,12 +295,12 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long util = cpu_util_cfs(rq);
- unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+ unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq);
- return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL);
+ return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
}
/**
@@ -427,7 +446,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
{
if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
- sg_policy->need_freq_update = true;
+ sg_policy->limits_changed = true;
}
static void sugov_update_single(struct update_util_data *hook, u64 time,
@@ -447,7 +466,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
if (!sugov_should_update_freq(sg_policy, time))
return;
- busy = sugov_cpu_is_busy(sg_cpu);
+ /* Limits may have changed, don't skip frequency update */
+ busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu);
util = sugov_get_util(sg_cpu);
max = sg_cpu->max;
@@ -821,6 +841,7 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->last_freq_update_time = 0;
sg_policy->next_freq = 0;
sg_policy->work_in_progress = false;
+ sg_policy->limits_changed = false;
sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = 0;
@@ -869,7 +890,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
mutex_unlock(&sg_policy->work_lock);
}
- sg_policy->need_freq_update = true;
+ sg_policy->limits_changed = true;
}
struct cpufreq_governor schedutil_gov = {
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 9c6480e6d62d..b7abca987d94 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -94,11 +94,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
continue;
- if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+ if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
continue;
if (lowest_mask) {
- cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+ cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
/*
* We have to ensure that we have at least one bit
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 43901fa3f269..2dc48720f189 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -287,7 +287,7 @@ static void task_non_contending(struct task_struct *p)
dl_se->dl_non_contending = 1;
get_task_struct(p);
- hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
+ hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
}
static void task_contending(struct sched_dl_entity *dl_se, int flags)
@@ -529,6 +529,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
{
struct rq *later_rq = NULL;
+ struct dl_bw *dl_b;
later_rq = find_lock_later_rq(p, rq);
if (!later_rq) {
@@ -538,7 +539,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* If we cannot preempt any rq, fall back to pick any
* online CPU:
*/
- cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+ cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
if (cpu >= nr_cpu_ids) {
/*
* Failed to find any suitable CPU.
@@ -557,6 +558,38 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
double_lock_balance(rq, later_rq);
}
+ if (p->dl.dl_non_contending || p->dl.dl_throttled) {
+ /*
+ * Inactive timer is armed (or callback is running, but
+ * waiting for us to release rq locks). In any case, when it
+ * will fire (or continue), it will see running_bw of this
+ * task migrated to later_rq (and correctly handle it).
+ */
+ sub_running_bw(&p->dl, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
+
+ add_rq_bw(&p->dl, &later_rq->dl);
+ add_running_bw(&p->dl, &later_rq->dl);
+ } else {
+ sub_rq_bw(&p->dl, &rq->dl);
+ add_rq_bw(&p->dl, &later_rq->dl);
+ }
+
+ /*
+ * And we finally need to fixup root_domain(s) bandwidth accounting,
+ * since p is still hanging out in the old (now moved to default) root
+ * domain.
+ */
+ dl_b = &rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+ __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+ raw_spin_unlock(&dl_b->lock);
+
+ dl_b = &later_rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+ __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
+ raw_spin_unlock(&dl_b->lock);
+
set_task_cpu(p, later_rq->cpu);
double_unlock_balance(later_rq, rq);
@@ -726,7 +759,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* refill the runtime and set the deadline a period in the future,
* because keeping the current (absolute) deadline of the task would
* result in breaking guarantees promised to other tasks (refer to
- * Documentation/scheduler/sched-deadline.txt for more information).
+ * Documentation/scheduler/sched-deadline.rst for more information).
*
* This function returns true if:
*
@@ -923,7 +956,7 @@ static int start_dl_timer(struct task_struct *p)
*/
if (!hrtimer_is_queued(timer)) {
get_task_struct(p);
- hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+ hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
}
return 1;
@@ -1053,7 +1086,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
timer->function = dl_task_timer;
}
@@ -1195,7 +1228,7 @@ static void update_curr_dl(struct rq *rq)
&curr->dl);
} else {
unsigned long scale_freq = arch_scale_freq_capacity(cpu);
- unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+ unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
scaled_delta_exec = cap_scale(delta_exec, scale_freq);
scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
@@ -1292,7 +1325,7 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->inactive_timer;
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
timer->function = inactive_task_timer;
}
@@ -1694,12 +1727,20 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
}
#endif
-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static void set_next_task_dl(struct rq *rq, struct task_struct *p)
{
p->se.exec_start = rq_clock_task(rq);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
+
+ if (hrtick_enabled(rq))
+ start_hrtick_dl(rq, p);
+
+ if (rq->curr->sched_class != &dl_sched_class)
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+ deadline_queue_push_tasks(rq);
}
static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1720,64 +1761,42 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *p;
struct dl_rq *dl_rq;
- dl_rq = &rq->dl;
+ WARN_ON_ONCE(prev || rf);
- if (need_pull_dl_task(rq, prev)) {
- /*
- * This is OK, because current is on_cpu, which avoids it being
- * picked for load-balance and preemption/IRQs are still
- * disabled avoiding further scheduler activity on it and we're
- * being very careful to re-start the picking loop.
- */
- rq_unpin_lock(rq, rf);
- pull_dl_task(rq);
- rq_repin_lock(rq, rf);
- /*
- * pull_dl_task() can drop (and re-acquire) rq->lock; this
- * means a stop task can slip in, in which case we need to
- * re-start task selection.
- */
- if (rq->stop && task_on_rq_queued(rq->stop))
- return RETRY_TASK;
- }
-
- /*
- * When prev is DL, we may throttle it in put_prev_task().
- * So, we update time before we check for dl_nr_running.
- */
- if (prev->sched_class == &dl_sched_class)
- update_curr_dl(rq);
+ dl_rq = &rq->dl;
if (unlikely(!dl_rq->dl_nr_running))
return NULL;
- put_prev_task(rq, prev);
-
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
- set_next_task(rq, p);
-
- if (hrtick_enabled(rq))
- start_hrtick_dl(rq, p);
-
- deadline_queue_push_tasks(rq);
-
- if (rq->curr->sched_class != &dl_sched_class)
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ set_next_task_dl(rq, p);
return p;
}
-static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
{
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
+
+ if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet started the picking loop.
+ */
+ rq_unpin_lock(rq, rf);
+ pull_dl_task(rq);
+ rq_repin_lock(rq, rf);
+ }
}
/*
@@ -1811,11 +1830,6 @@ static void task_fork_dl(struct task_struct *p)
*/
}
-static void set_curr_task_dl(struct rq *rq)
-{
- set_next_task(rq, rq->curr);
-}
-
#ifdef CONFIG_SMP
/* Only try algorithms three times */
@@ -1824,7 +1838,7 @@ static void set_curr_task_dl(struct rq *rq)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
}
@@ -1974,7 +1988,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
+ !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
task_running(rq, task) ||
!dl_task(task) ||
!task_on_rq_queued(task))) {
@@ -2088,17 +2102,13 @@ retry:
}
deactivate_task(rq, next_task, 0);
- sub_running_bw(&next_task->dl, &rq->dl);
- sub_rq_bw(&next_task->dl, &rq->dl);
set_task_cpu(next_task, later_rq->cpu);
- add_rq_bw(&next_task->dl, &later_rq->dl);
/*
* Update the later_rq clock here, because the clock is used
* by the cpufreq_update_util() inside __add_running_bw().
*/
update_rq_clock(later_rq);
- add_running_bw(&next_task->dl, &later_rq->dl);
activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
ret = 1;
@@ -2186,11 +2196,7 @@ static void pull_dl_task(struct rq *this_rq)
resched = true;
deactivate_task(src_rq, p, 0);
- sub_running_bw(&p->dl, &src_rq->dl);
- sub_rq_bw(&p->dl, &src_rq->dl);
set_task_cpu(p, this_cpu);
- add_rq_bw(&p->dl, &this_rq->dl);
- add_running_bw(&p->dl, &this_rq->dl);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
@@ -2283,6 +2289,36 @@ void __init init_sched_dl_class(void)
GFP_KERNEL, cpu_to_node(i));
}
+void dl_add_task_root_domain(struct task_struct *p)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+ struct dl_bw *dl_b;
+
+ rq = task_rq_lock(p, &rf);
+ if (!dl_task(p))
+ goto unlock;
+
+ dl_b = &rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+
+ __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+
+ raw_spin_unlock(&dl_b->lock);
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+}
+
+void dl_clear_root_domain(struct root_domain *rd)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
+ rd->dl_bw.total_bw = 0;
+ raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
+}
+
#endif /* CONFIG_SMP */
static void switched_from_dl(struct rq *rq, struct task_struct *p)
@@ -2403,6 +2439,7 @@ const struct sched_class dl_sched_class = {
.pick_next_task = pick_next_task_dl,
.put_prev_task = put_prev_task_dl,
+ .set_next_task = set_next_task_dl,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_dl,
@@ -2413,7 +2450,6 @@ const struct sched_class dl_sched_class = {
.task_woken = task_woken_dl,
#endif
- .set_curr_task = set_curr_task_dl,
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 14c6a8716ba1..f7e4579e746c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -233,49 +233,35 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
*tablep = NULL;
}
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
-
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
- umode_t mode, proc_handler *proc_handler,
- bool load_idx)
+ umode_t mode, proc_handler *proc_handler)
{
entry->procname = procname;
entry->data = data;
entry->maxlen = maxlen;
entry->mode = mode;
entry->proc_handler = proc_handler;
-
- if (load_idx) {
- entry->extra1 = &min_load_idx;
- entry->extra2 = &max_load_idx;
- }
}
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(14);
+ struct ctl_table *table = sd_alloc_ctl_entry(9);
if (table == NULL)
return NULL;
- set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[13] is terminator */
+ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
+ /* &table[8] is terminator */
return table;
}
@@ -653,8 +639,6 @@ do { \
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
P(nr_running);
- SEQ_printf(m, " .%-30s: %lu\n", "load",
- rq->load.weight);
P(nr_switches);
P(nr_load_updates);
P(nr_uninterruptible);
@@ -662,11 +646,6 @@ do { \
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
PN(clock_task);
- P(cpu_load[0]);
- P(cpu_load[1]);
- P(cpu_load[2]);
- P(cpu_load[3]);
- P(cpu_load[4]);
#undef P
#undef PN
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f35930f5e528..d4bbf68c3161 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -96,12 +96,12 @@ int __weak arch_asym_cpu_priority(int cpu)
}
/*
- * The margin used when comparing utilization with CPU capacity:
- * util * margin < capacity * 1024
+ * The margin used when comparing utilization with CPU capacity.
*
* (default: ~20%)
*/
-static unsigned int capacity_margin = 1280;
+#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
+
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -275,6 +275,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
+{
+ if (!path)
+ return;
+
+ if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
+ autogroup_path(cfs_rq->tg, path, len);
+ else if (cfs_rq && cfs_rq->tg->css.cgroup)
+ cgroup_path(cfs_rq->tg->css.cgroup, path, len);
+ else
+ strlcpy(path, "(null)", len);
+}
+
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
@@ -449,6 +462,12 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return NULL;
}
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
+{
+ if (path)
+ strlcpy(path, "(null)", len);
+}
+
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
return true;
@@ -764,7 +783,7 @@ void post_init_entity_util_avg(struct task_struct *p)
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
- long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
if (cap > 0) {
@@ -1067,6 +1086,21 @@ struct numa_group {
unsigned long faults[0];
};
+/*
+ * For functions that can be called in multiple contexts that permit reading
+ * ->numa_group (see struct task_struct for locking rules).
+ */
+static struct numa_group *deref_task_numa_group(struct task_struct *p)
+{
+ return rcu_dereference_check(p->numa_group, p == current ||
+ (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
+}
+
+static struct numa_group *deref_curr_numa_group(struct task_struct *p)
+{
+ return rcu_dereference_protected(p->numa_group, p == current);
+}
+
static inline unsigned long group_faults_priv(struct numa_group *ng);
static inline unsigned long group_faults_shared(struct numa_group *ng);
@@ -1110,10 +1144,12 @@ static unsigned int task_scan_start(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long period = smin;
+ struct numa_group *ng;
/* Scale the maximum scan period with the amount of shared memory. */
- if (p->numa_group) {
- struct numa_group *ng = p->numa_group;
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
@@ -1121,6 +1157,7 @@ static unsigned int task_scan_start(struct task_struct *p)
period *= shared + 1;
period /= private + shared + 1;
}
+ rcu_read_unlock();
return max(smin, period);
}
@@ -1129,13 +1166,14 @@ static unsigned int task_scan_max(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long smax;
+ struct numa_group *ng;
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
/* Scale the maximum scan period with the amount of shared memory. */
- if (p->numa_group) {
- struct numa_group *ng = p->numa_group;
+ ng = deref_curr_numa_group(p);
+ if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
unsigned long period = smax;
@@ -1150,47 +1188,6 @@ static unsigned int task_scan_max(struct task_struct *p)
return max(smin, smax);
}
-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
-{
- int mm_users = 0;
- struct mm_struct *mm = p->mm;
-
- if (mm) {
- mm_users = atomic_read(&mm->mm_users);
- if (mm_users == 1) {
- mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
- mm->numa_scan_seq = 0;
- }
- }
- p->node_stamp = 0;
- p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
- p->numa_scan_period = sysctl_numa_balancing_scan_delay;
- p->numa_work.next = &p->numa_work;
- p->numa_faults = NULL;
- p->numa_group = NULL;
- p->last_task_numa_placement = 0;
- p->last_sum_exec_runtime = 0;
-
- /* New address space, reset the preferred nid */
- if (!(clone_flags & CLONE_VM)) {
- p->numa_preferred_nid = NUMA_NO_NODE;
- return;
- }
-
- /*
- * New thread, keep existing numa_preferred_nid which should be copied
- * already by arch_dup_task_struct but stagger when scans start.
- */
- if (mm) {
- unsigned int delay;
-
- delay = min_t(unsigned int, task_scan_max(current),
- current->numa_scan_period * mm_users * NSEC_PER_MSEC);
- delay += 2 * TICK_NSEC;
- p->node_stamp = delay;
- }
-}
-
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
@@ -1214,7 +1211,16 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
pid_t task_numa_group_id(struct task_struct *p)
{
- return p->numa_group ? p->numa_group->gid : 0;
+ struct numa_group *ng;
+ pid_t gid = 0;
+
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ if (ng)
+ gid = ng->gid;
+ rcu_read_unlock();
+
+ return gid;
}
/*
@@ -1239,11 +1245,13 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
static inline unsigned long group_faults(struct task_struct *p, int nid)
{
- if (!p->numa_group)
+ struct numa_group *ng = deref_task_numa_group(p);
+
+ if (!ng)
return 0;
- return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
- p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
+ return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
@@ -1381,12 +1389,13 @@ static inline unsigned long task_weight(struct task_struct *p, int nid,
static inline unsigned long group_weight(struct task_struct *p, int nid,
int dist)
{
+ struct numa_group *ng = deref_task_numa_group(p);
unsigned long faults, total_faults;
- if (!p->numa_group)
+ if (!ng)
return 0;
- total_faults = p->numa_group->total_faults;
+ total_faults = ng->total_faults;
if (!total_faults)
return 0;
@@ -1400,7 +1409,7 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
- struct numa_group *ng = p->numa_group;
+ struct numa_group *ng = deref_curr_numa_group(p);
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
@@ -1466,9 +1475,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long weighted_cpuload(struct rq *rq);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
+static unsigned long cpu_runnable_load(struct rq *rq);
/* Cached statistics for all CPUs within a node */
struct numa_stats {
@@ -1489,7 +1496,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
- ns->load += weighted_cpuload(rq);
+ ns->load += cpu_runnable_load(rq);
ns->compute_capacity += capacity_of(cpu);
}
@@ -1583,13 +1590,14 @@ static bool load_too_imbalanced(long src_load, long dst_load,
static void task_numa_compare(struct task_numa_env *env,
long taskimp, long groupimp, bool maymove)
{
+ struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
+ long imp = p_ng ? groupimp : taskimp;
struct task_struct *cur;
long src_load, dst_load;
- long load;
- long imp = env->p->numa_group ? groupimp : taskimp;
- long moveimp = imp;
int dist = env->dist;
+ long moveimp = imp;
+ long load;
if (READ_ONCE(dst_rq->numa_migrate_on))
return;
@@ -1621,28 +1629,29 @@ static void task_numa_compare(struct task_numa_env *env,
* be incurred if the tasks were swapped.
*/
/* Skip this swap candidate if cannot move to the source cpu */
- if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
goto unlock;
/*
* If dst and source tasks are in the same NUMA group, or not
* in any group then look only at task weights.
*/
- if (cur->numa_group == env->p->numa_group) {
+ cur_ng = rcu_dereference(cur->numa_group);
+ if (cur_ng == p_ng) {
imp = taskimp + task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
/*
* Add some hysteresis to prevent swapping the
* tasks within a group over tiny differences.
*/
- if (cur->numa_group)
+ if (cur_ng)
imp -= imp / 16;
} else {
/*
* Compare the group weights. If a task is all by itself
* (not part of a group), use the task weight instead.
*/
- if (cur->numa_group && env->p->numa_group)
+ if (cur_ng && p_ng)
imp += group_weight(cur, env->src_nid, dist) -
group_weight(cur, env->dst_nid, dist);
else
@@ -1718,7 +1727,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
- if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
env->dst_cpu = cpu;
@@ -1740,11 +1749,12 @@ static int task_numa_migrate(struct task_struct *p)
.best_imp = 0,
.best_cpu = -1,
};
+ unsigned long taskweight, groupweight;
struct sched_domain *sd;
+ long taskimp, groupimp;
+ struct numa_group *ng;
struct rq *best_rq;
- unsigned long taskweight, groupweight;
int nid, ret, dist;
- long taskimp, groupimp;
/*
* Pick the lowest SD_NUMA domain, as that would have the smallest
@@ -1790,7 +1800,8 @@ static int task_numa_migrate(struct task_struct *p)
* multiple NUMA nodes; in order to better consolidate the group,
* we need to check other locations.
*/
- if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
+ ng = deref_curr_numa_group(p);
+ if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -1823,7 +1834,7 @@ static int task_numa_migrate(struct task_struct *p)
* A task that migrated to a second choice node will be better off
* trying for a better one later. Do not set the preferred node here.
*/
- if (p->numa_group) {
+ if (ng) {
if (env.best_cpu == -1)
nid = env.src_nid;
else
@@ -2118,6 +2129,7 @@ static void task_numa_placement(struct task_struct *p)
unsigned long total_faults;
u64 runtime, period;
spinlock_t *group_lock = NULL;
+ struct numa_group *ng;
/*
* The p->mm->numa_scan_seq field gets updated without
@@ -2135,8 +2147,9 @@ static void task_numa_placement(struct task_struct *p)
runtime = numa_get_avg_runtime(p, &period);
/* If the task is part of a group prevent parallel updates to group stats */
- if (p->numa_group) {
- group_lock = &p->numa_group->lock;
+ ng = deref_curr_numa_group(p);
+ if (ng) {
+ group_lock = &ng->lock;
spin_lock_irq(group_lock);
}
@@ -2177,7 +2190,7 @@ static void task_numa_placement(struct task_struct *p)
p->numa_faults[cpu_idx] += f_diff;
faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff;
- if (p->numa_group) {
+ if (ng) {
/*
* safe because we can only change our own group
*
@@ -2185,14 +2198,14 @@ static void task_numa_placement(struct task_struct *p)
* nid and priv in a specific region because it
* is at the beginning of the numa_faults array.
*/
- p->numa_group->faults[mem_idx] += diff;
- p->numa_group->faults_cpu[mem_idx] += f_diff;
- p->numa_group->total_faults += diff;
- group_faults += p->numa_group->faults[mem_idx];
+ ng->faults[mem_idx] += diff;
+ ng->faults_cpu[mem_idx] += f_diff;
+ ng->total_faults += diff;
+ group_faults += ng->faults[mem_idx];
}
}
- if (!p->numa_group) {
+ if (!ng) {
if (faults > max_faults) {
max_faults = faults;
max_nid = nid;
@@ -2203,8 +2216,8 @@ static void task_numa_placement(struct task_struct *p)
}
}
- if (p->numa_group) {
- numa_group_count_active_nodes(p->numa_group);
+ if (ng) {
+ numa_group_count_active_nodes(ng);
spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_nid);
}
@@ -2238,7 +2251,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
int cpu = cpupid_to_cpu(cpupid);
int i;
- if (unlikely(!p->numa_group)) {
+ if (unlikely(!deref_curr_numa_group(p))) {
unsigned int size = sizeof(struct numa_group) +
4*nr_node_ids*sizeof(unsigned long);
@@ -2274,7 +2287,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!grp)
goto no_join;
- my_grp = p->numa_group;
+ my_grp = deref_curr_numa_group(p);
if (grp == my_grp)
goto no_join;
@@ -2336,13 +2349,24 @@ no_join:
return;
}
-void task_numa_free(struct task_struct *p)
+/*
+ * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * If @final is set, the task is dead and has reached refcount zero, so we can
+ * safely free all relevant data structures. Otherwise, there might be
+ * concurrent reads from places like load balancing and procfs, and we should
+ * reset the data back to default state without freeing ->numa_faults.
+ */
+void task_numa_free(struct task_struct *p, bool final)
{
- struct numa_group *grp = p->numa_group;
- void *numa_faults = p->numa_faults;
+ /* safe: p either is current or is being freed by current */
+ struct numa_group *grp = rcu_dereference_raw(p->numa_group);
+ unsigned long *numa_faults = p->numa_faults;
unsigned long flags;
int i;
+ if (!numa_faults)
+ return;
+
if (grp) {
spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
@@ -2355,8 +2379,14 @@ void task_numa_free(struct task_struct *p)
put_numa_group(grp);
}
- p->numa_faults = NULL;
- kfree(numa_faults);
+ if (final) {
+ p->numa_faults = NULL;
+ kfree(numa_faults);
+ } else {
+ p->total_numa_faults = 0;
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ numa_faults[i] = 0;
+ }
}
/*
@@ -2409,7 +2439,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
* actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down.
*/
- ng = p->numa_group;
+ ng = deref_curr_numa_group(p);
if (!priv && !local && ng && ng->active_nodes > 1 &&
numa_is_active_node(cpu_node, ng) &&
numa_is_active_node(mem_node, ng))
@@ -2452,7 +2482,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
*/
-void task_numa_work(struct callback_head *work)
+static void task_numa_work(struct callback_head *work)
{
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
@@ -2465,7 +2495,7 @@ void task_numa_work(struct callback_head *work)
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
- work->next = work; /* protect against double add */
+ work->next = work;
/*
* Who cares about NUMA placement when they're dying.
*
@@ -2594,6 +2624,50 @@ out:
}
}
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+ int mm_users = 0;
+ struct mm_struct *mm = p->mm;
+
+ if (mm) {
+ mm_users = atomic_read(&mm->mm_users);
+ if (mm_users == 1) {
+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ mm->numa_scan_seq = 0;
+ }
+ }
+ p->node_stamp = 0;
+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ /* Protect against double add, see task_tick_numa and task_numa_work */
+ p->numa_work.next = &p->numa_work;
+ p->numa_faults = NULL;
+ RCU_INIT_POINTER(p->numa_group, NULL);
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
+
+ init_task_work(&p->numa_work, task_numa_work);
+
+ /* New address space, reset the preferred nid */
+ if (!(clone_flags & CLONE_VM)) {
+ p->numa_preferred_nid = NUMA_NO_NODE;
+ return;
+ }
+
+ /*
+ * New thread, keep existing numa_preferred_nid which should be copied
+ * already by arch_dup_task_struct but stagger when scans start.
+ */
+ if (mm) {
+ unsigned int delay;
+
+ delay = min_t(unsigned int, task_scan_max(current),
+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+ delay += 2 * TICK_NSEC;
+ p->node_stamp = delay;
+ }
+}
+
/*
* Drive the periodic memory faults..
*/
@@ -2622,10 +2696,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
- if (!time_before(jiffies, curr->mm->numa_next_scan)) {
- init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+ if (!time_before(jiffies, curr->mm->numa_next_scan))
task_work_add(curr, work, true);
- }
}
}
@@ -2686,8 +2758,6 @@ static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se)) {
struct rq *rq = rq_of(cfs_rq);
@@ -2703,8 +2773,6 @@ static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se)) {
account_numa_dequeue(rq_of(cfs_rq), task_of(se));
@@ -3334,6 +3402,9 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
update_tg_cfs_util(cfs_rq, se, gcfs_rq);
update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
+ trace_pelt_cfs_tp(cfs_rq);
+ trace_pelt_se_tp(se);
+
return 1;
}
@@ -3486,6 +3557,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
cfs_rq_util_change(cfs_rq, flags);
+
+ trace_pelt_cfs_tp(cfs_rq);
}
/**
@@ -3505,6 +3578,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
cfs_rq_util_change(cfs_rq, 0);
+
+ trace_pelt_cfs_tp(cfs_rq);
}
/*
@@ -3615,8 +3690,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
-
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
@@ -3733,7 +3806,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
static inline int task_fits_capacity(struct task_struct *p, long capacity)
{
- return capacity * 1024 > task_util_est(p) * capacity_margin;
+ return fits_capacity(task_util_est(p), capacity);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -4100,7 +4173,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
- if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+ if (schedstat_enabled() &&
+ rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
schedstat_set(se->statistics.slice_max,
max((u64)schedstat_val(se->statistics.slice_max),
se->sum_exec_runtime - se->prev_sum_exec_runtime));
@@ -4295,8 +4369,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
now = sched_clock_cpu(smp_processor_id());
cfs_b->runtime = cfs_b->quota;
- cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
- cfs_b->expires_seq++;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4318,8 +4390,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount, expires;
- int expires_seq;
+ u64 amount = 0, min_amount;
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4336,65 +4407,23 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- expires_seq = cfs_b->expires_seq;
- expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
- /*
- * we may have advanced our local expiration to account for allowed
- * spread between our sched_clock and the one on which runtime was
- * issued.
- */
- if (cfs_rq->expires_seq != expires_seq) {
- cfs_rq->expires_seq = expires_seq;
- cfs_rq->runtime_expires = expires;
- }
return cfs_rq->runtime_remaining > 0;
}
-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
- /* if the deadline is ahead of our clock, nothing to do */
- if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
- return;
-
- if (cfs_rq->runtime_remaining < 0)
- return;
-
- /*
- * If the local deadline has passed we have to consider the
- * possibility that our sched_clock is 'fast' and the global deadline
- * has not truly expired.
- *
- * Fortunately we can check determine whether this the case by checking
- * whether the global deadline(cfs_b->expires_seq) has advanced.
- */
- if (cfs_rq->expires_seq == cfs_b->expires_seq) {
- /* extend local deadline, drift is bounded above by 2 ticks */
- cfs_rq->runtime_expires += TICK_NSEC;
- } else {
- /* global deadline is ahead, expiration has passed */
- cfs_rq->runtime_remaining = 0;
- }
-}
-
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
- expire_cfs_rq_runtime(cfs_rq);
if (likely(cfs_rq->runtime_remaining > 0))
return;
+ if (cfs_rq->throttled)
+ return;
/*
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
@@ -4479,7 +4508,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, dequeue = 1;
+ long task_delta, idle_task_delta, dequeue = 1;
bool empty;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -4490,6 +4519,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
rcu_read_unlock();
task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
@@ -4499,6 +4529,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (dequeue)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
qcfs_rq->h_nr_running -= task_delta;
+ qcfs_rq->idle_h_nr_running -= idle_task_delta;
if (qcfs_rq->load.weight)
dequeue = 0;
@@ -4538,7 +4569,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
int enqueue = 1;
- long task_delta;
+ long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -4558,6 +4589,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
return;
task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
if (se->on_rq)
enqueue = 0;
@@ -4566,6 +4598,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (enqueue)
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
cfs_rq->h_nr_running += task_delta;
+ cfs_rq->idle_h_nr_running += idle_task_delta;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -4581,8 +4614,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
resched_curr(rq);
}
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
- u64 remaining, u64 expires)
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
{
struct cfs_rq *cfs_rq;
u64 runtime;
@@ -4598,13 +4630,15 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
if (!cfs_rq_throttled(cfs_rq))
goto next;
+ /* By the above check, this should never be true */
+ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+
runtime = -cfs_rq->runtime_remaining + 1;
if (runtime > remaining)
runtime = remaining;
remaining -= runtime;
cfs_rq->runtime_remaining += runtime;
- cfs_rq->runtime_expires = expires;
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0)
@@ -4629,7 +4663,7 @@ next:
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
- u64 runtime, runtime_expires;
+ u64 runtime;
int throttled;
/* no need to continue the timer with no bandwidth constraint */
@@ -4657,8 +4691,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
- runtime_expires = cfs_b->runtime_expires;
-
/*
* This check is repeated as we are holding onto the new bandwidth while
* we unthrottle. This can potentially race with an unthrottled group
@@ -4671,8 +4703,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
cfs_b->distribute_running = 1;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- runtime = distribute_cfs_runtime(cfs_b, runtime,
- runtime_expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->distribute_running = 0;
@@ -4734,6 +4765,11 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
if (runtime_refresh_within(cfs_b, min_left))
return;
+ /* don't push forwards an existing deferred unthrottle */
+ if (cfs_b->slack_started)
+ return;
+ cfs_b->slack_started = true;
+
hrtimer_start(&cfs_b->slack_timer,
ns_to_ktime(cfs_bandwidth_slack_period),
HRTIMER_MODE_REL);
@@ -4749,8 +4785,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
return;
raw_spin_lock(&cfs_b->lock);
- if (cfs_b->quota != RUNTIME_INF &&
- cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+ if (cfs_b->quota != RUNTIME_INF) {
cfs_b->runtime += slack_runtime;
/* we are under rq->lock, defer unthrottling using a timer */
@@ -4783,10 +4818,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
unsigned long flags;
- u64 expires;
/* confirm we're still not at a refresh boundary */
raw_spin_lock_irqsave(&cfs_b->lock, flags);
+ cfs_b->slack_started = false;
if (cfs_b->distribute_running) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
@@ -4800,7 +4835,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- expires = cfs_b->runtime_expires;
if (runtime)
cfs_b->distribute_running = 1;
@@ -4809,11 +4843,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (!runtime)
return;
- runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- if (expires == cfs_b->runtime_expires)
- lsub_positive(&cfs_b->runtime, runtime);
+ lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
@@ -4950,6 +4983,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
cfs_b->distribute_running = 0;
+ cfs_b->slack_started = false;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -4969,8 +5003,6 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->period_active = 1;
overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
- cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
- cfs_b->expires_seq++;
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}
@@ -5148,13 +5180,15 @@ static inline unsigned long cpu_util(int cpu);
static inline bool cpu_overutilized(int cpu)
{
- return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+ return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
}
static inline void update_overutilized_status(struct rq *rq)
{
- if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
+ if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
+ trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
+ }
}
#else
static inline void update_overutilized_status(struct rq *rq) { }
@@ -5170,6 +5204,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ int idle_h_nr_running = task_has_idle_policy(p);
/*
* The code below (indirectly) updates schedutil which looks at
@@ -5202,6 +5237,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
flags = ENQUEUE_WAKEUP;
}
@@ -5209,6 +5245,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -5270,6 +5307,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
+ int idle_h_nr_running = task_has_idle_policy(p);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5284,6 +5322,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -5303,6 +5342,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -5325,71 +5365,6 @@ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
#ifdef CONFIG_NO_HZ_COMMON
-/*
- * per rq 'load' arrray crap; XXX kill this.
- */
-
-/*
- * The exact cpuload calculated at every tick would be:
- *
- * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
- *
- * If a CPU misses updates for n ticks (as it was idle) and update gets
- * called on the n+1-th tick when CPU may be busy, then we have:
- *
- * load_n = (1 - 1/2^i)^n * load_0
- * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- *
- * load' = (1 - 1/2^i)^n * load
- *
- * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
- * This allows us to precompute the above in said factors, thereby allowing the
- * reduction of an arbitrary n in O(log_2 n) steps. (See also
- * fixed_power_int())
- *
- * The calculation is approximated on a 128 point scale.
- */
-#define DEGRADE_SHIFT 7
-
-static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- { 0, 0, 0, 0, 0, 0, 0, 0 },
- { 64, 32, 8, 0, 0, 0, 0, 0 },
- { 96, 72, 40, 12, 1, 0, 0, 0 },
- { 112, 98, 75, 43, 15, 1, 0, 0 },
- { 120, 112, 98, 76, 45, 16, 2, 0 }
-};
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
- int j = 0;
-
- if (!missed_updates)
- return load;
-
- if (missed_updates >= degrade_zero_ticks[idx])
- return 0;
-
- if (idx == 1)
- return load >> missed_updates;
-
- while (missed_updates) {
- if (missed_updates % 2)
- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
- missed_updates >>= 1;
- j++;
- }
- return load;
-}
static struct {
cpumask_var_t idle_cpus_mask;
@@ -5401,232 +5376,18 @@ static struct {
#endif /* CONFIG_NO_HZ_COMMON */
-/**
- * __cpu_load_update - update the rq->cpu_load[] statistics
- * @this_rq: The rq to update statistics for
- * @this_load: The current load
- * @pending_updates: The number of missed updates
- *
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC).
- *
- * This function computes a decaying average:
- *
- * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
- *
- * Because of NOHZ it might not get called on every tick which gives need for
- * the @pending_updates argument.
- *
- * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
- * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
- * = A * (A * load[i]_n-2 + B) + B
- * = A * (A * (A * load[i]_n-3 + B) + B) + B
- * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
- * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
- * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
- * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
- *
- * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
- * any change in load would have resulted in the tick being turned back on.
- *
- * For regular NOHZ, this reduces to:
- *
- * load[i]_n = (1 - 1/2^i)^n * load[i]_0
- *
- * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term.
- */
-static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates)
-{
- unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
- int i, scale;
-
- this_rq->nr_load_updates++;
-
- /* Update our load: */
- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
- unsigned long old_load, new_load;
-
- /* scale is effectively 1 << i now, and >> i divides by scale */
-
- old_load = this_rq->cpu_load[i];
-#ifdef CONFIG_NO_HZ_COMMON
- old_load = decay_load_missed(old_load, pending_updates - 1, i);
- if (tickless_load) {
- old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
- /*
- * old_load can never be a negative value because a
- * decayed tickless_load cannot be greater than the
- * original tickless_load.
- */
- old_load += tickless_load;
- }
-#endif
- new_load = this_load;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (new_load > old_load)
- new_load += scale - 1;
-
- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
- }
-}
-
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(struct rq *rq)
-{
- return cfs_rq_runnable_load_avg(&rq->cfs);
-}
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we need to avoid the delta approach from the regular tick when
- * possible since that would seriously skew the load calculation. This is why we
- * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
- * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
- * loop exit, nohz_idle_balance, nohz full exit...)
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-static void cpu_load_update_nohz(struct rq *this_rq,
- unsigned long curr_jiffies,
- unsigned long load)
-{
- unsigned long pending_updates;
-
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- if (pending_updates) {
- this_rq->last_load_update_tick = curr_jiffies;
- /*
- * In the regular NOHZ case, we were idle, this means load 0.
- * In the NOHZ_FULL case, we were non-idle, we should consider
- * its weighted load.
- */
- cpu_load_update(this_rq, load, pending_updates);
- }
-}
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-static void cpu_load_update_idle(struct rq *this_rq)
-{
- /*
- * bail if there's load or we're actually up-to-date.
- */
- if (weighted_cpuload(this_rq))
- return;
-
- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
-}
-
-/*
- * Record CPU load on nohz entry so we know the tickless load to account
- * on nohz exit. cpu_load[0] happens then to be updated more frequently
- * than other cpu_load[idx] but it should be fine as cpu_load readers
- * shouldn't rely into synchronized cpu_load[*] updates.
- */
-void cpu_load_update_nohz_start(void)
-{
- struct rq *this_rq = this_rq();
-
- /*
- * This is all lockless but should be fine. If weighted_cpuload changes
- * concurrently we'll exit nohz. And cpu_load write can race with
- * cpu_load_update_idle() but both updater would be writing the same.
- */
- this_rq->cpu_load[0] = weighted_cpuload(this_rq);
-}
-
-/*
- * Account the tickless load in the end of a nohz frame.
- */
-void cpu_load_update_nohz_stop(void)
-{
- unsigned long curr_jiffies = READ_ONCE(jiffies);
- struct rq *this_rq = this_rq();
- unsigned long load;
- struct rq_flags rf;
-
- if (curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- load = weighted_cpuload(this_rq);
- rq_lock(this_rq, &rf);
- update_rq_clock(this_rq);
- cpu_load_update_nohz(this_rq, curr_jiffies, load);
- rq_unlock(this_rq, &rf);
-}
-#else /* !CONFIG_NO_HZ_COMMON */
-static inline void cpu_load_update_nohz(struct rq *this_rq,
- unsigned long curr_jiffies,
- unsigned long load) { }
-#endif /* CONFIG_NO_HZ_COMMON */
-
-static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
-{
-#ifdef CONFIG_NO_HZ_COMMON
- /* See the mess around cpu_load_update_nohz(). */
- this_rq->last_load_update_tick = READ_ONCE(jiffies);
-#endif
- cpu_load_update(this_rq, load, 1);
-}
-
-/*
- * Called from scheduler_tick()
- */
-void cpu_load_update_active(struct rq *this_rq)
-{
- unsigned long load = weighted_cpuload(this_rq);
-
- if (tick_nohz_tick_stopped())
- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
- else
- cpu_load_update_periodic(this_rq, load);
-}
-
-/*
- * Return a low guess at the load of a migration-source CPU weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
- */
-static unsigned long source_load(int cpu, int type)
+/* CPU only has SCHED_IDLE tasks enqueued */
+static int sched_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(rq);
- if (type == 0 || !sched_feat(LB_BIAS))
- return total;
-
- return min(rq->cpu_load[type-1], total);
+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+ rq->nr_running);
}
-/*
- * Return a high guess at the load of a migration-target CPU weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
+static unsigned long cpu_runnable_load(struct rq *rq)
{
- struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(rq);
-
- if (type == 0 || !sched_feat(LB_BIAS))
- return total;
-
- return max(rq->cpu_load[type-1], total);
+ return cfs_rq_runnable_load_avg(&rq->cfs);
}
static unsigned long capacity_of(int cpu)
@@ -5638,7 +5399,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = weighted_cpuload(rq);
+ unsigned long load_avg = cpu_runnable_load(rq);
if (nr_running)
return load_avg / nr_running;
@@ -5736,7 +5497,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
- this_eff_load = target_load(this_cpu, sd->wake_idx);
+ this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
if (sync) {
unsigned long current_load = task_h_load(current);
@@ -5754,7 +5515,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
- prev_eff_load = source_load(prev_cpu, sd->wake_idx);
+ prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
prev_eff_load -= task_load;
if (sched_feat(WA_BIAS))
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
@@ -5815,14 +5576,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
unsigned long this_runnable_load = ULONG_MAX;
unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
unsigned long most_spare = 0, this_spare = 0;
- int load_idx = sd->forkexec_idx;
int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
(sd->imbalance_pct-100) / 100;
- if (sd_flag & SD_BALANCE_WAKE)
- load_idx = sd->wake_idx;
-
do {
unsigned long load, avg_load, runnable_load;
unsigned long spare_cap, max_spare_cap;
@@ -5831,7 +5588,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_span(group),
- &p->cpus_allowed))
+ p->cpus_ptr))
continue;
local_group = cpumask_test_cpu(this_cpu,
@@ -5846,12 +5603,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
max_spare_cap = 0;
for_each_cpu(i, sched_group_span(group)) {
- /* Bias balancing toward CPUs of our domain */
- if (local_group)
- load = source_load(i, load_idx);
- else
- load = target_load(i, load_idx);
-
+ load = cpu_runnable_load(cpu_rq(i));
runnable_load += load;
avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
@@ -5955,7 +5707,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
unsigned int min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0;
int least_loaded_cpu = this_cpu;
- int shallowest_idle_cpu = -1;
+ int shallowest_idle_cpu = -1, si_cpu = -1;
int i;
/* Check if we have any choice: */
@@ -5963,7 +5715,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5986,8 +5738,13 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
- } else if (shallowest_idle_cpu == -1) {
- load = weighted_cpuload(cpu_rq(i));
+ } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
+ if (sched_idle_cpu(i)) {
+ si_cpu = i;
+ continue;
+ }
+
+ load = cpu_runnable_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
@@ -5995,7 +5752,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
}
}
- return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+ if (shallowest_idle_cpu != -1)
+ return shallowest_idle_cpu;
+ if (si_cpu != -1)
+ return si_cpu;
+ return least_loaded_cpu;
}
static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -6003,7 +5764,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
{
int new_cpu = cpu;
- if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
return prev_cpu;
/*
@@ -6120,7 +5881,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
if (!test_idle_cores(target, false))
return -1;
- cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
for_each_cpu_wrap(core, cpus, target) {
bool idle = true;
@@ -6148,19 +5909,21 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
*/
static int select_idle_smt(struct task_struct *p, int target)
{
- int cpu;
+ int cpu, si_cpu = -1;
if (!static_branch_likely(&sched_smt_present))
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
if (available_idle_cpu(cpu))
return cpu;
+ if (si_cpu == -1 && sched_idle_cpu(cpu))
+ si_cpu = cpu;
}
- return -1;
+ return si_cpu;
}
#else /* CONFIG_SCHED_SMT */
@@ -6188,7 +5951,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
- int cpu, nr = INT_MAX;
+ int this = smp_processor_id();
+ int cpu, nr = INT_MAX, si_cpu = -1;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -6212,18 +5976,20 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
nr = 4;
}
- time = local_clock();
+ time = cpu_clock(this);
for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
if (!--nr)
- return -1;
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ return si_cpu;
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
if (available_idle_cpu(cpu))
break;
+ if (si_cpu == -1 && sched_idle_cpu(cpu))
+ si_cpu = cpu;
}
- time = local_clock() - time;
+ time = cpu_clock(this) - time;
cost = this_sd->avg_scan_cost;
delta = (s64)(time - cost) / 8;
this_sd->avg_scan_cost += delta;
@@ -6239,13 +6005,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
- if (available_idle_cpu(target))
+ if (available_idle_cpu(target) || sched_idle_cpu(target))
return target;
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
- if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
+ if (prev != target && cpus_share_cache(prev, target) &&
+ (available_idle_cpu(prev) || sched_idle_cpu(prev)))
return prev;
/* Check a recently used CPU as a potential idle candidate: */
@@ -6253,8 +6020,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
- available_idle_cpu(recent_used_cpu) &&
- cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+ (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
/*
* Replace recent_used_cpu with prev as it is a potential
* candidate for the next wake:
@@ -6489,41 +6256,55 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
}
/*
- * compute_energy(): Estimates the energy that would be consumed if @p was
+ * compute_energy(): Estimates the energy that @pd would consume if @p was
* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- * landscape of the * CPUs after the task migration, and uses the Energy Model
+ * landscape of @pd's CPUs after the task migration, and uses the Energy Model
* to compute what would be the energy if we decided to actually migrate that
* task.
*/
static long
compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
{
- long util, max_util, sum_util, energy = 0;
+ struct cpumask *pd_mask = perf_domain_span(pd);
+ unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
+ unsigned long max_util = 0, sum_util = 0;
int cpu;
- for (; pd; pd = pd->next) {
- max_util = sum_util = 0;
+ /*
+ * The capacity state of CPUs of the current rd can be driven by CPUs
+ * of another rd if they belong to the same pd. So, account for the
+ * utilization of these CPUs too by masking pd with cpu_online_mask
+ * instead of the rd span.
+ *
+ * If an entire pd is outside of the current rd, it will not appear in
+ * its pd list and will not be accounted by compute_energy().
+ */
+ for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
+ unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
+ struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
+
/*
- * The capacity state of CPUs of the current rd can be driven by
- * CPUs of another rd if they belong to the same performance
- * domain. So, account for the utilization of these CPUs too
- * by masking pd with cpu_online_mask instead of the rd span.
- *
- * If an entire performance domain is outside of the current rd,
- * it will not appear in its pd list and will not be accounted
- * by compute_energy().
+ * Busy time computation: utilization clamping is not
+ * required since the ratio (sum_util / cpu_capacity)
+ * is already enough to scale the EM reported power
+ * consumption at the (eventually clamped) cpu_capacity.
*/
- for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) {
- util = cpu_util_next(cpu, p, dst_cpu);
- util = schedutil_energy_util(cpu, util);
- max_util = max(util, max_util);
- sum_util += util;
- }
+ sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ ENERGY_UTIL, NULL);
- energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+ /*
+ * Performance domain frequency: utilization clamping
+ * must be considered since it affects the selection
+ * of the performance domain frequency.
+ * NOTE: in case RT tasks are running, by default the
+ * FREQUENCY_UTIL's utilization can be max OPP.
+ */
+ cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ FREQUENCY_UTIL, tsk);
+ max_util = max(max_util, cpu_util);
}
- return energy;
+ return em_pd_energy(pd->em_pd, max_util, sum_util);
}
/*
@@ -6565,21 +6346,19 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* other use-cases too. So, until someone finds a better way to solve this,
* let's keep things simple by re-using the existing slow path.
*/
-
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
- unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+ unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ unsigned long cpu_cap, util, base_energy = 0;
int cpu, best_energy_cpu = prev_cpu;
- struct perf_domain *head, *pd;
- unsigned long cpu_cap, util;
struct sched_domain *sd;
+ struct perf_domain *pd;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
if (!pd || READ_ONCE(rd->overutilized))
goto fail;
- head = pd;
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
@@ -6596,24 +6375,29 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
goto unlock;
for (; pd; pd = pd->next) {
- unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+ unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+ unsigned long base_energy_pd;
int max_spare_cap_cpu = -1;
+ /* Compute the 'base' energy of the pd, without @p */
+ base_energy_pd = compute_energy(p, -1, pd);
+ base_energy += base_energy_pd;
+
for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
/* Skip CPUs that will be overutilized. */
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
- if (cpu_cap * 1024 < util * capacity_margin)
+ if (!fits_capacity(util, cpu_cap))
continue;
/* Always use prev_cpu as a candidate. */
if (cpu == prev_cpu) {
- prev_energy = compute_energy(p, prev_cpu, head);
- best_energy = min(best_energy, prev_energy);
- continue;
+ prev_delta = compute_energy(p, prev_cpu, pd);
+ prev_delta -= base_energy_pd;
+ best_delta = min(best_delta, prev_delta);
}
/*
@@ -6629,9 +6413,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
/* Evaluate the energy impact of using this CPU. */
if (max_spare_cap_cpu >= 0) {
- cur_energy = compute_energy(p, max_spare_cap_cpu, head);
- if (cur_energy < best_energy) {
- best_energy = cur_energy;
+ cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+ cur_delta -= base_energy_pd;
+ if (cur_delta < best_delta) {
+ best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
}
}
@@ -6643,10 +6428,10 @@ unlock:
* Pick the best CPU if prev_cpu cannot be used, or if it saves at
* least 6% of the energy used by prev_cpu.
*/
- if (prev_energy == ULONG_MAX)
+ if (prev_delta == ULONG_MAX)
return best_energy_cpu;
- if ((prev_energy - best_energy) > (prev_energy >> 4))
+ if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
return best_energy_cpu;
return prev_cpu;
@@ -6689,7 +6474,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed);
+ cpumask_test_cpu(cpu, p->cpus_ptr);
}
rcu_read_lock();
@@ -6980,7 +6765,7 @@ again:
goto idle;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (prev->sched_class != &fair_sched_class)
+ if (!prev || prev->sched_class != &fair_sched_class)
goto simple;
/*
@@ -7057,8 +6842,8 @@ again:
goto done;
simple:
#endif
-
- put_prev_task(rq, prev);
+ if (prev)
+ put_prev_task(rq, prev);
do {
se = pick_next_entity(cfs_rq, NULL);
@@ -7086,11 +6871,13 @@ done: __maybe_unused;
return p;
idle:
- update_misfit_status(NULL, rq);
- new_tasks = idle_balance(rq, rf);
+ if (!rf)
+ return NULL;
+
+ new_tasks = newidle_balance(rq, rf);
/*
- * Because idle_balance() releases (and re-acquires) rq->lock, it is
+ * Because newidle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
@@ -7112,7 +6899,7 @@ idle:
/*
* Account for a descheduled task:
*/
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
@@ -7445,14 +7232,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
- * 2) cannot be migrated to this CPU due to cpus_allowed, or
+ * 2) cannot be migrated to this CPU due to cpus_ptr, or
* 3) running (obviously), or
* 4) are cache-hot on their current CPU.
*/
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
- if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
+ if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -7472,7 +7259,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Prevent to re-select dst_cpu via env's CPUs: */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
env->flags |= LBF_DST_PINNED;
env->new_dst_cpu = cpu;
break;
@@ -7558,7 +7345,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
static const unsigned int sched_nr_migrate_break = 32;
/*
- * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * detach_tasks() -- tries to detach up to imbalance runnable load from
* busiest_rq, as part of a balancing operation within domain "sd".
*
* Returns number of detached tasks if successful and 0 otherwise.
@@ -7614,7 +7401,7 @@ static int detach_tasks(struct lb_env *env)
detached++;
env->imbalance -= load;
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
* NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is detached to minimize
@@ -7626,7 +7413,7 @@ static int detach_tasks(struct lb_env *env)
/*
* We only want to steal up to the prescribed amount of
- * weighted load.
+ * runnable load.
*/
if (env->imbalance <= 0)
break;
@@ -7695,6 +7482,7 @@ static void attach_tasks(struct lb_env *env)
rq_unlock(env->dst_rq, &rf);
}
+#ifdef CONFIG_NO_HZ_COMMON
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
{
if (cfs_rq->avg.load_avg)
@@ -7722,6 +7510,19 @@ static inline bool others_have_blocked(struct rq *rq)
return false;
}
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
+ rq->last_blocked_load_update_tick = jiffies;
+
+ if (!has_blocked)
+ rq->has_blocked_load = 0;
+}
+#else
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
+static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7787,11 +7588,7 @@ static void update_blocked_averages(int cpu)
if (others_have_blocked(rq))
done = false;
-#ifdef CONFIG_NO_HZ_COMMON
- rq->last_blocked_load_update_tick = jiffies;
- if (done)
- rq->has_blocked_load = 0;
-#endif
+ update_blocked_load_status(rq, !done);
rq_unlock_irqrestore(rq, &rf);
}
@@ -7857,11 +7654,7 @@ static inline void update_blocked_averages(int cpu)
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
-#ifdef CONFIG_NO_HZ_COMMON
- rq->last_blocked_load_update_tick = jiffies;
- if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
- rq->has_blocked_load = 0;
-#endif
+ update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
rq_unlock_irqrestore(rq, &rf);
}
@@ -7879,7 +7672,6 @@ static unsigned long task_h_load(struct task_struct *p)
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
unsigned long group_capacity;
unsigned long group_util; /* Total utilization of the group */
@@ -7933,38 +7725,10 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
};
}
-/**
- * get_sd_load_idx - Obtain the load index for a given sched domain.
- * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The idle status of the CPU for whose sd load_idx is obtained.
- *
- * Return: The load index.
- */
-static inline int get_sd_load_idx(struct sched_domain *sd,
- enum cpu_idle_type idle)
-{
- int load_idx;
-
- switch (idle) {
- case CPU_NOT_IDLE:
- load_idx = sd->busy_idx;
- break;
-
- case CPU_NEWLY_IDLE:
- load_idx = sd->newidle_idx;
- break;
- default:
- load_idx = sd->idle_idx;
- break;
- }
-
- return load_idx;
-}
-
static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long max = arch_scale_cpu_capacity(sd, cpu);
+ unsigned long max = arch_scale_cpu_capacity(cpu);
unsigned long used, free;
unsigned long irq;
@@ -7989,7 +7753,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
unsigned long capacity = scale_rt_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
- cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
+ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
if (!capacity)
capacity = 1;
@@ -8099,7 +7863,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
/*
* Group imbalance indicates (and tries to solve) the problem where balancing
- * groups is inadequate due to ->cpus_allowed constraints.
+ * groups is inadequate due to ->cpus_ptr constraints.
*
* Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
* cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
@@ -8184,8 +7948,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
static inline bool
group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
- return sg->sgc->min_capacity * capacity_margin <
- ref->sgc->min_capacity * 1024;
+ return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
}
/*
@@ -8195,8 +7958,7 @@ group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
static inline bool
group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
- return sg->sgc->max_capacity * capacity_margin <
- ref->sgc->max_capacity * 1024;
+ return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
}
static inline enum
@@ -8249,9 +8011,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct sg_lb_stats *sgs,
int *sg_status)
{
- int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
- int load_idx = get_sd_load_idx(env->sd, env->idle);
- unsigned long load;
int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
@@ -8262,13 +8021,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
env->flags |= LBF_NOHZ_AGAIN;
- /* Bias balancing toward CPUs of our domain: */
- if (local_group)
- load = target_load(i, load_idx);
- else
- load = source_load(i, load_idx);
-
- sgs->group_load += load;
+ sgs->group_load += cpu_runnable_load(rq);
sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
@@ -8283,7 +8036,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
- sgs->sum_weighted_load += weighted_cpuload(rq);
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -8302,7 +8054,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
if (sgs->sum_nr_running)
- sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+ sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
@@ -8516,8 +8268,12 @@ next_group:
/* Update over-utilization (tipping point, U >= 0) indicator */
WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
+ trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
} else if (sg_status & SG_OVERUTILIZED) {
- WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
+ struct root_domain *rd = env->dst_rq->rd;
+
+ WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
+ trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
}
}
@@ -8723,7 +8479,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
*
- * Also calculates the amount of weighted load which should be moved
+ * Also calculates the amount of runnable load which should be moved
* to restore balance.
*
* @env: The load balancing environment.
@@ -8768,7 +8524,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
/*
* If the busiest group is imbalanced the below checks don't
* work because they assume all things are equal, which typically
- * isn't true due to cpus_allowed constraints and the like.
+ * isn't true due to cpus_ptr constraints and the like.
*/
if (busiest->group_type == group_imbalanced)
goto force_balance;
@@ -8842,7 +8598,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
int i;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
- unsigned long capacity, wl;
+ unsigned long capacity, load;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -8896,30 +8652,30 @@ static struct rq *find_busiest_queue(struct lb_env *env,
rq->nr_running == 1)
continue;
- wl = weighted_cpuload(rq);
+ load = cpu_runnable_load(rq);
/*
- * When comparing with imbalance, use weighted_cpuload()
+ * When comparing with imbalance, use cpu_runnable_load()
* which is not scaled with the CPU capacity.
*/
- if (rq->nr_running == 1 && wl > env->imbalance &&
+ if (rq->nr_running == 1 && load > env->imbalance &&
!check_cpu_capacity(rq, env->sd))
continue;
/*
* For the load comparisons with the other CPU's, consider
- * the weighted_cpuload() scaled with the CPU capacity, so
+ * the cpu_runnable_load() scaled with the CPU capacity, so
* that the load can be moved away from the CPU that is
* potentially running at a lower capacity.
*
- * Thus we're looking for max(wl_i / capacity_i), crosswise
+ * Thus we're looking for max(load_i / capacity_i), crosswise
* multiplication to rid ourselves of the division works out
- * to: wl_i * capacity_j > wl_j * capacity_i; where j is
+ * to: load_i * capacity_j > load_j * capacity_i; where j is
* our previous maximum.
*/
- if (wl * busiest_capacity > busiest_load * capacity) {
- busiest_load = wl;
+ if (load * busiest_capacity > busiest_load * capacity) {
+ busiest_load = load;
busiest_capacity = capacity;
busiest = rq;
}
@@ -9210,7 +8966,7 @@ more_balance:
* if the curr task on busiest CPU can't be
* moved to this_cpu:
*/
- if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
+ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
env.flags |= LBF_ALL_PINNED;
@@ -9260,9 +9016,10 @@ more_balance:
out_balanced:
/*
* We reach balance although we may have faced some affinity
- * constraints. Clear the imbalance flag if it was set.
+ * constraints. Clear the imbalance flag only if other tasks got
+ * a chance to move and fix the imbalance.
*/
- if (sd_parent) {
+ if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if (*group_imbalance)
@@ -9283,10 +9040,10 @@ out_one_pinned:
ld_moved = 0;
/*
- * idle_balance() disregards balance intervals, so we could repeatedly
- * reach this code, which would lead to balance_interval skyrocketting
- * in a short amount of time. Skip the balance_interval increase logic
- * to avoid that.
+ * newidle_balance() disregards balance intervals, so we could
+ * repeatedly reach this code, which would lead to balance_interval
+ * skyrocketting in a short amount of time. Skip the balance_interval
+ * increase logic to avoid that.
*/
if (env.idle == CPU_NEWLY_IDLE)
goto out;
@@ -9879,7 +9636,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- cpu_load_update_idle(rq);
rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
@@ -9997,7 +9753,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -10005,6 +9761,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
int pulled_task = 0;
u64 curr_cost = 0;
+ update_misfit_status(NULL, this_rq);
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -10389,9 +10146,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* This routine is mostly called to set cfs_rq->curr field when a task
* migrates between groups/classes.
*/
-static void set_curr_task_fair(struct rq *rq)
+static void set_next_task_fair(struct rq *rq, struct task_struct *p)
{
- struct sched_entity *se = &rq->curr->se;
+ struct sched_entity *se = &p->se;
+
+#ifdef CONFIG_SMP
+ if (task_on_rq_queued(p)) {
+ /*
+ * Move the next running task to the front of the list, so our
+ * cfs_tasks list becomes MRU one.
+ */
+ list_move(&se->group_node, &rq->cfs_tasks);
+ }
+#endif
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -10509,18 +10276,18 @@ err:
void online_fair_sched_group(struct task_group *tg)
{
struct sched_entity *se;
+ struct rq_flags rf;
struct rq *rq;
int i;
for_each_possible_cpu(i) {
rq = cpu_rq(i);
se = tg->se[i];
-
- raw_spin_lock_irq(&rq->lock);
+ rq_lock_irq(rq, &rf);
update_rq_clock(rq);
attach_entity_cfs_rq(se);
sync_throttle(tg, i);
- raw_spin_unlock_irq(&rq->lock);
+ rq_unlock_irq(rq, &rf);
}
}
@@ -10662,7 +10429,9 @@ const struct sched_class fair_sched_class = {
.check_preempt_curr = check_preempt_wakeup,
.pick_next_task = pick_next_task_fair,
+
.put_prev_task = put_prev_task_fair,
+ .set_next_task = set_next_task_fair,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
@@ -10675,7 +10444,6 @@ const struct sched_class fair_sched_class = {
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_fair,
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
@@ -10690,6 +10458,10 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_change_group = task_change_group_fair,
#endif
+
+#ifdef CONFIG_UCLAMP_TASK
+ .uclamp_enabled = 1,
+#endif
};
#ifdef CONFIG_SCHED_DEBUG
@@ -10708,18 +10480,22 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
{
int node;
unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
+ struct numa_group *ng;
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
for_each_online_node(node) {
if (p->numa_faults) {
tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
}
- if (p->numa_group) {
- gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
- gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ if (ng) {
+ gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
+ gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
}
print_numa_stats(m, node, tsf, tpf, gsf, gpf);
}
+ rcu_read_unlock();
}
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
@@ -10737,3 +10513,83 @@ __init void init_sched_fair_class(void)
#endif /* SMP */
}
+
+/*
+ * Helper functions to facilitate extracting info from tracepoints.
+ */
+
+const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
+{
+#ifdef CONFIG_SMP
+ return cfs_rq ? &cfs_rq->avg : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
+
+char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
+{
+ if (!cfs_rq) {
+ if (str)
+ strlcpy(str, "(null)", len);
+ else
+ return NULL;
+ }
+
+ cfs_rq_tg_path(cfs_rq, str, len);
+ return str;
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
+
+int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
+
+const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ return rq ? &rq->avg_rt : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
+
+const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ return rq ? &rq->avg_dl : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
+
+const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
+ return rq ? &rq->avg_irq : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
+
+int sched_trace_rq_cpu(struct rq *rq)
+{
+ return rq ? cpu_of(rq) : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
+
+const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
+{
+#ifdef CONFIG_SMP
+ return rd ? rd->span : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rd_span);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 858589b83377..2410db5e9a35 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,7 +39,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
-SCHED_FEAT(LB_BIAS, false)
/*
* Decrement CPU capacity based on time not spent running tasks
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 80940939b733..8dad5aa600ea 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -238,16 +238,16 @@ static void do_idle(void)
tick_nohz_idle_enter();
while (!need_resched()) {
- check_pgt_cache();
rmb();
+ local_irq_disable();
+
if (cpu_is_offline(cpu)) {
- tick_nohz_idle_stop_tick_protected();
+ tick_nohz_idle_stop_tick();
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
- local_irq_disable();
arch_cpu_idle_enter();
/*
@@ -311,7 +311,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-void play_idle(unsigned long duration_ms)
+void play_idle(unsigned long duration_us)
{
struct idle_timer it;
@@ -323,7 +323,7 @@ void play_idle(unsigned long duration_ms)
WARN_ON_ONCE(current->nr_cpus_allowed != 1);
WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
- WARN_ON_ONCE(!duration_ms);
+ WARN_ON_ONCE(!duration_us);
rcu_sleep_check();
preempt_disable();
@@ -333,7 +333,8 @@ void play_idle(unsigned long duration_ms)
it.done = 0;
hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
it.timer.function = idle_inject_timer_fn;
- hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);
+ hrtimer_start(&it.timer, ns_to_ktime(duration_us * NSEC_PER_USEC),
+ HRTIMER_MODE_REL_PINNED);
while (!READ_ONCE(it.done))
do_idle();
@@ -374,14 +375,27 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_curr(rq);
}
-static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+}
+
+static void set_next_task_idle(struct rq *rq, struct task_struct *next)
{
- put_prev_task(rq, prev);
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
+}
+
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *next = rq->idle;
- return rq->idle;
+ if (prev)
+ put_prev_task(rq, prev);
+
+ set_next_task_idle(rq, next);
+
+ return next;
}
/*
@@ -397,10 +411,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
raw_spin_lock_irq(&rq->lock);
}
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
-{
-}
-
/*
* scheduler tick hitting a task of our scheduling class.
*
@@ -413,10 +423,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void set_curr_task_idle(struct rq *rq)
-{
-}
-
static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
@@ -451,13 +457,13 @@ const struct sched_class idle_sched_class = {
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
+ .set_next_task = set_next_task_idle,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.get_rr_interval = get_rr_interval_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 123ea07a3f3b..9fcb2a695a41 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -14,11 +14,25 @@ EXPORT_SYMBOL_GPL(housekeeping_overridden);
static cpumask_var_t housekeeping_mask;
static unsigned int housekeeping_flags;
+bool housekeeping_enabled(enum hk_flags flags)
+{
+ return !!(housekeeping_flags & flags);
+}
+EXPORT_SYMBOL_GPL(housekeeping_enabled);
+
int housekeeping_any_cpu(enum hk_flags flags)
{
- if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
+ int cpu;
+
+ if (static_branch_unlikely(&housekeeping_overridden)) {
+ if (housekeeping_flags & flags) {
+ cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id());
+ if (cpu < nr_cpu_ids)
+ return cpu;
+
return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+ }
+ }
return smp_processor_id();
}
EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index befce29bd882..a96db50d40e0 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -28,6 +28,8 @@
#include "sched.h"
#include "pelt.h"
+#include <trace/events/sched.h>
+
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
@@ -265,6 +267,7 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ trace_pelt_se_tp(se);
return 1;
}
@@ -278,6 +281,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
cfs_se_util_change(&se->avg);
+ trace_pelt_se_tp(se);
return 1;
}
@@ -292,6 +296,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->curr != NULL)) {
___update_load_avg(&cfs_rq->avg, 1, 1);
+ trace_pelt_cfs_tp(cfs_rq);
return 1;
}
@@ -317,6 +322,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
running)) {
___update_load_avg(&rq->avg_rt, 1, 1);
+ trace_pelt_rt_tp(rq);
return 1;
}
@@ -340,6 +346,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
running)) {
___update_load_avg(&rq->avg_dl, 1, 1);
+ trace_pelt_dl_tp(rq);
return 1;
}
@@ -366,7 +373,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
* reflect the real amount of computation
*/
running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
- running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+ running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
/*
* We know the time that has been used by interrupt since last update
@@ -388,8 +395,10 @@ int update_irq_load_avg(struct rq *rq, u64 running)
1,
1);
- if (ret)
+ if (ret) {
___update_load_avg(&rq->avg_irq, 1, 1);
+ trace_pelt_irq_tp(rq);
+ }
return ret;
}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7489d5f56960..afff644da065 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -79,7 +79,7 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
* Scale the elapsed time to reflect the real amount of
* computation
*/
- delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+ delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq)));
delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
rq->clock_pelt += delta;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7acc632c3b82..517e3719027e 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1051,7 +1051,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
if (!rcu_access_pointer(group->poll_kworker)) {
struct sched_param param = {
- .sched_priority = MAX_RT_PRIO - 1,
+ .sched_priority = 1,
};
struct kthread_worker *kworker;
@@ -1061,7 +1061,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
mutex_unlock(&group->trigger_lock);
return ERR_CAST(kworker);
}
- sched_setscheduler(kworker->task, SCHED_FIFO, &param);
+ sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
kthread_init_delayed_work(&group->poll_work,
psi_poll_work);
rcu_assign_pointer(group->poll_kworker, kworker);
@@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct kref *ref)
* deadlock while waiting for psi_poll_work to acquire trigger_lock
*/
if (kworker_to_destroy) {
+ /*
+ * After the RCU grace period has expired, the worker
+ * can no longer be found through group->poll_kworker.
+ * But it might have been already scheduled before
+ * that - deschedule it cleanly before destroying it.
+ */
kthread_cancel_delayed_work_sync(&group->poll_work);
+ atomic_set(&group->poll_scheduled, 0);
+
kthread_destroy_worker(kworker_to_destroy);
}
kfree(t);
@@ -1190,7 +1198,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
- buf_size = min(nbytes, (sizeof(buf) - 1));
+ buf_size = min(nbytes, sizeof(buf));
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1e6b909dca36..ebaa4e619684 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -45,8 +45,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
raw_spin_lock_init(&rt_b->rt_runtime_lock);
- hrtimer_init(&rt_b->rt_period_timer,
- CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_HARD);
rt_b->rt_period_timer.function = sched_rt_period_timer;
}
@@ -67,7 +67,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
* to update the period.
*/
hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
- hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_expires(&rt_b->rt_period_timer,
+ HRTIMER_MODE_ABS_PINNED_HARD);
}
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -1498,12 +1499,22 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif
}
-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)
{
p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
+
+ /*
+ * If prev task was rt, put_prev_task() has already updated the
+ * utilization. We only care of the case where we start to schedule a
+ * rt task
+ */
+ if (rq->curr->sched_class != &rt_sched_class)
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+ rt_queue_push_tasks(rq);
}
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -1543,56 +1554,19 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;
- if (need_pull_rt_task(rq, prev)) {
- /*
- * This is OK, because current is on_cpu, which avoids it being
- * picked for load-balance and preemption/IRQs are still
- * disabled avoiding further scheduler activity on it and we're
- * being very careful to re-start the picking loop.
- */
- rq_unpin_lock(rq, rf);
- pull_rt_task(rq);
- rq_repin_lock(rq, rf);
- /*
- * pull_rt_task() can drop (and re-acquire) rq->lock; this
- * means a dl or stop task can slip in, in which case we need
- * to re-start task selection.
- */
- if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
- rq->dl.dl_nr_running))
- return RETRY_TASK;
- }
-
- /*
- * We may dequeue prev's rt_rq in put_prev_task().
- * So, we update time before rt_queued check.
- */
- if (prev->sched_class == &rt_sched_class)
- update_curr_rt(rq);
+ WARN_ON_ONCE(prev || rf);
if (!rt_rq->rt_queued)
return NULL;
- put_prev_task(rq, prev);
-
p = _pick_next_task_rt(rq);
- set_next_task(rq, p);
-
- rt_queue_push_tasks(rq);
-
- /*
- * If prev task was rt, put_prev_task() has already updated the
- * utilization. We only care of the case where we start to schedule a
- * rt task
- */
- if (rq->curr->sched_class != &rt_sched_class)
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ set_next_task_rt(rq, p);
return p;
}
-static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
{
update_curr_rt(rq);
@@ -1604,6 +1578,18 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
*/
if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+
+ if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet started the picking loop.
+ */
+ rq_unpin_lock(rq, rf);
+ pull_rt_task(rq);
+ rq_repin_lock(rq, rf);
+ }
}
#ifdef CONFIG_SMP
@@ -1614,7 +1600,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
@@ -1751,7 +1737,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* Also make sure that it wasn't scheduled on its rq.
*/
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
+ !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
task_running(rq, task) ||
!rt_task(task) ||
!task_on_rq_queued(task))) {
@@ -2304,8 +2290,10 @@ static void watchdog(struct rq *rq, struct task_struct *p)
}
next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
- if (p->rt.timeout > next)
- p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
+ if (p->rt.timeout > next) {
+ posix_cputimers_rt_watchdog(&p->posix_cputimers,
+ p->se.sum_exec_runtime);
+ }
}
}
#else
@@ -2354,11 +2342,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
}
}
-static void set_curr_task_rt(struct rq *rq)
-{
- set_next_task(rq, rq->curr);
-}
-
static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
{
/*
@@ -2380,6 +2363,7 @@ const struct sched_class rt_sched_class = {
.pick_next_task = pick_next_task_rt,
.put_prev_task = put_prev_task_rt,
+ .set_next_task = set_next_task_rt,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_rt,
@@ -2391,7 +2375,6 @@ const struct sched_class rt_sched_class = {
.switched_from = switched_from_rt,
#endif
- .set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
.get_rr_interval = get_rr_interval_rt,
@@ -2400,6 +2383,10 @@ const struct sched_class rt_sched_class = {
.switched_to = switched_to_rt,
.update_curr = update_curr_rt,
+
+#ifdef CONFIG_UCLAMP_TASK
+ .uclamp_enabled = 1,
+#endif
};
#ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
index a26473674fb7..c529706bed11 100644
--- a/kernel/sched/sched-pelt.h
+++ b/kernel/sched/sched-pelt.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
-static const u32 runnable_avg_yN_inv[] = {
+static const u32 runnable_avg_yN_inv[] __maybe_unused = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b52ed1ada0be..b3cb895d14a2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks;
extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
-#ifdef CONFIG_SMP
-extern void cpu_load_update_active(struct rq *this_rq);
-#else
-static inline void cpu_load_update_active(struct rq *this_rq) { }
-#endif
-
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
@@ -341,11 +335,11 @@ struct cfs_bandwidth {
u64 quota;
u64 runtime;
s64 hierarchical_quota;
- u64 runtime_expires;
- int expires_seq;
- short idle;
- short period_active;
+ u8 idle;
+ u8 period_active;
+ u8 distribute_running;
+ u8 slack_started;
struct hrtimer period_timer;
struct hrtimer slack_timer;
struct list_head throttled_cfs_rq;
@@ -354,8 +348,6 @@ struct cfs_bandwidth {
int nr_periods;
int nr_throttled;
u64 throttled_time;
-
- bool distribute_running;
#endif
};
@@ -399,6 +391,16 @@ struct task_group {
#endif
struct cfs_bandwidth cfs_bandwidth;
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ /* The two decimal precision [%] value requested from user-space */
+ unsigned int uclamp_pct[UCLAMP_CNT];
+ /* Clamp values requested for a task group */
+ struct uclamp_se uclamp_req[UCLAMP_CNT];
+ /* Effective clamp values used for a task group */
+ struct uclamp_se uclamp[UCLAMP_CNT];
+#endif
+
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -489,7 +491,8 @@ struct cfs_rq {
struct load_weight load;
unsigned long runnable_weight;
unsigned int nr_running;
- unsigned int h_nr_running;
+ unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int idle_h_nr_running; /* SCHED_IDLE */
u64 exec_clock;
u64 min_vruntime;
@@ -562,8 +565,6 @@ struct cfs_rq {
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
- int expires_seq;
- u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_clock;
@@ -783,9 +784,6 @@ struct root_domain {
struct perf_domain __rcu *pd;
};
-extern struct root_domain def_root_domain;
-extern struct mutex sched_domains_mutex;
-
extern void init_defrootdomain(void);
extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
@@ -797,6 +795,48 @@ extern void rto_push_irq_work_func(struct irq_work *work);
#endif
#endif /* CONFIG_SMP */
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * struct uclamp_bucket - Utilization clamp bucket
+ * @value: utilization clamp value for tasks on this clamp bucket
+ * @tasks: number of RUNNABLE tasks on this clamp bucket
+ *
+ * Keep track of how many tasks are RUNNABLE for a given utilization
+ * clamp value.
+ */
+struct uclamp_bucket {
+ unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
+ unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
+};
+
+/*
+ * struct uclamp_rq - rq's utilization clamp
+ * @value: currently active clamp values for a rq
+ * @bucket: utilization clamp buckets affecting a rq
+ *
+ * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
+ * A clamp value is affecting a rq when there is at least one task RUNNABLE
+ * (or actually running) with that value.
+ *
+ * There are up to UCLAMP_CNT possible different clamp values, currently there
+ * are only two: minimum utilization and maximum utilization.
+ *
+ * All utilization clamping values are MAX aggregated, since:
+ * - for util_min: we want to run the CPU at least at the max of the minimum
+ * utilization required by its currently RUNNABLE tasks.
+ * - for util_max: we want to allow the CPU to run up to the max of the
+ * maximum utilization allowed by its currently RUNNABLE tasks.
+ *
+ * Since on each system we expect only a limited number of different
+ * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
+ * the metrics required to compute all the per-rq utilization clamp values.
+ */
+struct uclamp_rq {
+ unsigned int value;
+ struct uclamp_bucket bucket[UCLAMP_BUCKETS];
+};
+#endif /* CONFIG_UCLAMP_TASK */
+
/*
* This is the main, per-CPU runqueue data structure.
*
@@ -818,8 +858,6 @@ struct rq {
unsigned int nr_preferred_running;
unsigned int numa_migrate_on;
#endif
- #define CPU_LOAD_IDX_MAX 5
- unsigned long cpu_load[CPU_LOAD_IDX_MAX];
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
@@ -830,11 +868,16 @@ struct rq {
atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
- /* capture load from *all* tasks on this CPU: */
- struct load_weight load;
unsigned long nr_load_updates;
u64 nr_switches;
+#ifdef CONFIG_UCLAMP_TASK
+ /* Utilization clamp values based on CPU's RUNNABLE tasks */
+ struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
+ unsigned int uclamp_flags;
+#define UCLAMP_FLAG_IDLE 0x01
+#endif
+
struct cfs_rq cfs;
struct rt_rq rt;
struct dl_rq dl;
@@ -1222,16 +1265,18 @@ enum numa_topology_type {
extern enum numa_topology_type sched_numa_topology_type;
extern int sched_max_numa_distance;
extern bool find_numa_distance(int distance);
-#endif
-
-#ifdef CONFIG_NUMA
extern void sched_init_numa(void);
extern void sched_domains_numa_masks_set(unsigned int cpu);
extern void sched_domains_numa_masks_clear(unsigned int cpu);
+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
#else
static inline void sched_init_numa(void) { }
static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+ return nr_cpu_ids;
+}
#endif
#ifdef CONFIG_NUMA_BALANCING
@@ -1410,10 +1455,14 @@ static inline void unregister_sched_domain_sysctl(void)
}
#endif
+extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+
#else
static inline void sched_ttwu_pending(void) { }
+static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
+
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -1649,6 +1698,10 @@ extern const u32 sched_prio_to_wmult[40];
struct sched_class {
const struct sched_class *next;
+#ifdef CONFIG_UCLAMP_TASK
+ int uclamp_enabled;
+#endif
+
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
@@ -1657,17 +1710,21 @@ struct sched_class {
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
/*
- * It is the responsibility of the pick_next_task() method that will
- * return the next task to call put_prev_task() on the @prev task or
- * something equivalent.
+ * Both @prev and @rf are optional and may be NULL, in which case the
+ * caller must already have invoked put_prev_task(rq, prev, rf).
+ *
+ * Otherwise it is the responsibility of the pick_next_task() to call
+ * put_prev_task() on the @prev task or something equivalent, IFF it
+ * returns a next task.
*
- * May return RETRY_TASK when it finds a higher prio class has runnable
- * tasks.
+ * In that case (@rf != NULL) it may return RETRY_TASK when it finds a
+ * higher prio class has runnable tasks.
*/
struct task_struct * (*pick_next_task)(struct rq *rq,
struct task_struct *prev,
struct rq_flags *rf);
- void (*put_prev_task)(struct rq *rq, struct task_struct *p);
+ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf);
+ void (*set_next_task)(struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
@@ -1682,7 +1739,6 @@ struct sched_class {
void (*rq_offline)(struct rq *rq);
#endif
- void (*set_curr_task)(struct rq *rq);
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
void (*task_fork)(struct task_struct *p);
void (*task_dead)(struct task_struct *p);
@@ -1712,12 +1768,14 @@ struct sched_class {
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- prev->sched_class->put_prev_task(rq, prev);
+ WARN_ON_ONCE(rq->curr != prev);
+ prev->sched_class->put_prev_task(rq, prev, NULL);
}
-static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
+static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
- curr->sched_class->set_curr_task(rq);
+ WARN_ON_ONCE(rq->curr != next);
+ next->sched_class->set_next_task(rq, next);
}
#ifdef CONFIG_SMP
@@ -1900,7 +1958,7 @@ unsigned long arch_scale_freq_capacity(int cpu)
#endif
#ifdef CONFIG_SMP
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1952,7 +2010,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
return ret;
}
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
/*
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
@@ -2222,6 +2280,48 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
+#ifdef CONFIG_UCLAMP_TASK
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+
+static __always_inline
+unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
+ struct task_struct *p)
+{
+ unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
+ unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+
+ if (p) {
+ min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
+ max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
+ }
+
+ /*
+ * Since CPU's {min,max}_util clamps are MAX aggregated considering
+ * RUNNABLE tasks with _different_ clamps, we can end up with an
+ * inversion. Fix it now when the clamps are applied.
+ */
+ if (unlikely(min_util >= max_util))
+ return min_util;
+
+ return clamp(util, min_util, max_util);
+}
+
+static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+{
+ return uclamp_util_with(rq, util, NULL);
+}
+#else /* CONFIG_UCLAMP_TASK */
+static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
+ struct task_struct *p)
+{
+ return util;
+}
+static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+{
+ return util;
+}
+#endif /* CONFIG_UCLAMP_TASK */
+
#ifdef arch_scale_freq_capacity
# ifndef arch_scale_freq_invariant
# define arch_scale_freq_invariant() true
@@ -2237,7 +2337,6 @@ static inline unsigned long capacity_orig_of(int cpu)
}
#endif
-#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
/**
* enum schedutil_type - CPU utilization type
* @FREQUENCY_UTIL: Utilization used to select frequency
@@ -2253,15 +2352,11 @@ enum schedutil_type {
ENERGY_UTIL,
};
-unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
-static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
-{
- unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
-
- return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL);
-}
+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum schedutil_type type,
+ struct task_struct *p);
static inline unsigned long cpu_bw_dl(struct rq *rq)
{
@@ -2290,11 +2385,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
return READ_ONCE(rq->avg_rt.util_avg);
}
#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
-static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
+static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum schedutil_type type,
+ struct task_struct *p)
{
- return cfs;
+ return 0;
}
-#endif
+#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index aa0de240fb41..ba683fe81a6e 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -157,9 +157,10 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
{
unsigned long long now = rq_clock(rq), delta = 0;
- if (unlikely(sched_info_on()))
+ if (sched_info_on()) {
if (t->sched_info.last_queued)
delta = now - t->sched_info.last_queued;
+ }
sched_info_reset_dequeued(t);
t->sched_info.run_delay += delta;
@@ -192,7 +193,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
*/
static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
{
- if (unlikely(sched_info_on())) {
+ if (sched_info_on()) {
if (!t->sched_info.last_queued)
t->sched_info.last_queued = rq_clock(rq);
}
@@ -239,7 +240,7 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct
static inline void
sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
- if (unlikely(sched_info_on()))
+ if (sched_info_on())
__sched_info_switch(rq, prev, next);
}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index c183b790ca54..7e1cee4e65b2 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,17 +23,22 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}
+static void set_next_task_stop(struct rq *rq, struct task_struct *stop)
+{
+ stop->se.exec_start = rq_clock_task(rq);
+}
+
static struct task_struct *
pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *stop = rq->stop;
+ WARN_ON_ONCE(prev || rf);
+
if (!stop || !task_on_rq_queued(stop))
return NULL;
- put_prev_task(rq, prev);
-
- stop->se.exec_start = rq_clock_task(rq);
+ set_next_task_stop(rq, stop);
return stop;
}
@@ -55,7 +60,7 @@ static void yield_task_stop(struct rq *rq)
BUG(); /* the stop task should never yield, its pointless. */
}
-static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *curr = rq->curr;
u64 delta_exec;
@@ -86,13 +91,6 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void set_curr_task_stop(struct rq *rq)
-{
- struct task_struct *stop = rq->stop;
-
- stop->se.exec_start = rq_clock_task(rq);
-}
-
static void switched_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
@@ -128,13 +126,13 @@ const struct sched_class stop_sched_class = {
.pick_next_task = pick_next_task_stop,
.put_prev_task = put_prev_task_stop,
+ .set_next_task = set_next_task_stop,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_stop,
.task_tick = task_tick_stop,
.get_rr_interval = get_rr_interval_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f53f89df837d..b5667a273bf6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1284,6 +1284,7 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
#endif
/*
@@ -1344,11 +1345,6 @@ sd_init(struct sched_domain_topology_level *tl,
.imbalance_pct = 125,
.cache_nice_tries = 0,
- .busy_idx = 0,
- .idle_idx = 0,
- .newidle_idx = 0,
- .wake_idx = 0,
- .forkexec_idx = 0,
.flags = 1*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
@@ -1400,17 +1396,14 @@ sd_init(struct sched_domain_topology_level *tl,
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
- sd->busy_idx = 2;
#ifdef CONFIG_NUMA
} else if (sd->flags & SD_NUMA) {
sd->cache_nice_tries = 2;
- sd->busy_idx = 3;
- sd->idle_idx = 2;
sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE;
- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
sd->flags &= ~(SD_BALANCE_EXEC |
SD_BALANCE_FORK |
SD_WAKE_AFFINE);
@@ -1419,8 +1412,6 @@ sd_init(struct sched_domain_topology_level *tl,
#endif
} else {
sd->cache_nice_tries = 1;
- sd->busy_idx = 2;
- sd->idle_idx = 1;
}
/*
@@ -1734,6 +1725,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
}
}
+/*
+ * sched_numa_find_closest() - given the NUMA topology, find the cpu
+ * closest to @cpu from @cpumask.
+ * cpumask: cpumask to find a cpu from
+ * cpu: cpu to be close to
+ *
+ * returns: cpu, or nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+ int i, j = cpu_to_node(cpu);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ }
+ return nr_cpu_ids;
+}
+
#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -1884,10 +1895,10 @@ static struct sched_domain_topology_level
unsigned long cap;
/* Is there any asymmetry? */
- cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
+ cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
for_each_cpu(i, cpu_map) {
- if (arch_scale_cpu_capacity(NULL, i) != cap) {
+ if (arch_scale_cpu_capacity(i) != cap) {
asym = true;
break;
}
@@ -1902,7 +1913,7 @@ static struct sched_domain_topology_level
* to everyone.
*/
for_each_cpu(i, cpu_map) {
- unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
+ unsigned long max_capacity = arch_scale_cpu_capacity(i);
int tl_id = 0;
for_each_sd_topology(tl) {
@@ -1912,7 +1923,7 @@ static struct sched_domain_topology_level
for_each_cpu_and(j, tl->mask(i), cpu_map) {
unsigned long capacity;
- capacity = arch_scale_cpu_capacity(NULL, j);
+ capacity = arch_scale_cpu_capacity(j);
if (capacity <= max_capacity)
continue;
@@ -2159,16 +2170,16 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
* ndoms_new == 0 is a special case for destroying existing domains,
* and it will not create the default domain.
*
- * Call with hotplug lock held
+ * Call with hotplug lock and sched_domains_mutex held
*/
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
+void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
{
bool __maybe_unused has_eas = false;
int i, j, n;
int new_topology;
- mutex_lock(&sched_domains_mutex);
+ lockdep_assert_held(&sched_domains_mutex);
/* Always unregister in case we don't destroy any domains: */
unregister_sched_domain_sysctl();
@@ -2193,8 +2204,19 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
for (i = 0; i < ndoms_cur; i++) {
for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_cur[i], doms_new[j]) &&
- dattrs_equal(dattr_cur, i, dattr_new, j))
+ dattrs_equal(dattr_cur, i, dattr_new, j)) {
+ struct root_domain *rd;
+
+ /*
+ * This domain won't be destroyed and as such
+ * its dl_bw->total_bw needs to be cleared. It
+ * will be recomputed in function
+ * update_tasks_root_domain().
+ */
+ rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
+ dl_clear_root_domain(rd);
goto match1;
+ }
}
/* No match - a current sched domain not in new doms_new[] */
detach_destroy_domains(doms_cur[i]);
@@ -2251,6 +2273,15 @@ match3:
ndoms_cur = ndoms_new;
register_sched_domain_sysctl();
+}
+/*
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ mutex_lock(&sched_domains_mutex);
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
mutex_unlock(&sched_domains_mutex);
}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index fa0f9adfb752..c1e566a114ca 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -118,16 +118,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
bookmark.func = NULL;
INIT_LIST_HEAD(&bookmark.entry);
- spin_lock_irqsave(&wq_head->lock, flags);
- nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
- spin_unlock_irqrestore(&wq_head->lock, flags);
-
- while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+ do {
spin_lock_irqsave(&wq_head->lock, flags);
nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
wake_flags, key, &bookmark);
spin_unlock_irqrestore(&wq_head->lock, flags);
- }
+ } while (bookmark.flags & WQ_FLAG_BOOKMARK);
}
/**