aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-10-12 12:56:01 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-10-12 12:56:01 -0700
commitedaa5ddf3833669a25654d42c0fb653dfdd906df (patch)
tree156c29b0375581dfa7c7e578d8d61924b3eb81b2 /kernel
parentMerge tag 'x86-entry-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip (diff)
parentsched/deadline: Unthrottle PI boosted threads while enqueuing (diff)
downloadlinux-dev-edaa5ddf3833669a25654d42c0fb653dfdd906df.tar.xz
linux-dev-edaa5ddf3833669a25654d42c0fb653dfdd906df.zip
Merge tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - reorganize & clean up the SD* flags definitions and add a bunch of sanity checks. These new checks caught quite a few bugs or at least inconsistencies, resulting in another set of patches. - rseq updates, add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ - add a new tracepoint to improve CPU capacity tracking - improve overloaded SMP system load-balancing behavior - tweak SMT balancing - energy-aware scheduling updates - NUMA balancing improvements - deadline scheduler fixes and improvements - CPU isolation fixes - misc cleanups, simplifications and smaller optimizations * tag 'sched-core-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (42 commits) sched/deadline: Unthrottle PI boosted threads while enqueuing sched/debug: Add new tracepoint to track cpu_capacity sched/fair: Tweak pick_next_entity() rseq/selftests: Test MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ rseq/selftests,x86_64: Add rseq_offset_deref_addv() rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ sched/fair: Use dst group while checking imbalance for NUMA balancer sched/fair: Reduce busy load balance interval sched/fair: Minimize concurrent LBs between domain level sched/fair: Reduce minimal imbalance threshold sched/fair: Relax constraint on task's load during load balance sched/fair: Remove the force parameter of update_tg_load_avg() sched/fair: Fix wrong cpu selecting from isolated domain sched: Remove unused inline function uclamp_bucket_base_value() sched/rt: Disable RT_RUNTIME_SHARE by default sched/deadline: Fix stale throttling on de-/boosted tasks sched/numa: Use runnable_avg to classify node sched/topology: Move sd_flag_debug out of #ifdef CONFIG_SYSCTL MAINTAINERS: Add myself as SCHED_DEADLINE reviewer sched/topology: Move SD_DEGENERATE_GROUPS_MASK out of linux/sched/topology.h ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c13
-rw-r--r--kernel/sched/deadline.c34
-rw-r--r--kernel/sched/debug.c56
-rw-r--r--kernel/sched/fair.c103
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/membarrier.c136
-rw-r--r--kernel/sched/topology.c69
7 files changed, 305 insertions, 108 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d95dc3f4644..8160ab5263f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
@@ -940,11 +941,6 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
return clamp_value / UCLAMP_BUCKET_DELTA;
}
-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
-{
- return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
-}
-
static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
@@ -4551,9 +4547,12 @@ void __noreturn do_task_dead(void)
static inline void sched_submit_work(struct task_struct *tsk)
{
+ unsigned int task_flags;
+
if (!tsk->state)
return;
+ task_flags = tsk->flags;
/*
* If a worker went to sleep, notify and ask workqueue whether
* it wants to wake up a task to maintain concurrency.
@@ -4562,9 +4561,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
* in the possible wakeup of a kworker and because wq_worker_sleeping()
* requires it.
*/
- if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
preempt_disable();
- if (tsk->flags & PF_WQ_WORKER)
+ if (task_flags & PF_WQ_WORKER)
wq_worker_sleeping(tsk);
else
io_wq_worker_sleeping(tsk);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3862a28cd05d..6d93f4518734 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1525,14 +1525,38 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
*/
if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
pi_se = &pi_task->dl;
+ /*
+ * Because of delays in the detection of the overrun of a
+ * thread's runtime, it might be the case that a thread
+ * goes to sleep in a rt mutex with negative runtime. As
+ * a consequence, the thread will be throttled.
+ *
+ * While waiting for the mutex, this thread can also be
+ * boosted via PI, resulting in a thread that is throttled
+ * and boosted at the same time.
+ *
+ * In this case, the boost overrides the throttle.
+ */
+ if (p->dl.dl_throttled) {
+ /*
+ * The replenish timer needs to be canceled. No
+ * problem if it fires concurrently: boosted threads
+ * are ignored in dl_task_timer().
+ */
+ hrtimer_try_to_cancel(&p->dl.dl_timer);
+ p->dl.dl_throttled = 0;
+ }
} else if (!dl_prio(p->normal_prio)) {
/*
- * Special case in which we have a !SCHED_DEADLINE task
- * that is going to be deboosted, but exceeds its
- * runtime while doing so. No point in replenishing
- * it, as it's going to return back to its original
- * scheduling class after this.
+ * Special case in which we have a !SCHED_DEADLINE task that is going
+ * to be deboosted, but exceeds its runtime while doing so. No point in
+ * replenishing it, as it's going to return back to its original
+ * scheduling class after this. If it has been throttled, we need to
+ * clear the flag, otherwise the task may wake up as throttled after
+ * being boosted again with no means to replenish the runtime and clear
+ * the throttle.
*/
+ p->dl.dl_throttled = 0;
BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
return;
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 36c54265bb2b..0655524700d2 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -245,6 +245,60 @@ set_table_entry(struct ctl_table *entry,
entry->proc_handler = proc_handler;
}
+static int sd_ctl_doflags(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ unsigned long flags = *(unsigned long *)table->data;
+ size_t data_size = 0;
+ size_t len = 0;
+ char *tmp;
+ int idx;
+
+ if (write)
+ return 0;
+
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ char *name = sd_flag_debug[idx].name;
+
+ /* Name plus whitespace */
+ data_size += strlen(name) + 1;
+ }
+
+ if (*ppos > data_size) {
+ *lenp = 0;
+ return 0;
+ }
+
+ tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ char *name = sd_flag_debug[idx].name;
+
+ len += snprintf(tmp + len, strlen(name) + 2, "%s ", name);
+ }
+
+ tmp += *ppos;
+ len -= *ppos;
+
+ if (len > *lenp)
+ len = *lenp;
+ if (len)
+ memcpy(buffer, tmp, len);
+ if (len < *lenp) {
+ ((char *)buffer)[len] = '\n';
+ len++;
+ }
+
+ *lenp = len;
+ *ppos += len;
+
+ kfree(tmp);
+
+ return 0;
+}
+
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
@@ -258,7 +312,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax);
+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, sd_ctl_doflags);
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
/* &table[8] is terminator */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1a68a0536add..aa4c6227cd6d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se)
void post_init_entity_util_avg(struct task_struct *p)
{
}
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
}
#endif /* CONFIG_SMP */
@@ -1504,6 +1504,7 @@ enum numa_type {
/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;
+ unsigned long runnable;
unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
@@ -1547,19 +1548,22 @@ struct task_numa_env {
};
static unsigned long cpu_load(struct rq *rq);
+static unsigned long cpu_runnable(struct rq *rq);
static unsigned long cpu_util(int cpu);
-static inline long adjust_numa_imbalance(int imbalance, int src_nr_running);
+static inline long adjust_numa_imbalance(int imbalance, int nr_running);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
struct numa_stats *ns)
{
if ((ns->nr_running > ns->weight) &&
- ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
+ (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
+ ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
return node_overloaded;
if ((ns->nr_running < ns->weight) ||
- ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
+ (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
+ ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
return node_has_spare;
return node_fully_busy;
@@ -1610,6 +1614,7 @@ static void update_numa_stats(struct task_numa_env *env,
struct rq *rq = cpu_rq(cpu);
ns->load += cpu_load(rq);
+ ns->runnable += cpu_runnable(rq);
ns->util += cpu_util(cpu);
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
@@ -1925,7 +1930,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
src_running = env->src_stats.nr_running - 1;
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
- imbalance = adjust_numa_imbalance(imbalance, src_running);
+ imbalance = adjust_numa_imbalance(imbalance, dst_running);
/* Use idle CPU if there is no imbalance */
if (!imbalance) {
@@ -3084,7 +3089,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
- account_entity_dequeue(cfs_rq, se);
+ update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
@@ -3100,7 +3105,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq)
- account_entity_enqueue(cfs_rq, se);
+ update_load_add(&cfs_rq->load, se->load.weight);
}
@@ -3288,7 +3293,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
/**
* update_tg_load_avg - update the tg's load avg
* @cfs_rq: the cfs_rq whose avg changed
- * @force: update regardless of how small the difference
*
* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
* However, because tg->load_avg is a global value there are performance
@@ -3300,7 +3304,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
*
* Updating tg's load_avg is necessary before update_cfs_share().
*/
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
@@ -3310,7 +3314,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
if (cfs_rq->tg == &root_task_group)
return;
- if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+ if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
@@ -3612,7 +3616,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
@@ -3800,13 +3804,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* IOW we're enqueueing a task on a new CPU.
*/
attach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, 0);
+ update_tg_load_avg(cfs_rq);
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
if (flags & UPDATE_TG)
- update_tg_load_avg(cfs_rq, 0);
+ update_tg_load_avg(cfs_rq);
}
}
@@ -4461,17 +4465,17 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
se = second;
}
- /*
- * Prefer last buddy, try to return the CPU to a preempted task.
- */
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
- se = cfs_rq->last;
-
- /*
- * Someone really wants this to run. If it's not unfair, run it.
- */
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
+ /*
+ * Someone really wants this to run. If it's not unfair, run it.
+ */
se = cfs_rq->next;
+ } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
+ /*
+ * Prefer last buddy, try to return the CPU to a preempted task.
+ */
+ se = cfs_rq->last;
+ }
clear_buddies(cfs_rq, se);
@@ -6075,7 +6079,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, int target)
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu;
@@ -6083,7 +6087,8 @@ static int select_idle_smt(struct task_struct *p, int target)
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
@@ -6099,7 +6104,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
return -1;
}
-static inline int select_idle_smt(struct task_struct *p, int target)
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
@@ -6274,7 +6279,7 @@ symmetric:
if ((unsigned)i < nr_cpumask_bits)
return i;
- i = select_idle_smt(p, target);
+ i = select_idle_smt(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
@@ -6594,7 +6599,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
- spare_cap = cpu_cap - util;
+ spare_cap = cpu_cap;
+ lsub_positive(&spare_cap, util);
/*
* Skip CPUs that cannot satisfy the capacity request.
@@ -7402,6 +7408,10 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (unlikely(task_has_idle_policy(p)))
return 0;
+ /* SMT siblings share cache */
+ if (env->sd->flags & SD_SHARE_CPUCAPACITY)
+ return 0;
+
/*
* Buddy candidates are cache hot:
*/
@@ -7669,8 +7679,8 @@ static int detach_tasks(struct lb_env *env)
* scheduler fails to find a good waiting task to
* migrate.
*/
- if (load/2 > env->imbalance &&
- env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
+
+ if ((load >> env->sd->nr_balance_failed) > env->imbalance)
goto next;
env->imbalance -= load;
@@ -7887,7 +7897,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
struct sched_entity *se;
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
- update_tg_load_avg(cfs_rq, 0);
+ update_tg_load_avg(cfs_rq);
if (cfs_rq == &rq->cfs)
decayed = true;
@@ -8098,6 +8108,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
capacity = 1;
cpu_rq(cpu)->cpu_capacity = capacity;
+ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
+
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
sdg->sgc->max_capacity = capacity;
@@ -8957,7 +8969,7 @@ next_group:
}
}
-static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
+static inline long adjust_numa_imbalance(int imbalance, int nr_running)
{
unsigned int imbalance_min;
@@ -8966,7 +8978,7 @@ static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
* tasks that remain local when the source domain is almost idle.
*/
imbalance_min = 2;
- if (src_nr_running <= imbalance_min)
+ if (nr_running <= imbalance_min)
return 0;
return imbalance;
@@ -9780,6 +9792,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
+
+ /*
+ * Reduce likelihood of busy balancing at higher domains racing with
+ * balancing at lower domains by preventing their balancing periods
+ * from being multiples of each other.
+ */
+ if (cpu_busy)
+ interval -= 1;
+
interval = clamp(interval, 1UL, max_load_balance_interval);
return interval;
@@ -10786,7 +10807,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, false);
+ update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
@@ -10805,7 +10826,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, false);
+ update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
@@ -11302,6 +11323,18 @@ int sched_trace_rq_cpu(struct rq *rq)
}
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
+int sched_trace_rq_cpu_capacity(struct rq *rq)
+{
+ return rq ?
+#ifdef CONFIG_SMP
+ rq->cpu_capacity
+#else
+ SCHED_CAPACITY_SCALE
+#endif
+ : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
+
const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
{
#ifdef CONFIG_SMP
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 7481cd96f391..68d369cba9e4 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -77,7 +77,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
SCHED_FEAT(RT_PUSH_IPI, true)
#endif
-SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(RT_RUNTIME_SHARE, false)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 168479a7d61b..e23e74d52db5 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -18,6 +18,14 @@
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
#endif
+#ifdef CONFIG_RSEQ
+#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \
+ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
+#else
+#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
+#endif
+
#define MEMBARRIER_CMD_BITMASK \
(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
@@ -30,6 +38,11 @@ static void ipi_mb(void *info)
smp_mb(); /* IPIs should be serializing but paranoid. */
}
+static void ipi_rseq(void *info)
+{
+ rseq_preempt(current);
+}
+
static void ipi_sync_rq_state(void *info)
{
struct mm_struct *mm = (struct mm_struct *) info;
@@ -129,19 +142,27 @@ static int membarrier_global_expedited(void)
return 0;
}
-static int membarrier_private_expedited(int flags)
+static int membarrier_private_expedited(int flags, int cpu_id)
{
- int cpu;
cpumask_var_t tmpmask;
struct mm_struct *mm = current->mm;
+ smp_call_func_t ipi_func = ipi_mb;
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
+ } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+ if (!IS_ENABLED(CONFIG_RSEQ))
+ return -EINVAL;
+ if (!(atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
+ return -EPERM;
+ ipi_func = ipi_rseq;
} else {
+ WARN_ON_ONCE(flags);
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
return -EPERM;
@@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags)
*/
smp_mb(); /* system call entry is not a mb. */
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
return -ENOMEM;
cpus_read_lock();
- rcu_read_lock();
- for_each_online_cpu(cpu) {
+
+ if (cpu_id >= 0) {
struct task_struct *p;
- /*
- * Skipping the current CPU is OK even through we can be
- * migrated at any point. The current CPU, at the point
- * where we read raw_smp_processor_id(), is ensured to
- * be in program order with respect to the caller
- * thread. Therefore, we can skip this CPU from the
- * iteration.
- */
- if (cpu == raw_smp_processor_id())
- continue;
- p = rcu_dereference(cpu_rq(cpu)->curr);
- if (p && p->mm == mm)
- __cpumask_set_cpu(cpu, tmpmask);
+ if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
+ goto out;
+ if (cpu_id == raw_smp_processor_id())
+ goto out;
+ rcu_read_lock();
+ p = rcu_dereference(cpu_rq(cpu_id)->curr);
+ if (!p || p->mm != mm) {
+ rcu_read_unlock();
+ goto out;
+ }
+ rcu_read_unlock();
+ } else {
+ int cpu;
+
+ rcu_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ /*
+ * Skipping the current CPU is OK even through we can be
+ * migrated at any point. The current CPU, at the point
+ * where we read raw_smp_processor_id(), is ensured to
+ * be in program order with respect to the caller
+ * thread. Therefore, we can skip this CPU from the
+ * iteration.
+ */
+ if (cpu == raw_smp_processor_id())
+ continue;
+ p = rcu_dereference(cpu_rq(cpu)->curr);
+ if (p && p->mm == mm)
+ __cpumask_set_cpu(cpu, tmpmask);
+ }
+ rcu_read_unlock();
}
- rcu_read_unlock();
preempt_disable();
- smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ if (cpu_id >= 0)
+ smp_call_function_single(cpu_id, ipi_func, NULL, 1);
+ else
+ smp_call_function_many(tmpmask, ipi_func, NULL, 1);
preempt_enable();
- free_cpumask_var(tmpmask);
+out:
+ if (cpu_id < 0)
+ free_cpumask_var(tmpmask);
cpus_read_unlock();
/*
@@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags)
set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
ret;
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
ready_state =
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+ } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+ if (!IS_ENABLED(CONFIG_RSEQ))
+ return -EINVAL;
+ ready_state =
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
+ } else {
+ WARN_ON_ONCE(flags);
}
/*
@@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags)
return 0;
if (flags & MEMBARRIER_FLAG_SYNC_CORE)
set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+ if (flags & MEMBARRIER_FLAG_RSEQ)
+ set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
atomic_or(set_state, &mm->membarrier_state);
ret = sync_runqueues_membarrier_state(mm);
if (ret)
@@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags)
/**
* sys_membarrier - issue memory barriers on a set of threads
- * @cmd: Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
+ * @cmd: Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0 for all commands other than
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
+ * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
+ * contains the CPU on which to interrupt (= restart)
+ * the RSEQ critical section.
+ * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
+ * RSEQ CS should be interrupted (@cmd must be
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
*
* If this system call is not implemented, -ENOSYS is returned. If the
* command specified does not exist, not available on the running
@@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags)
* smp_mb() X O O
* sys_membarrier() O O O
*/
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
{
- if (unlikely(flags))
- return -EINVAL;
+ switch (cmd) {
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
+ return -EINVAL;
+ break;
+ default:
+ if (unlikely(flags))
+ return -EINVAL;
+ }
+
+ if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
+ cpu_id = -1;
+
switch (cmd) {
case MEMBARRIER_CMD_QUERY:
{
@@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
return membarrier_register_global_expedited();
case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
- return membarrier_private_expedited(0);
+ return membarrier_private_expedited(0, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
return membarrier_register_private_expedited(0);
case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
- return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+ return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
default:
return -EINVAL;
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1bd7e3af904f..dd7770226086 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -25,10 +25,18 @@ static inline bool sched_debug(void)
return sched_debug_enabled;
}
+#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
+const struct sd_flag_debug sd_flag_debug[] = {
+#include <linux/sched/sd_flags.h>
+};
+#undef SD_FLAG
+
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
{
struct sched_group *group = sd->groups;
+ unsigned long flags = sd->flags;
+ unsigned int idx;
cpumask_clear(groupmask);
@@ -43,6 +51,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
}
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ unsigned int flag = BIT(idx);
+ unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
+
+ if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
+ !(sd->child->flags & flag))
+ printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
+ sd_flag_debug[idx].name);
+
+ if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
+ !(sd->parent->flags & flag))
+ printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
+ sd_flag_debug[idx].name);
+ }
+
printk(KERN_DEBUG "%*s groups:", level + 1, "");
do {
if (!group) {
@@ -137,22 +160,22 @@ static inline bool sched_debug(void)
}
#endif /* CONFIG_SCHED_DEBUG */
+/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
+static const unsigned int SD_DEGENERATE_GROUPS_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
static int sd_degenerate(struct sched_domain *sd)
{
if (cpumask_weight(sched_domain_span(sd)) == 1)
return 1;
/* Following flags need at least 2 groups */
- if (sd->flags & (SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_SHARE_CPUCAPACITY |
- SD_ASYM_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN)) {
- if (sd->groups != sd->groups->next)
- return 0;
- }
+ if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
+ (sd->groups != sd->groups->next))
+ return 0;
/* Following flags don't use groups */
if (sd->flags & (SD_WAKE_AFFINE))
@@ -173,18 +196,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 0;
/* Flags needing groups don't count if only 1 group in parent */
- if (parent->groups == parent->groups->next) {
- pflags &= ~(SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_ASYM_CPUCAPACITY |
- SD_SHARE_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
- if (nr_node_ids == 1)
- pflags &= ~SD_SERIALIZE;
- }
+ if (parent->groups == parent->groups->next)
+ pflags &= ~SD_DEGENERATE_GROUPS_MASK;
+
if (~cflags & pflags)
return 0;
@@ -1292,7 +1306,6 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
* SD_SHARE_CPUCAPACITY - describes SMT topologies
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN - describes shared power domain
*
* Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it:
@@ -1303,8 +1316,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
(SD_SHARE_CPUCAPACITY | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
- SD_ASYM_PACKING | \
- SD_SHARE_POWERDOMAIN)
+ SD_ASYM_PACKING)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
@@ -1336,8 +1348,8 @@ sd_init(struct sched_domain_topology_level *tl,
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
- .busy_factor = 32,
- .imbalance_pct = 125,
+ .busy_factor = 16,
+ .imbalance_pct = 117,
.cache_nice_tries = 0,
@@ -1989,11 +2001,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;
+ int dflags = 0;
sd = NULL;
for_each_sd_topology(tl) {
- int dflags = 0;
-
if (tl == tl_asym) {
dflags |= SD_ASYM_CPUCAPACITY;
has_asym = true;