aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c319
1 files changed, 199 insertions, 120 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3a61a3b8eaa9..8298b2c240ce 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11,6 +11,7 @@
#include <linux/nospec.h>
#include <linux/kcov.h>
+#include <linux/scs.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -20,6 +21,7 @@
#include "../smpboot.h"
#include "pelt.h"
+#include "smp.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -219,6 +221,13 @@ void update_rq_clock(struct rq *rq)
update_rq_clock_task(rq, delta);
}
+static inline void
+rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
+{
+ csd->flags = 0;
+ csd->func = func;
+ csd->info = rq;
+}
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -314,16 +323,14 @@ void hrtick_start(struct rq *rq, u64 delay)
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
HRTIMER_MODE_REL_PINNED_HARD);
}
+
#endif /* CONFIG_SMP */
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
- rq->hrtick_csd.flags = 0;
- rq->hrtick_csd.func = __hrtick_start;
- rq->hrtick_csd.info = rq;
+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
#endif
-
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
}
@@ -632,29 +639,23 @@ void wake_up_nohz_cpu(int cpu)
wake_up_idle_cpu(cpu);
}
-static inline bool got_nohz_idle_kick(void)
+static void nohz_csd_func(void *info)
{
- int cpu = smp_processor_id();
-
- if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
- return false;
-
- if (idle_cpu(cpu) && !need_resched())
- return true;
+ struct rq *rq = info;
+ int cpu = cpu_of(rq);
+ unsigned int flags;
/*
- * We can't run Idle Load Balance on this CPU for this time so we
- * cancel it and clear NOHZ_BALANCE_KICK
+ * Release the rq::nohz_csd.
*/
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
- return false;
-}
-
-#else /* CONFIG_NO_HZ_COMMON */
+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+ WARN_ON(!(flags & NOHZ_KICK_MASK));
-static inline bool got_nohz_idle_kick(void)
-{
- return false;
+ rq->idle_balance = idle_cpu(cpu);
+ if (rq->idle_balance && !need_resched()) {
+ rq->nohz_idle_balance = flags;
+ raise_softirq_irqoff(SCHED_SOFTIRQ);
+ }
}
#endif /* CONFIG_NO_HZ_COMMON */
@@ -1110,8 +1111,7 @@ static void uclamp_update_root_tg(void) { }
#endif
int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
int old_min, old_max;
@@ -1232,13 +1232,8 @@ static void uclamp_fork(struct task_struct *p)
return;
for_each_clamp_id(clamp_id) {
- unsigned int clamp_value = uclamp_none(clamp_id);
-
- /* By default, RT tasks always get 100% boost */
- if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
- clamp_value = uclamp_none(UCLAMP_MAX);
-
- uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false);
+ uclamp_se_set(&p->uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
}
}
@@ -1544,7 +1539,7 @@ static int migration_cpu_stop(void *data)
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
- sched_ttwu_pending();
+ flush_smp_call_function_from_idle();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
@@ -2278,16 +2273,23 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-void sched_ttwu_pending(void)
+void sched_ttwu_pending(void *arg)
{
+ struct llist_node *llist = arg;
struct rq *rq = this_rq();
- struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p, *t;
struct rq_flags rf;
if (!llist)
return;
+ /*
+ * rq::ttwu_pending racy indication of out-standing wakeups.
+ * Races such that false-negatives are possible, since they
+ * are shorter lived that false-positives would be.
+ */
+ WRITE_ONCE(rq->ttwu_pending, 0);
+
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
@@ -2297,56 +2299,30 @@ void sched_ttwu_pending(void)
rq_unlock_irqrestore(rq, &rf);
}
-void scheduler_ipi(void)
+void send_call_function_single_ipi(int cpu)
{
- /*
- * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
- * TIF_NEED_RESCHED remotely (for the first time) will also send
- * this IPI.
- */
- preempt_fold_need_resched();
-
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
- return;
-
- /*
- * Not all reschedule IPI handlers call irq_enter/irq_exit, since
- * traditionally all their work was done from the interrupt return
- * path. Now that we actually do some work, we need to make sure
- * we do call them.
- *
- * Some archs already do call them, luckily irq_enter/exit nest
- * properly.
- *
- * Arguably we should visit all archs and update all handlers,
- * however a fair share of IPIs are still resched only so this would
- * somewhat pessimize the simple resched case.
- */
- irq_enter();
- sched_ttwu_pending();
+ struct rq *rq = cpu_rq(cpu);
- /*
- * Check if someone kicked us for doing the nohz idle load balance.
- */
- if (unlikely(got_nohz_idle_kick())) {
- this_rq()->idle_balance = 1;
- raise_softirq_irqoff(SCHED_SOFTIRQ);
- }
- irq_exit();
+ if (!set_nr_if_polling(rq->idle))
+ arch_send_call_function_single_ipi(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
-static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+/*
+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
+ * necessary. The wakee CPU on receipt of the IPI will queue the task
+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
+ * of the wakeup instead of the waker.
+ */
+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
- if (!set_nr_if_polling(rq->idle))
- smp_send_reschedule(cpu);
- else
- trace_sched_wake_idle_without_ipi(cpu);
- }
+ WRITE_ONCE(rq->ttwu_pending, 1);
+ __smp_call_single_queue(cpu, &p->wake_entry);
}
void wake_up_if_idle(int cpu)
@@ -2377,6 +2353,38 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
{
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
+
+static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+{
+ /*
+ * If the CPU does not share cache, then queue the task on the
+ * remote rqs wakelist to avoid accessing remote data.
+ */
+ if (!cpus_share_cache(smp_processor_id(), cpu))
+ return true;
+
+ /*
+ * If the task is descheduling and the only running task on the
+ * CPU then use the wakelist to offload the task activation to
+ * the soon-to-be-idle CPU as the current CPU is likely busy.
+ * nr_running is checked to avoid unnecessary task stacking.
+ */
+ if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1)
+ return true;
+
+ return false;
+}
+
+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
+{
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
+ __ttwu_queue_wakelist(p, cpu, wake_flags);
+ return true;
+ }
+
+ return false;
+}
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
@@ -2385,11 +2393,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
struct rq_flags rf;
#if defined(CONFIG_SMP)
- if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
- sched_clock_cpu(cpu); /* Sync clocks across CPUs */
- ttwu_queue_remote(p, cpu, wake_flags);
+ if (ttwu_queue_wakelist(p, cpu, wake_flags))
return;
- }
#endif
rq_lock(rq, &rf);
@@ -2566,12 +2571,22 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
* __schedule(). See the comment for smp_mb__after_spinlock().
+ *
+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
*/
smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
goto unlock;
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
#ifdef CONFIG_SMP
+ p->sched_contributes_to_load = !!task_contributes_to_load(p);
+ p->state = TASK_WAKING;
+
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
@@ -2595,6 +2610,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/*
* If the owning (remote) CPU is still in the middle of schedule() with
+ * this task as prev, considering queueing p on the remote CPUs wake_list
+ * which potentially sends an IPI instead of spinning on p->on_cpu to
+ * let the waker make forward progress. This is safe because IRQs are
+ * disabled and the IPI will deliver after on_cpu is cleared.
+ */
+ if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ))
+ goto unlock;
+
+ /*
+ * If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*
* Pairs with the smp_store_release() in finish_task().
@@ -2604,28 +2629,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
- p->sched_contributes_to_load = !!task_contributes_to_load(p);
- p->state = TASK_WAKING;
-
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
-
-#else /* CONFIG_SMP */
-
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
@@ -2640,6 +2649,52 @@ out:
}
/**
+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
+ * @p: Process for which the function is to be invoked.
+ * @func: Function to invoke.
+ * @arg: Argument to function.
+ *
+ * If the specified task can be quickly locked into a definite state
+ * (either sleeping or on a given runqueue), arrange to keep it in that
+ * state while invoking @func(@arg). This function can use ->on_rq and
+ * task_curr() to work out what the state is, if required. Given that
+ * @func can be invoked with a runqueue lock held, it had better be quite
+ * lightweight.
+ *
+ * Returns:
+ * @false if the task slipped out from under the locks.
+ * @true if the task was locked onto a runqueue or is sleeping.
+ * However, @func can override this by returning @false.
+ */
+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg)
+{
+ bool ret = false;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ lockdep_assert_irqs_enabled();
+ raw_spin_lock_irq(&p->pi_lock);
+ if (p->on_rq) {
+ rq = __task_rq_lock(p, &rf);
+ if (task_rq(p) == rq)
+ ret = func(p, arg);
+ rq_unlock(rq, &rf);
+ } else {
+ switch (p->state) {
+ case TASK_RUNNING:
+ case TASK_WAKING:
+ break;
+ default:
+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
+ if (!p->on_rq)
+ ret = func(p, arg);
+ }
+ }
+ raw_spin_unlock_irq(&p->pi_lock);
+ return ret;
+}
+
+/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
@@ -2707,6 +2762,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->capture_control = NULL;
#endif
init_numa_balancing(clone_flags, p);
+#ifdef CONFIG_SMP
+ p->wake_entry_type = CSD_TYPE_TTWU;
+#endif
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -2723,7 +2781,7 @@ void set_numabalancing_state(bool enabled)
#ifdef CONFIG_PROC_SYSCTL
int sysctl_numa_balancing(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
@@ -2797,8 +2855,8 @@ static void __init init_schedstats(void)
}
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
@@ -3882,6 +3940,9 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
#ifdef CONFIG_SCHED_STACK_END_CHECK
if (task_stack_end_corrupted(prev))
panic("corrupted stack end detected inside scheduler\n");
+
+ if (task_scs_end_corrupted(prev))
+ panic("corrupted shadow stack detected inside scheduler\n");
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
@@ -3904,6 +3965,28 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
schedstat_inc(this_rq()->sched_count);
}
+static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
+ struct rq_flags *rf)
+{
+#ifdef CONFIG_SMP
+ const struct sched_class *class;
+ /*
+ * We must do the balancing pass before put_prev_task(), such
+ * that when we release the rq->lock the task is in the same
+ * state as before we took rq->lock.
+ *
+ * We can terminate the balance pass as soon as we know there is
+ * a runnable task of @class priority or higher.
+ */
+ for_class_range(class, prev->sched_class, &idle_sched_class) {
+ if (class->balance(rq, prev, rf))
+ break;
+ }
+#endif
+
+ put_prev_task(rq, prev);
+}
+
/*
* Pick up the highest-prio task:
*/
@@ -3937,22 +4020,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
restart:
-#ifdef CONFIG_SMP
- /*
- * We must do the balancing pass before put_next_task(), such
- * that when we release the rq->lock the task is in the same
- * state as before we took rq->lock.
- *
- * We can terminate the balance pass as soon as we know there is
- * a runnable task of @class priority or higher.
- */
- for_class_range(class, prev->sched_class, &idle_sched_class) {
- if (class->balance(rq, prev, rf))
- break;
- }
-#endif
-
- put_prev_task(rq, prev);
+ put_prev_task_balance(rq, prev, rf);
for_each_class(class) {
p = class->pick_next_task(rq);
@@ -4642,7 +4710,7 @@ int idle_cpu(int cpu)
return 0;
#ifdef CONFIG_SMP
- if (!llist_empty(&rq->wake_list))
+ if (rq->ttwu_pending)
return 0;
#endif
@@ -6045,6 +6113,7 @@ void init_idle(struct task_struct *idle, int cpu)
idle->se.exec_start = sched_clock();
idle->flags |= PF_IDLE;
+ scs_task_reset(idle);
kasan_unpoison_task_stack(idle);
#ifdef CONFIG_SMP
@@ -6195,13 +6264,14 @@ void idle_task_exit(void)
struct mm_struct *mm = current->active_mm;
BUG_ON(cpu_online(smp_processor_id()));
+ BUG_ON(current != this_rq()->idle);
if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
- current->active_mm = &init_mm;
finish_arch_post_lock_switch();
}
- mmdrop(mm);
+
+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}
/*
@@ -6491,7 +6561,6 @@ int sched_cpu_dying(unsigned int cpu)
struct rq_flags rf;
/* Handle pending wakeups and then migrate everything off */
- sched_ttwu_pending();
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
@@ -6594,6 +6663,8 @@ void __init sched_init(void)
root_task_group.cfs_rq = (struct cfs_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -6646,7 +6717,6 @@ void __init sched_init(void)
init_rt_rq(&rq->rt);
init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
@@ -6668,7 +6738,6 @@ void __init sched_init(void)
* We achieve this by letting root_task_group's tasks sit
* directly in rq->cfs (i.e root_task_group->se[] = NULL).
*/
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -6696,6 +6765,8 @@ void __init sched_init(void)
#ifdef CONFIG_NO_HZ_COMMON
rq->last_blocked_load_update_tick = jiffies;
atomic_set(&rq->nohz_flags, 0);
+
+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
@@ -7390,6 +7461,8 @@ static DEFINE_MUTEX(cfs_constraints_mutex);
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
@@ -7418,6 +7491,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
return -EINVAL;
/*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota != RUNTIME_INF && quota > max_cfs_runtime)
+ return -EINVAL;
+
+ /*
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/