aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c1261
1 files changed, 795 insertions, 466 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9a02820fc10b..cb2aa2b54c7a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,26 +6,94 @@
*
* Copyright (C) 1991-2002 Linus Torvalds
*/
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
-#undef CREATE_TRACE_POINTS
-
-#include "sched.h"
+#include <linux/highmem.h>
+#include <linux/hrtimer_api.h>
+#include <linux/ktime_api.h>
+#include <linux/sched/signal.h>
+#include <linux/syscalls_api.h>
+#include <linux/debug_locks.h>
+#include <linux/prefetch.h>
+#include <linux/capability.h>
+#include <linux/pgtable_api.h>
+#include <linux/wait_bit.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/hardirq.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/init.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/rt.h>
-#include <linux/nospec.h>
#include <linux/blkdev.h>
+#include <linux/context_tracking.h>
+#include <linux/cpuset.h>
+#include <linux/delayacct.h>
+#include <linux/init_task.h>
+#include <linux/interrupt.h>
+#include <linux/ioprio.h>
+#include <linux/kallsyms.h>
#include <linux/kcov.h>
+#include <linux/kprobes.h>
+#include <linux/llist_api.h>
+#include <linux/mmu_context.h>
+#include <linux/mmzone.h>
+#include <linux/mutex_api.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/perf_event_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/rcuwait_api.h>
+#include <linux/sched/wake_q.h>
#include <linux/scs.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/vtime.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+# ifdef CONFIG_GENERIC_ENTRY
+# include <linux/entry-common.h>
+# endif
+#endif
+
+#include <uapi/linux/sched/types.h>
+#include <asm/irq_regs.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
-#include "../workqueue_internal.h"
-#include "../../fs/io-wq.h"
-#include "../smpboot.h"
+#define CREATE_TRACE_POINTS
+#include <linux/sched/rseq_api.h>
+#include <trace/events/sched.h>
+#undef CREATE_TRACE_POINTS
+
+#include "sched.h"
+#include "stats.h"
+#include "autogroup.h"
+#include "autogroup.h"
#include "pelt.h"
#include "smp.h"
+#include "stats.h"
+
+#include "../workqueue_internal.h"
+#include "../../io_uring/io-wq.h"
+#include "../smpboot.h"
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -36,6 +104,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
@@ -74,17 +143,7 @@ __read_mostly int sysctl_resched_latency_warn_once = 1;
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
-#ifdef CONFIG_PREEMPT_RT
-const_debug unsigned int sysctl_sched_nr_migrate = 8;
-#else
-const_debug unsigned int sysctl_sched_nr_migrate = 32;
-#endif
-
-/*
- * period over which we measure -rt task CPU usage in us.
- * default: 1s
- */
-unsigned int sysctl_sched_rt_period = 1000000;
+const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
__read_mostly int scheduler_running;
@@ -144,7 +203,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
return false;
/* flip prio, so high prio is leftmost */
- if (prio_less(b, a, task_rq(a)->core->core_forceidle))
+ if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
return true;
return false;
@@ -181,15 +240,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
}
-void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
rq->core->core_task_seq++;
- if (!sched_core_enqueued(p))
- return;
+ if (sched_core_enqueued(p)) {
+ rb_erase(&p->core_node, &rq->core_tree);
+ RB_CLEAR_NODE(&p->core_node);
+ }
- rb_erase(&p->core_node, &rq->core_tree);
- RB_CLEAR_NODE(&p->core_node);
+ /*
+ * Migrating the last task off the cpu, with the cpu in forced idle
+ * state. Reschedule to create an accounting edge for forced idle,
+ * and re-examine whether the core is still in forced idle state.
+ */
+ if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
+ rq->core->core_forceidle_count && rq->curr == rq->idle)
+ resched_curr(rq);
}
/*
@@ -280,6 +347,8 @@ static void __sched_core_flip(bool enabled)
for_each_cpu(t, smt_mask)
cpu_rq(t)->core_enabled = enabled;
+ cpu_rq(cpu)->core->core_forceidle_start = 0;
+
sched_core_unlock(cpu, &flags);
cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
@@ -288,10 +357,7 @@ static void __sched_core_flip(bool enabled)
/*
* Toggle the offline CPUs.
*/
- cpumask_copy(&sched_core_mask, cpu_possible_mask);
- cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
-
- for_each_cpu(cpu, &sched_core_mask)
+ for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask)
cpu_rq(cpu)->core_enabled = enabled;
cpus_read_unlock();
@@ -364,18 +430,12 @@ void sched_core_put(void)
#else /* !CONFIG_SCHED_CORE */
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
-static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+static inline void
+sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
#endif /* CONFIG_SCHED_CORE */
/*
- * part of the period that we allow rt tasks to run in us.
- * default: 0.95s
- */
-int sysctl_sched_rt_runtime = 950000;
-
-
-/*
* Serialization rules:
*
* Lock order:
@@ -415,8 +475,7 @@ int sysctl_sched_rt_runtime = 950000;
* p->se.load, p->rt_priority,
* p->dl.dl_{runtime, deadline, period, flags, bw, density}
* - sched_setnuma(): p->numa_preferred_nid
- * - sched_move_task()/
- * cpu_cgroup_fork(): p->sched_task_group
+ * - sched_move_task(): p->sched_task_group
* - uclamp_update_active() p->uclamp*
*
* p->state <- TASK_*:
@@ -534,10 +593,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
swap(rq1, rq2);
raw_spin_rq_lock(rq1);
- if (__rq_lockp(rq1) == __rq_lockp(rq2))
- return;
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
- raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+ double_rq_clock_clear_update(rq1, rq2);
}
#endif
@@ -642,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
+ psi_account_irqtime(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
@@ -807,15 +867,11 @@ static inline void hrtick_rq_init(struct rq *rq)
({ \
typeof(ptr) _ptr = (ptr); \
typeof(mask) _mask = (mask); \
- typeof(*_ptr) _old, _val = *_ptr; \
+ typeof(*_ptr) _val = *_ptr; \
\
- for (;;) { \
- _old = cmpxchg(_ptr, _val, _val | _mask); \
- if (_old == _val) \
- break; \
- _val = _old; \
- } \
- _old; \
+ do { \
+ } while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \
+ _val; \
})
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
@@ -824,7 +880,7 @@ static inline void hrtick_rq_init(struct rq *rq)
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
@@ -839,30 +895,28 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = READ_ONCE(ti->flags);
+ typeof(ti->flags) val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
- old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
- if (old == val)
+ if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
break;
- val = old;
}
return true;
}
#else
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
#ifdef CONFIG_SMP
-static bool set_nr_if_polling(struct task_struct *p)
+static inline bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
@@ -1013,13 +1067,13 @@ int get_nohz_timer_target(void)
struct sched_domain *sd;
const struct cpumask *hk_mask;
- if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
+ if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
- hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
+ hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -1035,7 +1089,7 @@ int get_nohz_timer_target(void)
}
if (default_cpu == -1)
- default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+ default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
cpu = default_cpu;
unlock:
rcu_read_unlock();
@@ -1243,10 +1297,10 @@ static void set_load_weight(struct task_struct *p, bool update_load)
static DEFINE_MUTEX(uclamp_mutex);
/* Max allowed minimum utilization */
-unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
/* Max allowed maximum utilization */
-unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
/*
* By default RT tasks run at the maximum performance point/capacity of the
@@ -1263,7 +1317,7 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
* This knob will not override the system default sched_util_clamp_min defined
* above.
*/
-unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
+static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];
@@ -1393,33 +1447,6 @@ static void uclamp_update_util_min_rt_default(struct task_struct *p)
task_rq_unlock(rq, p, &rf);
}
-static void uclamp_sync_util_min_rt_default(void)
-{
- struct task_struct *g, *p;
-
- /*
- * copy_process() sysctl_uclamp
- * uclamp_min_rt = X;
- * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
- * // link thread smp_mb__after_spinlock()
- * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
- * sched_post_fork() for_each_process_thread()
- * __uclamp_sync_rt() __uclamp_sync_rt()
- *
- * Ensures that either sched_post_fork() will observe the new
- * uclamp_min_rt or for_each_process_thread() will observe the new
- * task.
- */
- read_lock(&tasklist_lock);
- smp_mb__after_spinlock();
- read_unlock(&tasklist_lock);
-
- rcu_read_lock();
- for_each_process_thread(g, p)
- uclamp_update_util_min_rt_default(p);
- rcu_read_unlock();
-}
-
static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
@@ -1699,6 +1726,11 @@ uclamp_update_active_tasks(struct cgroup_subsys_state *css)
}
static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+#endif
+
+#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_UCLAMP_TASK
+#ifdef CONFIG_UCLAMP_TASK_GROUP
static void uclamp_update_root_tg(void)
{
struct task_group *tg = &root_task_group;
@@ -1716,7 +1748,34 @@ static void uclamp_update_root_tg(void)
static void uclamp_update_root_tg(void) { }
#endif
-int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+static void uclamp_sync_util_min_rt_default(void)
+{
+ struct task_struct *g, *p;
+
+ /*
+ * copy_process() sysctl_uclamp
+ * uclamp_min_rt = X;
+ * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
+ * // link thread smp_mb__after_spinlock()
+ * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
+ * sched_post_fork() for_each_process_thread()
+ * __uclamp_sync_rt() __uclamp_sync_rt()
+ *
+ * Ensures that either sched_post_fork() will observe the new
+ * uclamp_min_rt or for_each_process_thread() will observe the new
+ * task.
+ */
+ read_lock(&tasklist_lock);
+ smp_mb__after_spinlock();
+ read_unlock(&tasklist_lock);
+
+ rcu_read_lock();
+ for_each_process_thread(g, p)
+ uclamp_update_util_min_rt_default(p);
+ rcu_read_unlock();
+}
+
+static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
@@ -1780,6 +1839,8 @@ done:
return result;
}
+#endif
+#endif
static int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
@@ -2005,7 +2066,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
- sched_core_dequeue(rq, p);
+ sched_core_dequeue(rq, p, flags);
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
@@ -2114,7 +2175,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
- else if (p->sched_class > rq->curr->sched_class)
+ else if (sched_class_above(p->sched_class, rq->curr->sched_class))
resched_curr(rq);
/*
@@ -2261,7 +2322,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
rq = cpu_rq(new_cpu);
rq_lock(rq, rf);
- BUG_ON(task_cpu(p) != new_cpu);
+ WARN_ON_ONCE(task_cpu(p) != new_cpu);
activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
@@ -2332,7 +2393,7 @@ static int migration_cpu_stop(void *data)
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
- flush_smp_call_function_from_idle();
+ flush_smp_call_function_queue();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
@@ -2711,7 +2772,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
return -EINVAL;
}
- if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
+ if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
/*
* MIGRATE_ENABLE gets here because 'p == current', but for
* anything else we cannot do is_migration_disabled(), punt
@@ -3187,12 +3248,12 @@ out:
/*
* wait_task_inactive - wait for a thread to unschedule.
*
- * If @match_state is nonzero, it's the @p->state value just checked and
- * not expected to change. If it changes, i.e. @p might have woken up,
- * then return zero. When we succeed in waiting for @p to be off its CPU,
- * we return a positive number (its total switch count). If a second call
- * a short while later returns the same number, the caller can be sure that
- * @p has remained unscheduled the whole time.
+ * Wait for the thread to block in any of the states set in @match_state.
+ * If it changes, i.e. @p might have woken up, then return zero. When we
+ * succeed in waiting for @p to be off its CPU, we return a positive number
+ * (its total switch count). If a second call a short while later returns the
+ * same number, the caller can be sure that @p has remained unscheduled the
+ * whole time.
*
* The caller must ensure that the task *will* unschedule sometime soon,
* else this function might spin for a *long* time. This function can't
@@ -3223,12 +3284,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
*
* NOTE! Since we don't hold any locks, it's not
* even sure that "rq" stays as the right runqueue!
- * But we don't care, since "task_running()" will
+ * But we don't care, since "task_on_cpu()" will
* return false if the runqueue has changed and p
* is actually now running somewhere else!
*/
- while (task_running(rq, p)) {
- if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
+ while (task_on_cpu(rq, p)) {
+ if (!(READ_ONCE(p->__state) & match_state))
return 0;
cpu_relax();
}
@@ -3240,10 +3301,10 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
*/
rq = task_rq_lock(p, &rf);
trace_sched_wait_task(p);
- running = task_running(rq, p);
+ running = task_on_cpu(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
- if (!match_state || READ_ONCE(p->__state) == match_state)
+ if (READ_ONCE(p->__state) & match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, p, &rf);
@@ -3735,7 +3796,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
-static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
/*
* Do not complicate things with the async wake_list while the CPU is
@@ -3744,6 +3805,10 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
if (!cpu_active(cpu))
return false;
+ /* Ensure the task will still be allowed to run on the CPU. */
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ return false;
+
/*
* If the CPU does not share cache, then queue the task on the
* remote rqs wakelist to avoid accessing remote data.
@@ -3751,13 +3816,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
if (!cpus_share_cache(smp_processor_id(), cpu))
return true;
+ if (cpu == smp_processor_id())
+ return false;
+
/*
- * If the task is descheduling and the only running task on the
- * CPU then use the wakelist to offload the task activation to
- * the soon-to-be-idle CPU as the current CPU is likely busy.
- * nr_running is checked to avoid unnecessary task stacking.
+ * If the wakee cpu is idle, or the task is descheduling and the
+ * only running task on the CPU, then use the wakelist to offload
+ * the task activation to the idle (or soon-to-be-idle) CPU as
+ * the current CPU is likely busy. nr_running is checked to
+ * avoid unnecessary task stacking.
+ *
+ * Note that we can only get here with (wakee) p->on_rq=0,
+ * p->on_cpu can be whatever, we've done the dequeue, so
+ * the wakee has been accounted out of ->nr_running.
*/
- if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
+ if (!cpu_rq(cpu)->nr_running)
return true;
return false;
@@ -3765,10 +3838,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
- if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
- if (WARN_ON_ONCE(cpu == smp_processor_id()))
- return false;
-
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) {
sched_clock_cpu(cpu); /* Sync clocks across CPUs */
__ttwu_queue_wakelist(p, cpu, wake_flags);
return true;
@@ -4090,7 +4160,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* scheduling.
*/
if (smp_load_acquire(&p->on_cpu) &&
- ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
goto unlock;
/*
@@ -4191,6 +4261,38 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
}
/**
+ * cpu_curr_snapshot - Return a snapshot of the currently running task
+ * @cpu: The CPU on which to snapshot the task.
+ *
+ * Returns the task_struct pointer of the task "currently" running on
+ * the specified CPU. If the same task is running on that CPU throughout,
+ * the return value will be a pointer to that task's task_struct structure.
+ * If the CPU did any context switches even vaguely concurrently with the
+ * execution of this function, the return value will be a pointer to the
+ * task_struct structure of a randomly chosen task that was running on
+ * that CPU somewhere around the time that this function was executing.
+ *
+ * If the specified CPU was offline, the return value is whatever it
+ * is, perhaps a pointer to the task_struct structure of that CPU's idle
+ * task, but there is no guarantee. Callers wishing a useful return
+ * value must take some action to ensure that the specified CPU remains
+ * online throughout.
+ *
+ * This function executes full memory barriers before and after fetching
+ * the pointer, which permits the caller to confine this function's fetch
+ * with respect to the caller's accesses to other shared variables.
+ */
+struct task_struct *cpu_curr_snapshot(int cpu)
+{
+ struct task_struct *t;
+
+ smp_mb(); /* Pairing determined by caller's synchronization design. */
+ t = rcu_dereference(cpu_curr(cpu));
+ smp_mb(); /* Pairing determined by caller's synchronization design. */
+ return t;
+}
+
+/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
@@ -4268,7 +4370,9 @@ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
#ifdef CONFIG_NUMA_BALANCING
-void set_numabalancing_state(bool enabled)
+int sysctl_numa_balancing_mode;
+
+static void __set_numabalancing_state(bool enabled)
{
if (enabled)
static_branch_enable(&sched_numa_balancing);
@@ -4276,13 +4380,33 @@ void set_numabalancing_state(bool enabled)
static_branch_disable(&sched_numa_balancing);
}
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
+ else
+ sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
+ __set_numabalancing_state(enabled);
+}
+
#ifdef CONFIG_PROC_SYSCTL
+static void reset_memory_tiering(void)
+{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat) {
+ pgdat->nbp_threshold = 0;
+ pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ pgdat->nbp_th_start = jiffies_to_msecs(jiffies);
+ }
+}
+
int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
- int state = static_branch_likely(&sched_numa_balancing);
+ int state = sysctl_numa_balancing_mode;
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4292,8 +4416,13 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
- if (write)
- set_numabalancing_state(state);
+ if (write) {
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ (state & NUMA_BALANCING_MEMORY_TIERING))
+ reset_memory_tiering();
+ sysctl_numa_balancing_mode = state;
+ __set_numabalancing_state(state);
+ }
return err;
}
#endif
@@ -4341,7 +4470,7 @@ out:
__setup("schedstats=", setup_schedstats);
#ifdef CONFIG_PROC_SYSCTL
-int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
+static int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -4363,6 +4492,52 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
#endif /* CONFIG_PROC_SYSCTL */
#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_core_sysctls[] = {
+#ifdef CONFIG_SCHEDSTATS
+ {
+ .procname = "sched_schedstats",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_schedstats,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SCHEDSTATS */
+#ifdef CONFIG_UCLAMP_TASK
+ {
+ .procname = "sched_util_clamp_min",
+ .data = &sysctl_sched_uclamp_util_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+ {
+ .procname = "sched_util_clamp_max",
+ .data = &sysctl_sched_uclamp_util_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+ {
+ .procname = "sched_util_clamp_min_rt_default",
+ .data = &sysctl_sched_uclamp_util_min_rt_default,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+#endif /* CONFIG_UCLAMP_TASK */
+ {}
+};
+static int __init sched_core_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_core_sysctls);
+ return 0;
+}
+late_initcall(sched_core_sysctl_init);
+#endif /* CONFIG_SYSCTL */
+
/*
* fork()/clone()-time setup:
*/
@@ -4413,6 +4588,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
init_entity_runnable_average(&p->se);
+
#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -4428,18 +4604,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}
-void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{
unsigned long flags;
-#ifdef CONFIG_CGROUP_SCHED
- struct task_group *tg;
-#endif
+ /*
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+ * required yet, but lockdep gets upset if rules are violated.
+ */
raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_CGROUP_SCHED
- tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
- struct task_group, css);
- p->sched_task_group = autogroup_task_group(p, tg);
+ if (1) {
+ struct task_group *tg;
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+ struct task_group, css);
+ tg = autogroup_task_group(p, tg);
+ p->sched_task_group = tg;
+ }
#endif
rseq_migrate(p);
/*
@@ -4450,7 +4631,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+void sched_post_fork(struct task_struct *p)
+{
uclamp_post_fork(p);
}
@@ -4612,7 +4796,8 @@ static inline void prepare_task(struct task_struct *next)
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
*
- * See the ttwu() WF_ON_CPU case and its ordering comment.
+ * See the smp_load_acquire(&p->on_cpu) case in ttwu() and
+ * its ordering comment.
*/
WRITE_ONCE(next->on_cpu, 1);
#endif
@@ -4638,10 +4823,10 @@ static inline void finish_task(struct task_struct *prev)
#ifdef CONFIG_SMP
-static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
{
void (*func)(struct rq *rq);
- struct callback_head *next;
+ struct balance_callback *next;
lockdep_assert_rq_held(rq);
@@ -4657,28 +4842,58 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
static void balance_push(struct rq *rq);
-struct callback_head balance_push_callback = {
+/*
+ * balance_push_callback is a right abuse of the callback interface and plays
+ * by significantly different rules.
+ *
+ * Where the normal balance_callback's purpose is to be ran in the same context
+ * that queued it (only later, when it's safe to drop rq->lock again),
+ * balance_push_callback is specifically targeted at __schedule().
+ *
+ * This abuse is tolerated because it places all the unlikely/odd cases behind
+ * a single test, namely: rq->balance_callback == NULL.
+ */
+struct balance_callback balance_push_callback = {
.next = NULL,
- .func = (void (*)(struct callback_head *))balance_push,
+ .func = balance_push,
};
-static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+static inline struct balance_callback *
+__splice_balance_callbacks(struct rq *rq, bool split)
{
- struct callback_head *head = rq->balance_callback;
+ struct balance_callback *head = rq->balance_callback;
+
+ if (likely(!head))
+ return NULL;
lockdep_assert_rq_held(rq);
- if (head)
+ /*
+ * Must not take balance_push_callback off the list when
+ * splice_balance_callbacks() and balance_callbacks() are not
+ * in the same rq->lock section.
+ *
+ * In that case it would be possible for __schedule() to interleave
+ * and observe the list empty.
+ */
+ if (split && head == &balance_push_callback)
+ head = NULL;
+ else
rq->balance_callback = NULL;
return head;
}
+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
+{
+ return __splice_balance_callbacks(rq, true);
+}
+
static void __balance_callbacks(struct rq *rq)
{
- do_balance_callbacks(rq, splice_balance_callbacks(rq));
+ do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
}
-static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
{
unsigned long flags;
@@ -4695,12 +4910,12 @@ static inline void __balance_callbacks(struct rq *rq)
{
}
-static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
{
return NULL;
}
-static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
{
}
@@ -4814,7 +5029,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
- long prev_state;
+ unsigned int prev_state;
/*
* The previous task will have left us with a preempt_count of 2
@@ -4959,6 +5174,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ lru_gen_use_mm(next->mm);
if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */
@@ -5247,6 +5463,7 @@ void scheduler_tick(void)
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
+ sched_core_tick(rq);
rq_unlock(rq, &rf);
@@ -5358,7 +5575,7 @@ static void sched_tick_start(int cpu)
int os;
struct tick_work *twork;
- if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -5379,7 +5596,7 @@ static void sched_tick_stop(int cpu)
struct tick_work *twork;
int os;
- if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -5590,7 +5807,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* higher scheduling class, because otherwise those lose the
* opportunity to pull in more work from other CPUs.
*/
- if (likely(prev->sched_class <= &fair_sched_class &&
+ if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = pick_next_task_fair(rq, prev, rf);
@@ -5653,12 +5870,15 @@ static inline struct task_struct *pick_task(struct rq *rq)
extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+static void queue_core_balance(struct rq *rq);
+
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *next, *p, *max = NULL;
const struct cpumask *smt_mask;
bool fi_before = false;
+ bool core_clock_updated = (rq == rq->core);
unsigned long cookie;
int i, cpu, occ = 0;
struct rq *rq_i;
@@ -5701,7 +5921,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
rq->core_pick = NULL;
- return next;
+ goto out;
}
put_prev_task_balance(rq, prev, rf);
@@ -5711,10 +5931,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* reset state */
rq->core->core_cookie = 0UL;
- if (rq->core->core_forceidle) {
+ if (rq->core->core_forceidle_count) {
+ if (!core_clock_updated) {
+ update_rq_clock(rq->core);
+ core_clock_updated = true;
+ }
+ sched_core_account_forceidle(rq);
+ /* reset after accounting force idle */
+ rq->core->core_forceidle_start = 0;
+ rq->core->core_forceidle_count = 0;
+ rq->core->core_forceidle_occupation = 0;
need_sync = true;
fi_before = true;
- rq->core->core_forceidle = false;
}
/*
@@ -5743,7 +5971,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*/
WARN_ON_ONCE(fi_before);
task_vruntime_update(rq, next, false);
- goto done;
+ goto out_set_next;
}
}
@@ -5756,7 +5984,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i);
- if (i != cpu)
+ /*
+ * Current cpu always has its clock updated on entrance to
+ * pick_next_task(). If the current cpu is not the core,
+ * the core may also have been updated above.
+ */
+ if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
p = rq_i->core_pick = pick_task(rq_i);
@@ -5786,7 +6019,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (p == rq_i->idle) {
if (rq_i->nr_running) {
- rq->core->core_forceidle = true;
+ rq->core->core_forceidle_count++;
if (!fi_before)
rq->core->core_forceidle_seq++;
}
@@ -5795,6 +6028,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
}
+ if (schedstat_enabled() && rq->core->core_forceidle_count) {
+ rq->core->core_forceidle_start = rq_clock(rq->core);
+ rq->core->core_forceidle_occupation = occ;
+ }
+
rq->core->core_pick_seq = rq->core->core_task_seq;
next = rq->core_pick;
rq->core_sched_seq = rq->core->core_pick_seq;
@@ -5831,8 +6069,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* 1 0 1
* 1 1 0
*/
- if (!(fi_before && rq->core->core_forceidle))
- task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
+ if (!(fi_before && rq->core->core_forceidle_count))
+ task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
rq_i->core_pick->core_occupation = occ;
@@ -5852,8 +6090,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
resched_curr(rq_i);
}
-done:
+out_set_next:
set_next_task(rq, next);
+out:
+ if (rq->core->core_forceidle_count && next == rq->idle)
+ queue_core_balance(rq);
+
return next;
}
@@ -5882,7 +6124,7 @@ static bool try_steal_cookie(int this, int that)
if (p == src->core_pick || p == src->curr)
goto next;
- if (!cpumask_test_cpu(this, &p->cpus_mask))
+ if (!is_cpu_allowed(p, this))
goto next;
if (p->core_occupation > dst->idle->core_occupation)
@@ -5946,9 +6188,9 @@ static void sched_core_balance(struct rq *rq)
preempt_enable();
}
-static DEFINE_PER_CPU(struct callback_head, core_balance_head);
+static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
-void queue_core_balance(struct rq *rq)
+static void queue_core_balance(struct rq *rq)
{
if (!sched_core_enabled(rq))
return;
@@ -6036,11 +6278,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
goto unlock;
/* copy the shared state to the new leader */
- core_rq->core_task_seq = rq->core_task_seq;
- core_rq->core_pick_seq = rq->core_pick_seq;
- core_rq->core_cookie = rq->core_cookie;
- core_rq->core_forceidle = rq->core_forceidle;
- core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_task_seq = rq->core_task_seq;
+ core_rq->core_pick_seq = rq->core_pick_seq;
+ core_rq->core_cookie = rq->core_cookie;
+ core_rq->core_forceidle_count = rq->core_forceidle_count;
+ core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+ core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
+
+ /*
+ * Accounting edge for forced idle is handled in pick_next_task().
+ * Don't need another one here, since the hotplug thread shouldn't
+ * have a cookie.
+ */
+ core_rq->core_forceidle_start = 0;
/* install new leader */
for_each_cpu(t, smt_mask) {
@@ -6178,10 +6428,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
/*
* We must load prev->state once (task_struct::state is volatile), such
- * that:
- *
- * - we form a control dependency vs deactivate_task() below.
- * - ptrace_{,un}freeze_traced() can change ->state underneath us.
+ * that we form a control dependency vs deactivate_task() below.
*/
prev_state = READ_ONCE(prev->__state);
if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
@@ -6191,7 +6438,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
prev->sched_contributes_to_load =
(prev_state & TASK_UNINTERRUPTIBLE) &&
!(prev_state & TASK_NOLOAD) &&
- !(prev->flags & PF_FROZEN);
+ !(prev_state & TASK_FROZEN);
if (prev->sched_contributes_to_load)
rq->nr_uninterruptible++;
@@ -6250,7 +6497,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
- trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
@@ -6298,15 +6545,18 @@ static inline void sched_submit_work(struct task_struct *tsk)
io_wq_worker_sleeping(tsk);
}
- if (tsk_is_pi_blocked(tsk))
- return;
+ /*
+ * spinlock and rwlock must not flush block requests. This will
+ * deadlock if the callback attempts to acquire a lock which is
+ * already acquired.
+ */
+ SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
*/
- if (blk_needs_flush_plug(tsk))
- blk_flush_plug(tsk->plug, true);
+ blk_flush_plug(tsk->plug, true);
}
static void sched_update_worker(struct task_struct *tsk)
@@ -6358,7 +6608,7 @@ void __sched schedule_idle(void)
} while (need_resched());
}
-#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK)
+#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK)
asmlinkage __visible void __sched schedule_user(void)
{
/*
@@ -6443,17 +6693,31 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
*/
if (likely(!preemptible()))
return;
-
preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#ifndef preempt_schedule_dynamic_enabled
+#define preempt_schedule_dynamic_enabled preempt_schedule
+#define preempt_schedule_dynamic_disabled NULL
+#endif
+DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
+void __sched notrace dynamic_preempt_schedule(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule))
+ return;
+ preempt_schedule();
+}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule);
+EXPORT_SYMBOL(dynamic_preempt_schedule);
+#endif
#endif
-
/**
* preempt_schedule_notrace - preempt_schedule called by tracing
@@ -6508,147 +6772,27 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
-EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#ifndef preempt_schedule_notrace_dynamic_enabled
+#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
+#define preempt_schedule_notrace_dynamic_disabled NULL
#endif
-
-#endif /* CONFIG_PREEMPTION */
-
-#ifdef CONFIG_PREEMPT_DYNAMIC
-
-#include <linux/entry-common.h>
-
-/*
- * SC:cond_resched
- * SC:might_resched
- * SC:preempt_schedule
- * SC:preempt_schedule_notrace
- * SC:irqentry_exit_cond_resched
- *
- *
- * NONE:
- * cond_resched <- __cond_resched
- * might_resched <- RET0
- * preempt_schedule <- NOP
- * preempt_schedule_notrace <- NOP
- * irqentry_exit_cond_resched <- NOP
- *
- * VOLUNTARY:
- * cond_resched <- __cond_resched
- * might_resched <- __cond_resched
- * preempt_schedule <- NOP
- * preempt_schedule_notrace <- NOP
- * irqentry_exit_cond_resched <- NOP
- *
- * FULL:
- * cond_resched <- RET0
- * might_resched <- RET0
- * preempt_schedule <- preempt_schedule
- * preempt_schedule_notrace <- preempt_schedule_notrace
- * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
- */
-
-enum {
- preempt_dynamic_undefined = -1,
- preempt_dynamic_none,
- preempt_dynamic_voluntary,
- preempt_dynamic_full,
-};
-
-int preempt_dynamic_mode = preempt_dynamic_undefined;
-
-int sched_dynamic_mode(const char *str)
-{
- if (!strcmp(str, "none"))
- return preempt_dynamic_none;
-
- if (!strcmp(str, "voluntary"))
- return preempt_dynamic_voluntary;
-
- if (!strcmp(str, "full"))
- return preempt_dynamic_full;
-
- return -EINVAL;
-}
-
-void sched_dynamic_update(int mode)
-{
- /*
- * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
- * the ZERO state, which is invalid.
- */
- static_call_update(cond_resched, __cond_resched);
- static_call_update(might_resched, __cond_resched);
- static_call_update(preempt_schedule, __preempt_schedule_func);
- static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
- static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
-
- switch (mode) {
- case preempt_dynamic_none:
- static_call_update(cond_resched, __cond_resched);
- static_call_update(might_resched, (void *)&__static_call_return0);
- static_call_update(preempt_schedule, NULL);
- static_call_update(preempt_schedule_notrace, NULL);
- static_call_update(irqentry_exit_cond_resched, NULL);
- pr_info("Dynamic Preempt: none\n");
- break;
-
- case preempt_dynamic_voluntary:
- static_call_update(cond_resched, __cond_resched);
- static_call_update(might_resched, __cond_resched);
- static_call_update(preempt_schedule, NULL);
- static_call_update(preempt_schedule_notrace, NULL);
- static_call_update(irqentry_exit_cond_resched, NULL);
- pr_info("Dynamic Preempt: voluntary\n");
- break;
-
- case preempt_dynamic_full:
- static_call_update(cond_resched, (void *)&__static_call_return0);
- static_call_update(might_resched, (void *)&__static_call_return0);
- static_call_update(preempt_schedule, __preempt_schedule_func);
- static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
- static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: full\n");
- break;
- }
-
- preempt_dynamic_mode = mode;
-}
-
-static int __init setup_preempt_mode(char *str)
-{
- int mode = sched_dynamic_mode(str);
- if (mode < 0) {
- pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
- return 0;
- }
-
- sched_dynamic_update(mode);
- return 1;
-}
-__setup("preempt=", setup_preempt_mode);
-
-static void __init preempt_dynamic_init(void)
+DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
+void __sched notrace dynamic_preempt_schedule_notrace(void)
{
- if (preempt_dynamic_mode == preempt_dynamic_undefined) {
- if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
- sched_dynamic_update(preempt_dynamic_none);
- } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
- sched_dynamic_update(preempt_dynamic_voluntary);
- } else {
- /* Default static call setting, nothing to do */
- WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
- preempt_dynamic_mode = preempt_dynamic_full;
- pr_info("Dynamic Preempt: full\n");
- }
- }
+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace))
+ return;
+ preempt_schedule_notrace();
}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
+EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
+#endif
+#endif
-#else /* !CONFIG_PREEMPT_DYNAMIC */
-
-static inline void preempt_dynamic_init(void) { }
-
-#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
+#endif /* CONFIG_PREEMPTION */
/*
* This is the entry point to schedule() from kernel preemption
@@ -6903,17 +7047,29 @@ out_unlock:
EXPORT_SYMBOL(set_user_nice);
/*
- * can_nice - check if a task can reduce its nice value
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
* @p: task
* @nice: nice value
*/
-int can_nice(const struct task_struct *p, const int nice)
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
{
/* Convert nice value [19,-20] to rlimit style value [1,40]: */
int nice_rlim = nice_to_rlimit(nice);
- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
- capable(CAP_SYS_NICE));
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
}
#ifdef __ARCH_WANT_SYS_NICE
@@ -7042,12 +7198,14 @@ struct task_struct *idle_task(int cpu)
* required to meet deadlines.
*/
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum cpu_util_type type,
+ enum cpu_util_type type,
struct task_struct *p)
{
- unsigned long dl_util, util, irq;
+ unsigned long dl_util, util, irq, max;
struct rq *rq = cpu_rq(cpu);
+ max = arch_scale_cpu_capacity(cpu);
+
if (!uclamp_is_used() &&
type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
@@ -7127,10 +7285,9 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
return min(max, util);
}
-unsigned long sched_cpu_util(int cpu, unsigned long max)
+unsigned long sched_cpu_util(int cpu)
{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
- ENERGY_UTIL, NULL);
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
}
#endif /* CONFIG_SMP */
@@ -7192,6 +7349,69 @@ static bool check_same_owner(struct task_struct *p)
return match;
}
+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ int policy, int reset_on_fork)
+{
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !is_nice_reduction(p, attr->sched_nice))
+ goto req_priv;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy: */
+ if (policy != p->policy && !rlim_rtprio)
+ goto req_priv;
+
+ /* Can't increase priority: */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ goto req_priv;
+ }
+
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ goto req_priv;
+
+ /*
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+ */
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
+ if (!is_nice_reduction(p, task_nice(p)))
+ goto req_priv;
+ }
+
+ /* Can't change other user's priorities: */
+ if (!check_same_owner(p))
+ goto req_priv;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ goto req_priv;
+
+ return 0;
+
+req_priv:
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ return 0;
+}
+
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
@@ -7199,7 +7419,7 @@ static int __sched_setscheduler(struct task_struct *p,
int oldpolicy = -1, policy = attr->sched_policy;
int retval, oldprio, newprio, queued, running;
const struct sched_class *prev_class;
- struct callback_head *head;
+ struct balance_callback *head;
struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -7233,58 +7453,11 @@ recheck:
(rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;
- /*
- * Allow unprivileged RT tasks to decrease priority:
- */
- if (user && !capable(CAP_SYS_NICE)) {
- if (fair_policy(policy)) {
- if (attr->sched_nice < task_nice(p) &&
- !can_nice(p, attr->sched_nice))
- return -EPERM;
- }
-
- if (rt_policy(policy)) {
- unsigned long rlim_rtprio =
- task_rlimit(p, RLIMIT_RTPRIO);
-
- /* Can't set/change the rt policy: */
- if (policy != p->policy && !rlim_rtprio)
- return -EPERM;
-
- /* Can't increase priority: */
- if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- return -EPERM;
- }
-
- /*
- * Can't set/change SCHED_DEADLINE policy at all for now
- * (safest behavior); in the future we would like to allow
- * unprivileged DL tasks to increase their relative deadline
- * or reduce their runtime (both ways reducing utilization)
- */
- if (dl_policy(policy))
- return -EPERM;
-
- /*
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- if (task_has_idle_policy(p) && !idle_policy(policy)) {
- if (!can_nice(p, task_nice(p)))
- return -EPERM;
- }
-
- /* Can't change other user's priorities: */
- if (!check_same_owner(p))
- return -EPERM;
-
- /* Normal users shall not reset the sched_reset_on_fork flag: */
- if (p->sched_reset_on_fork && !reset_on_fork)
- return -EPERM;
- }
-
if (user) {
+ retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+ if (retval)
+ return retval;
+
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;
@@ -8155,11 +8328,35 @@ EXPORT_SYMBOL(__cond_resched);
#endif
#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define cond_resched_dynamic_enabled __cond_resched
+#define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(cond_resched);
+#define might_resched_dynamic_enabled __cond_resched
+#define might_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(might_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
+int __sched dynamic_cond_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_cond_resched))
+ return 0;
+ return __cond_resched();
+}
+EXPORT_SYMBOL(dynamic_cond_resched);
+
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);
+int __sched dynamic_might_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_might_resched))
+ return 0;
+ return __cond_resched();
+}
+EXPORT_SYMBOL(dynamic_might_resched);
+#endif
#endif
/*
@@ -8179,9 +8376,7 @@ int __cond_resched_lock(spinlock_t *lock)
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
+ if (!_cond_resched())
cpu_relax();
ret = 1;
spin_lock(lock);
@@ -8199,9 +8394,7 @@ int __cond_resched_rwlock_read(rwlock_t *lock)
if (rwlock_needbreak(lock) || resched) {
read_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
+ if (!_cond_resched())
cpu_relax();
ret = 1;
read_lock(lock);
@@ -8219,9 +8412,7 @@ int __cond_resched_rwlock_write(rwlock_t *lock)
if (rwlock_needbreak(lock) || resched) {
write_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
+ if (!_cond_resched())
cpu_relax();
ret = 1;
write_lock(lock);
@@ -8230,6 +8421,166 @@ int __cond_resched_rwlock_write(rwlock_t *lock)
}
EXPORT_SYMBOL(__cond_resched_rwlock_write);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+#ifdef CONFIG_GENERIC_ENTRY
+#include <linux/entry-common.h>
+#endif
+
+/*
+ * SC:cond_resched
+ * SC:might_resched
+ * SC:preempt_schedule
+ * SC:preempt_schedule_notrace
+ * SC:irqentry_exit_cond_resched
+ *
+ *
+ * NONE:
+ * cond_resched <- __cond_resched
+ * might_resched <- RET0
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * VOLUNTARY:
+ * cond_resched <- __cond_resched
+ * might_resched <- __cond_resched
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * FULL:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ */
+
+enum {
+ preempt_dynamic_undefined = -1,
+ preempt_dynamic_none,
+ preempt_dynamic_voluntary,
+ preempt_dynamic_full,
+};
+
+int preempt_dynamic_mode = preempt_dynamic_undefined;
+
+int sched_dynamic_mode(const char *str)
+{
+ if (!strcmp(str, "none"))
+ return preempt_dynamic_none;
+
+ if (!strcmp(str, "voluntary"))
+ return preempt_dynamic_voluntary;
+
+ if (!strcmp(str, "full"))
+ return preempt_dynamic_full;
+
+ return -EINVAL;
+}
+
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
+#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key)
+#else
+#error "Unsupported PREEMPT_DYNAMIC mechanism"
+#endif
+
+void sched_dynamic_update(int mode)
+{
+ /*
+ * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
+ * the ZERO state, which is invalid.
+ */
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+
+ switch (mode) {
+ case preempt_dynamic_none:
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_disable(preempt_schedule);
+ preempt_dynamic_disable(preempt_schedule_notrace);
+ preempt_dynamic_disable(irqentry_exit_cond_resched);
+ pr_info("Dynamic Preempt: none\n");
+ break;
+
+ case preempt_dynamic_voluntary:
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(might_resched);
+ preempt_dynamic_disable(preempt_schedule);
+ preempt_dynamic_disable(preempt_schedule_notrace);
+ preempt_dynamic_disable(irqentry_exit_cond_resched);
+ pr_info("Dynamic Preempt: voluntary\n");
+ break;
+
+ case preempt_dynamic_full:
+ preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+ pr_info("Dynamic Preempt: full\n");
+ break;
+ }
+
+ preempt_dynamic_mode = mode;
+}
+
+static int __init setup_preempt_mode(char *str)
+{
+ int mode = sched_dynamic_mode(str);
+ if (mode < 0) {
+ pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
+ return 0;
+ }
+
+ sched_dynamic_update(mode);
+ return 1;
+}
+__setup("preempt=", setup_preempt_mode);
+
+static void __init preempt_dynamic_init(void)
+{
+ if (preempt_dynamic_mode == preempt_dynamic_undefined) {
+ if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
+ sched_dynamic_update(preempt_dynamic_none);
+ } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
+ sched_dynamic_update(preempt_dynamic_voluntary);
+ } else {
+ /* Default static call setting, nothing to do */
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
+ preempt_dynamic_mode = preempt_dynamic_full;
+ pr_info("Dynamic Preempt: full\n");
+ }
+ }
+}
+
+#define PREEMPT_MODEL_ACCESSOR(mode) \
+ bool preempt_model_##mode(void) \
+ { \
+ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
+ return preempt_dynamic_mode == preempt_dynamic_##mode; \
+ } \
+ EXPORT_SYMBOL_GPL(preempt_model_##mode)
+
+PREEMPT_MODEL_ACCESSOR(none);
+PREEMPT_MODEL_ACCESSOR(voluntary);
+PREEMPT_MODEL_ACCESSOR(full);
+
+#else /* !CONFIG_PREEMPT_DYNAMIC */
+
+static inline void preempt_dynamic_init(void) { }
+
+#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
+
/**
* yield - yield the current processor to other threads.
*
@@ -8307,7 +8658,7 @@ again:
if (curr->sched_class != p->sched_class)
goto out_unlock;
- if (task_running(p_rq, p) || !task_is_running(p))
+ if (task_on_cpu(p_rq, p) || !task_is_running(p))
goto out_unlock;
yielded = curr->sched_class->yield_to_task(rq, p);
@@ -8338,9 +8689,7 @@ int io_schedule_prepare(void)
int old_iowait = current->in_iowait;
current->in_iowait = 1;
- if (current->plug)
- blk_flush_plug(current->plug, true);
-
+ blk_flush_plug(current->plug, true);
return old_iowait;
}
@@ -8521,9 +8870,9 @@ void sched_show_task(struct task_struct *p)
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n",
+ pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n",
free, task_pid_nr(p), ppid,
- (unsigned long)task_thread_info(p)->flags);
+ read_task_thread_flags(p));
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
@@ -8549,7 +8898,7 @@ state_filter_match(unsigned long state_filter, struct task_struct *p)
* When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
* TASK_KILLABLE).
*/
- if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
+ if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD))
return false;
return true;
@@ -8602,14 +8951,6 @@ void __init init_idle(struct task_struct *idle, int cpu)
__sched_fork(0, idle);
- /*
- * The idle task doesn't need the kthread struct to function, but it
- * is dressed up as a per-CPU kthread and thus needs to play the part
- * if we want to avoid special-casing it in code that deals with per-CPU
- * kthreads.
- */
- set_kthread_struct(idle);
-
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_rq_lock(rq);
@@ -8675,7 +9016,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
{
int ret = 1;
- if (!cpumask_weight(cur))
+ if (cpumask_empty(cur))
return ret;
ret = dl_cpuset_cpumask_can_shrink(cur, trial);
@@ -8684,7 +9025,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
}
int task_can_attach(struct task_struct *p,
- const struct cpumask *cs_cpus_allowed)
+ const struct cpumask *cs_effective_cpus)
{
int ret = 0;
@@ -8703,8 +9044,13 @@ int task_can_attach(struct task_struct *p,
}
if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
- cs_cpus_allowed))
- ret = dl_task_can_attach(p, cs_cpus_allowed);
+ cs_effective_cpus)) {
+ int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
+
+ if (unlikely(cpu >= nr_cpu_ids))
+ return -EINVAL;
+ ret = dl_cpu_busy(cpu, p);
+ }
out:
return ret;
@@ -8988,8 +9334,10 @@ static void cpuset_cpu_active(void)
static int cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
- if (dl_cpu_busy(cpu))
- return -EBUSY;
+ int ret = dl_cpu_busy(cpu, NULL);
+
+ if (ret)
+ return ret;
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
@@ -9019,6 +9367,7 @@ int sched_cpu_activate(unsigned int cpu)
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
+ sched_update_numa(cpu, true);
sched_domains_numa_masks_set(cpu);
cpuset_cpu_active();
}
@@ -9097,10 +9446,12 @@ int sched_cpu_deactivate(unsigned int cpu)
if (!sched_smp_initialized)
return 0;
+ sched_update_numa(cpu, false);
ret = cpuset_cpu_inactive(cpu);
if (ret) {
balance_push_set(cpu, false);
set_cpu_active(cpu, true);
+ sched_update_numa(cpu, true);
return ret;
}
sched_domains_numa_masks_clear(cpu);
@@ -9203,7 +9554,7 @@ int sched_cpu_dying(unsigned int cpu)
void __init sched_init_smp(void)
{
- sched_init_numa();
+ sched_init_numa(NUMA_NO_NODE);
/*
* There's no userspace yet to cause hotplug operations; hence all the
@@ -9215,7 +9566,7 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
BUG();
current->flags &= ~PF_NO_SETAFFINITY;
sched_init_granularity();
@@ -9259,20 +9610,17 @@ LIST_HEAD(task_groups);
static struct kmem_cache *task_group_cache __read_mostly;
#endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
-DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
-
void __init sched_init(void)
{
unsigned long ptr = 0;
int i;
/* Make sure the linker didn't screw up */
- BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
- &fair_sched_class + 1 != &rt_sched_class ||
- &rt_sched_class + 1 != &dl_sched_class);
+ BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
+ &fair_sched_class != &rt_sched_class + 1 ||
+ &rt_sched_class != &dl_sched_class + 1);
#ifdef CONFIG_SMP
- BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
+ BUG_ON(&dl_sched_class != &stop_sched_class + 1);
#endif
wait_bit_init();
@@ -9305,17 +9653,8 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
}
-#ifdef CONFIG_CPUMASK_OFFSTACK
- for_each_possible_cpu(i) {
- per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
- cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
- cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
- init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
#ifdef CONFIG_SMP
init_defrootdomain();
@@ -9412,7 +9751,9 @@ void __init sched_init(void)
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
- rq->core_forceidle = false;
+ rq->core_forceidle_count = 0;
+ rq->core_forceidle_occupation = 0;
+ rq->core_forceidle_start = 0;
rq->core_cookie = 0UL;
#endif
@@ -9427,6 +9768,14 @@ void __init sched_init(void)
enter_lazy_tlb(&init_mm, current);
/*
+ * The idle task doesn't need the kthread struct to function, but it
+ * is dressed up as a per-CPU kthread and thus needs to play the part
+ * if we want to avoid special-casing it in code that deals with per-CPU
+ * kthreads.
+ */
+ WARN_ON(!set_kthread_struct(current));
+
+ /*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
@@ -9812,7 +10161,7 @@ void sched_release_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-static void sched_change_group(struct task_struct *tsk, int type)
+static void sched_change_group(struct task_struct *tsk)
{
struct task_group *tg;
@@ -9828,7 +10177,7 @@ static void sched_change_group(struct task_struct *tsk, int type)
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
- tsk->sched_class->task_change_group(tsk, type);
+ tsk->sched_class->task_change_group(tsk);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
@@ -9859,7 +10208,7 @@ void sched_move_task(struct task_struct *tsk)
if (running)
put_prev_task(rq, tsk);
- sched_change_group(tsk, TASK_MOVE_GROUP);
+ sched_change_group(tsk);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -9937,53 +10286,19 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_unregister_group(tg);
}
-/*
- * This is called before wake_up_new_task(), therefore we really only
- * have to set its group bits, all the other stuff does not apply.
- */
-static void cpu_cgroup_fork(struct task_struct *task)
-{
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(task, &rf);
-
- update_rq_clock(rq);
- sched_change_group(task, TASK_SET_GROUP);
-
- task_rq_unlock(rq, task, &rf);
-}
-
+#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
- int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
-#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
-#endif
- /*
- * Serialize against wake_up_new_task() such that if it's
- * running, we're sure to observe its full state.
- */
- raw_spin_lock_irq(&task->pi_lock);
- /*
- * Avoid calling sched_move_task() before wake_up_new_task()
- * has happened. This would lead to problems with PELT, due to
- * move wanting to detach+attach while we're not attached yet.
- */
- if (READ_ONCE(task->__state) == TASK_NEW)
- ret = -EINVAL;
- raw_spin_unlock_irq(&task->pi_lock);
-
- if (ret)
- break;
}
- return ret;
+ return 0;
}
+#endif
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{
@@ -10819,8 +11134,9 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
- .fork = cpu_cgroup_fork,
+#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
+#endif
.attach = cpu_cgroup_attach,
.legacy_cftypes = cpu_legacy_files,
.dfl_cftypes = cpu_files,
@@ -10832,6 +11148,19 @@ struct cgroup_subsys cpu_cgrp_subsys = {
void dump_cpu_task(int cpu)
{
+ if (cpu == smp_processor_id() && in_hardirq()) {
+ struct pt_regs *regs;
+
+ regs = get_irq_regs();
+ if (regs) {
+ show_regs(regs);
+ return;
+ }
+ }
+
+ if (trigger_single_cpu_backtrace(cpu))
+ return;
+
pr_info("Task dump for CPU %d:\n", cpu);
sched_show_task(cpu_curr(cpu));
}