aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c786
1 files changed, 439 insertions, 347 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b489fcac37b..7f2cae4620c7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -33,7 +33,7 @@
#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/highmem.h>
-#include <asm/mmu_context.h>
+#include <linux/mmu_context.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
#include <linux/completion.h>
@@ -170,6 +170,71 @@ static struct rq *this_rq_lock(void)
return rq;
}
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ for (;;) {
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ rf->cookie = lockdep_pin_lock(&rq->lock);
+ return rq;
+ }
+ raw_spin_unlock(&rq->lock);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ for (;;) {
+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ /*
+ * move_queued_task() task_rq_lock()
+ *
+ * ACQUIRE (rq->lock)
+ * [S] ->on_rq = MIGRATING [L] rq = task_rq()
+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
+ * [S] ->cpu = new_cpu [L] task_rq()
+ * [L] ->on_rq
+ * RELEASE (rq->lock)
+ *
+ * If we observe the old cpu in task_rq_lock, the acquire of
+ * the old rq->lock will fully serialize against the stores.
+ *
+ * If we observe the new cpu in task_rq_lock, the acquire will
+ * pair with the WMB to ensure we must then also see migrating.
+ */
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ rf->cookie = lockdep_pin_lock(&rq->lock);
+ return rq;
+ }
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
#ifdef CONFIG_SCHED_HRTICK
/*
* Use HR-timers to deliver accurate preemption points.
@@ -249,29 +314,6 @@ void hrtick_start(struct rq *rq, u64 delay)
}
}
-static int
-hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- int cpu = (int)(long)hcpu;
-
- switch (action) {
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- hrtick_clear(cpu_rq(cpu));
- return NOTIFY_OK;
- }
-
- return NOTIFY_DONE;
-}
-
-static __init void init_hrtick(void)
-{
- hotcpu_notifier(hotplug_hrtick, 0);
-}
#else
/*
* Called to set the hrtick timer state.
@@ -288,10 +330,6 @@ void hrtick_start(struct rq *rq, u64 delay)
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
HRTIMER_MODE_REL_PINNED);
}
-
-static inline void init_hrtick(void)
-{
-}
#endif /* CONFIG_SMP */
static void init_rq_hrtick(struct rq *rq)
@@ -315,10 +353,6 @@ static inline void hrtick_clear(struct rq *rq)
static inline void init_rq_hrtick(struct rq *rq)
{
}
-
-static inline void init_hrtick(void)
-{
-}
#endif /* CONFIG_SCHED_HRTICK */
/*
@@ -400,7 +434,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
* wakeup due to that.
*
* This cmpxchg() implies a full barrier, which pairs with the write
- * barrier implied by the wakeup in wake_up_list().
+ * barrier implied by the wakeup in wake_up_q().
*/
if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
return;
@@ -499,7 +533,10 @@ int get_nohz_timer_target(void)
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
- if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
+ if (cpu == i)
+ continue;
+
+ if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
cpu = i;
goto unlock;
}
@@ -596,17 +633,8 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
/*
- * FIFO realtime policy runs the highest priority task (after DEADLINE).
- * Other runnable tasks are of a lower priority. The scheduler tick
- * isn't needed.
- */
- fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
- if (fifo_nr_running)
- return true;
-
- /*
- * Round-robin realtime tasks time slice with other tasks at the same
- * realtime priority.
+ * If there are more than one RR tasks, we need the tick to effect the
+ * actual RR behaviour.
*/
if (rq->rt.rr_nr_running) {
if (rq->rt.rr_nr_running == 1)
@@ -615,8 +643,20 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
}
- /* Normal multitasking need periodic preemption checks */
- if (rq->cfs.nr_running > 1)
+ /*
+ * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
+ * forced preemption between FIFO tasks.
+ */
+ fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+ if (fifo_nr_running)
+ return true;
+
+ /*
+ * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
+ * if there's more than one we need the tick for involuntary
+ * preemption.
+ */
+ if (rq->nr_running > 1)
return false;
return true;
@@ -1082,12 +1122,20 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
static int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask, bool check)
{
- unsigned long flags;
- struct rq *rq;
+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
unsigned int dest_cpu;
+ struct rq_flags rf;
+ struct rq *rq;
int ret = 0;
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
+
+ if (p->flags & PF_KTHREAD) {
+ /*
+ * Kernel threads are allowed on online && !active CPUs
+ */
+ cpu_valid_mask = cpu_online_mask;
+ }
/*
* Must re-check here, to close a race against __kthread_bind(),
@@ -1101,22 +1149,32 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_equal(&p->cpus_allowed, new_mask))
goto out;
- if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+ if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
ret = -EINVAL;
goto out;
}
do_set_cpus_allowed(p, new_mask);
+ if (p->flags & PF_KTHREAD) {
+ /*
+ * For kernel threads that do indeed end up on online &&
+ * !active we want to ensure they are strict per-cpu threads.
+ */
+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+ !cpumask_intersects(new_mask, cpu_active_mask) &&
+ p->nr_cpus_allowed != 1);
+ }
+
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;
- dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
if (task_running(rq, p) || p->state == TASK_WAKING) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
tlb_migrate_finish(p->mm);
return 0;
@@ -1125,12 +1183,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
* OK, since we're going to drop the lock immediately
* afterwards anyway.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, rf.cookie);
rq = move_queued_task(rq, p, dest_cpu);
- lockdep_pin_lock(&rq->lock);
+ lockdep_repin_lock(&rq->lock, rf.cookie);
}
out:
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return ret;
}
@@ -1314,8 +1372,8 @@ out:
*/
unsigned long wait_task_inactive(struct task_struct *p, long match_state)
{
- unsigned long flags;
int running, queued;
+ struct rq_flags rf;
unsigned long ncsw;
struct rq *rq;
@@ -1350,14 +1408,14 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* lock now, to be *sure*. If we're wrong, we'll
* just go back and repeat.
*/
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
trace_sched_wait_task(p);
running = task_running(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
/*
* If it changed from the expected state, bail out now.
@@ -1431,6 +1489,25 @@ EXPORT_SYMBOL_GPL(kick_process);
/*
* ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ *
+ * A few notes on cpu_active vs cpu_online:
+ *
+ * - cpu_active must be a subset of cpu_online
+ *
+ * - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
+ * see __set_cpus_allowed_ptr(). At this point the newly online
+ * cpu isn't yet part of the sched domains, and balancing will not
+ * see it.
+ *
+ * - on cpu-down we clear cpu_active() to mask the sched domains and
+ * avoid the load balancer to place new tasks on the to be removed
+ * cpu. Existing tasks will remain running there and will be taken
+ * off.
+ *
+ * This means that fallback selection must not select !active CPUs.
+ * And can assume that any active CPU must be online. Conversely
+ * select_task_rq() below may allow selection of !active CPUs in order
+ * to satisfy the above rules.
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
@@ -1449,8 +1526,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* Look for allowed, online CPU in same node. */
for_each_cpu(dest_cpu, nodemask) {
- if (!cpu_online(dest_cpu))
- continue;
if (!cpu_active(dest_cpu))
continue;
if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
@@ -1461,8 +1536,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for (;;) {
/* Any allowed, online CPU? */
for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
- if (!cpu_online(dest_cpu))
- continue;
if (!cpu_active(dest_cpu))
continue;
goto out;
@@ -1512,8 +1585,10 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
lockdep_assert_held(&p->pi_lock);
- if (p->nr_cpus_allowed > 1)
+ if (tsk_nr_cpus_allowed(p) > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ else
+ cpu = cpumask_any(tsk_cpus_allowed(p));
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1601,8 +1676,8 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
/*
* Mark the task runnable and perform wakeup-preemption.
*/
-static void
-ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
+ struct pin_cookie cookie)
{
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
@@ -1614,9 +1689,9 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
* Our task @p is fully woken up and running; so its safe to
* drop the rq->lock, hereafter rq is only used for statistics.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
p->sched_class->task_woken(rq, p);
- lockdep_pin_lock(&rq->lock);
+ lockdep_repin_lock(&rq->lock, cookie);
}
if (rq->idle_stamp) {
@@ -1634,17 +1709,23 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
}
static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+ struct pin_cookie cookie)
{
+ int en_flags = ENQUEUE_WAKEUP;
+
lockdep_assert_held(&rq->lock);
#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
+
+ if (wake_flags & WF_MIGRATED)
+ en_flags |= ENQUEUE_MIGRATED;
#endif
- ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
- ttwu_do_wakeup(rq, p, wake_flags);
+ ttwu_activate(rq, p, en_flags);
+ ttwu_do_wakeup(rq, p, wake_flags, cookie);
}
/*
@@ -1655,17 +1736,18 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
*/
static int ttwu_remote(struct task_struct *p, int wake_flags)
{
+ struct rq_flags rf;
struct rq *rq;
int ret = 0;
- rq = __task_rq_lock(p);
+ rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
/* check_preempt_curr() may use rq clock */
update_rq_clock(rq);
- ttwu_do_wakeup(rq, p, wake_flags);
+ ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
ret = 1;
}
- __task_rq_unlock(rq);
+ __task_rq_unlock(rq, &rf);
return ret;
}
@@ -1675,6 +1757,7 @@ void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
+ struct pin_cookie cookie;
struct task_struct *p;
unsigned long flags;
@@ -1682,15 +1765,21 @@ void sched_ttwu_pending(void)
return;
raw_spin_lock_irqsave(&rq->lock, flags);
- lockdep_pin_lock(&rq->lock);
+ cookie = lockdep_pin_lock(&rq->lock);
while (llist) {
+ int wake_flags = 0;
+
p = llist_entry(llist, struct task_struct, wake_entry);
llist = llist_next(llist);
- ttwu_do_activate(rq, p, 0);
+
+ if (p->sched_remote_wakeup)
+ wake_flags = WF_MIGRATED;
+
+ ttwu_do_activate(rq, p, wake_flags, cookie);
}
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1732,10 +1821,12 @@ void scheduler_ipi(void)
irq_exit();
}
-static void ttwu_queue_remote(struct task_struct *p, int cpu)
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
+ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+
if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
if (!set_nr_if_polling(rq->idle))
smp_send_reschedule(cpu);
@@ -1774,22 +1865,23 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
}
#endif /* CONFIG_SMP */
-static void ttwu_queue(struct task_struct *p, int cpu)
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
+ struct pin_cookie cookie;
#if defined(CONFIG_SMP)
if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
sched_clock_cpu(cpu); /* sync clocks x-cpu */
- ttwu_queue_remote(p, cpu);
+ ttwu_queue_remote(p, cpu, wake_flags);
return;
}
#endif
raw_spin_lock(&rq->lock);
- lockdep_pin_lock(&rq->lock);
- ttwu_do_activate(rq, p, 0);
- lockdep_unpin_lock(&rq->lock);
+ cookie = lockdep_pin_lock(&rq->lock);
+ ttwu_do_activate(rq, p, wake_flags, cookie);
+ lockdep_unpin_lock(&rq->lock, cookie);
raw_spin_unlock(&rq->lock);
}
@@ -1958,9 +2050,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
- if (p->sched_class->task_waking)
- p->sched_class->task_waking(p);
-
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
@@ -1968,7 +2057,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
}
#endif /* CONFIG_SMP */
- ttwu_queue(p, cpu);
+ ttwu_queue(p, cpu, wake_flags);
stat:
if (schedstat_enabled())
ttwu_stat(p, cpu, wake_flags);
@@ -1986,7 +2075,7 @@ out:
* ensure that this_rq() is locked, @p is bound to this_rq() and not
* the current task.
*/
-static void try_to_wake_up_local(struct task_struct *p)
+static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
{
struct rq *rq = task_rq(p);
@@ -2003,11 +2092,11 @@ static void try_to_wake_up_local(struct task_struct *p)
* disabled avoiding further scheduler activity on it and we've
* not yet picked a replacement task.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&p->pi_lock);
raw_spin_lock(&rq->lock);
- lockdep_pin_lock(&rq->lock);
+ lockdep_repin_lock(&rq->lock, cookie);
}
if (!(p->state & TASK_NORMAL))
@@ -2018,7 +2107,7 @@ static void try_to_wake_up_local(struct task_struct *p)
if (!task_on_rq_queued(p))
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
- ttwu_do_wakeup(rq, p, 0);
+ ttwu_do_wakeup(rq, p, 0, cookie);
if (schedstat_enabled())
ttwu_stat(p, smp_processor_id(), 0);
out:
@@ -2378,7 +2467,8 @@ static int dl_overflow(struct task_struct *p, int policy,
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
int cpus, err = -1;
- if (new_bw == p->dl.dl_bw)
+ /* !deadline task may carry old deadline bandwidth */
+ if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
return 0;
/*
@@ -2417,12 +2507,12 @@ extern void init_dl_bw(struct dl_bw *dl_b);
*/
void wake_up_new_task(struct task_struct *p)
{
- unsigned long flags;
+ struct rq_flags rf;
struct rq *rq;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
/* Initialize new task's runnable average */
init_entity_runnable_average(&p->se);
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
@@ -2431,8 +2521,10 @@ void wake_up_new_task(struct task_struct *p)
*/
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
+ /* Post initialize new task's util average when its cfs_rq is set */
+ post_init_entity_util_avg(&p->se);
- rq = __task_rq_lock(p);
+ rq = __task_rq_lock(p, &rf);
activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
@@ -2443,12 +2535,12 @@ void wake_up_new_task(struct task_struct *p)
* Nothing relies on rq->lock after this, so its fine to
* drop it.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, rf.cookie);
p->sched_class->task_woken(rq, p);
- lockdep_pin_lock(&rq->lock);
+ lockdep_repin_lock(&rq->lock, rf.cookie);
}
#endif
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2710,7 +2802,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
*/
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
- struct task_struct *next)
+ struct task_struct *next, struct pin_cookie cookie)
{
struct mm_struct *mm, *oldmm;
@@ -2730,7 +2822,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
- switch_mm(oldmm, mm, next);
+ switch_mm_irqs_off(oldmm, mm, next);
if (!prev->mm) {
prev->active_mm = NULL;
@@ -2742,7 +2834,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
/* Here we just switch the register state and the stack. */
@@ -2864,7 +2956,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
*/
unsigned long long task_sched_runtime(struct task_struct *p)
{
- unsigned long flags;
+ struct rq_flags rf;
struct rq *rq;
u64 ns;
@@ -2884,7 +2976,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return p->se.sum_exec_runtime;
#endif
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
/*
* Must be ->curr _and_ ->on_rq. If dequeued, we would
* project cycles that may never be accounted to this
@@ -2895,7 +2987,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
p->sched_class->update_curr(rq);
}
ns = p->se.sum_exec_runtime;
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return ns;
}
@@ -2915,7 +3007,7 @@ void scheduler_tick(void)
raw_spin_lock(&rq->lock);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
- update_cpu_load_active(rq);
+ cpu_load_update_active(rq);
calc_global_load_tick(rq);
raw_spin_unlock(&rq->lock);
@@ -2958,6 +3050,20 @@ u64 scheduler_tick_max_deferment(void)
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
+/*
+ * If the value passed in is equal to the current preempt count
+ * then we just disabled preemption. Start timing the latency.
+ */
+static inline void preempt_latency_start(int val)
+{
+ if (preempt_count() == val) {
+ unsigned long ip = get_lock_parent_ip();
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = ip;
+#endif
+ trace_preempt_off(CALLER_ADDR0, ip);
+ }
+}
void preempt_count_add(int val)
{
@@ -2976,17 +3082,21 @@ void preempt_count_add(int val)
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
PREEMPT_MASK - 10);
#endif
- if (preempt_count() == val) {
- unsigned long ip = get_lock_parent_ip();
-#ifdef CONFIG_DEBUG_PREEMPT
- current->preempt_disable_ip = ip;
-#endif
- trace_preempt_off(CALLER_ADDR0, ip);
- }
+ preempt_latency_start(val);
}
EXPORT_SYMBOL(preempt_count_add);
NOKPROBE_SYMBOL(preempt_count_add);
+/*
+ * If the value passed in equals to the current preempt count
+ * then we just enabled preemption. Stop timing the latency.
+ */
+static inline void preempt_latency_stop(int val)
+{
+ if (preempt_count() == val)
+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+}
+
void preempt_count_sub(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
@@ -3003,13 +3113,15 @@ void preempt_count_sub(int val)
return;
#endif
- if (preempt_count() == val)
- trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+ preempt_latency_stop(val);
__preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
NOKPROBE_SYMBOL(preempt_count_sub);
+#else
+static inline void preempt_latency_start(int val) { }
+static inline void preempt_latency_stop(int val) { }
#endif
/*
@@ -3062,7 +3174,7 @@ static inline void schedule_debug(struct task_struct *prev)
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
const struct sched_class *class = &fair_sched_class;
struct task_struct *p;
@@ -3073,20 +3185,20 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
*/
if (likely(prev->sched_class == class &&
rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq, prev);
+ p = fair_sched_class.pick_next_task(rq, prev, cookie);
if (unlikely(p == RETRY_TASK))
goto again;
/* assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
- p = idle_sched_class.pick_next_task(rq, prev);
+ p = idle_sched_class.pick_next_task(rq, prev, cookie);
return p;
}
again:
for_each_class(class) {
- p = class->pick_next_task(rq, prev);
+ p = class->pick_next_task(rq, prev, cookie);
if (p) {
if (unlikely(p == RETRY_TASK))
goto again;
@@ -3140,6 +3252,7 @@ static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
+ struct pin_cookie cookie;
struct rq *rq;
int cpu;
@@ -3173,7 +3286,7 @@ static void __sched notrace __schedule(bool preempt)
*/
smp_mb__before_spinlock();
raw_spin_lock(&rq->lock);
- lockdep_pin_lock(&rq->lock);
+ cookie = lockdep_pin_lock(&rq->lock);
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@ -3195,7 +3308,7 @@ static void __sched notrace __schedule(bool preempt)
to_wakeup = wq_worker_sleeping(prev);
if (to_wakeup)
- try_to_wake_up_local(to_wakeup);
+ try_to_wake_up_local(to_wakeup, cookie);
}
}
switch_count = &prev->nvcsw;
@@ -3204,7 +3317,7 @@ static void __sched notrace __schedule(bool preempt)
if (task_on_rq_queued(prev))
update_rq_clock(rq);
- next = pick_next_task(rq, prev);
+ next = pick_next_task(rq, prev, cookie);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->clock_skip_update = 0;
@@ -3215,9 +3328,9 @@ static void __sched notrace __schedule(bool preempt)
++*switch_count;
trace_sched_switch(preempt, prev, next);
- rq = context_switch(rq, prev, next); /* unlocks the rq */
+ rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
} else {
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
raw_spin_unlock_irq(&rq->lock);
}
@@ -3284,8 +3397,23 @@ void __sched schedule_preempt_disabled(void)
static void __sched notrace preempt_schedule_common(void)
{
do {
+ /*
+ * Because the function tracer can trace preempt_count_sub()
+ * and it also uses preempt_enable/disable_notrace(), if
+ * NEED_RESCHED is set, the preempt_enable_notrace() called
+ * by the function tracer will call this function again and
+ * cause infinite recursion.
+ *
+ * Preemption must be disabled here before the function
+ * tracer can trace. Break up preempt_disable() into two
+ * calls. One to disable preemption without fear of being
+ * traced. The other to still record the preemption latency,
+ * which can also be traced by the function tracer.
+ */
preempt_disable_notrace();
+ preempt_latency_start(1);
__schedule(true);
+ preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
/*
@@ -3337,7 +3465,21 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
return;
do {
+ /*
+ * Because the function tracer can trace preempt_count_sub()
+ * and it also uses preempt_enable/disable_notrace(), if
+ * NEED_RESCHED is set, the preempt_enable_notrace() called
+ * by the function tracer will call this function again and
+ * cause infinite recursion.
+ *
+ * Preemption must be disabled here before the function
+ * tracer can trace. Break up preempt_disable() into two
+ * calls. One to disable preemption without fear of being
+ * traced. The other to still record the preemption latency,
+ * which can also be traced by the function tracer.
+ */
preempt_disable_notrace();
+ preempt_latency_start(1);
/*
* Needs preempt disabled in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
@@ -3347,6 +3489,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
__schedule(true);
exception_exit(prev_ctx);
+ preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
} while (need_resched());
}
@@ -3403,12 +3546,13 @@ EXPORT_SYMBOL(default_wake_function);
void rt_mutex_setprio(struct task_struct *p, int prio)
{
int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
- struct rq *rq;
const struct sched_class *prev_class;
+ struct rq_flags rf;
+ struct rq *rq;
BUG_ON(prio > MAX_PRIO);
- rq = __task_rq_lock(p);
+ rq = __task_rq_lock(p, &rf);
/*
* Idle task boosting is a nono in general. There is one
@@ -3484,7 +3628,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
preempt_disable(); /* avoid rq from going away on us */
- __task_rq_unlock(rq);
+ __task_rq_unlock(rq, &rf);
balance_callback(rq);
preempt_enable();
@@ -3494,7 +3638,7 @@ out_unlock:
void set_user_nice(struct task_struct *p, long nice)
{
int old_prio, delta, queued;
- unsigned long flags;
+ struct rq_flags rf;
struct rq *rq;
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
@@ -3503,7 +3647,7 @@ void set_user_nice(struct task_struct *p, long nice)
* We have to be careful, if called from sys_setpriority(),
* the task might be in the middle of scheduling on another CPU.
*/
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
@@ -3534,7 +3678,7 @@ void set_user_nice(struct task_struct *p, long nice)
resched_curr(rq);
}
out_unlock:
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
}
EXPORT_SYMBOL(set_user_nice);
@@ -3831,11 +3975,11 @@ static int __sched_setscheduler(struct task_struct *p,
MAX_RT_PRIO - 1 - attr->sched_priority;
int retval, oldprio, oldpolicy = -1, queued, running;
int new_effective_prio, policy = attr->sched_policy;
- unsigned long flags;
const struct sched_class *prev_class;
- struct rq *rq;
+ struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ struct rq *rq;
/* may grab non-irq protected spin_locks */
BUG_ON(in_interrupt());
@@ -3930,13 +4074,13 @@ recheck:
* To be able to change p->policy safely, the appropriate
* runqueue lock must be held.
*/
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
/*
* Changing the policy of the stop threads its a very bad idea
*/
if (p == rq->stop) {
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return -EINVAL;
}
@@ -3953,7 +4097,7 @@ recheck:
goto change;
p->sched_reset_on_fork = reset_on_fork;
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return 0;
}
change:
@@ -3967,7 +4111,7 @@ change:
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return -EPERM;
}
#endif
@@ -3982,7 +4126,7 @@ change:
*/
if (!cpumask_subset(span, &p->cpus_allowed) ||
rq->rd->dl_bw.bw == 0) {
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return -EPERM;
}
}
@@ -3992,7 +4136,7 @@ change:
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
goto recheck;
}
@@ -4002,7 +4146,7 @@ change:
* is available.
*/
if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return -EBUSY;
}
@@ -4047,7 +4191,7 @@ change:
check_class_changed(rq, p, prev_class, oldprio);
preempt_disable(); /* avoid rq from going away on us */
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
if (pi)
rt_mutex_adjust_pi(p);
@@ -4900,10 +5044,10 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
{
struct task_struct *p;
unsigned int time_slice;
- unsigned long flags;
+ struct rq_flags rf;
+ struct timespec t;
struct rq *rq;
int retval;
- struct timespec t;
if (pid < 0)
return -EINVAL;
@@ -4918,11 +5062,11 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
if (retval)
goto out_unlock;
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
time_slice = 0;
if (p->sched_class->get_rr_interval)
time_slice = p->sched_class->get_rr_interval(rq, p);
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
rcu_read_unlock();
jiffies_to_timespec(time_slice, &t);
@@ -4998,7 +5142,8 @@ void show_state_filter(unsigned long state_filter)
touch_all_softlockup_watchdogs();
#ifdef CONFIG_SCHED_DEBUG
- sysrq_sched_debug_show();
+ if (!state_filter)
+ sysrq_sched_debug_show();
#endif
rcu_read_unlock();
/*
@@ -5160,6 +5305,8 @@ out:
#ifdef CONFIG_SMP
+static bool sched_smp_initialized __read_mostly;
+
#ifdef CONFIG_NUMA_BALANCING
/* Migrate current task p to target_cpu */
int migrate_task_to(struct task_struct *p, int target_cpu)
@@ -5185,11 +5332,11 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
*/
void sched_setnuma(struct task_struct *p, int nid)
{
- struct rq *rq;
- unsigned long flags;
bool queued, running;
+ struct rq_flags rf;
+ struct rq *rq;
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
queued = task_on_rq_queued(p);
running = task_current(rq, p);
@@ -5204,7 +5351,7 @@ void sched_setnuma(struct task_struct *p, int nid)
p->sched_class->set_curr_task(rq);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE);
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -5220,7 +5367,7 @@ void idle_task_exit(void)
BUG_ON(cpu_online(smp_processor_id()));
if (mm != &init_mm) {
- switch_mm(mm, &init_mm, current);
+ switch_mm_irqs_off(mm, &init_mm, current);
finish_arch_post_lock_switch();
}
mmdrop(mm);
@@ -5268,6 +5415,7 @@ static void migrate_tasks(struct rq *dead_rq)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
+ struct pin_cookie cookie;
int dest_cpu;
/*
@@ -5299,8 +5447,8 @@ static void migrate_tasks(struct rq *dead_rq)
/*
* pick_next_task assumes pinned rq->lock.
*/
- lockdep_pin_lock(&rq->lock);
- next = pick_next_task(rq, &fake_task);
+ cookie = lockdep_pin_lock(&rq->lock);
+ next = pick_next_task(rq, &fake_task, cookie);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -5313,7 +5461,7 @@ static void migrate_tasks(struct rq *dead_rq)
* because !cpu_active at this point, which means load-balance
* will not interfere. Also, stop-machine.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&next->pi_lock);
raw_spin_lock(&rq->lock);
@@ -5374,127 +5522,13 @@ static void set_rq_offline(struct rq *rq)
}
}
-/*
- * migration_call - callback that gets triggered when a CPU is added.
- * Here we can start up the necessary migration thread for the new CPU.
- */
-static int
-migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
+static void set_cpu_rq_start_time(unsigned int cpu)
{
- int cpu = (long)hcpu;
- unsigned long flags;
struct rq *rq = cpu_rq(cpu);
- switch (action & ~CPU_TASKS_FROZEN) {
-
- case CPU_UP_PREPARE:
- rq->calc_load_update = calc_load_update;
- account_reset_rq(rq);
- break;
-
- case CPU_ONLINE:
- /* Update our root-domain */
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-
- set_rq_online(rq);
- }
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- break;
-
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DYING:
- sched_ttwu_pending();
- /* Update our root-domain */
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
- set_rq_offline(rq);
- }
- migrate_tasks(rq);
- BUG_ON(rq->nr_running != 1); /* the migration thread */
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- break;
-
- case CPU_DEAD:
- calc_load_migrate(rq);
- break;
-#endif
- }
-
- update_max_interval();
-
- return NOTIFY_OK;
-}
-
-/*
- * Register at high priority so that task migration (migrate_all_tasks)
- * happens before everything else. This has to be lower priority than
- * the notifier in the perf_event subsystem, though.
- */
-static struct notifier_block migration_notifier = {
- .notifier_call = migration_call,
- .priority = CPU_PRI_MIGRATION,
-};
-
-static void set_cpu_rq_start_time(void)
-{
- int cpu = smp_processor_id();
- struct rq *rq = cpu_rq(cpu);
rq->age_stamp = sched_clock_cpu(cpu);
}
-static int sched_cpu_active(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- int cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_STARTING:
- set_cpu_rq_start_time();
- return NOTIFY_OK;
-
- case CPU_DOWN_FAILED:
- set_cpu_active(cpu, true);
- return NOTIFY_OK;
-
- default:
- return NOTIFY_DONE;
- }
-}
-
-static int sched_cpu_inactive(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_PREPARE:
- set_cpu_active((long)hcpu, false);
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
-}
-
-static int __init migration_init(void)
-{
- void *cpu = (void *)(long)smp_processor_id();
- int err;
-
- /* Initialize migration for the boot CPU */
- err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
- BUG_ON(err == NOTIFY_BAD);
- migration_call(&migration_notifier, CPU_ONLINE, cpu);
- register_cpu_notifier(&migration_notifier);
-
- /* Register cpu active notifiers */
- cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
- cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
-
- return 0;
-}
-early_initcall(migration_init);
-
static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
#ifdef CONFIG_SCHED_DEBUG
@@ -6642,10 +6676,10 @@ static void sched_init_numa(void)
init_numa_topology_type();
}
-static void sched_domains_numa_masks_set(int cpu)
+static void sched_domains_numa_masks_set(unsigned int cpu)
{
- int i, j;
int node = cpu_to_node(cpu);
+ int i, j;
for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
@@ -6655,51 +6689,20 @@ static void sched_domains_numa_masks_set(int cpu)
}
}
-static void sched_domains_numa_masks_clear(int cpu)
+static void sched_domains_numa_masks_clear(unsigned int cpu)
{
int i, j;
+
for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++)
cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
}
}
-/*
- * Update sched_domains_numa_masks[level][node] array when new cpus
- * are onlined.
- */
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- int cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- sched_domains_numa_masks_set(cpu);
- break;
-
- case CPU_DEAD:
- sched_domains_numa_masks_clear(cpu);
- break;
-
- default:
- return NOTIFY_DONE;
- }
-
- return NOTIFY_OK;
-}
#else
-static inline void sched_init_numa(void)
-{
-}
-
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- return 0;
-}
+static inline void sched_init_numa(void) { }
+static void sched_domains_numa_masks_set(unsigned int cpu) { }
+static void sched_domains_numa_masks_clear(unsigned int cpu) { }
#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -7089,13 +7092,9 @@ static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
* If we come here as part of a suspend/resume, don't touch cpusets because we
* want to restore it back to its original state upon resume anyway.
*/
-static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
+static void cpuset_cpu_active(void)
{
- switch (action) {
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_FAILED_FROZEN:
-
+ if (cpuhp_tasks_frozen) {
/*
* num_cpus_frozen tracks how many CPUs are involved in suspend
* resume sequence. As long as this is not the last online
@@ -7105,35 +7104,25 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
num_cpus_frozen--;
if (likely(num_cpus_frozen)) {
partition_sched_domains(1, NULL, NULL);
- break;
+ return;
}
-
/*
* This is the last CPU online operation. So fall through and
* restore the original sched domains by considering the
* cpuset configurations.
*/
-
- case CPU_ONLINE:
- cpuset_update_active_cpus(true);
- break;
- default:
- return NOTIFY_DONE;
}
- return NOTIFY_OK;
+ cpuset_update_active_cpus(true);
}
-static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
+static int cpuset_cpu_inactive(unsigned int cpu)
{
unsigned long flags;
- long cpu = (long)hcpu;
struct dl_bw *dl_b;
bool overflow;
int cpus;
- switch (action) {
- case CPU_DOWN_PREPARE:
+ if (!cpuhp_tasks_frozen) {
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
@@ -7145,19 +7134,120 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
rcu_read_unlock_sched();
if (overflow)
- return notifier_from_errno(-EBUSY);
+ return -EBUSY;
cpuset_update_active_cpus(false);
- break;
- case CPU_DOWN_PREPARE_FROZEN:
+ } else {
num_cpus_frozen++;
partition_sched_domains(1, NULL, NULL);
- break;
- default:
- return NOTIFY_DONE;
}
- return NOTIFY_OK;
+ return 0;
}
+int sched_cpu_activate(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ set_cpu_active(cpu, true);
+
+ if (sched_smp_initialized) {
+ sched_domains_numa_masks_set(cpu);
+ cpuset_cpu_active();
+ }
+
+ /*
+ * Put the rq online, if not already. This happens:
+ *
+ * 1) In the early boot process, because we build the real domains
+ * after all cpus have been brought up.
+ *
+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
+ * domains.
+ */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_online(rq);
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ update_max_interval();
+
+ return 0;
+}
+
+int sched_cpu_deactivate(unsigned int cpu)
+{
+ int ret;
+
+ set_cpu_active(cpu, false);
+ /*
+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
+ * users of this state to go away such that all new such users will
+ * observe it.
+ *
+ * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+ * not imply sync_sched(), so wait for both.
+ *
+ * Do sync before park smpboot threads to take care the rcu boost case.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
+ else
+ synchronize_rcu();
+
+ if (!sched_smp_initialized)
+ return 0;
+
+ ret = cpuset_cpu_inactive(cpu);
+ if (ret) {
+ set_cpu_active(cpu, true);
+ return ret;
+ }
+ sched_domains_numa_masks_clear(cpu);
+ return 0;
+}
+
+static void sched_rq_cpu_starting(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ rq->calc_load_update = calc_load_update;
+ account_reset_rq(rq);
+ update_max_interval();
+}
+
+int sched_cpu_starting(unsigned int cpu)
+{
+ set_cpu_rq_start_time(cpu);
+ sched_rq_cpu_starting(cpu);
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+int sched_cpu_dying(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ /* Handle pending wakeups and then migrate everything off */
+ sched_ttwu_pending();
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ migrate_tasks(rq);
+ BUG_ON(rq->nr_running != 1);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ calc_load_migrate(rq);
+ update_max_interval();
+ nohz_balance_exit_idle(cpu);
+ hrtick_clear(rq);
+ return 0;
+}
+#endif
+
void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
@@ -7179,12 +7269,6 @@ void __init sched_init_smp(void)
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
- hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
- hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
- hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-
- init_hrtick();
-
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
BUG();
@@ -7193,7 +7277,16 @@ void __init sched_init_smp(void)
init_sched_rt_class();
init_sched_dl_class();
+ sched_smp_initialized = true;
+}
+
+static int __init migration_init(void)
+{
+ sched_rq_cpu_starting(smp_processor_id());
+ return 0;
}
+early_initcall(migration_init);
+
#else
void __init sched_init_smp(void)
{
@@ -7328,8 +7421,6 @@ void __init sched_init(void)
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
- rq->last_load_update_tick = jiffies;
-
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
@@ -7348,12 +7439,13 @@ void __init sched_init(void)
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
+ rq->last_load_update_tick = jiffies;
rq->nohz_flags = 0;
#endif
#ifdef CONFIG_NO_HZ_FULL
rq->last_sched_tick = 0;
#endif
-#endif
+#endif /* CONFIG_SMP */
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
}
@@ -7391,7 +7483,7 @@ void __init sched_init(void)
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
- set_cpu_rq_start_time();
+ set_cpu_rq_start_time(smp_processor_id());
#endif
init_sched_fair_class();
@@ -7636,10 +7728,10 @@ void sched_move_task(struct task_struct *tsk)
{
struct task_group *tg;
int queued, running;
- unsigned long flags;
+ struct rq_flags rf;
struct rq *rq;
- rq = task_rq_lock(tsk, &flags);
+ rq = task_rq_lock(tsk, &rf);
running = task_current(rq, tsk);
queued = task_on_rq_queued(tsk);
@@ -7671,7 +7763,7 @@ void sched_move_task(struct task_struct *tsk)
if (queued)
enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
- task_rq_unlock(rq, tsk, &flags);
+ task_rq_unlock(rq, tsk, &rf);
}
#endif /* CONFIG_CGROUP_SCHED */
@@ -7891,7 +7983,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
static int sched_rt_global_constraints(void)
{
unsigned long flags;
- int i, ret = 0;
+ int i;
raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
@@ -7903,7 +7995,7 @@ static int sched_rt_global_constraints(void)
}
raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
- return ret;
+ return 0;
}
#endif /* CONFIG_RT_GROUP_SCHED */