aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c6
-rw-r--r--kernel/sched/clock.c13
-rw-r--r--kernel/sched/completion.c19
-rw-r--r--kernel/sched/core.c123
-rw-r--r--kernel/sched/deadline.c33
-rw-r--r--kernel/sched/idle.c58
-rw-r--r--kernel/sched/sched.h76
-rw-r--r--kernel/sched/stats.c11
9 files changed, 211 insertions, 130 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index ab32b7b0db5c..46be87024875 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,5 +1,5 @@
ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_clock.o = -pg
+CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
endif
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 8a2e230fb86a..eae160dd669d 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void)
* so we don't have to move tasks around upon policy change,
* or flail around trying to allocate bandwidth on the fly.
* A bandwidth exception in __sched_setscheduler() allows
- * the policy change to proceed. Thereafter, task_group()
- * returns &root_task_group, so zero bandwidth is required.
+ * the policy change to proceed.
*/
free_rt_sched_group(tg);
tg->rt_se = root_task_group.rt_se;
@@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
if (tg != &root_task_group)
return false;
- if (p->sched_class != &fair_sched_class)
- return false;
-
/*
* We can only assume the task group can't go away on us if
* autogroup_move_group() can see us on ->thread_group list.
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c27e4f8f4879..c0a205101c23 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -420,3 +420,16 @@ u64 local_clock(void)
EXPORT_SYMBOL_GPL(cpu_clock);
EXPORT_SYMBOL_GPL(local_clock);
+
+/*
+ * Running clock - returns the time that has elapsed while a guest has been
+ * running.
+ * On a guest this value should be local_clock minus the time the guest was
+ * suspended by the hypervisor (for any reason).
+ * On bare metal this function should return the same as local_clock.
+ * Architectures and sub-architectures can override this.
+ */
+u64 __weak running_clock(void)
+{
+ return local_clock();
+}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 7052d3fd4e7b..8d0f35debf35 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -274,7 +274,7 @@ bool try_wait_for_completion(struct completion *x)
* first without taking the lock so we can
* return early in the blocking case.
*/
- if (!ACCESS_ONCE(x->done))
+ if (!READ_ONCE(x->done))
return 0;
spin_lock_irqsave(&x->wait.lock, flags);
@@ -297,6 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion);
*/
bool completion_done(struct completion *x)
{
- return !!ACCESS_ONCE(x->done);
+ if (!READ_ONCE(x->done))
+ return false;
+
+ /*
+ * If ->done, we need to wait for complete() to release ->wait.lock
+ * otherwise we can end up freeing the completion before complete()
+ * is done referencing it.
+ *
+ * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
+ * the loads of ->done and ->wait.lock such that we cannot observe
+ * the lock before complete() acquires it while observing the ->done
+ * after it's acquired the lock.
+ */
+ smp_rmb();
+ spin_unlock_wait(&x->wait.lock);
+ return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1f37fe7f77a4..f0f831e8a345 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -307,66 +307,6 @@ __read_mostly int scheduler_running;
int sysctl_sched_rt_runtime = 950000;
/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- lockdep_assert_held(&p->pi_lock);
-
- for (;;) {
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
- return rq;
- raw_spin_unlock(&rq->lock);
-
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
-}
-
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
- __acquires(p->pi_lock)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- for (;;) {
- raw_spin_lock_irqsave(&p->pi_lock, *flags);
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
- return rq;
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
-}
-
-static void __task_rq_unlock(struct rq *rq)
- __releases(rq->lock)
-{
- raw_spin_unlock(&rq->lock);
-}
-
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
- __releases(rq->lock)
- __releases(p->pi_lock)
-{
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-}
-
-/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
static struct rq *this_rq_lock(void)
@@ -2899,7 +2839,7 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
-static void preempt_schedule_common(void)
+static void __sched notrace preempt_schedule_common(void)
{
do {
__preempt_count_add(PREEMPT_ACTIVE);
@@ -4418,36 +4358,29 @@ EXPORT_SYMBOL_GPL(yield_to);
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
*/
-void __sched io_schedule(void)
-{
- struct rq *rq = raw_rq();
-
- delayacct_blkio_start();
- atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
- schedule();
- current->in_iowait = 0;
- atomic_dec(&rq->nr_iowait);
- delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-
long __sched io_schedule_timeout(long timeout)
{
- struct rq *rq = raw_rq();
+ int old_iowait = current->in_iowait;
+ struct rq *rq;
long ret;
+ current->in_iowait = 1;
+ if (old_iowait)
+ blk_schedule_flush_plug(current);
+ else
+ blk_flush_plug(current);
+
delayacct_blkio_start();
+ rq = raw_rq();
atomic_inc(&rq->nr_iowait);
- blk_flush_plug(current);
- current->in_iowait = 1;
ret = schedule_timeout(timeout);
- current->in_iowait = 0;
+ current->in_iowait = old_iowait;
atomic_dec(&rq->nr_iowait);
delayacct_blkio_end();
+
return ret;
}
+EXPORT_SYMBOL(io_schedule_timeout);
/**
* sys_sched_get_priority_max - return maximum RT priority.
@@ -5462,9 +5395,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
{
struct sched_group *group = sd->groups;
- char str[256];
- cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
cpumask_clear(groupmask);
printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -5477,7 +5408,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
return -1;
}
- printk(KERN_CONT "span %s level %s\n", str, sd->name);
+ printk(KERN_CONT "span %*pbl level %s\n",
+ cpumask_pr_args(sched_domain_span(sd)), sd->name);
if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -5522,9 +5454,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_or(groupmask, groupmask, sched_group_cpus(group));
- cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-
- printk(KERN_CONT " %s", str);
+ printk(KERN_CONT " %*pbl",
+ cpumask_pr_args(sched_group_cpus(group)));
if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
printk(KERN_CONT " (cpu_capacity = %d)",
group->sgc->capacity);
@@ -7644,6 +7575,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
{
struct task_struct *g, *p;
+ /*
+ * Autogroups do not have RT tasks; see autogroup_create().
+ */
+ if (task_group_is_autogroup(tg))
+ return 0;
+
for_each_process_thread(g, p) {
if (rt_task(p) && task_group(p) == tg)
return 1;
@@ -7736,6 +7673,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
{
int i, err = 0;
+ /*
+ * Disallowing the root group RT runtime is BAD, it would disallow the
+ * kernel creating (and or operating) RT threads.
+ */
+ if (tg == &root_task_group && rt_runtime == 0)
+ return -EINVAL;
+
+ /* No period doesn't make any sense. */
+ if (rt_period == 0)
+ return -EINVAL;
+
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -7792,9 +7740,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
rt_period = (u64)rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime;
- if (rt_period == 0)
- return -EINVAL;
-
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a027799ae130..3fa8fa6d9403 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -511,16 +511,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity,
dl_timer);
struct task_struct *p = dl_task_of(dl_se);
+ unsigned long flags;
struct rq *rq;
-again:
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (rq != task_rq(p)) {
- /* Task was moved, retrying. */
- raw_spin_unlock(&rq->lock);
- goto again;
- }
+ rq = task_rq_lock(current, &flags);
/*
* We need to take care of several possible races here:
@@ -541,6 +535,26 @@ again:
sched_clock_tick();
update_rq_clock(rq);
+
+ /*
+ * If the throttle happened during sched-out; like:
+ *
+ * schedule()
+ * deactivate_task()
+ * dequeue_task_dl()
+ * update_curr_dl()
+ * start_dl_timer()
+ * __dequeue_task_dl()
+ * prev->on_rq = 0;
+ *
+ * We can be both throttled and !queued. Replenish the counter
+ * but do not enqueue -- wait for our wakeup to do that.
+ */
+ if (!task_on_rq_queued(p)) {
+ replenish_dl_entity(dl_se, dl_se);
+ goto unlock;
+ }
+
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
@@ -555,7 +569,7 @@ again:
push_dl_task(rq);
#endif
unlock:
- raw_spin_unlock(&rq->lock);
+ task_rq_unlock(rq, current, &flags);
return HRTIMER_NORESTART;
}
@@ -898,6 +912,7 @@ static void yield_task_dl(struct rq *rq)
rq->curr->dl.dl_yielded = 1;
p->dl.runtime = 0;
}
+ update_rq_clock(rq);
update_curr_dl(rq);
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index aaf1c1d5cf5d..80014a178342 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -7,6 +7,7 @@
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
+#include <linux/suspend.h>
#include <asm/tlb.h>
@@ -81,6 +82,7 @@ static void cpuidle_idle_call(void)
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
int next_state, entered_state;
unsigned int broadcast;
+ bool reflect;
/*
* Check if the idle task must be rescheduled. If it is the
@@ -104,25 +106,37 @@ static void cpuidle_idle_call(void)
*/
rcu_idle_enter();
+ if (cpuidle_not_available(drv, dev))
+ goto use_default;
+
/*
- * Ask the cpuidle framework to choose a convenient idle state.
- * Fall back to the default arch idle method on errors.
+ * Suspend-to-idle ("freeze") is a system state in which all user space
+ * has been frozen, all I/O devices have been suspended and the only
+ * activity happens here and in iterrupts (if any). In that case bypass
+ * the cpuidle governor and go stratight for the deepest idle state
+ * available. Possibly also suspend the local tick and the entire
+ * timekeeping to prevent timer interrupts from kicking us out of idle
+ * until a proper wakeup interrupt happens.
*/
- next_state = cpuidle_select(drv, dev);
- if (next_state < 0) {
-use_default:
- /*
- * We can't use the cpuidle framework, let's use the default
- * idle routine.
- */
- if (current_clr_polling_and_test())
+ if (idle_should_freeze()) {
+ entered_state = cpuidle_enter_freeze(drv, dev);
+ if (entered_state >= 0) {
local_irq_enable();
- else
- arch_cpu_idle();
+ goto exit_idle;
+ }
- goto exit_idle;
+ reflect = false;
+ next_state = cpuidle_find_deepest_state(drv, dev);
+ } else {
+ reflect = true;
+ /*
+ * Ask the cpuidle framework to choose a convenient idle state.
+ */
+ next_state = cpuidle_select(drv, dev);
}
-
+ /* Fall back to the default arch idle method on errors. */
+ if (next_state < 0)
+ goto use_default;
/*
* The idle task must be scheduled, it is pointless to
@@ -167,7 +181,8 @@ use_default:
/*
* Give the governor an opportunity to reflect on the outcome
*/
- cpuidle_reflect(dev, entered_state);
+ if (reflect)
+ cpuidle_reflect(dev, entered_state);
exit_idle:
__current_set_polling();
@@ -180,6 +195,19 @@ exit_idle:
rcu_idle_exit();
start_critical_timings();
+ return;
+
+use_default:
+ /*
+ * We can't use the cpuidle framework, let's use the default
+ * idle routine.
+ */
+ if (current_clr_polling_and_test())
+ local_irq_enable();
+ else
+ arch_cpu_idle();
+
+ goto exit_idle;
}
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0870db23d79c..dc0f435a2779 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1380,6 +1380,82 @@ static inline void sched_avg_update(struct rq *rq) { }
extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ for (;;) {
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ return rq;
+ raw_spin_unlock(&rq->lock);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ for (;;) {
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ /*
+ * move_queued_task() task_rq_lock()
+ *
+ * ACQUIRE (rq->lock)
+ * [S] ->on_rq = MIGRATING [L] rq = task_rq()
+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
+ * [S] ->cpu = new_cpu [L] task_rq()
+ * [L] ->on_rq
+ * RELEASE (rq->lock)
+ *
+ * If we observe the old cpu in task_rq_lock, the acquire of
+ * the old rq->lock will fully serialize against the stores.
+ *
+ * If we observe the new cpu in task_rq_lock, the acquire will
+ * pair with the WMB to ensure we must then also see migrating.
+ */
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ return rq;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+ __releases(rq->lock)
+{
+ raw_spin_unlock(&rq->lock);
+}
+
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
+{
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+}
+
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index a476bea17fbc..87e2c9f0c33e 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -15,11 +15,6 @@
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
- int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
- char *mask_str = kmalloc(mask_len, GFP_KERNEL);
-
- if (mask_str == NULL)
- return -ENOMEM;
if (v == (void *)1) {
seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
@@ -50,9 +45,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
for_each_domain(cpu, sd) {
enum cpu_idle_type itype;
- cpumask_scnprintf(mask_str, mask_len,
- sched_domain_span(sd));
- seq_printf(seq, "domain%d %s", dcount++, mask_str);
+ seq_printf(seq, "domain%d %*pb", dcount++,
+ cpumask_pr_args(sched_domain_span(sd)));
for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
itype++) {
seq_printf(seq, " %u %u %u %u %u %u %u %u",
@@ -76,7 +70,6 @@ static int show_schedstat(struct seq_file *seq, void *v)
rcu_read_unlock();
#endif
}
- kfree(mask_str);
return 0;
}