aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/irq/Kconfig3
-rw-r--r--kernel/irq/handle.c111
-rw-r--r--kernel/lockdep.c18
-rw-r--r--kernel/params.c65
-rw-r--r--kernel/perf_event.c46
-rw-r--r--kernel/printk.c100
-rw-r--r--kernel/sched.c26
-rw-r--r--kernel/sched_autogroup.c32
-rw-r--r--kernel/sched_autogroup.h4
-rw-r--r--kernel/sched_debug.c42
-rw-r--r--kernel/sched_fair.c126
-rw-r--r--kernel/smp.c62
-rw-r--r--kernel/sys.c3
-rw-r--r--kernel/sysctl.c3
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/trace/trace_irqsoff.c8
-rw-r--r--kernel/workqueue.c20
17 files changed, 388 insertions, 288 deletions
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 31d766bf5d2e..8e42fec7686d 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -9,9 +9,6 @@ menu "IRQ subsystem"
config GENERIC_HARDIRQS
def_bool y
-config GENERIC_HARDIRQS_NO__DO_IRQ
- def_bool y
-
# Select this to disable the deprecated stuff
config GENERIC_HARDIRQS_NO_DEPRECATED
def_bool n
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e2347eb63306..3540a7190122 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
return retval;
}
-
-#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
-
-#ifdef CONFIG_ENABLE_WARN_DEPRECATED
-# warning __do_IRQ is deprecated. Please convert to proper flow handlers
-#endif
-
-/**
- * __do_IRQ - original all in one highlevel IRQ handler
- * @irq: the interrupt number
- *
- * __do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- *
- * This is the original x86 implementation which is used for every
- * interrupt type.
- */
-unsigned int __do_IRQ(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irqaction *action;
- unsigned int status;
-
- kstat_incr_irqs_this_cpu(irq, desc);
-
- if (CHECK_IRQ_PER_CPU(desc->status)) {
- irqreturn_t action_ret;
-
- /*
- * No locking required for CPU-local interrupts:
- */
- if (desc->irq_data.chip->ack)
- desc->irq_data.chip->ack(irq);
- if (likely(!(desc->status & IRQ_DISABLED))) {
- action_ret = handle_IRQ_event(irq, desc->action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
- }
- desc->irq_data.chip->end(irq);
- return 1;
- }
-
- raw_spin_lock(&desc->lock);
- if (desc->irq_data.chip->ack)
- desc->irq_data.chip->ack(irq);
- /*
- * REPLAY is when Linux resends an IRQ that was dropped earlier
- * WAITING is used by probe to mark irqs that are being tested
- */
- status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
- status |= IRQ_PENDING; /* we _want_ to handle it */
-
- /*
- * If the IRQ is disabled for whatever reason, we cannot
- * use the action we have.
- */
- action = NULL;
- if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
- action = desc->action;
- status &= ~IRQ_PENDING; /* we commit to handling */
- status |= IRQ_INPROGRESS; /* we are handling it */
- }
- desc->status = status;
-
- /*
- * If there is no IRQ handler or it was disabled, exit early.
- * Since we set PENDING, if another processor is handling
- * a different instance of this same irq, the other processor
- * will take care of it.
- */
- if (unlikely(!action))
- goto out;
-
- /*
- * Edge triggered interrupts need to remember
- * pending events.
- * This applies to any hw interrupts that allow a second
- * instance of the same irq to arrive while we are in do_IRQ
- * or in the handler. But the code here only handles the _second_
- * instance of the irq, not the third or fourth. So it is mostly
- * useful for irq hardware that does not mask cleanly in an
- * SMP environment.
- */
- for (;;) {
- irqreturn_t action_ret;
-
- raw_spin_unlock(&desc->lock);
-
- action_ret = handle_IRQ_event(irq, action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
-
- raw_spin_lock(&desc->lock);
- if (likely(!(desc->status & IRQ_PENDING)))
- break;
- desc->status &= ~IRQ_PENDING;
- }
- desc->status &= ~IRQ_INPROGRESS;
-
-out:
- /*
- * The ->end() handler has to deal with interrupts which got
- * disabled while the handler was running.
- */
- desc->irq_data.chip->end(irq);
- raw_spin_unlock(&desc->lock);
-
- return 1;
-}
-#endif
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42ba65dff7d9..0d2058da80f5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
}
/*
- * Debugging helper: via this flag we know that we are in
- * 'early bootup code', and will warn about any invalid irqs-on event:
- */
-static int early_boot_irqs_enabled;
-
-void early_boot_irqs_off(void)
-{
- early_boot_irqs_enabled = 0;
-}
-
-void early_boot_irqs_on(void)
-{
- early_boot_irqs_enabled = 1;
-}
-
-/*
* Hardirqs will be enabled:
*/
void trace_hardirqs_on_caller(unsigned long ip)
@@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
- if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
+ if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
return;
if (unlikely(curr->hardirqs_enabled)) {
diff --git a/kernel/params.c b/kernel/params.c
index 08107d181758..0da1411222b9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
params[i].ops->free(params[i].arg);
}
-static void __init kernel_add_sysfs_param(const char *name,
- struct kernel_param *kparam,
- unsigned int name_skip)
+static struct module_kobject * __init locate_module_kobject(const char *name)
{
struct module_kobject *mk;
struct kobject *kobj;
@@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name,
kobj = kset_find_obj(module_kset, name);
if (kobj) {
- /* We already have one. Remove params so we can add more. */
mk = to_module_kobject(kobj);
- /* We need to remove it before adding parameters. */
- sysfs_remove_group(&mk->kobj, &mk->mp->grp);
} else {
mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
BUG_ON(!mk);
@@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name,
"%s", name);
if (err) {
kobject_put(&mk->kobj);
- printk(KERN_ERR "Module '%s' failed add to sysfs, "
- "error number %d\n", name, err);
- printk(KERN_ERR "The system will be unstable now.\n");
- return;
+ printk(KERN_ERR
+ "Module '%s' failed add to sysfs, error number %d\n",
+ name, err);
+ printk(KERN_ERR
+ "The system will be unstable now.\n");
+ return NULL;
}
- /* So that exit path is even. */
+
+ /* So that we hold reference in both cases. */
kobject_get(&mk->kobj);
}
+ return mk;
+}
+
+static void __init kernel_add_sysfs_param(const char *name,
+ struct kernel_param *kparam,
+ unsigned int name_skip)
+{
+ struct module_kobject *mk;
+ int err;
+
+ mk = locate_module_kobject(name);
+ if (!mk)
+ return;
+
+ /* We need to remove old parameters before adding more. */
+ if (mk->mp)
+ sysfs_remove_group(&mk->kobj, &mk->mp->grp);
+
/* These should not fail at boot. */
err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
BUG_ON(err);
@@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void)
}
}
+ssize_t __modver_version_show(struct module_attribute *mattr,
+ struct module *mod, char *buf)
+{
+ struct module_version_attribute *vattr =
+ container_of(mattr, struct module_version_attribute, mattr);
+
+ return sprintf(buf, "%s\n", vattr->version);
+}
+
+extern struct module_version_attribute __start___modver[], __stop___modver[];
+
+static void __init version_sysfs_builtin(void)
+{
+ const struct module_version_attribute *vattr;
+ struct module_kobject *mk;
+ int err;
+
+ for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
+ mk = locate_module_kobject(vattr->module_name);
+ if (mk) {
+ err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
+ kobject_uevent(&mk->kobj, KOBJ_ADD);
+ kobject_put(&mk->kobj);
+ }
+ }
+}
/* module-related sysfs stuff */
@@ -875,6 +917,7 @@ static int __init param_sysfs_init(void)
}
module_sysfs_initialized = 1;
+ version_sysfs_builtin();
param_sysfs_builtin();
return 0;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 84522c796987..126a302c481c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2201,13 +2201,6 @@ find_lively_task_by_vpid(pid_t vpid)
if (!task)
return ERR_PTR(-ESRCH);
- /*
- * Can't attach events to a dying task.
- */
- err = -ESRCH;
- if (task->flags & PF_EXITING)
- goto errout;
-
/* Reuse ptrace permission checks for now. */
err = -EACCES;
if (!ptrace_may_access(task, PTRACE_MODE_READ))
@@ -2268,14 +2261,27 @@ retry:
get_ctx(ctx);
- if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
- /*
- * We raced with some other task; use
- * the context they set.
- */
+ err = 0;
+ mutex_lock(&task->perf_event_mutex);
+ /*
+ * If it has already passed perf_event_exit_task().
+ * we must see PF_EXITING, it takes this mutex too.
+ */
+ if (task->flags & PF_EXITING)
+ err = -ESRCH;
+ else if (task->perf_event_ctxp[ctxn])
+ err = -EAGAIN;
+ else
+ rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ mutex_unlock(&task->perf_event_mutex);
+
+ if (unlikely(err)) {
put_task_struct(task);
kfree(ctx);
- goto retry;
+
+ if (err == -EAGAIN)
+ goto retry;
+ goto errout;
}
}
@@ -5374,6 +5380,8 @@ free_dev:
goto out;
}
+static struct lock_class_key cpuctx_mutex;
+
int perf_pmu_register(struct pmu *pmu, char *name, int type)
{
int cpu, ret;
@@ -5422,6 +5430,7 @@ skip_type:
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
__perf_event_init_context(&cpuctx->ctx);
+ lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
cpuctx->ctx.type = cpu_context;
cpuctx->ctx.pmu = pmu;
cpuctx->jiffies_interval = 1;
@@ -6127,7 +6136,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* scheduled, so we are now safe from rescheduling changing
* our context.
*/
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
task_ctx_sched_out(child_ctx, EVENT_ALL);
/*
@@ -6440,11 +6449,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
unsigned long flags;
int ret = 0;
- child->perf_event_ctxp[ctxn] = NULL;
-
- mutex_init(&child->perf_event_mutex);
- INIT_LIST_HEAD(&child->perf_event_list);
-
if (likely(!parent->perf_event_ctxp[ctxn]))
return 0;
@@ -6533,6 +6537,10 @@ int perf_event_init_task(struct task_struct *child)
{
int ctxn, ret;
+ memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+ mutex_init(&child->perf_event_mutex);
+ INIT_LIST_HEAD(&child->perf_event_list);
+
for_each_task_context_nr(ctxn) {
ret = perf_event_init_context(child, ctxn);
if (ret)
diff --git a/kernel/printk.c b/kernel/printk.c
index 53d9a9ec88e6..2ddbdc73aade 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -97,7 +97,7 @@ static int console_locked, console_suspended;
/*
* logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
* It is also used in interesting ways to provide interlocking in
- * release_console_sem().
+ * console_unlock();.
*/
static DEFINE_SPINLOCK(logbuf_lock);
@@ -501,7 +501,7 @@ static void _call_console_drivers(unsigned start,
/*
* Call the console drivers, asking them to write out
* log_buf[start] to log_buf[end - 1].
- * The console_sem must be held.
+ * The console_lock must be held.
*/
static void call_console_drivers(unsigned start, unsigned end)
{
@@ -604,11 +604,11 @@ static int have_callable_console(void)
*
* This is printk(). It can be called from any context. We want it to work.
*
- * We try to grab the console_sem. If we succeed, it's easy - we log the output and
+ * We try to grab the console_lock. If we succeed, it's easy - we log the output and
* call the console drivers. If we fail to get the semaphore we place the output
* into the log buffer and return. The current holder of the console_sem will
- * notice the new output in release_console_sem() and will send it to the
- * consoles before releasing the semaphore.
+ * notice the new output in console_unlock(); and will send it to the
+ * consoles before releasing the lock.
*
* One effect of this deferred printing is that code which calls printk() and
* then changes console_loglevel may break. This is because console_loglevel
@@ -659,19 +659,19 @@ static inline int can_use_console(unsigned int cpu)
/*
* Try to get console ownership to actually show the kernel
* messages from a 'printk'. Return true (and with the
- * console_semaphore held, and 'console_locked' set) if it
+ * console_lock held, and 'console_locked' set) if it
* is successful, false otherwise.
*
* This gets called with the 'logbuf_lock' spinlock held and
* interrupts disabled. It should return with 'lockbuf_lock'
* released but interrupts still disabled.
*/
-static int acquire_console_semaphore_for_printk(unsigned int cpu)
+static int console_trylock_for_printk(unsigned int cpu)
__releases(&logbuf_lock)
{
int retval = 0;
- if (!try_acquire_console_sem()) {
+ if (console_trylock()) {
retval = 1;
/*
@@ -827,12 +827,12 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* actual magic (print out buffers, wake up klogd,
* etc).
*
- * The acquire_console_semaphore_for_printk() function
+ * The console_trylock_for_printk() function
* will release 'logbuf_lock' regardless of whether it
* actually gets the semaphore or not.
*/
- if (acquire_console_semaphore_for_printk(this_cpu))
- release_console_sem();
+ if (console_trylock_for_printk(this_cpu))
+ console_unlock();
lockdep_on();
out_restore_irqs:
@@ -993,7 +993,7 @@ void suspend_console(void)
if (!console_suspend_enabled)
return;
printk("Suspending console(s) (use no_console_suspend to debug)\n");
- acquire_console_sem();
+ console_lock();
console_suspended = 1;
up(&console_sem);
}
@@ -1004,7 +1004,7 @@ void resume_console(void)
return;
down(&console_sem);
console_suspended = 0;
- release_console_sem();
+ console_unlock();
}
/**
@@ -1027,21 +1027,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
case CPU_DYING:
case CPU_DOWN_FAILED:
case CPU_UP_CANCELED:
- acquire_console_sem();
- release_console_sem();
+ console_lock();
+ console_unlock();
}
return NOTIFY_OK;
}
/**
- * acquire_console_sem - lock the console system for exclusive use.
+ * console_lock - lock the console system for exclusive use.
*
- * Acquires a semaphore which guarantees that the caller has
+ * Acquires a lock which guarantees that the caller has
* exclusive access to the console system and the console_drivers list.
*
* Can sleep, returns nothing.
*/
-void acquire_console_sem(void)
+void console_lock(void)
{
BUG_ON(in_interrupt());
down(&console_sem);
@@ -1050,21 +1050,29 @@ void acquire_console_sem(void)
console_locked = 1;
console_may_schedule = 1;
}
-EXPORT_SYMBOL(acquire_console_sem);
+EXPORT_SYMBOL(console_lock);
-int try_acquire_console_sem(void)
+/**
+ * console_trylock - try to lock the console system for exclusive use.
+ *
+ * Tried to acquire a lock which guarantees that the caller has
+ * exclusive access to the console system and the console_drivers list.
+ *
+ * returns 1 on success, and 0 on failure to acquire the lock.
+ */
+int console_trylock(void)
{
if (down_trylock(&console_sem))
- return -1;
+ return 0;
if (console_suspended) {
up(&console_sem);
- return -1;
+ return 0;
}
console_locked = 1;
console_may_schedule = 0;
- return 0;
+ return 1;
}
-EXPORT_SYMBOL(try_acquire_console_sem);
+EXPORT_SYMBOL(console_trylock);
int is_console_locked(void)
{
@@ -1095,20 +1103,20 @@ void wake_up_klogd(void)
}
/**
- * release_console_sem - unlock the console system
+ * console_unlock - unlock the console system
*
- * Releases the semaphore which the caller holds on the console system
+ * Releases the console_lock which the caller holds on the console system
* and the console driver list.
*
- * While the semaphore was held, console output may have been buffered
- * by printk(). If this is the case, release_console_sem() emits
- * the output prior to releasing the semaphore.
+ * While the console_lock was held, console output may have been buffered
+ * by printk(). If this is the case, console_unlock(); emits
+ * the output prior to releasing the lock.
*
* If there is output waiting for klogd, we wake it up.
*
- * release_console_sem() may be called from any context.
+ * console_unlock(); may be called from any context.
*/
-void release_console_sem(void)
+void console_unlock(void)
{
unsigned long flags;
unsigned _con_start, _log_end;
@@ -1141,7 +1149,7 @@ void release_console_sem(void)
if (wake_klogd)
wake_up_klogd();
}
-EXPORT_SYMBOL(release_console_sem);
+EXPORT_SYMBOL(console_unlock);
/**
* console_conditional_schedule - yield the CPU if required
@@ -1150,7 +1158,7 @@ EXPORT_SYMBOL(release_console_sem);
* if this CPU should yield the CPU to another task, do
* so here.
*
- * Must be called within acquire_console_sem().
+ * Must be called within console_lock();.
*/
void __sched console_conditional_schedule(void)
{
@@ -1171,14 +1179,14 @@ void console_unblank(void)
if (down_trylock(&console_sem) != 0)
return;
} else
- acquire_console_sem();
+ console_lock();
console_locked = 1;
console_may_schedule = 0;
for_each_console(c)
if ((c->flags & CON_ENABLED) && c->unblank)
c->unblank();
- release_console_sem();
+ console_unlock();
}
/*
@@ -1189,7 +1197,7 @@ struct tty_driver *console_device(int *index)
struct console *c;
struct tty_driver *driver = NULL;
- acquire_console_sem();
+ console_lock();
for_each_console(c) {
if (!c->device)
continue;
@@ -1197,7 +1205,7 @@ struct tty_driver *console_device(int *index)
if (driver)
break;
}
- release_console_sem();
+ console_unlock();
return driver;
}
@@ -1208,17 +1216,17 @@ struct tty_driver *console_device(int *index)
*/
void console_stop(struct console *console)
{
- acquire_console_sem();
+ console_lock();
console->flags &= ~CON_ENABLED;
- release_console_sem();
+ console_unlock();
}
EXPORT_SYMBOL(console_stop);
void console_start(struct console *console)
{
- acquire_console_sem();
+ console_lock();
console->flags |= CON_ENABLED;
- release_console_sem();
+ console_unlock();
}
EXPORT_SYMBOL(console_start);
@@ -1340,7 +1348,7 @@ void register_console(struct console *newcon)
* Put this console in the list - keep the
* preferred driver at the head of the list.
*/
- acquire_console_sem();
+ console_lock();
if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
newcon->next = console_drivers;
console_drivers = newcon;
@@ -1352,14 +1360,14 @@ void register_console(struct console *newcon)
}
if (newcon->flags & CON_PRINTBUFFER) {
/*
- * release_console_sem() will print out the buffered messages
+ * console_unlock(); will print out the buffered messages
* for us.
*/
spin_lock_irqsave(&logbuf_lock, flags);
con_start = log_start;
spin_unlock_irqrestore(&logbuf_lock, flags);
}
- release_console_sem();
+ console_unlock();
console_sysfs_notify();
/*
@@ -1396,7 +1404,7 @@ int unregister_console(struct console *console)
return braille_unregister_console(console);
#endif
- acquire_console_sem();
+ console_lock();
if (console_drivers == console) {
console_drivers=console->next;
res = 0;
@@ -1418,7 +1426,7 @@ int unregister_console(struct console *console)
if (console_drivers != NULL && console->flags & CON_CONSDEV)
console_drivers->flags |= CON_CONSDEV;
- release_console_sem();
+ console_unlock();
console_sysfs_notify();
return res;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index ea3e5eff3878..18d38e4ec7ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -553,9 +553,6 @@ struct rq {
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
-
- /* BKL stats */
- unsigned int bkl_count;
#endif
};
@@ -609,6 +606,9 @@ static inline struct task_group *task_group(struct task_struct *p)
struct task_group *tg;
struct cgroup_subsys_state *css;
+ if (p->flags & PF_EXITING)
+ return &root_task_group;
+
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));
tg = container_of(css, struct task_group, css);
@@ -3887,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev)
schedstat_inc(this_rq(), sched_count);
#ifdef CONFIG_SCHEDSTATS
if (unlikely(prev->lock_depth >= 0)) {
- schedstat_inc(this_rq(), bkl_count);
+ schedstat_inc(this_rq(), rq_sched_info.bkl_count);
schedstat_inc(prev, sched_info.bkl_count);
}
#endif
@@ -4871,7 +4871,8 @@ recheck:
* assigned.
*/
if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0) {
+ task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+ !task_group_is_autogroup(task_group(p))) {
__task_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return -EPERM;
@@ -8882,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
}
}
+static void
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+{
+ /*
+ * cgroup_exit() is called in the copy_process() failure path.
+ * Ignore this case since the task hasn't ran yet, this avoids
+ * trying to poke a half freed task state from generic code.
+ */
+ if (!(task->flags & PF_EXITING))
+ return;
+
+ sched_move_task(task);
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval)
@@ -8954,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.destroy = cpu_cgroup_destroy,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
+ .exit = cpu_cgroup_exit,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
.early_init = 1,
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 32a723b8f84c..9fb656283157 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -27,6 +27,11 @@ static inline void autogroup_destroy(struct kref *kref)
{
struct autogroup *ag = container_of(kref, struct autogroup, kref);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* We've redirected RT tasks to the root task group... */
+ ag->tg->rt_se = NULL;
+ ag->tg->rt_rq = NULL;
+#endif
sched_destroy_group(ag->tg);
}
@@ -55,6 +60,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
return ag;
}
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg);
+#endif
+
static inline struct autogroup *autogroup_create(void)
{
struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -72,6 +81,19 @@ static inline struct autogroup *autogroup_create(void)
init_rwsem(&ag->lock);
ag->id = atomic_inc_return(&autogroup_seq_nr);
ag->tg = tg;
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Autogroup RT tasks are redirected to the root task group
+ * so we don't have to move tasks around upon policy change,
+ * or flail around trying to allocate bandwidth on the fly.
+ * A bandwidth exception in __sched_setscheduler() allows
+ * the policy change to proceed. Thereafter, task_group()
+ * returns &root_task_group, so zero bandwidth is required.
+ */
+ free_rt_sched_group(tg);
+ tg->rt_se = root_task_group.rt_se;
+ tg->rt_rq = root_task_group.rt_rq;
+#endif
tg->autogroup = ag;
return ag;
@@ -106,6 +128,11 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
return true;
}
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+ return tg != &root_task_group && tg->autogroup;
+}
+
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
@@ -231,6 +258,11 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
#ifdef CONFIG_SCHED_DEBUG
static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
+ int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+ if (!enabled || !tg->autogroup)
+ return 0;
+
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 5358e241cb20..7b859ffe5dad 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -15,6 +15,10 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg);
static inline void autogroup_init(struct task_struct *init_task) { }
static inline void autogroup_free(struct task_group *tg) { }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+ return 0;
+}
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 1dfae3d014b5..eb6cb8edd075 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
#include <linux/kallsyms.h>
#include <linux/utsname.h>
+static DEFINE_SPINLOCK(sched_debug_lock);
+
/*
* This allows printing both to /proc/sched_debug and
* to the console
@@ -86,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
}
#endif
+#ifdef CONFIG_CGROUP_SCHED
+static char group_path[PATH_MAX];
+
+static char *task_group_path(struct task_group *tg)
+{
+ if (autogroup_path(tg, group_path, PATH_MAX))
+ return group_path;
+
+ /*
+ * May be NULL if the underlying cgroup isn't fully-created yet
+ */
+ if (!tg->css.cgroup) {
+ group_path[0] = '\0';
+ return group_path;
+ }
+ cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ return group_path;
+}
+#endif
+
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
@@ -108,6 +130,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
#endif
+#ifdef CONFIG_CGROUP_SCHED
+ SEQ_printf(m, " %s", task_group_path(task_group(p)));
+#endif
SEQ_printf(m, "\n");
}
@@ -144,7 +169,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
struct sched_entity *last;
unsigned long flags;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+#else
SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+#endif
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
@@ -191,7 +220,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
+#ifdef CONFIG_RT_GROUP_SCHED
+ SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+#else
SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+#endif
#define P(x) \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -212,6 +245,7 @@ extern __read_mostly int sched_clock_running;
static void print_cpu(struct seq_file *m, int cpu)
{
struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
#ifdef CONFIG_X86
{
@@ -262,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu)
P(ttwu_count);
P(ttwu_local);
- P(bkl_count);
+ SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
+ rq->rq_sched_info.bkl_count);
#undef P
+#undef P64
#endif
+ spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
+ rcu_read_lock();
print_rq(m, rq, cpu);
+ rcu_read_unlock();
+ spin_unlock_irqrestore(&sched_debug_lock, flags);
}
static const char *sched_tunable_scaling_names[] = {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c62ebae65cf0..0c26e2df450e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -699,7 +699,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->nr_running--;
}
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_FAIR_GROUP_SCHED
+# ifdef CONFIG_SMP
static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
int global_update)
{
@@ -721,10 +722,10 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
u64 now, delta;
unsigned long load = cfs_rq->load.weight;
- if (!cfs_rq)
+ if (cfs_rq->tg == &root_task_group)
return;
- now = rq_of(cfs_rq)->clock;
+ now = rq_of(cfs_rq)->clock_task;
delta = now - cfs_rq->load_stamp;
/* truncate load history at 4 idle periods */
@@ -762,6 +763,51 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
list_del_leaf_cfs_rq(cfs_rq);
}
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+ long weight_delta)
+{
+ long load_weight, load, shares;
+
+ load = cfs_rq->load.weight + weight_delta;
+
+ load_weight = atomic_read(&tg->load_weight);
+ load_weight -= cfs_rq->load_contribution;
+ load_weight += load;
+
+ shares = (tg->shares * load);
+ if (load_weight)
+ shares /= load_weight;
+
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
+ if (shares > tg->shares)
+ shares = tg->shares;
+
+ return shares;
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq, 0);
+ }
+}
+# else /* CONFIG_SMP */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+ long weight_delta)
+{
+ return tg->shares;
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+# endif /* CONFIG_SMP */
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
@@ -782,41 +828,20 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
{
struct task_group *tg;
struct sched_entity *se;
- long load_weight, load, shares;
-
- if (!cfs_rq)
- return;
+ long shares;
tg = cfs_rq->tg;
se = tg->se[cpu_of(rq_of(cfs_rq))];
if (!se)
return;
-
- load = cfs_rq->load.weight + weight_delta;
-
- load_weight = atomic_read(&tg->load_weight);
- load_weight -= cfs_rq->load_contribution;
- load_weight += load;
-
- shares = (tg->shares * load);
- if (load_weight)
- shares /= load_weight;
-
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- if (shares > tg->shares)
- shares = tg->shares;
+#ifndef CONFIG_SMP
+ if (likely(se->load.weight == tg->shares))
+ return;
+#endif
+ shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
reweight_entity(cfs_rq_of(se), se, shares);
}
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq, 0);
- }
-}
#else /* CONFIG_FAIR_GROUP_SCHED */
static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
{
@@ -1062,6 +1087,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
struct sched_entity *se = __pick_next_entity(cfs_rq);
s64 delta = curr->vruntime - se->vruntime;
+ if (delta < 0)
+ return;
+
if (delta > ideal_runtime)
resched_task(rq_of(cfs_rq)->curr);
}
@@ -1362,27 +1390,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
return wl;
for_each_sched_entity(se) {
- long S, rw, s, a, b;
+ long lw, w;
- S = se->my_q->tg->shares;
- s = se->load.weight;
- rw = se->my_q->load.weight;
+ tg = se->my_q->tg;
+ w = se->my_q->load.weight;
- a = S*(rw + wl);
- b = S*rw + s*wg;
+ /* use this cpu's instantaneous contribution */
+ lw = atomic_read(&tg->load_weight);
+ lw -= se->my_q->load_contribution;
+ lw += w + wg;
- wl = s*(a-b);
+ wl += w;
- if (likely(b))
- wl /= b;
+ if (lw > 0 && wl < lw)
+ wl = (wl * tg->shares) / lw;
+ else
+ wl = tg->shares;
- /*
- * Assume the group is already running and will
- * thus already be accounted for in the weight.
- *
- * That is, moving shares between CPUs, does not
- * alter the group weight.
- */
+ /* zero point is MIN_SHARES */
+ if (wl < MIN_SHARES)
+ wl = MIN_SHARES;
+ wl -= se->load.weight;
wg = 0;
}
@@ -1401,7 +1429,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
- unsigned long this_load, load;
+ s64 this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
struct task_group *tg;
@@ -1440,8 +1468,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
- if (this_load) {
- unsigned long this_eff_load, prev_eff_load;
+ if (this_load > 0) {
+ s64 this_eff_load, prev_eff_load;
this_eff_load = 100;
this_eff_load *= power_of(prev_cpu);
diff --git a/kernel/smp.c b/kernel/smp.c
index 4ec30e069987..9910744f0856 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -194,23 +194,52 @@ void generic_smp_call_function_interrupt(void)
*/
list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
int refs;
+ void (*func) (void *info);
- if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
+ /*
+ * Since we walk the list without any locks, we might
+ * see an entry that was completed, removed from the
+ * list and is in the process of being reused.
+ *
+ * We must check that the cpu is in the cpumask before
+ * checking the refs, and both must be set before
+ * executing the callback on this cpu.
+ */
+
+ if (!cpumask_test_cpu(cpu, data->cpumask))
+ continue;
+
+ smp_rmb();
+
+ if (atomic_read(&data->refs) == 0)
continue;
+ func = data->csd.func; /* for later warn */
data->csd.func(data->csd.info);
+ /*
+ * If the cpu mask is not still set then it enabled interrupts,
+ * we took another smp interrupt, and executed the function
+ * twice on this cpu. In theory that copy decremented refs.
+ */
+ if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
+ WARN(1, "%pS enabled interrupts and double executed\n",
+ func);
+ continue;
+ }
+
refs = atomic_dec_return(&data->refs);
WARN_ON(refs < 0);
- if (!refs) {
- raw_spin_lock(&call_function.lock);
- list_del_rcu(&data->csd.list);
- raw_spin_unlock(&call_function.lock);
- }
if (refs)
continue;
+ WARN_ON(!cpumask_empty(data->cpumask));
+
+ raw_spin_lock(&call_function.lock);
+ list_del_rcu(&data->csd.list);
+ raw_spin_unlock(&call_function.lock);
+
csd_unlock(&data->csd);
}
@@ -430,7 +459,7 @@ void smp_call_function_many(const struct cpumask *mask,
* can't happen.
*/
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
- && !oops_in_progress);
+ && !oops_in_progress && !early_boot_irqs_disabled);
/* So, what's a CPU they want? Ignoring this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -454,11 +483,21 @@ void smp_call_function_many(const struct cpumask *mask,
data = &__get_cpu_var(cfd_data);
csd_lock(&data->csd);
+ BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
data->csd.func = func;
data->csd.info = info;
cpumask_and(data->cpumask, mask, cpu_online_mask);
cpumask_clear_cpu(this_cpu, data->cpumask);
+
+ /*
+ * To ensure the interrupt handler gets an complete view
+ * we order the cpumask and refs writes and order the read
+ * of them in the interrupt handler. In addition we may
+ * only clear our own cpu bit from the mask.
+ */
+ smp_wmb();
+
atomic_set(&data->refs, cpumask_weight(data->cpumask));
raw_spin_lock_irqsave(&call_function.lock, flags);
@@ -533,17 +572,20 @@ void ipi_call_unlock_irq(void)
#endif /* USE_GENERIC_SMP_HELPERS */
/*
- * Call a function on all processors
+ * Call a function on all processors. May be used during early boot while
+ * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
+ * of local_irq_disable/enable().
*/
int on_each_cpu(void (*func) (void *info), void *info, int wait)
{
+ unsigned long flags;
int ret = 0;
preempt_disable();
ret = smp_call_function(func, info, wait);
- local_irq_disable();
+ local_irq_save(flags);
func(info);
- local_irq_enable();
+ local_irq_restore(flags);
preempt_enable();
return ret;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 31b71a276b40..18da702ec813 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1385,7 +1385,8 @@ static int check_prlimit_permission(struct task_struct *task)
const struct cred *cred = current_cred(), *tcred;
tcred = __task_cred(task);
- if ((cred->uid != tcred->euid ||
+ if (current != task &&
+ (cred->uid != tcred->euid ||
cred->uid != tcred->suid ||
cred->uid != tcred->uid ||
cred->gid != tcred->egid ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bc86bb32e126..0f1bd83db985 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -170,7 +170,8 @@ static int proc_taint(struct ctl_table *table, int write,
#endif
#ifdef CONFIG_MAGIC_SYSRQ
-static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
+/* Note: sysrq code uses it's own private copy */
+static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
static int sysrq_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *lenp,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3e216e01bbd1..c55ea2433471 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -642,8 +642,7 @@ static void tick_nohz_switch_to_nohz(void)
}
local_irq_enable();
- printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
- smp_processor_id());
+ printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
}
/*
@@ -795,8 +794,10 @@ void tick_setup_sched_timer(void)
}
#ifdef CONFIG_NO_HZ
- if (tick_nohz_enabled)
+ if (tick_nohz_enabled) {
ts->nohz_mode = NOHZ_MODE_HIGHRES;
+ printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
+ }
#endif
}
#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5cf8c602b880..92b6e1e12d98 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
* Stubs:
*/
-void early_boot_irqs_off(void)
-{
-}
-
-void early_boot_irqs_on(void)
-{
-}
-
void trace_softirqs_on(unsigned long ip)
{
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8ee6ec82f88a..11869faa6819 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -768,7 +768,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
worker->flags &= ~flags;
- /* if transitioning out of NOT_RUNNING, increment nr_running */
+ /*
+ * If transitioning out of NOT_RUNNING, increment nr_running. Note
+ * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
+ * of multiple flags, not a single flag.
+ */
if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(get_gcwq_nr_running(gcwq->cpu));
@@ -1840,7 +1844,7 @@ __acquires(&gcwq->lock)
spin_unlock_irq(&gcwq->lock);
work_clear_pending(work);
- lock_map_acquire(&cwq->wq->lockdep_map);
+ lock_map_acquire_read(&cwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
trace_workqueue_execute_start(work);
f(work);
@@ -2384,8 +2388,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
insert_wq_barrier(cwq, barr, work, worker);
spin_unlock_irq(&gcwq->lock);
- lock_map_acquire(&cwq->wq->lockdep_map);
+ /*
+ * If @max_active is 1 or rescuer is in use, flushing another work
+ * item on the same workqueue may lead to deadlock. Make sure the
+ * flusher is not running on the same workqueue by verifying write
+ * access.
+ */
+ if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
+ lock_map_acquire(&cwq->wq->lockdep_map);
+ else
+ lock_map_acquire_read(&cwq->wq->lockdep_map);
lock_map_release(&cwq->wq->lockdep_map);
+
return true;
already_gone:
spin_unlock_irq(&gcwq->lock);