aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--drivers/acpi/nvs.c (renamed from kernel/power/nvs.c)19
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/capability.c2
-rw-r--r--kernel/cgroup.c45
-rw-r--r--kernel/cpuset.c7
-rw-r--r--kernel/cred.c16
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/exit.c14
-rw-r--r--kernel/fork.c41
-rw-r--r--kernel/freezer.c9
-rw-r--r--kernel/futex.c62
-rw-r--r--kernel/hrtimer.c2
-rw-r--r--kernel/irq/Kconfig3
-rw-r--r--kernel/irq/handle.c111
-rw-r--r--kernel/irq/internals.h6
-rw-r--r--kernel/irq/irqdesc.c51
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/latencytop.c23
-rw-r--r--kernel/lockdep.c18
-rw-r--r--kernel/module.c16
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/params.c65
-rw-r--r--kernel/perf_event.c182
-rw-r--r--kernel/power/Kconfig5
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/hibernate.c11
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/process.c14
-rw-r--r--kernel/power/snapshot.c7
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/power/swap.c7
-rw-r--r--kernel/printk.c184
-rw-r--r--kernel/ptrace.c8
-rw-r--r--kernel/rcutiny.c3
-rw-r--r--kernel/sched.c28
-rw-r--r--kernel/sched_autogroup.c32
-rw-r--r--kernel/sched_autogroup.h4
-rw-r--r--kernel/sched_debug.c42
-rw-r--r--kernel/sched_fair.c126
-rw-r--r--kernel/sched_rt.c2
-rw-r--r--kernel/smp.c75
-rw-r--r--kernel/softirq.c19
-rw-r--r--kernel/srcu.c15
-rw-r--r--kernel/sys.c9
-rw-r--r--kernel/sysctl.c34
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/time.c4
-rw-r--r--kernel/time/clocksource.c10
-rw-r--r--kernel/time/ntp.c425
-rw-r--r--kernel/time/tick-broadcast.c10
-rw-r--r--kernel/time/tick-common.c6
-rw-r--r--kernel/time/tick-internal.h3
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/time/timekeeping.c47
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c60
-rw-r--r--kernel/trace/trace.c6
-rw-r--r--kernel/trace/trace_entries.h2
-rw-r--r--kernel/trace/trace_events.c12
-rw-r--r--kernel/trace/trace_export.c6
-rw-r--r--kernel/trace/trace_irqsoff.c8
-rw-r--r--kernel/trace/trace_syscalls.c52
-rw-r--r--kernel/tracepoint.c31
-rw-r--r--kernel/user_namespace.c15
-rw-r--r--kernel/watchdog.c53
-rw-r--r--kernel/workqueue.c57
73 files changed, 1439 insertions, 754 deletions
diff --git a/kernel/power/nvs.c b/drivers/acpi/nvs.c
index 1836db60bbb6..fa5a1df42b79 100644
--- a/kernel/power/nvs.c
+++ b/drivers/acpi/nvs.c
@@ -1,7 +1,7 @@
/*
- * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
+ * nvs.c - Routines for saving and restoring ACPI NVS memory region
*
- * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ * Copyright (C) 2008-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
*
* This file is released under the GPLv2.
*/
@@ -11,7 +11,9 @@
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/suspend.h>
+#include <linux/acpi.h>
+#include <linux/acpi_io.h>
+#include <acpi/acpiosxf.h>
/*
* Platforms, like ACPI, may want us to save some memory used by them during
@@ -105,7 +107,7 @@ int suspend_nvs_alloc(void)
/**
* suspend_nvs_save - save NVS memory regions
*/
-void suspend_nvs_save(void)
+int suspend_nvs_save(void)
{
struct nvs_page *entry;
@@ -113,9 +115,16 @@ void suspend_nvs_save(void)
list_for_each_entry(entry, &nvs_list, node)
if (entry->data) {
- entry->kaddr = ioremap(entry->phys_start, entry->size);
+ entry->kaddr = acpi_os_ioremap(entry->phys_start,
+ entry->size);
+ if (!entry->kaddr) {
+ suspend_nvs_free();
+ return -ENOMEM;
+ }
memcpy(entry->data, entry->kaddr, entry->size);
}
+
+ return 0;
}
/**
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b5ff083fa22..353d3fe8ba33 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
+obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_X86_DS) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o
@@ -121,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h
# config_data.h contains the same information as ikconfig.h but gzipped.
# Info from config_data can be extracted from /proc/config*
targets += config_data.gz
-$(obj)/config_data.gz: .config FORCE
+$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
quiet_cmd_ikconfiggz = IKCFG $@
diff --git a/kernel/audit.c b/kernel/audit.c
index 77770a034d59..e4956244ae50 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
if (err < 0) {
BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
- audit_log_lost("auditd dissapeared\n");
+ audit_log_lost("auditd disappeared\n");
audit_pid = 0;
/* we might get lucky and get this in the next auditd */
audit_hold_skb(skb);
diff --git a/kernel/capability.c b/kernel/capability.c
index 2f05303715a5..9e9385f132c8 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -306,7 +306,7 @@ int capable(int cap)
BUG();
}
- if (security_capable(cap) == 0) {
+ if (security_capable(current_cred(), cap) == 0) {
current->flags |= PF_SUPERPRIV;
return 1;
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 51cddc11cd85..b24d7027b83c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -763,9 +763,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
* -> cgroup_mkdir.
*/
-static struct dentry *cgroup_lookup(struct inode *dir,
- struct dentry *dentry, struct nameidata *nd);
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
static int cgroup_populate_dir(struct cgroup *cgrp);
static const struct inode_operations cgroup_dir_inode_operations;
@@ -862,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
iput(inode);
}
+static int cgroup_delete(const struct dentry *d)
+{
+ return 1;
+}
+
static void remove_dir(struct dentry *d)
{
struct dentry *parent = dget(d->d_parent);
@@ -912,7 +916,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
parent = dentry->d_parent;
spin_lock(&parent->d_lock);
- spin_lock(&dentry->d_lock);
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
list_del_init(&dentry->d_u.d_child);
spin_unlock(&dentry->d_lock);
spin_unlock(&parent->d_lock);
@@ -1451,6 +1455,11 @@ static int cgroup_set_super(struct super_block *sb, void *data)
static int cgroup_get_rootdir(struct super_block *sb)
{
+ static const struct dentry_operations cgroup_dops = {
+ .d_iput = cgroup_diput,
+ .d_delete = cgroup_delete,
+ };
+
struct inode *inode =
cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
struct dentry *dentry;
@@ -1468,6 +1477,8 @@ static int cgroup_get_rootdir(struct super_block *sb)
return -ENOMEM;
}
sb->s_root = dentry;
+ /* for everything else we want ->d_op set */
+ sb->s_d_op = &cgroup_dops;
return 0;
}
@@ -2197,6 +2208,14 @@ static const struct inode_operations cgroup_dir_inode_operations = {
.rename = cgroup_rename,
};
+static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+ if (dentry->d_name.len > NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+ d_add(dentry, NULL);
+ return NULL;
+}
+
/*
* Check if a file is a control file
*/
@@ -2207,26 +2226,6 @@ static inline struct cftype *__file_cft(struct file *file)
return __d_cft(file->f_dentry);
}
-static int cgroup_delete_dentry(const struct dentry *dentry)
-{
- return 1;
-}
-
-static struct dentry *cgroup_lookup(struct inode *dir,
- struct dentry *dentry, struct nameidata *nd)
-{
- static const struct dentry_operations cgroup_dentry_operations = {
- .d_delete = cgroup_delete_dentry,
- .d_iput = cgroup_diput,
- };
-
- if (dentry->d_name.len > NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
- d_set_d_op(dentry, &cgroup_dentry_operations);
- d_add(dentry, NULL);
- return NULL;
-}
-
static int cgroup_create_file(struct dentry *dentry, mode_t mode,
struct super_block *sb)
{
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4349935c2ad8..e92e98189032 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1575,8 +1575,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
return -ENODEV;
trialcs = alloc_trial_cpuset(cs);
- if (!trialcs)
- return -ENOMEM;
+ if (!trialcs) {
+ retval = -ENOMEM;
+ goto out;
+ }
switch (cft->private) {
case FILE_CPULIST:
@@ -1591,6 +1593,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
}
free_trial_cpuset(trialcs);
+out:
cgroup_unlock();
return retval;
}
diff --git a/kernel/cred.c b/kernel/cred.c
index 6a1aa004e376..3a9d6dd53a6c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -252,13 +252,13 @@ struct cred *cred_alloc_blank(void)
#endif
atomic_set(&new->usage, 1);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ new->magic = CRED_MAGIC;
+#endif
if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
goto error;
-#ifdef CONFIG_DEBUG_CREDENTIALS
- new->magic = CRED_MAGIC;
-#endif
return new;
error:
@@ -657,6 +657,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
validate_creds(old);
*new = *old;
+ atomic_set(&new->usage, 1);
+ set_cred_subscribers(new, 0);
get_uid(new->user);
get_group_info(new->group_info);
@@ -674,8 +676,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
goto error;
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
put_cred(old);
validate_creds(new);
return new;
@@ -748,7 +748,11 @@ bool creds_are_invalid(const struct cred *cred)
if (cred->magic != CRED_MAGIC)
return true;
#ifdef CONFIG_SECURITY_SELINUX
- if (selinux_is_enabled()) {
+ /*
+ * cred->security == NULL if security_cred_alloc_blank() or
+ * security_prepare_creds() returned an error.
+ */
+ if (selinux_is_enabled() && cred->security) {
if ((unsigned long) cred->security < PAGE_SIZE)
return true;
if ((*(u32 *)cred->security & 0xffffff00) ==
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index a6e729766821..bd3e8e29caa3 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2914,7 +2914,7 @@ static void __init kdb_cmd_init(void)
}
}
-/* Intialize kdb_printf, breakpoint tables and kdb state */
+/* Initialize kdb_printf, breakpoint tables and kdb state */
void __init kdb_init(int lvl)
{
static int kdb_init_lvl = KDB_NOT_INITIALIZED;
diff --git a/kernel/exit.c b/kernel/exit.c
index 89c74861a3da..f9a45ebcc7b1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -994,6 +994,15 @@ NORET_TYPE void do_exit(long code)
exit_fs(tsk);
check_stack_usage();
exit_thread();
+
+ /*
+ * Flush inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ *
+ * because of cgroup mode, must be called before cgroup_exit()
+ */
+ perf_event_exit_task(tsk);
+
cgroup_exit(tsk, 1);
if (group_dead)
@@ -1007,11 +1016,6 @@ NORET_TYPE void do_exit(long code)
* FIXME: do that only when needed, using sched_exit tracepoint
*/
flush_ptrace_hw_breakpoint(tsk);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- */
- perf_event_exit_task(tsk);
exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index d9b44f20b6b0..25e429152ddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
#include <linux/oom.h>
+#include <linux/khugepaged.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -330,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
retval = ksm_fork(mm, oldmm);
if (retval)
goto out;
+ retval = khugepaged_fork(mm, oldmm);
+ if (retval)
+ goto out;
prev = NULL;
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
@@ -529,6 +533,9 @@ void __mmdrop(struct mm_struct *mm)
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ VM_BUG_ON(mm->pmd_huge_pte);
+#endif
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -543,6 +550,7 @@ void mmput(struct mm_struct *mm)
if (atomic_dec_and_test(&mm->mm_users)) {
exit_aio(mm);
ksm_exit(mm);
+ khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {
@@ -669,6 +677,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
mm->token_priority = 0;
mm->last_interval = 0;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ mm->pmd_huge_pte = NULL;
+#endif
+
if (!mm_init(mm, tsk))
goto fail_nomem;
@@ -910,6 +922,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
+ sig->oom_score_adj_min = current->signal->oom_score_adj_min;
mutex_init(&sig->cred_guard_mutex);
@@ -1410,23 +1423,6 @@ long do_fork(unsigned long clone_flags,
}
/*
- * We hope to recycle these flags after 2.6.26
- */
- if (unlikely(clone_flags & CLONE_STOPPED)) {
- static int __read_mostly count = 100;
-
- if (count > 0 && printk_ratelimit()) {
- char comm[TASK_COMM_LEN];
-
- count--;
- printk(KERN_INFO "fork(): process `%s' used deprecated "
- "clone flags 0x%lx\n",
- get_task_comm(comm, current),
- clone_flags & CLONE_STOPPED);
- }
- }
-
- /*
* When called from kernel_thread, don't do user tracing stuff.
*/
if (likely(user_mode(regs)))
@@ -1464,16 +1460,7 @@ long do_fork(unsigned long clone_flags,
*/
p->flags &= ~PF_STARTING;
- if (unlikely(clone_flags & CLONE_STOPPED)) {
- /*
- * We'll start up with an immediate SIGSTOP.
- */
- sigaddset(&p->pending.signal, SIGSTOP);
- set_tsk_thread_flag(p, TIF_SIGPENDING);
- __set_task_state(p, TASK_STOPPED);
- } else {
- wake_up_new_task(p, clone_flags);
- }
+ wake_up_new_task(p, clone_flags);
tracehook_report_clone_complete(trace, regs,
clone_flags, nr, p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index bd1d42b17cb2..66ecd2ead215 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only)
}
if (should_send_signal(p)) {
- if (!signal_pending(p))
- fake_signal_wake_up(p);
+ fake_signal_wake_up(p);
+ /*
+ * fake_signal_wake_up() goes through p's scheduler
+ * lock and guarantees that TASK_STOPPED/TRACED ->
+ * TASK_RUNNING transition can't race with task state
+ * testing in try_to_freeze_tasks().
+ */
} else if (sig_only) {
return false;
} else {
diff --git a/kernel/futex.c b/kernel/futex.c
index 3019b92e6917..b766d28accd6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page;
+ struct page *page, *page_head;
int err;
/*
@@ -265,11 +265,46 @@ again:
if (err < 0)
return err;
- page = compound_head(page);
- lock_page(page);
- if (!page->mapping) {
- unlock_page(page);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ page_head = page;
+ if (unlikely(PageTail(page))) {
put_page(page);
+ /* serialize against __split_huge_page_splitting() */
+ local_irq_disable();
+ if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+ page_head = compound_head(page);
+ /*
+ * page_head is valid pointer but we must pin
+ * it before taking the PG_lock and/or
+ * PG_compound_lock. The moment we re-enable
+ * irqs __split_huge_page_splitting() can
+ * return and the head page can be freed from
+ * under us. We can't take the PG_lock and/or
+ * PG_compound_lock on a page that could be
+ * freed from under us.
+ */
+ if (page != page_head) {
+ get_page(page_head);
+ put_page(page);
+ }
+ local_irq_enable();
+ } else {
+ local_irq_enable();
+ goto again;
+ }
+ }
+#else
+ page_head = compound_head(page);
+ if (page != page_head) {
+ get_page(page_head);
+ put_page(page);
+ }
+#endif
+
+ lock_page(page_head);
+ if (!page_head->mapping) {
+ unlock_page(page_head);
+ put_page(page_head);
goto again;
}
@@ -280,20 +315,20 @@ again:
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page)) {
+ if (PageAnon(page_head)) {
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page->mapping->host;
- key->shared.pgoff = page->index;
+ key->shared.inode = page_head->mapping->host;
+ key->shared.pgoff = page_head->index;
}
get_futex_key_refs(key);
- unlock_page(page);
- put_page(page);
+ unlock_page(page_head);
+ put_page(page_head);
return 0;
}
@@ -791,10 +826,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
- * This happens when we have stolen the lock and the original
- * pending owner did not enqueue itself back on the rt_mutex.
- * Thats not a tragedy. We know that way, that a lock waiter
- * is on the fly. We make the futex_q waiter the pending owner.
+ * It is possible that the next waiter (the one that brought
+ * this owner to the kernel) timed out and is no longer
+ * waiting on the lock.
*/
if (!new_owner)
new_owner = this->task;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 45da2b6920ab..0c8d7c048615 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1745,7 +1745,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
}
/*
- * A NULL parameter means "inifinte"
+ * A NULL parameter means "infinite"
*/
if (!expires) {
schedule();
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 31d766bf5d2e..8e42fec7686d 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -9,9 +9,6 @@ menu "IRQ subsystem"
config GENERIC_HARDIRQS
def_bool y
-config GENERIC_HARDIRQS_NO__DO_IRQ
- def_bool y
-
# Select this to disable the deprecated stuff
config GENERIC_HARDIRQS_NO_DEPRECATED
def_bool n
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e2347eb63306..3540a7190122 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
return retval;
}
-
-#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
-
-#ifdef CONFIG_ENABLE_WARN_DEPRECATED
-# warning __do_IRQ is deprecated. Please convert to proper flow handlers
-#endif
-
-/**
- * __do_IRQ - original all in one highlevel IRQ handler
- * @irq: the interrupt number
- *
- * __do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- *
- * This is the original x86 implementation which is used for every
- * interrupt type.
- */
-unsigned int __do_IRQ(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irqaction *action;
- unsigned int status;
-
- kstat_incr_irqs_this_cpu(irq, desc);
-
- if (CHECK_IRQ_PER_CPU(desc->status)) {
- irqreturn_t action_ret;
-
- /*
- * No locking required for CPU-local interrupts:
- */
- if (desc->irq_data.chip->ack)
- desc->irq_data.chip->ack(irq);
- if (likely(!(desc->status & IRQ_DISABLED))) {
- action_ret = handle_IRQ_event(irq, desc->action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
- }
- desc->irq_data.chip->end(irq);
- return 1;
- }
-
- raw_spin_lock(&desc->lock);
- if (desc->irq_data.chip->ack)
- desc->irq_data.chip->ack(irq);
- /*
- * REPLAY is when Linux resends an IRQ that was dropped earlier
- * WAITING is used by probe to mark irqs that are being tested
- */
- status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
- status |= IRQ_PENDING; /* we _want_ to handle it */
-
- /*
- * If the IRQ is disabled for whatever reason, we cannot
- * use the action we have.
- */
- action = NULL;
- if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
- action = desc->action;
- status &= ~IRQ_PENDING; /* we commit to handling */
- status |= IRQ_INPROGRESS; /* we are handling it */
- }
- desc->status = status;
-
- /*
- * If there is no IRQ handler or it was disabled, exit early.
- * Since we set PENDING, if another processor is handling
- * a different instance of this same irq, the other processor
- * will take care of it.
- */
- if (unlikely(!action))
- goto out;
-
- /*
- * Edge triggered interrupts need to remember
- * pending events.
- * This applies to any hw interrupts that allow a second
- * instance of the same irq to arrive while we are in do_IRQ
- * or in the handler. But the code here only handles the _second_
- * instance of the irq, not the third or fourth. So it is mostly
- * useful for irq hardware that does not mask cleanly in an
- * SMP environment.
- */
- for (;;) {
- irqreturn_t action_ret;
-
- raw_spin_unlock(&desc->lock);
-
- action_ret = handle_IRQ_event(irq, action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
-
- raw_spin_lock(&desc->lock);
- if (likely(!(desc->status & IRQ_PENDING)))
- break;
- desc->status &= ~IRQ_PENDING;
- }
- desc->status &= ~IRQ_INPROGRESS;
-
-out:
- /*
- * The ->end() handler has to deal with interrupts which got
- * disabled while the handler was running.
- */
- desc->irq_data.chip->end(irq);
- raw_spin_unlock(&desc->lock);
-
- return 1;
-}
-#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4571ae7e085a..99c3bc8a6fb4 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -3,6 +3,12 @@
*/
#include <linux/irqdesc.h>
+#ifdef CONFIG_SPARSE_IRQ
+# define IRQ_BITMAP_BITS (NR_IRQS + 8196)
+#else
+# define IRQ_BITMAP_BITS NR_IRQS
+#endif
+
extern int noirqdebug;
#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f5..2039bea31bdf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
{
+ int cpu;
+
desc->irq_data.irq = irq;
desc->irq_data.chip = &no_irq_chip;
desc->irq_data.chip_data = NULL;
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
desc->irq_count = 0;
desc->irqs_unhandled = 0;
desc->name = NULL;
- memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+ for_each_possible_cpu(cpu)
+ *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
desc_smp_init(desc, node);
}
@@ -91,7 +94,7 @@ int nr_irqs = NR_IRQS;
EXPORT_SYMBOL_GPL(nr_irqs);
static DEFINE_MUTEX(sparse_irq_lock);
-static DECLARE_BITMAP(allocated_irqs, NR_IRQS);
+static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
#ifdef CONFIG_SPARSE_IRQ
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
if (!desc)
return NULL;
/* allocate based on nr_cpu_ids */
- desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
- gfp, node);
+ desc->kstat_irqs = alloc_percpu(unsigned int);
if (!desc->kstat_irqs)
goto err_desc;
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
return desc;
err_kstat:
- kfree(desc->kstat_irqs);
+ free_percpu(desc->kstat_irqs);
err_desc:
kfree(desc);
return NULL;
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq)
mutex_unlock(&sparse_irq_lock);
free_masks(desc);
- kfree(desc->kstat_irqs);
+ free_percpu(desc->kstat_irqs);
kfree(desc);
}
@@ -215,6 +217,15 @@ int __init early_irq_init(void)
initcnt = arch_probe_nr_irqs();
printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
+ if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
+ nr_irqs = IRQ_BITMAP_BITS;
+
+ if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
+ initcnt = IRQ_BITMAP_BITS;
+
+ if (initcnt > nr_irqs)
+ nr_irqs = initcnt;
+
for (i = 0; i < initcnt; i++) {
desc = alloc_desc(i, node);
set_bit(i, allocated_irqs);
@@ -234,7 +245,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
}
};
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
int __init early_irq_init(void)
{
int count, i, node = first_online_node;
@@ -250,7 +260,8 @@ int __init early_irq_init(void)
for (i = 0; i < count; i++) {
desc[i].irq_data.irq = i;
desc[i].irq_data.chip = &no_irq_chip;
- desc[i].kstat_irqs = kstat_irqs_all[i];
+ /* TODO : do this allocation on-demand ... */
+ desc[i].kstat_irqs = alloc_percpu(unsigned int);
alloc_masks(desc + i, GFP_KERNEL, node);
desc_smp_init(desc + i, node);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -275,6 +286,22 @@ static void free_desc(unsigned int irq)
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
{
+#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
+ struct irq_desc *desc;
+ unsigned int i;
+
+ for (i = 0; i < cnt; i++) {
+ desc = irq_to_desc(start + i);
+ if (desc && !desc->kstat_irqs) {
+ unsigned int __percpu *stats = alloc_percpu(unsigned int);
+
+ if (!stats)
+ return -1;
+ if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
+ free_percpu(stats);
+ }
+ }
+#endif
return start;
}
#endif /* !CONFIG_SPARSE_IRQ */
@@ -391,7 +418,9 @@ void dynamic_irq_cleanup(unsigned int irq)
unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
struct irq_desc *desc = irq_to_desc(irq);
- return desc ? desc->kstat_irqs[cpu] : 0;
+
+ return desc && desc->kstat_irqs ?
+ *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
}
#ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +430,10 @@ unsigned int kstat_irqs(unsigned int irq)
int cpu;
int sum = 0;
- if (!desc)
+ if (!desc || !desc->kstat_irqs)
return 0;
for_each_possible_cpu(cpu)
- sum += desc->kstat_irqs[cpu];
+ sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
}
#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0caa59f747dd..9033c1c70828 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1100,7 +1100,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
if (retval)
kfree(action);
-#ifdef CONFIG_DEBUG_SHIRQ
+#ifdef CONFIG_DEBUG_SHIRQ_FIXME
if (!retval && (irqflags & IRQF_SHARED)) {
/*
* It's a shared IRQ -- the driver ought to be prepared for it
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 1d2541940480..441fd629ff04 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -56,6 +56,7 @@ void move_masked_irq(int irq)
void move_native_irq(int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
+ bool masked;
if (likely(!(desc->status & IRQ_MOVE_PENDING)))
return;
@@ -63,8 +64,15 @@ void move_native_irq(int irq)
if (unlikely(desc->status & IRQ_DISABLED))
return;
- desc->irq_data.chip->irq_mask(&desc->irq_data);
+ /*
+ * Be careful vs. already masked interrupts. If this is a
+ * threaded interrupt with ONESHOT set, we can end up with an
+ * interrupt storm.
+ */
+ masked = desc->status & IRQ_MASKED;
+ if (!masked)
+ desc->irq_data.chip->irq_mask(&desc->irq_data);
move_masked_irq(irq);
- desc->irq_data.chip->irq_unmask(&desc->irq_data);
+ if (!masked)
+ desc->irq_data.chip->irq_unmask(&desc->irq_data);
}
-
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 891115a929aa..dc49358b73fa 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -23,7 +23,7 @@
#ifdef CONFIG_HARDIRQS_SW_RESEND
/* Bitmap to handle software resend of interrupts: */
-static DECLARE_BITMAP(irqs_resend, NR_IRQS);
+static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
/*
* Run software resends of IRQ's
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b55045bc7563..ec19b92c7ebd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
* just verifies it is an address we can use.
*
* Since the kernel does everything in page size chunks ensure
- * the destination addreses are page aligned. Too many
+ * the destination addresses are page aligned. Too many
* special cases crop of when we don't do this. The most
* insidious is getting overlapping destination addresses
* simply because addresses are changed to page size
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 17110a4a4fc2..ee74b35e528d 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -241,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v)
seq_puts(m, "Latency Top version : v0.1\n");
for (i = 0; i < MAXLR; i++) {
- if (latency_record[i].backtrace[0]) {
+ struct latency_record *lr = &latency_record[i];
+
+ if (lr->backtrace[0]) {
int q;
- seq_printf(m, "%i %lu %lu ",
- latency_record[i].count,
- latency_record[i].time,
- latency_record[i].max);
+ seq_printf(m, "%i %lu %lu",
+ lr->count, lr->time, lr->max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
- char sym[KSYM_SYMBOL_LEN];
- char *c;
- if (!latency_record[i].backtrace[q])
+ unsigned long bt = lr->backtrace[q];
+ if (!bt)
break;
- if (latency_record[i].backtrace[q] == ULONG_MAX)
+ if (bt == ULONG_MAX)
break;
- sprint_symbol(sym, latency_record[i].backtrace[q]);
- c = strchr(sym, '+');
- if (c)
- *c = 0;
- seq_printf(m, "%s ", sym);
+ seq_printf(m, " %ps", (void *)bt);
}
seq_printf(m, "\n");
}
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42ba65dff7d9..0d2058da80f5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2292,22 +2292,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
}
/*
- * Debugging helper: via this flag we know that we are in
- * 'early bootup code', and will warn about any invalid irqs-on event:
- */
-static int early_boot_irqs_enabled;
-
-void early_boot_irqs_off(void)
-{
- early_boot_irqs_enabled = 0;
-}
-
-void early_boot_irqs_on(void)
-{
- early_boot_irqs_enabled = 1;
-}
-
-/*
* Hardirqs will be enabled:
*/
void trace_hardirqs_on_caller(unsigned long ip)
@@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
- if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
+ if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
return;
if (unlikely(curr->hardirqs_enabled)) {
diff --git a/kernel/module.c b/kernel/module.c
index 34e00b708fad..efa290ea94bf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2460,9 +2460,9 @@ static void find_module_sections(struct module *mod, struct load_info *info)
#endif
#ifdef CONFIG_TRACEPOINTS
- mod->tracepoints = section_objs(info, "__tracepoints",
- sizeof(*mod->tracepoints),
- &mod->num_tracepoints);
+ mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
+ sizeof(*mod->tracepoints_ptrs),
+ &mod->num_tracepoints);
#endif
#ifdef HAVE_JUMP_LABEL
mod->jump_entries = section_objs(info, "__jump_table",
@@ -3393,7 +3393,7 @@ void module_layout(struct module *mod,
struct modversion_info *ver,
struct kernel_param *kp,
struct kernel_symbol *ks,
- struct tracepoint *tp)
+ struct tracepoint * const *tp)
{
}
EXPORT_SYMBOL(module_layout);
@@ -3407,8 +3407,8 @@ void module_update_tracepoints(void)
mutex_lock(&module_mutex);
list_for_each_entry(mod, &modules, list)
if (!mod->taints)
- tracepoint_update_probe_range(mod->tracepoints,
- mod->tracepoints + mod->num_tracepoints);
+ tracepoint_update_probe_range(mod->tracepoints_ptrs,
+ mod->tracepoints_ptrs + mod->num_tracepoints);
mutex_unlock(&module_mutex);
}
@@ -3432,8 +3432,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter)
else if (iter_mod > iter->module)
iter->tracepoint = NULL;
found = tracepoint_get_iter_range(&iter->tracepoint,
- iter_mod->tracepoints,
- iter_mod->tracepoints
+ iter_mod->tracepoints_ptrs,
+ iter_mod->tracepoints_ptrs
+ iter_mod->num_tracepoints);
if (found) {
iter->module = iter_mod;
diff --git a/kernel/panic.c b/kernel/panic.c
index 4c13b1a88ebb..991bb87a1704 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
int panic_timeout;
+EXPORT_SYMBOL_GPL(panic_timeout);
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
diff --git a/kernel/params.c b/kernel/params.c
index 08107d181758..0da1411222b9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
params[i].ops->free(params[i].arg);
}
-static void __init kernel_add_sysfs_param(const char *name,
- struct kernel_param *kparam,
- unsigned int name_skip)
+static struct module_kobject * __init locate_module_kobject(const char *name)
{
struct module_kobject *mk;
struct kobject *kobj;
@@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name,
kobj = kset_find_obj(module_kset, name);
if (kobj) {
- /* We already have one. Remove params so we can add more. */
mk = to_module_kobject(kobj);
- /* We need to remove it before adding parameters. */
- sysfs_remove_group(&mk->kobj, &mk->mp->grp);
} else {
mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
BUG_ON(!mk);
@@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name,
"%s", name);
if (err) {
kobject_put(&mk->kobj);
- printk(KERN_ERR "Module '%s' failed add to sysfs, "
- "error number %d\n", name, err);
- printk(KERN_ERR "The system will be unstable now.\n");
- return;
+ printk(KERN_ERR
+ "Module '%s' failed add to sysfs, error number %d\n",
+ name, err);
+ printk(KERN_ERR
+ "The system will be unstable now.\n");
+ return NULL;
}
- /* So that exit path is even. */
+
+ /* So that we hold reference in both cases. */
kobject_get(&mk->kobj);
}
+ return mk;
+}
+
+static void __init kernel_add_sysfs_param(const char *name,
+ struct kernel_param *kparam,
+ unsigned int name_skip)
+{
+ struct module_kobject *mk;
+ int err;
+
+ mk = locate_module_kobject(name);
+ if (!mk)
+ return;
+
+ /* We need to remove old parameters before adding more. */
+ if (mk->mp)
+ sysfs_remove_group(&mk->kobj, &mk->mp->grp);
+
/* These should not fail at boot. */
err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
BUG_ON(err);
@@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void)
}
}
+ssize_t __modver_version_show(struct module_attribute *mattr,
+ struct module *mod, char *buf)
+{
+ struct module_version_attribute *vattr =
+ container_of(mattr, struct module_version_attribute, mattr);
+
+ return sprintf(buf, "%s\n", vattr->version);
+}
+
+extern struct module_version_attribute __start___modver[], __stop___modver[];
+
+static void __init version_sysfs_builtin(void)
+{
+ const struct module_version_attribute *vattr;
+ struct module_kobject *mk;
+ int err;
+
+ for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
+ mk = locate_module_kobject(vattr->module_name);
+ if (mk) {
+ err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
+ kobject_uevent(&mk->kobj, KOBJ_ADD);
+ kobject_put(&mk->kobj);
+ }
+ }
+}
/* module-related sysfs stuff */
@@ -875,6 +917,7 @@ static int __init param_sysfs_init(void)
}
module_sysfs_initialized = 1;
+ version_sysfs_builtin();
param_sysfs_builtin();
return 0;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 11847bf1e8cc..656222fcf767 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,6 +38,12 @@
#include <asm/irq_regs.h>
+enum event_type_t {
+ EVENT_FLEXIBLE = 0x1,
+ EVENT_PINNED = 0x2,
+ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+
atomic_t perf_task_events __read_mostly;
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
@@ -65,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
static atomic64_t perf_event_id;
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type);
+
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type);
+
void __weak perf_event_print_debug(void) { }
extern __weak const char *perf_pmu_name(void)
@@ -72,6 +84,11 @@ extern __weak const char *perf_pmu_name(void)
return "pmu";
}
+static inline u64 perf_clock(void)
+{
+ return local_clock();
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -240,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
put_ctx(ctx);
}
-static inline u64 perf_clock(void)
-{
- return local_clock();
-}
-
/*
* Update the record of the current time in a context.
*/
@@ -256,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx)
ctx->timestamp = now;
}
+static u64 perf_event_time(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ return ctx ? ctx->time : 0;
+}
+
/*
* Update the total_time_enabled and total_time_running fields for a event.
*/
@@ -269,7 +287,7 @@ static void update_event_times(struct perf_event *event)
return;
if (ctx->is_active)
- run_end = ctx->time;
+ run_end = perf_event_time(event);
else
run_end = event->tstamp_stopped;
@@ -278,7 +296,7 @@ static void update_event_times(struct perf_event *event)
if (event->state == PERF_EVENT_STATE_INACTIVE)
run_end = event->tstamp_stopped;
else
- run_end = ctx->time;
+ run_end = perf_event_time(event);
event->total_time_running = run_end - event->tstamp_running;
}
@@ -534,6 +552,7 @@ event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
+ u64 tstamp = perf_event_time(event);
u64 delta;
/*
* An event which could not be activated because of
@@ -545,7 +564,7 @@ event_sched_out(struct perf_event *event,
&& !event_filter_match(event)) {
delta = ctx->time - event->tstamp_stopped;
event->tstamp_running += delta;
- event->tstamp_stopped = ctx->time;
+ event->tstamp_stopped = tstamp;
}
if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -556,7 +575,7 @@ event_sched_out(struct perf_event *event,
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
- event->tstamp_stopped = ctx->time;
+ event->tstamp_stopped = tstamp;
event->pmu->del(event, 0);
event->oncpu = -1;
@@ -763,16 +782,33 @@ retry:
raw_spin_unlock_irq(&ctx->lock);
}
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_event *event, int enable);
+
static int
event_sched_in(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
+ u64 tstamp = perf_event_time(event);
+
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
event->state = PERF_EVENT_STATE_ACTIVE;
event->oncpu = smp_processor_id();
+
+ /*
+ * Unthrottle events, since we scheduled we might have missed several
+ * ticks already, also for a heavily scheduling task there is little
+ * guarantee it'll get a tick in a timely manner.
+ */
+ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
+ perf_log_throttle(event, 1);
+ event->hw.interrupts = 0;
+ }
+
/*
* The new state must be visible before we turn it on in the hardware:
*/
@@ -784,9 +820,9 @@ event_sched_in(struct perf_event *event,
return -EAGAIN;
}
- event->tstamp_running += ctx->time - event->tstamp_stopped;
+ event->tstamp_running += tstamp - event->tstamp_stopped;
- event->shadow_ctx_time = ctx->time - ctx->timestamp;
+ event->shadow_ctx_time = tstamp - ctx->timestamp;
if (!is_software_event(event))
cpuctx->active_oncpu++;
@@ -898,11 +934,13 @@ static int group_can_go_on(struct perf_event *event,
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
+ u64 tstamp = perf_event_time(event);
+
list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = ctx->time;
- event->tstamp_running = ctx->time;
- event->tstamp_stopped = ctx->time;
+ event->tstamp_enabled = tstamp;
+ event->tstamp_running = tstamp;
+ event->tstamp_stopped = tstamp;
}
/*
@@ -937,7 +975,7 @@ static void __perf_install_in_context(void *info)
add_event_to_ctx(event, ctx);
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
goto unlock;
/*
@@ -1042,14 +1080,13 @@ static void __perf_event_mark_enabled(struct perf_event *event,
struct perf_event_context *ctx)
{
struct perf_event *sub;
+ u64 tstamp = perf_event_time(event);
event->state = PERF_EVENT_STATE_INACTIVE;
- event->tstamp_enabled = ctx->time - event->total_time_enabled;
+ event->tstamp_enabled = tstamp - event->total_time_enabled;
list_for_each_entry(sub, &event->sibling_list, group_entry) {
- if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
- sub->tstamp_enabled =
- ctx->time - sub->total_time_enabled;
- }
+ if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+ sub->tstamp_enabled = tstamp - sub->total_time_enabled;
}
}
@@ -1082,7 +1119,7 @@ static void __perf_event_enable(void *info)
goto unlock;
__perf_event_mark_enabled(event, ctx);
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
goto unlock;
/*
@@ -1193,12 +1230,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
return 0;
}
-enum event_type_t {
- EVENT_FLEXIBLE = 0x1,
- EVENT_PINNED = 0x2,
- EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
@@ -1435,7 +1466,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
if (event->state <= PERF_EVENT_STATE_OFF)
continue;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
continue;
if (group_can_go_on(event, cpuctx, 1))
@@ -1467,7 +1498,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
* Listen to the 'cpu' scheduling filter constraint
* of events:
*/
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
continue;
if (group_can_go_on(event, cpuctx, can_add_hw)) {
@@ -1580,10 +1611,6 @@ void __perf_event_task_sched_in(struct task_struct *task)
}
}
-#define MAX_INTERRUPTS (~0ULL)
-
-static void perf_log_throttle(struct perf_event *event, int enable);
-
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
{
u64 frequency = event->attr.sample_freq;
@@ -1694,7 +1721,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
continue;
hwc = &event->hw;
@@ -1885,11 +1912,12 @@ static void __perf_event_read(void *info)
return;
raw_spin_lock(&ctx->lock);
- update_context_time(ctx);
+ if (ctx->is_active)
+ update_context_time(ctx);
update_event_times(event);
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
raw_spin_unlock(&ctx->lock);
-
- event->pmu->read(event);
}
static inline u64 perf_event_count(struct perf_event *event)
@@ -1983,8 +2011,7 @@ static int alloc_callchain_buffers(void)
* accessed from NMI. Use a temporary manual per cpu allocation
* until that gets sorted out.
*/
- size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
- num_possible_cpus();
+ size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
entries = kzalloc(size, GFP_KERNEL);
if (!entries)
@@ -2185,13 +2212,6 @@ find_lively_task_by_vpid(pid_t vpid)
if (!task)
return ERR_PTR(-ESRCH);
- /*
- * Can't attach events to a dying task.
- */
- err = -ESRCH;
- if (task->flags & PF_EXITING)
- goto errout;
-
/* Reuse ptrace permission checks for now. */
err = -EACCES;
if (!ptrace_may_access(task, PTRACE_MODE_READ))
@@ -2212,14 +2232,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
unsigned long flags;
int ctxn, err;
- if (!task && cpu != -1) {
+ if (!task) {
/* Must be root to operate on a CPU event: */
if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
- if (cpu < 0 || cpu >= nr_cpumask_bits)
- return ERR_PTR(-EINVAL);
-
/*
* We could be clever and allow to attach a event to an
* offline CPU and activate it when the CPU comes up, but
@@ -2255,14 +2272,27 @@ retry:
get_ctx(ctx);
- if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
- /*
- * We raced with some other task; use
- * the context they set.
- */
+ err = 0;
+ mutex_lock(&task->perf_event_mutex);
+ /*
+ * If it has already passed perf_event_exit_task().
+ * we must see PF_EXITING, it takes this mutex too.
+ */
+ if (task->flags & PF_EXITING)
+ err = -ESRCH;
+ else if (task->perf_event_ctxp[ctxn])
+ err = -EAGAIN;
+ else
+ rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ mutex_unlock(&task->perf_event_mutex);
+
+ if (unlikely(err)) {
put_task_struct(task);
kfree(ctx);
- goto retry;
+
+ if (err == -EAGAIN)
+ goto retry;
+ goto errout;
}
}
@@ -3893,7 +3923,7 @@ static int perf_event_task_match(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
return 0;
if (event->attr.comm || event->attr.mmap ||
@@ -4030,7 +4060,7 @@ static int perf_event_comm_match(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
return 0;
if (event->attr.comm)
@@ -4178,7 +4208,7 @@ static int perf_event_mmap_match(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return 0;
- if (event->cpu != -1 && event->cpu != smp_processor_id())
+ if (!event_filter_match(event))
return 0;
if ((!executable && event->attr.mmap_data) ||
@@ -4648,7 +4678,7 @@ int perf_swevent_get_recursion_context(void)
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
-void inline perf_swevent_put_recursion_context(int rctx)
+inline void perf_swevent_put_recursion_context(int rctx)
{
struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
@@ -5361,6 +5391,8 @@ free_dev:
goto out;
}
+static struct lock_class_key cpuctx_mutex;
+
int perf_pmu_register(struct pmu *pmu, char *name, int type)
{
int cpu, ret;
@@ -5409,6 +5441,7 @@ skip_type:
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
__perf_event_init_context(&cpuctx->ctx);
+ lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
cpuctx->ctx.type = cpu_context;
cpuctx->ctx.pmu = pmu;
cpuctx->jiffies_interval = 1;
@@ -5525,6 +5558,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
struct hw_perf_event *hwc;
long err;
+ if ((unsigned)cpu >= nr_cpu_ids) {
+ if (!task || cpu != -1)
+ return ERR_PTR(-EINVAL);
+ }
+
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return ERR_PTR(-ENOMEM);
@@ -5573,7 +5611,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (!overflow_handler && parent_event)
overflow_handler = parent_event->overflow_handler;
-
+
event->overflow_handler = overflow_handler;
if (attr->disabled)
@@ -6109,7 +6147,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* scheduled, so we are now safe from rescheduling changing
* our context.
*/
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
task_ctx_sched_out(child_ctx, EVENT_ALL);
/*
@@ -6422,11 +6460,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
unsigned long flags;
int ret = 0;
- child->perf_event_ctxp[ctxn] = NULL;
-
- mutex_init(&child->perf_event_mutex);
- INIT_LIST_HEAD(&child->perf_event_list);
-
if (likely(!parent->perf_event_ctxp[ctxn]))
return 0;
@@ -6478,7 +6511,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 0;
- raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
child_ctx = child->perf_event_ctxp[ctxn];
@@ -6486,12 +6518,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
/*
* Mark the child context as a clone of the parent
* context, or of whatever the parent is a clone of.
- * Note that if the parent is a clone, it could get
- * uncloned at any point, but that doesn't matter
- * because the list of events and the generation
- * count can't have changed since we took the mutex.
+ *
+ * Note that if the parent is a clone, the holding of
+ * parent_ctx->lock avoids it from being uncloned.
*/
- cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+ cloned_ctx = parent_ctx->parent_ctx;
if (cloned_ctx) {
child_ctx->parent_ctx = cloned_ctx;
child_ctx->parent_gen = parent_ctx->parent_gen;
@@ -6502,6 +6533,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
get_ctx(child_ctx->parent_ctx);
}
+ raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
mutex_unlock(&parent_ctx->mutex);
perf_unpin_context(parent_ctx);
@@ -6516,6 +6548,10 @@ int perf_event_init_task(struct task_struct *child)
{
int ctxn, ret;
+ memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+ mutex_init(&child->perf_event_mutex);
+ INIT_LIST_HEAD(&child->perf_event_list);
+
for_each_task_context_nr(ctxn) {
ret = perf_event_init_context(child, ctxn);
if (ret)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a5aff3ebad38..265729966ece 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG
depends on PM_ADVANCED_DEBUG
default n
-config SUSPEND_NVS
- bool
-
config SUSPEND
bool "Suspend to RAM and standby"
depends on PM && ARCH_SUSPEND_POSSIBLE
- select SUSPEND_NVS if HAS_IOMEM
default y
---help---
Allow the system to enter sleep states in which main memory is
@@ -140,7 +136,6 @@ config HIBERNATION
depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
select LZO_COMPRESS
select LZO_DECOMPRESS
- select SUSPEND_NVS if HAS_IOMEM
---help---
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f9063c6b185d..c350e18b53e3 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,4 @@
-
-ifeq ($(CONFIG_PM_DEBUG),y)
-EXTRA_CFLAGS += -DDEBUG
-endif
+ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
obj-$(CONFIG_PM) += main.o
obj-$(CONFIG_PM_SLEEP) += console.o
@@ -10,6 +7,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
block_io.o
-obj-$(CONFIG_SUSPEND_NVS) += nvs.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 048d0b514831..1832bd264219 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -51,18 +51,18 @@ enum {
static int hibernation_mode = HIBERNATION_SHUTDOWN;
-static struct platform_hibernation_ops *hibernation_ops;
+static const struct platform_hibernation_ops *hibernation_ops;
/**
* hibernation_set_ops - set the global hibernate operations
* @ops: the hibernation operations to use in subsequent hibernation transitions
*/
-void hibernation_set_ops(struct platform_hibernation_ops *ops)
+void hibernation_set_ops(const struct platform_hibernation_ops *ops)
{
if (ops && !(ops->begin && ops->end && ops->pre_snapshot
&& ops->prepare && ops->finish && ops->enter && ops->pre_restore
- && ops->restore_cleanup)) {
+ && ops->restore_cleanup && ops->leave)) {
WARN_ON(1);
return;
}
@@ -278,7 +278,7 @@ static int create_image(int platform_mode)
goto Enable_irqs;
}
- if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
+ if (hibernation_test(TEST_CORE) || pm_wakeup_pending())
goto Power_up;
in_suspend = 1;
@@ -516,7 +516,7 @@ int hibernation_platform_enter(void)
local_irq_disable();
sysdev_suspend(PMSG_HIBERNATE);
- if (!pm_check_wakeup_events()) {
+ if (pm_wakeup_pending()) {
error = -EAGAIN;
goto Power_up;
}
@@ -647,6 +647,7 @@ int hibernate(void)
swsusp_free();
if (!error)
power_down();
+ in_suspend = 0;
pm_restore_gfp_mask();
} else {
pr_debug("PM: Image restored successfully.\n");
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7b5db6a8561e..701853042c28 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
static int __init pm_start_workqueue(void)
{
- pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0);
+ pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
return pm_wq ? 0 : -ENOMEM;
}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e50b4c1b2a0f..0cf3a27a6c9d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,7 +22,7 @@
*/
#define TIMEOUT (20 * HZ)
-static inline int freezeable(struct task_struct * p)
+static inline int freezable(struct task_struct * p)
{
if ((p == current) ||
(p->flags & PF_NOFREEZE) ||
@@ -53,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only)
todo = 0;
read_lock(&tasklist_lock);
do_each_thread(g, p) {
- if (frozen(p) || !freezeable(p))
+ if (frozen(p) || !freezable(p))
continue;
if (!freeze_task(p, sig_only))
@@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only)
* perturb a task in TASK_STOPPED or TASK_TRACED.
* It is "frozen enough". If the task does wake
* up, it will immediately call try_to_freeze.
+ *
+ * Because freeze_task() goes through p's
+ * scheduler lock after setting TIF_FREEZE, it's
+ * guaranteed that either we see TASK_RUNNING or
+ * try_to_stop() after schedule() in ptrace/signal
+ * stop sees TIF_FREEZE.
*/
if (!task_is_stopped_or_traced(p) &&
!freezer_should_skip(p))
@@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only)
if (!todo || time_after(jiffies, end_time))
break;
- if (!pm_check_wakeup_events()) {
+ if (pm_wakeup_pending()) {
wakeup = true;
break;
}
@@ -161,7 +167,7 @@ static void thaw_tasks(bool nosig_only)
read_lock(&tasklist_lock);
do_each_thread(g, p) {
- if (!freezeable(p))
+ if (!freezable(p))
continue;
if (nosig_only && should_send_signal(p))
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0dac75ea4456..64db648ff911 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1519,11 +1519,8 @@ static int
swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
unsigned int nr_pages, unsigned int nr_highmem)
{
- int error = 0;
-
if (nr_highmem > 0) {
- error = get_highmem_buffer(PG_ANY);
- if (error)
+ if (get_highmem_buffer(PG_ANY))
goto err_out;
if (nr_highmem > alloc_highmem) {
nr_highmem -= alloc_highmem;
@@ -1546,7 +1543,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
err_out:
swsusp_free();
- return error;
+ return -ENOMEM;
}
asmlinkage int swsusp_save(void)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 031d5e3a6197..de6f86bfa303 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,13 +31,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
[PM_SUSPEND_MEM] = "mem",
};
-static struct platform_suspend_ops *suspend_ops;
+static const struct platform_suspend_ops *suspend_ops;
/**
* suspend_set_ops - Set the global suspend method table.
* @ops: Pointer to ops structure.
*/
-void suspend_set_ops(struct platform_suspend_ops *ops)
+void suspend_set_ops(const struct platform_suspend_ops *ops)
{
mutex_lock(&pm_mutex);
suspend_ops = ops;
@@ -164,7 +164,7 @@ static int suspend_enter(suspend_state_t state)
error = sysdev_suspend(PMSG_SUSPEND);
if (!error) {
- if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
+ if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
error = suspend_ops->enter(state);
events_check_enabled = false;
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8c7e4832b9be..7c97c3a0eee3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -224,7 +224,7 @@ static int swsusp_swap_check(void)
return res;
root_swap = res;
- res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
+ res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
if (res)
return res;
@@ -888,7 +888,7 @@ out_finish:
/**
* swsusp_read - read the hibernation image.
* @flags_p: flags passed by the "frozen" kernel in the image header should
- * be written into this memeory location
+ * be written into this memory location
*/
int swsusp_read(unsigned int *flags_p)
@@ -930,7 +930,8 @@ int swsusp_check(void)
{
int error;
- hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+ hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
+ FMODE_READ, NULL);
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
clear_page(swsusp_header);
diff --git a/kernel/printk.c b/kernel/printk.c
index f64b8997fc76..36231525e22f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -39,6 +39,7 @@
#include <linux/syslog.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
+#include <linux/rculist.h>
#include <asm/uaccess.h>
@@ -96,7 +97,7 @@ static int console_locked, console_suspended;
/*
* logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
* It is also used in interesting ways to provide interlocking in
- * release_console_sem().
+ * console_unlock();.
*/
static DEFINE_SPINLOCK(logbuf_lock);
@@ -261,25 +262,47 @@ int dmesg_restrict = 1;
int dmesg_restrict;
#endif
+static int syslog_action_restricted(int type)
+{
+ if (dmesg_restrict)
+ return 1;
+ /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
+ return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
+}
+
+static int check_syslog_permissions(int type, bool from_file)
+{
+ /*
+ * If this is from /proc/kmsg and we've already opened it, then we've
+ * already done the capabilities checks at open time.
+ */
+ if (from_file && type != SYSLOG_ACTION_OPEN)
+ return 0;
+
+ if (syslog_action_restricted(type)) {
+ if (capable(CAP_SYSLOG))
+ return 0;
+ /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
+ if (capable(CAP_SYS_ADMIN)) {
+ WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
+ "but no CAP_SYSLOG (deprecated).\n");
+ return 0;
+ }
+ return -EPERM;
+ }
+ return 0;
+}
+
int do_syslog(int type, char __user *buf, int len, bool from_file)
{
unsigned i, j, limit, count;
int do_clear = 0;
char c;
- int error = 0;
+ int error;
- /*
- * If this is from /proc/kmsg we only do the capabilities checks
- * at open time.
- */
- if (type == SYSLOG_ACTION_OPEN || !from_file) {
- if (dmesg_restrict && !capable(CAP_SYSLOG))
- goto warn; /* switch to return -EPERM after 2.6.39 */
- if ((type != SYSLOG_ACTION_READ_ALL &&
- type != SYSLOG_ACTION_SIZE_BUFFER) &&
- !capable(CAP_SYSLOG))
- goto warn; /* switch to return -EPERM after 2.6.39 */
- }
+ error = check_syslog_permissions(type, from_file);
+ if (error)
+ goto out;
error = security_syslog(type);
if (error)
@@ -422,12 +445,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
}
out:
return error;
-warn:
- /* remove after 2.6.39 */
- if (capable(CAP_SYS_ADMIN))
- WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
- "but no CAP_SYSLOG (deprecated and denied).\n");
- return -EPERM;
}
SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
@@ -500,7 +517,7 @@ static void _call_console_drivers(unsigned start,
/*
* Call the console drivers, asking them to write out
* log_buf[start] to log_buf[end - 1].
- * The console_sem must be held.
+ * The console_lock must be held.
*/
static void call_console_drivers(unsigned start, unsigned end)
{
@@ -603,11 +620,11 @@ static int have_callable_console(void)
*
* This is printk(). It can be called from any context. We want it to work.
*
- * We try to grab the console_sem. If we succeed, it's easy - we log the output and
+ * We try to grab the console_lock. If we succeed, it's easy - we log the output and
* call the console drivers. If we fail to get the semaphore we place the output
* into the log buffer and return. The current holder of the console_sem will
- * notice the new output in release_console_sem() and will send it to the
- * consoles before releasing the semaphore.
+ * notice the new output in console_unlock(); and will send it to the
+ * consoles before releasing the lock.
*
* One effect of this deferred printing is that code which calls printk() and
* then changes console_loglevel may break. This is because console_loglevel
@@ -658,19 +675,19 @@ static inline int can_use_console(unsigned int cpu)
/*
* Try to get console ownership to actually show the kernel
* messages from a 'printk'. Return true (and with the
- * console_semaphore held, and 'console_locked' set) if it
+ * console_lock held, and 'console_locked' set) if it
* is successful, false otherwise.
*
* This gets called with the 'logbuf_lock' spinlock held and
* interrupts disabled. It should return with 'lockbuf_lock'
* released but interrupts still disabled.
*/
-static int acquire_console_semaphore_for_printk(unsigned int cpu)
+static int console_trylock_for_printk(unsigned int cpu)
__releases(&logbuf_lock)
{
int retval = 0;
- if (!try_acquire_console_sem()) {
+ if (console_trylock()) {
retval = 1;
/*
@@ -826,12 +843,12 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* actual magic (print out buffers, wake up klogd,
* etc).
*
- * The acquire_console_semaphore_for_printk() function
+ * The console_trylock_for_printk() function
* will release 'logbuf_lock' regardless of whether it
* actually gets the semaphore or not.
*/
- if (acquire_console_semaphore_for_printk(this_cpu))
- release_console_sem();
+ if (console_trylock_for_printk(this_cpu))
+ console_unlock();
lockdep_on();
out_restore_irqs:
@@ -992,7 +1009,7 @@ void suspend_console(void)
if (!console_suspend_enabled)
return;
printk("Suspending console(s) (use no_console_suspend to debug)\n");
- acquire_console_sem();
+ console_lock();
console_suspended = 1;
up(&console_sem);
}
@@ -1003,7 +1020,7 @@ void resume_console(void)
return;
down(&console_sem);
console_suspended = 0;
- release_console_sem();
+ console_unlock();
}
/**
@@ -1026,21 +1043,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
case CPU_DYING:
case CPU_DOWN_FAILED:
case CPU_UP_CANCELED:
- acquire_console_sem();
- release_console_sem();
+ console_lock();
+ console_unlock();
}
return NOTIFY_OK;
}
/**
- * acquire_console_sem - lock the console system for exclusive use.
+ * console_lock - lock the console system for exclusive use.
*
- * Acquires a semaphore which guarantees that the caller has
+ * Acquires a lock which guarantees that the caller has
* exclusive access to the console system and the console_drivers list.
*
* Can sleep, returns nothing.
*/
-void acquire_console_sem(void)
+void console_lock(void)
{
BUG_ON(in_interrupt());
down(&console_sem);
@@ -1049,21 +1066,29 @@ void acquire_console_sem(void)
console_locked = 1;
console_may_schedule = 1;
}
-EXPORT_SYMBOL(acquire_console_sem);
+EXPORT_SYMBOL(console_lock);
-int try_acquire_console_sem(void)
+/**
+ * console_trylock - try to lock the console system for exclusive use.
+ *
+ * Tried to acquire a lock which guarantees that the caller has
+ * exclusive access to the console system and the console_drivers list.
+ *
+ * returns 1 on success, and 0 on failure to acquire the lock.
+ */
+int console_trylock(void)
{
if (down_trylock(&console_sem))
- return -1;
+ return 0;
if (console_suspended) {
up(&console_sem);
- return -1;
+ return 0;
}
console_locked = 1;
console_may_schedule = 0;
- return 0;
+ return 1;
}
-EXPORT_SYMBOL(try_acquire_console_sem);
+EXPORT_SYMBOL(console_trylock);
int is_console_locked(void)
{
@@ -1094,20 +1119,20 @@ void wake_up_klogd(void)
}
/**
- * release_console_sem - unlock the console system
+ * console_unlock - unlock the console system
*
- * Releases the semaphore which the caller holds on the console system
+ * Releases the console_lock which the caller holds on the console system
* and the console driver list.
*
- * While the semaphore was held, console output may have been buffered
- * by printk(). If this is the case, release_console_sem() emits
- * the output prior to releasing the semaphore.
+ * While the console_lock was held, console output may have been buffered
+ * by printk(). If this is the case, console_unlock(); emits
+ * the output prior to releasing the lock.
*
* If there is output waiting for klogd, we wake it up.
*
- * release_console_sem() may be called from any context.
+ * console_unlock(); may be called from any context.
*/
-void release_console_sem(void)
+void console_unlock(void)
{
unsigned long flags;
unsigned _con_start, _log_end;
@@ -1140,7 +1165,7 @@ void release_console_sem(void)
if (wake_klogd)
wake_up_klogd();
}
-EXPORT_SYMBOL(release_console_sem);
+EXPORT_SYMBOL(console_unlock);
/**
* console_conditional_schedule - yield the CPU if required
@@ -1149,7 +1174,7 @@ EXPORT_SYMBOL(release_console_sem);
* if this CPU should yield the CPU to another task, do
* so here.
*
- * Must be called within acquire_console_sem().
+ * Must be called within console_lock();.
*/
void __sched console_conditional_schedule(void)
{
@@ -1170,14 +1195,14 @@ void console_unblank(void)
if (down_trylock(&console_sem) != 0)
return;
} else
- acquire_console_sem();
+ console_lock();
console_locked = 1;
console_may_schedule = 0;
for_each_console(c)
if ((c->flags & CON_ENABLED) && c->unblank)
c->unblank();
- release_console_sem();
+ console_unlock();
}
/*
@@ -1188,7 +1213,7 @@ struct tty_driver *console_device(int *index)
struct console *c;
struct tty_driver *driver = NULL;
- acquire_console_sem();
+ console_lock();
for_each_console(c) {
if (!c->device)
continue;
@@ -1196,7 +1221,7 @@ struct tty_driver *console_device(int *index)
if (driver)
break;
}
- release_console_sem();
+ console_unlock();
return driver;
}
@@ -1207,17 +1232,17 @@ struct tty_driver *console_device(int *index)
*/
void console_stop(struct console *console)
{
- acquire_console_sem();
+ console_lock();
console->flags &= ~CON_ENABLED;
- release_console_sem();
+ console_unlock();
}
EXPORT_SYMBOL(console_stop);
void console_start(struct console *console)
{
- acquire_console_sem();
+ console_lock();
console->flags |= CON_ENABLED;
- release_console_sem();
+ console_unlock();
}
EXPORT_SYMBOL(console_start);
@@ -1339,7 +1364,7 @@ void register_console(struct console *newcon)
* Put this console in the list - keep the
* preferred driver at the head of the list.
*/
- acquire_console_sem();
+ console_lock();
if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
newcon->next = console_drivers;
console_drivers = newcon;
@@ -1351,14 +1376,14 @@ void register_console(struct console *newcon)
}
if (newcon->flags & CON_PRINTBUFFER) {
/*
- * release_console_sem() will print out the buffered messages
+ * console_unlock(); will print out the buffered messages
* for us.
*/
spin_lock_irqsave(&logbuf_lock, flags);
con_start = log_start;
spin_unlock_irqrestore(&logbuf_lock, flags);
}
- release_console_sem();
+ console_unlock();
console_sysfs_notify();
/*
@@ -1395,7 +1420,7 @@ int unregister_console(struct console *console)
return braille_unregister_console(console);
#endif
- acquire_console_sem();
+ console_lock();
if (console_drivers == console) {
console_drivers=console->next;
res = 0;
@@ -1417,7 +1442,7 @@ int unregister_console(struct console *console)
if (console_drivers != NULL && console->flags & CON_CONSDEV)
console_drivers->flags |= CON_CONSDEV;
- release_console_sem();
+ console_unlock();
console_sysfs_notify();
return res;
}
@@ -1502,7 +1527,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper)
/* Don't allow registering multiple times */
if (!dumper->registered) {
dumper->registered = 1;
- list_add_tail(&dumper->list, &dump_list);
+ list_add_tail_rcu(&dumper->list, &dump_list);
err = 0;
}
spin_unlock_irqrestore(&dump_list_lock, flags);
@@ -1526,29 +1551,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
spin_lock_irqsave(&dump_list_lock, flags);
if (dumper->registered) {
dumper->registered = 0;
- list_del(&dumper->list);
+ list_del_rcu(&dumper->list);
err = 0;
}
spin_unlock_irqrestore(&dump_list_lock, flags);
+ synchronize_rcu();
return err;
}
EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
-static const char * const kmsg_reasons[] = {
- [KMSG_DUMP_OOPS] = "oops",
- [KMSG_DUMP_PANIC] = "panic",
- [KMSG_DUMP_KEXEC] = "kexec",
-};
-
-static const char *kmsg_to_str(enum kmsg_dump_reason reason)
-{
- if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
- return "unknown";
-
- return kmsg_reasons[reason];
-}
-
/**
* kmsg_dump - dump kernel log to kernel message dumpers.
* @reason: the reason (oops, panic etc) for dumping
@@ -1587,13 +1599,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
l2 = chars;
}
- if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
- printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n",
- kmsg_to_str(reason));
- return;
- }
- list_for_each_entry(dumper, &dump_list, list)
+ rcu_read_lock();
+ list_for_each_entry_rcu(dumper, &dump_list, list)
dumper->dump(dumper, reason, s1, l1, s2, l2);
- spin_unlock_irqrestore(&dump_list_lock, flags);
+ rcu_read_unlock();
}
#endif
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 99bbaa3e5b0d..e2302e40b360 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -163,7 +163,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
return !err;
}
-int ptrace_attach(struct task_struct *task)
+static int ptrace_attach(struct task_struct *task)
{
int retval;
@@ -219,7 +219,7 @@ out:
* Performs checks and sets PT_PTRACED.
* Should be used by all ptrace implementations for PTRACE_TRACEME.
*/
-int ptrace_traceme(void)
+static int ptrace_traceme(void)
{
int ret = -EPERM;
@@ -293,7 +293,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
return false;
}
-int ptrace_detach(struct task_struct *child, unsigned int data)
+static int ptrace_detach(struct task_struct *child, unsigned int data)
{
bool dead = false;
@@ -313,7 +313,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
child->exit_code = data;
dead = __ptrace_detach(current, child);
if (!child->exit_state)
- wake_up_process(child);
+ wake_up_state(child, TASK_TRACED | TASK_STOPPED);
}
write_unlock_irq(&tasklist_lock);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 034493724749..0c343b9a46d5 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -189,7 +189,8 @@ static int rcu_kthread(void *arg)
unsigned long flags;
for (;;) {
- wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+ wait_event_interruptible(rcu_kthread_wq,
+ have_rcu_kthread_work != 0);
morework = rcu_boost();
local_irq_save(flags);
work = have_rcu_kthread_work;
diff --git a/kernel/sched.c b/kernel/sched.c
index a0eb0941fa84..18d38e4ec7ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -553,9 +553,6 @@ struct rq {
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
-
- /* BKL stats */
- unsigned int bkl_count;
#endif
};
@@ -609,6 +606,9 @@ static inline struct task_group *task_group(struct task_struct *p)
struct task_group *tg;
struct cgroup_subsys_state *css;
+ if (p->flags & PF_EXITING)
+ return &root_task_group;
+
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));
tg = container_of(css, struct task_group, css);
@@ -2505,7 +2505,7 @@ out:
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
*
- * Put @p on the run-queue if it's not alredy there. The caller must
+ * Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
* the current task. this_rq() stays locked over invocation.
*/
@@ -3887,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev)
schedstat_inc(this_rq(), sched_count);
#ifdef CONFIG_SCHEDSTATS
if (unlikely(prev->lock_depth >= 0)) {
- schedstat_inc(this_rq(), bkl_count);
+ schedstat_inc(this_rq(), rq_sched_info.bkl_count);
schedstat_inc(prev, sched_info.bkl_count);
}
#endif
@@ -4871,7 +4871,8 @@ recheck:
* assigned.
*/
if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0) {
+ task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+ !task_group_is_autogroup(task_group(p))) {
__task_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return -EPERM;
@@ -8882,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
}
}
+static void
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+{
+ /*
+ * cgroup_exit() is called in the copy_process() failure path.
+ * Ignore this case since the task hasn't ran yet, this avoids
+ * trying to poke a half freed task state from generic code.
+ */
+ if (!(task->flags & PF_EXITING))
+ return;
+
+ sched_move_task(task);
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval)
@@ -8954,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.destroy = cpu_cgroup_destroy,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
+ .exit = cpu_cgroup_exit,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
.early_init = 1,
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 32a723b8f84c..9fb656283157 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -27,6 +27,11 @@ static inline void autogroup_destroy(struct kref *kref)
{
struct autogroup *ag = container_of(kref, struct autogroup, kref);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* We've redirected RT tasks to the root task group... */
+ ag->tg->rt_se = NULL;
+ ag->tg->rt_rq = NULL;
+#endif
sched_destroy_group(ag->tg);
}
@@ -55,6 +60,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
return ag;
}
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg);
+#endif
+
static inline struct autogroup *autogroup_create(void)
{
struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -72,6 +81,19 @@ static inline struct autogroup *autogroup_create(void)
init_rwsem(&ag->lock);
ag->id = atomic_inc_return(&autogroup_seq_nr);
ag->tg = tg;
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Autogroup RT tasks are redirected to the root task group
+ * so we don't have to move tasks around upon policy change,
+ * or flail around trying to allocate bandwidth on the fly.
+ * A bandwidth exception in __sched_setscheduler() allows
+ * the policy change to proceed. Thereafter, task_group()
+ * returns &root_task_group, so zero bandwidth is required.
+ */
+ free_rt_sched_group(tg);
+ tg->rt_se = root_task_group.rt_se;
+ tg->rt_rq = root_task_group.rt_rq;
+#endif
tg->autogroup = ag;
return ag;
@@ -106,6 +128,11 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
return true;
}
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+ return tg != &root_task_group && tg->autogroup;
+}
+
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
@@ -231,6 +258,11 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
#ifdef CONFIG_SCHED_DEBUG
static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
+ int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+ if (!enabled || !tg->autogroup)
+ return 0;
+
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 5358e241cb20..7b859ffe5dad 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -15,6 +15,10 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg);
static inline void autogroup_init(struct task_struct *init_task) { }
static inline void autogroup_free(struct task_group *tg) { }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+ return 0;
+}
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 1dfae3d014b5..eb6cb8edd075 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
#include <linux/kallsyms.h>
#include <linux/utsname.h>
+static DEFINE_SPINLOCK(sched_debug_lock);
+
/*
* This allows printing both to /proc/sched_debug and
* to the console
@@ -86,6 +88,26 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
}
#endif
+#ifdef CONFIG_CGROUP_SCHED
+static char group_path[PATH_MAX];
+
+static char *task_group_path(struct task_group *tg)
+{
+ if (autogroup_path(tg, group_path, PATH_MAX))
+ return group_path;
+
+ /*
+ * May be NULL if the underlying cgroup isn't fully-created yet
+ */
+ if (!tg->css.cgroup) {
+ group_path[0] = '\0';
+ return group_path;
+ }
+ cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ return group_path;
+}
+#endif
+
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
@@ -108,6 +130,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
#endif
+#ifdef CONFIG_CGROUP_SCHED
+ SEQ_printf(m, " %s", task_group_path(task_group(p)));
+#endif
SEQ_printf(m, "\n");
}
@@ -144,7 +169,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
struct sched_entity *last;
unsigned long flags;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+#else
SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+#endif
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
@@ -191,7 +220,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
+#ifdef CONFIG_RT_GROUP_SCHED
+ SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+#else
SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+#endif
#define P(x) \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -212,6 +245,7 @@ extern __read_mostly int sched_clock_running;
static void print_cpu(struct seq_file *m, int cpu)
{
struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
#ifdef CONFIG_X86
{
@@ -262,14 +296,20 @@ static void print_cpu(struct seq_file *m, int cpu)
P(ttwu_count);
P(ttwu_local);
- P(bkl_count);
+ SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
+ rq->rq_sched_info.bkl_count);
#undef P
+#undef P64
#endif
+ spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
+ rcu_read_lock();
print_rq(m, rq, cpu);
+ rcu_read_unlock();
+ spin_unlock_irqrestore(&sched_debug_lock, flags);
}
static const char *sched_tunable_scaling_names[] = {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c62ebae65cf0..0c26e2df450e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -699,7 +699,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->nr_running--;
}
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_FAIR_GROUP_SCHED
+# ifdef CONFIG_SMP
static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
int global_update)
{
@@ -721,10 +722,10 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
u64 now, delta;
unsigned long load = cfs_rq->load.weight;
- if (!cfs_rq)
+ if (cfs_rq->tg == &root_task_group)
return;
- now = rq_of(cfs_rq)->clock;
+ now = rq_of(cfs_rq)->clock_task;
delta = now - cfs_rq->load_stamp;
/* truncate load history at 4 idle periods */
@@ -762,6 +763,51 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
list_del_leaf_cfs_rq(cfs_rq);
}
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+ long weight_delta)
+{
+ long load_weight, load, shares;
+
+ load = cfs_rq->load.weight + weight_delta;
+
+ load_weight = atomic_read(&tg->load_weight);
+ load_weight -= cfs_rq->load_contribution;
+ load_weight += load;
+
+ shares = (tg->shares * load);
+ if (load_weight)
+ shares /= load_weight;
+
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
+ if (shares > tg->shares)
+ shares = tg->shares;
+
+ return shares;
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq, 0);
+ }
+}
+# else /* CONFIG_SMP */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+ long weight_delta)
+{
+ return tg->shares;
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+# endif /* CONFIG_SMP */
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
@@ -782,41 +828,20 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
{
struct task_group *tg;
struct sched_entity *se;
- long load_weight, load, shares;
-
- if (!cfs_rq)
- return;
+ long shares;
tg = cfs_rq->tg;
se = tg->se[cpu_of(rq_of(cfs_rq))];
if (!se)
return;
-
- load = cfs_rq->load.weight + weight_delta;
-
- load_weight = atomic_read(&tg->load_weight);
- load_weight -= cfs_rq->load_contribution;
- load_weight += load;
-
- shares = (tg->shares * load);
- if (load_weight)
- shares /= load_weight;
-
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- if (shares > tg->shares)
- shares = tg->shares;
+#ifndef CONFIG_SMP
+ if (likely(se->load.weight == tg->shares))
+ return;
+#endif
+ shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
reweight_entity(cfs_rq_of(se), se, shares);
}
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
- update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq, 0);
- }
-}
#else /* CONFIG_FAIR_GROUP_SCHED */
static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
{
@@ -1062,6 +1087,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
struct sched_entity *se = __pick_next_entity(cfs_rq);
s64 delta = curr->vruntime - se->vruntime;
+ if (delta < 0)
+ return;
+
if (delta > ideal_runtime)
resched_task(rq_of(cfs_rq)->curr);
}
@@ -1362,27 +1390,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
return wl;
for_each_sched_entity(se) {
- long S, rw, s, a, b;
+ long lw, w;
- S = se->my_q->tg->shares;
- s = se->load.weight;
- rw = se->my_q->load.weight;
+ tg = se->my_q->tg;
+ w = se->my_q->load.weight;
- a = S*(rw + wl);
- b = S*rw + s*wg;
+ /* use this cpu's instantaneous contribution */
+ lw = atomic_read(&tg->load_weight);
+ lw -= se->my_q->load_contribution;
+ lw += w + wg;
- wl = s*(a-b);
+ wl += w;
- if (likely(b))
- wl /= b;
+ if (lw > 0 && wl < lw)
+ wl = (wl * tg->shares) / lw;
+ else
+ wl = tg->shares;
- /*
- * Assume the group is already running and will
- * thus already be accounted for in the weight.
- *
- * That is, moving shares between CPUs, does not
- * alter the group weight.
- */
+ /* zero point is MIN_SHARES */
+ if (wl < MIN_SHARES)
+ wl = MIN_SHARES;
+ wl -= se->load.weight;
wg = 0;
}
@@ -1401,7 +1429,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
- unsigned long this_load, load;
+ s64 this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
struct task_group *tg;
@@ -1440,8 +1468,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
- if (this_load) {
- unsigned long this_eff_load, prev_eff_load;
+ if (this_load > 0) {
+ s64 this_eff_load, prev_eff_load;
this_eff_load = 100;
this_eff_load *= power_of(prev_cpu);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c914ec747ca6..ad6267714c84 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -625,7 +625,7 @@ static void update_curr_rt(struct rq *rq)
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
u64 delta_exec;
- if (!task_has_rt_policy(curr))
+ if (curr->sched_class != &rt_sched_class)
return;
delta_exec = rq->clock_task - curr->se.exec_start;
diff --git a/kernel/smp.c b/kernel/smp.c
index 12ed8b013e2d..9910744f0856 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
#include <linux/smp.h>
#include <linux/cpu.h>
+#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
static struct {
struct list_head queue;
raw_spinlock_t lock;
@@ -193,23 +194,52 @@ void generic_smp_call_function_interrupt(void)
*/
list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
int refs;
+ void (*func) (void *info);
- if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
+ /*
+ * Since we walk the list without any locks, we might
+ * see an entry that was completed, removed from the
+ * list and is in the process of being reused.
+ *
+ * We must check that the cpu is in the cpumask before
+ * checking the refs, and both must be set before
+ * executing the callback on this cpu.
+ */
+
+ if (!cpumask_test_cpu(cpu, data->cpumask))
+ continue;
+
+ smp_rmb();
+
+ if (atomic_read(&data->refs) == 0)
continue;
+ func = data->csd.func; /* for later warn */
data->csd.func(data->csd.info);
+ /*
+ * If the cpu mask is not still set then it enabled interrupts,
+ * we took another smp interrupt, and executed the function
+ * twice on this cpu. In theory that copy decremented refs.
+ */
+ if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
+ WARN(1, "%pS enabled interrupts and double executed\n",
+ func);
+ continue;
+ }
+
refs = atomic_dec_return(&data->refs);
WARN_ON(refs < 0);
- if (!refs) {
- raw_spin_lock(&call_function.lock);
- list_del_rcu(&data->csd.list);
- raw_spin_unlock(&call_function.lock);
- }
if (refs)
continue;
+ WARN_ON(!cpumask_empty(data->cpumask));
+
+ raw_spin_lock(&call_function.lock);
+ list_del_rcu(&data->csd.list);
+ raw_spin_unlock(&call_function.lock);
+
csd_unlock(&data->csd);
}
@@ -429,7 +459,7 @@ void smp_call_function_many(const struct cpumask *mask,
* can't happen.
*/
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
- && !oops_in_progress);
+ && !oops_in_progress && !early_boot_irqs_disabled);
/* So, what's a CPU they want? Ignoring this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -453,11 +483,21 @@ void smp_call_function_many(const struct cpumask *mask,
data = &__get_cpu_var(cfd_data);
csd_lock(&data->csd);
+ BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
data->csd.func = func;
data->csd.info = info;
cpumask_and(data->cpumask, mask, cpu_online_mask);
cpumask_clear_cpu(this_cpu, data->cpumask);
+
+ /*
+ * To ensure the interrupt handler gets an complete view
+ * we order the cpumask and refs writes and order the read
+ * of them in the interrupt handler. In addition we may
+ * only clear our own cpu bit from the mask.
+ */
+ smp_wmb();
+
atomic_set(&data->refs, cpumask_weight(data->cpumask));
raw_spin_lock_irqsave(&call_function.lock, flags);
@@ -529,3 +569,24 @@ void ipi_call_unlock_irq(void)
{
raw_spin_unlock_irq(&call_function.lock);
}
+#endif /* USE_GENERIC_SMP_HELPERS */
+
+/*
+ * Call a function on all processors. May be used during early boot while
+ * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
+ * of local_irq_disable/enable().
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int wait)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ preempt_disable();
+ ret = smp_call_function(func, info, wait);
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0823778f87fc..68eb5efec388 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -885,25 +885,6 @@ static __init int spawn_ksoftirqd(void)
}
early_initcall(spawn_ksoftirqd);
-#ifdef CONFIG_SMP
-/*
- * Call a function on all processors
- */
-int on_each_cpu(void (*func) (void *info), void *info, int wait)
-{
- int ret = 0;
-
- preempt_disable();
- ret = smp_call_function(func, info, wait);
- local_irq_disable();
- func(info);
- local_irq_enable();
- preempt_enable();
- return ret;
-}
-EXPORT_SYMBOL(on_each_cpu);
-#endif
-
/*
* [ These __weak aliases are kept in a separate compilation unit, so that
* GCC does not inline them incorrectly. ]
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 98d8c1e80edb..73ce23feaea9 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -156,6 +156,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited(). We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections. If there are still some readers after 10 microseconds,
+ * we repeatedly block for 1-millisecond time periods. This approach
+ * has done well in testing, so there is no need for a config parameter.
+ */
+#define SYNCHRONIZE_SRCU_READER_DELAY 10
+
+/*
* Helper function for synchronize_srcu() and synchronize_srcu_expedited().
*/
static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
@@ -207,11 +217,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
* will have finished executing. We initially give readers
* an arbitrarily chosen 10 microseconds to get out of their
* SRCU read-side critical sections, then loop waiting 1/HZ
- * seconds per iteration.
+ * seconds per iteration. The 10-microsecond value has done
+ * very well in testing.
*/
if (srcu_readers_active_idx(sp, idx))
- udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
+ udelay(SYNCHRONIZE_SRCU_READER_DELAY);
while (srcu_readers_active_idx(sp, idx))
schedule_timeout_interruptible(1);
diff --git a/kernel/sys.c b/kernel/sys.c
index 2745dcdb6c6c..18da702ec813 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -43,6 +43,8 @@
#include <linux/kprobes.h>
#include <linux/user_namespace.h>
+#include <linux/kmsg_dump.h>
+
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/unistd.h>
@@ -285,6 +287,7 @@ out_unlock:
*/
void emergency_restart(void)
{
+ kmsg_dump(KMSG_DUMP_EMERG);
machine_emergency_restart();
}
EXPORT_SYMBOL_GPL(emergency_restart);
@@ -312,6 +315,7 @@ void kernel_restart(char *cmd)
printk(KERN_EMERG "Restarting system.\n");
else
printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
+ kmsg_dump(KMSG_DUMP_RESTART);
machine_restart(cmd);
}
EXPORT_SYMBOL_GPL(kernel_restart);
@@ -333,6 +337,7 @@ void kernel_halt(void)
kernel_shutdown_prepare(SYSTEM_HALT);
sysdev_shutdown();
printk(KERN_EMERG "System halted.\n");
+ kmsg_dump(KMSG_DUMP_HALT);
machine_halt();
}
@@ -351,6 +356,7 @@ void kernel_power_off(void)
disable_nonboot_cpus();
sysdev_shutdown();
printk(KERN_EMERG "Power down.\n");
+ kmsg_dump(KMSG_DUMP_POWEROFF);
machine_power_off();
}
EXPORT_SYMBOL_GPL(kernel_power_off);
@@ -1379,7 +1385,8 @@ static int check_prlimit_permission(struct task_struct *task)
const struct cred *cred = current_cred(), *tcred;
tcred = __task_cred(task);
- if ((cred->uid != tcred->euid ||
+ if (current != task &&
+ (cred->uid != tcred->euid ||
cred->uid != tcred->suid ||
cred->uid != tcred->uid ||
cred->gid != tcred->egid ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae5cbb1e3ced..0f1bd83db985 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,6 +24,7 @@
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/signal.h>
+#include <linux/printk.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/ctype.h>
@@ -169,7 +170,8 @@ static int proc_taint(struct ctl_table *table, int write,
#endif
#ifdef CONFIG_MAGIC_SYSRQ
-static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
+/* Note: sysrq code uses it's own private copy */
+static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
static int sysrq_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -245,10 +247,6 @@ static struct ctl_table root_table[] = {
.mode = 0555,
.child = dev_table,
},
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
{ }
};
@@ -710,6 +708,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
+ {
+ .procname = "kptr_restrict",
+ .data = &kptr_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
#endif
{
.procname = "ngroups_max",
@@ -962,10 +969,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
{ }
};
@@ -1326,11 +1329,6 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
-
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
{ }
};
@@ -1486,10 +1484,6 @@ static struct ctl_table fs_table[] = {
.proc_handler = &pipe_proc_fn,
.extra1 = &pipe_min_size,
},
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
{ }
};
@@ -2899,7 +2893,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
}
}
-#else /* CONFIG_PROC_FS */
+#else /* CONFIG_PROC_SYSCTL */
int proc_dostring(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2951,7 +2945,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
}
-#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_PROC_SYSCTL */
/*
* No sense putting this after each symbol definition, twice,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 4b2545a136ff..b875bedf7c9a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1192,7 +1192,7 @@ static ssize_t bin_dn_node_address(struct file *file,
buf[result] = '\0';
- /* Convert the decnet addresss to binary */
+ /* Convert the decnet address to binary */
result = -EIO;
nodep = strchr(buf, '.') + 1;
if (!nodep)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 69691eb4b715..3971c6b9d58d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -348,7 +348,7 @@ static int parse(struct nlattr *na, struct cpumask *mask)
return ret;
}
-#ifdef CONFIG_IA64
+#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
#define TASKSTATS_NEEDS_PADDING 1
#endif
diff --git a/kernel/time.c b/kernel/time.c
index ba9b338d1835..32174359576f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time);
* Avoid unnecessary multiplications/divisions in the
* two most common HZ cases:
*/
-unsigned int inline jiffies_to_msecs(const unsigned long j)
+inline unsigned int jiffies_to_msecs(const unsigned long j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
return (MSEC_PER_SEC / HZ) * j;
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
}
EXPORT_SYMBOL(jiffies_to_msecs);
-unsigned int inline jiffies_to_usecs(const unsigned long j)
+inline unsigned int jiffies_to_usecs(const unsigned long j)
{
#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index df140cd3ea47..6519cf62d9cd 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
* @shift: pointer to shift variable
* @from: frequency to convert from
* @to: frequency to convert to
- * @minsec: guaranteed runtime conversion range in seconds
+ * @maxsec: guaranteed runtime conversion range in seconds
*
* The function evaluates the shift/mult pair for the scaled math
* operations of clocksources and clockevents.
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
* NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
* event @to is the counter frequency and @from is NSEC_PER_SEC.
*
- * The @minsec conversion range argument controls the time frame in
+ * The @maxsec conversion range argument controls the time frame in
* seconds which must be covered by the runtime conversion with the
* calculated mult and shift factors. This guarantees that no 64bit
* overflow happens when the input value of the conversion is
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
* factors.
*/
void
-clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
{
u64 tmp;
u32 sft, sftacc= 32;
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
* Calculate the shift factor which is limiting the conversion
* range:
*/
- tmp = ((u64)minsec * from) >> 32;
+ tmp = ((u64)maxsec * from) >> 32;
while (tmp) {
tmp >>=1;
sftacc--;
@@ -679,7 +679,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
{
- /* Intialize mult/shift and max_idle_ns */
+ /* Initialize mult/shift and max_idle_ns */
__clocksource_updatefreq_scale(cs, scale, freq);
/* Add clocksource to the clcoksource list */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index d2321891538f..5c00242fa921 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -14,6 +14,7 @@
#include <linux/timex.h>
#include <linux/time.h>
#include <linux/mm.h>
+#include <linux/module.h>
/*
* NTP timekeeping variables:
@@ -74,6 +75,162 @@ static long time_adjust;
/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
static s64 ntp_tick_adj;
+#ifdef CONFIG_NTP_PPS
+
+/*
+ * The following variables are used when a pulse-per-second (PPS) signal
+ * is available. They establish the engineering parameters of the clock
+ * discipline loop when controlled by the PPS signal.
+ */
+#define PPS_VALID 10 /* PPS signal watchdog max (s) */
+#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */
+#define PPS_INTMIN 2 /* min freq interval (s) (shift) */
+#define PPS_INTMAX 8 /* max freq interval (s) (shift) */
+#define PPS_INTCOUNT 4 /* number of consecutive good intervals to
+ increase pps_shift or consecutive bad
+ intervals to decrease it */
+#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */
+
+static int pps_valid; /* signal watchdog counter */
+static long pps_tf[3]; /* phase median filter */
+static long pps_jitter; /* current jitter (ns) */
+static struct timespec pps_fbase; /* beginning of the last freq interval */
+static int pps_shift; /* current interval duration (s) (shift) */
+static int pps_intcnt; /* interval counter */
+static s64 pps_freq; /* frequency offset (scaled ns/s) */
+static long pps_stabil; /* current stability (scaled ns/s) */
+
+/*
+ * PPS signal quality monitors
+ */
+static long pps_calcnt; /* calibration intervals */
+static long pps_jitcnt; /* jitter limit exceeded */
+static long pps_stbcnt; /* stability limit exceeded */
+static long pps_errcnt; /* calibration errors */
+
+
+/* PPS kernel consumer compensates the whole phase error immediately.
+ * Otherwise, reduce the offset by a fixed factor times the time constant.
+ */
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+ if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+ return offset;
+ else
+ return shift_right(offset, SHIFT_PLL + time_constant);
+}
+
+static inline void pps_reset_freq_interval(void)
+{
+ /* the PPS calibration interval may end
+ surprisingly early */
+ pps_shift = PPS_INTMIN;
+ pps_intcnt = 0;
+}
+
+/**
+ * pps_clear - Clears the PPS state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void pps_clear(void)
+{
+ pps_reset_freq_interval();
+ pps_tf[0] = 0;
+ pps_tf[1] = 0;
+ pps_tf[2] = 0;
+ pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
+ pps_freq = 0;
+}
+
+/* Decrease pps_valid to indicate that another second has passed since
+ * the last PPS signal. When it reaches 0, indicate that PPS signal is
+ * missing.
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void pps_dec_valid(void)
+{
+ if (pps_valid > 0)
+ pps_valid--;
+ else {
+ time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+ STA_PPSWANDER | STA_PPSERROR);
+ pps_clear();
+ }
+}
+
+static inline void pps_set_freq(s64 freq)
+{
+ pps_freq = freq;
+}
+
+static inline int is_error_status(int status)
+{
+ return (time_status & (STA_UNSYNC|STA_CLOCKERR))
+ /* PPS signal lost when either PPS time or
+ * PPS frequency synchronization requested
+ */
+ || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
+ && !(time_status & STA_PPSSIGNAL))
+ /* PPS jitter exceeded when
+ * PPS time synchronization requested */
+ || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+ == (STA_PPSTIME|STA_PPSJITTER))
+ /* PPS wander exceeded or calibration error when
+ * PPS frequency synchronization requested
+ */
+ || ((time_status & STA_PPSFREQ)
+ && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
+}
+
+static inline void pps_fill_timex(struct timex *txc)
+{
+ txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
+ PPM_SCALE_INV, NTP_SCALE_SHIFT);
+ txc->jitter = pps_jitter;
+ if (!(time_status & STA_NANO))
+ txc->jitter /= NSEC_PER_USEC;
+ txc->shift = pps_shift;
+ txc->stabil = pps_stabil;
+ txc->jitcnt = pps_jitcnt;
+ txc->calcnt = pps_calcnt;
+ txc->errcnt = pps_errcnt;
+ txc->stbcnt = pps_stbcnt;
+}
+
+#else /* !CONFIG_NTP_PPS */
+
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+ return shift_right(offset, SHIFT_PLL + time_constant);
+}
+
+static inline void pps_reset_freq_interval(void) {}
+static inline void pps_clear(void) {}
+static inline void pps_dec_valid(void) {}
+static inline void pps_set_freq(s64 freq) {}
+
+static inline int is_error_status(int status)
+{
+ return status & (STA_UNSYNC|STA_CLOCKERR);
+}
+
+static inline void pps_fill_timex(struct timex *txc)
+{
+ /* PPS is not implemented, so these are zero */
+ txc->ppsfreq = 0;
+ txc->jitter = 0;
+ txc->shift = 0;
+ txc->stabil = 0;
+ txc->jitcnt = 0;
+ txc->calcnt = 0;
+ txc->errcnt = 0;
+ txc->stbcnt = 0;
+}
+
+#endif /* CONFIG_NTP_PPS */
+
/*
* NTP methods:
*/
@@ -185,6 +342,9 @@ void ntp_clear(void)
tick_length = tick_length_base;
time_offset = 0;
+
+ /* Clear PPS state variables */
+ pps_clear();
}
/*
@@ -250,16 +410,16 @@ void second_overflow(void)
time_status |= STA_UNSYNC;
}
- /*
- * Compute the phase adjustment for the next second. The offset is
- * reduced by a fixed factor times the time constant.
- */
+ /* Compute the phase adjustment for the next second */
tick_length = tick_length_base;
- delta = shift_right(time_offset, SHIFT_PLL + time_constant);
+ delta = ntp_offset_chunk(time_offset);
time_offset -= delta;
tick_length += delta;
+ /* Check PPS signal */
+ pps_dec_valid();
+
if (!time_adjust)
return;
@@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
time_state = TIME_OK;
time_status = STA_UNSYNC;
+ /* restart PPS frequency calibration */
+ pps_reset_freq_interval();
}
/*
@@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
time_freq = txc->freq * PPM_SCALE;
time_freq = min(time_freq, MAXFREQ_SCALED);
time_freq = max(time_freq, -MAXFREQ_SCALED);
+ /* update pps_freq */
+ pps_set_freq(time_freq);
}
if (txc->modes & ADJ_MAXERROR)
@@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc)
}
result = time_state; /* mostly `TIME_OK' */
- if (time_status & (STA_UNSYNC|STA_CLOCKERR))
+ /* check for errors */
+ if (is_error_status(time_status))
result = TIME_ERROR;
txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
@@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc)
txc->tick = tick_usec;
txc->tai = time_tai;
- /* PPS is not implemented, so these are zero */
- txc->ppsfreq = 0;
- txc->jitter = 0;
- txc->shift = 0;
- txc->stabil = 0;
- txc->jitcnt = 0;
- txc->calcnt = 0;
- txc->errcnt = 0;
- txc->stbcnt = 0;
+ /* fill PPS status fields */
+ pps_fill_timex(txc);
write_sequnlock_irq(&xtime_lock);
@@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc)
return result;
}
+#ifdef CONFIG_NTP_PPS
+
+/* actually struct pps_normtime is good old struct timespec, but it is
+ * semantically different (and it is the reason why it was invented):
+ * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
+ * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
+struct pps_normtime {
+ __kernel_time_t sec; /* seconds */
+ long nsec; /* nanoseconds */
+};
+
+/* normalize the timestamp so that nsec is in the
+ ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
+static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
+{
+ struct pps_normtime norm = {
+ .sec = ts.tv_sec,
+ .nsec = ts.tv_nsec
+ };
+
+ if (norm.nsec > (NSEC_PER_SEC >> 1)) {
+ norm.nsec -= NSEC_PER_SEC;
+ norm.sec++;
+ }
+
+ return norm;
+}
+
+/* get current phase correction and jitter */
+static inline long pps_phase_filter_get(long *jitter)
+{
+ *jitter = pps_tf[0] - pps_tf[1];
+ if (*jitter < 0)
+ *jitter = -*jitter;
+
+ /* TODO: test various filters */
+ return pps_tf[0];
+}
+
+/* add the sample to the phase filter */
+static inline void pps_phase_filter_add(long err)
+{
+ pps_tf[2] = pps_tf[1];
+ pps_tf[1] = pps_tf[0];
+ pps_tf[0] = err;
+}
+
+/* decrease frequency calibration interval length.
+ * It is halved after four consecutive unstable intervals.
+ */
+static inline void pps_dec_freq_interval(void)
+{
+ if (--pps_intcnt <= -PPS_INTCOUNT) {
+ pps_intcnt = -PPS_INTCOUNT;
+ if (pps_shift > PPS_INTMIN) {
+ pps_shift--;
+ pps_intcnt = 0;
+ }
+ }
+}
+
+/* increase frequency calibration interval length.
+ * It is doubled after four consecutive stable intervals.
+ */
+static inline void pps_inc_freq_interval(void)
+{
+ if (++pps_intcnt >= PPS_INTCOUNT) {
+ pps_intcnt = PPS_INTCOUNT;
+ if (pps_shift < PPS_INTMAX) {
+ pps_shift++;
+ pps_intcnt = 0;
+ }
+ }
+}
+
+/* update clock frequency based on MONOTONIC_RAW clock PPS signal
+ * timestamps
+ *
+ * At the end of the calibration interval the difference between the
+ * first and last MONOTONIC_RAW clock timestamps divided by the length
+ * of the interval becomes the frequency update. If the interval was
+ * too long, the data are discarded.
+ * Returns the difference between old and new frequency values.
+ */
+static long hardpps_update_freq(struct pps_normtime freq_norm)
+{
+ long delta, delta_mod;
+ s64 ftemp;
+
+ /* check if the frequency interval was too long */
+ if (freq_norm.sec > (2 << pps_shift)) {
+ time_status |= STA_PPSERROR;
+ pps_errcnt++;
+ pps_dec_freq_interval();
+ pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
+ freq_norm.sec);
+ return 0;
+ }
+
+ /* here the raw frequency offset and wander (stability) is
+ * calculated. If the wander is less than the wander threshold
+ * the interval is increased; otherwise it is decreased.
+ */
+ ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
+ freq_norm.sec);
+ delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
+ pps_freq = ftemp;
+ if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
+ pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
+ time_status |= STA_PPSWANDER;
+ pps_stbcnt++;
+ pps_dec_freq_interval();
+ } else { /* good sample */
+ pps_inc_freq_interval();
+ }
+
+ /* the stability metric is calculated as the average of recent
+ * frequency changes, but is used only for performance
+ * monitoring
+ */
+ delta_mod = delta;
+ if (delta_mod < 0)
+ delta_mod = -delta_mod;
+ pps_stabil += (div_s64(((s64)delta_mod) <<
+ (NTP_SCALE_SHIFT - SHIFT_USEC),
+ NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
+
+ /* if enabled, the system clock frequency is updated */
+ if ((time_status & STA_PPSFREQ) != 0 &&
+ (time_status & STA_FREQHOLD) == 0) {
+ time_freq = pps_freq;
+ ntp_update_frequency();
+ }
+
+ return delta;
+}
+
+/* correct REALTIME clock phase error against PPS signal */
+static void hardpps_update_phase(long error)
+{
+ long correction = -error;
+ long jitter;
+
+ /* add the sample to the median filter */
+ pps_phase_filter_add(correction);
+ correction = pps_phase_filter_get(&jitter);
+
+ /* Nominal jitter is due to PPS signal noise. If it exceeds the
+ * threshold, the sample is discarded; otherwise, if so enabled,
+ * the time offset is updated.
+ */
+ if (jitter > (pps_jitter << PPS_POPCORN)) {
+ pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
+ jitter, (pps_jitter << PPS_POPCORN));
+ time_status |= STA_PPSJITTER;
+ pps_jitcnt++;
+ } else if (time_status & STA_PPSTIME) {
+ /* correct the time using the phase offset */
+ time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
+ NTP_INTERVAL_FREQ);
+ /* cancel running adjtime() */
+ time_adjust = 0;
+ }
+ /* update jitter */
+ pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
+}
+
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS signal arrival in order to
+ * discipline the CPU clock oscillator to the PPS signal. It takes two
+ * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
+ * is used to correct clock phase error and the latter is used to
+ * correct the frequency.
+ *
+ * This code is based on David Mills's reference nanokernel
+ * implementation. It was mostly rewritten but keeps the same idea.
+ */
+void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+ struct pps_normtime pts_norm, freq_norm;
+ unsigned long flags;
+
+ pts_norm = pps_normalize_ts(*phase_ts);
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ /* clear the error bits, they will be set again if needed */
+ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+
+ /* indicate signal presence */
+ time_status |= STA_PPSSIGNAL;
+ pps_valid = PPS_VALID;
+
+ /* when called for the first time,
+ * just start the frequency interval */
+ if (unlikely(pps_fbase.tv_sec == 0)) {
+ pps_fbase = *raw_ts;
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+ return;
+ }
+
+ /* ok, now we have a base for frequency calculation */
+ freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
+
+ /* check that the signal is in the range
+ * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
+ if ((freq_norm.sec == 0) ||
+ (freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
+ (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
+ time_status |= STA_PPSJITTER;
+ /* restart the frequency calibration interval */
+ pps_fbase = *raw_ts;
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+ pr_err("hardpps: PPSJITTER: bad pulse\n");
+ return;
+ }
+
+ /* signal is ok */
+
+ /* check if the current frequency interval is finished */
+ if (freq_norm.sec >= (1 << pps_shift)) {
+ pps_calcnt++;
+ /* restart the frequency calibration interval */
+ pps_fbase = *raw_ts;
+ hardpps_update_freq(freq_norm);
+ }
+
+ hardpps_update_phase(pts_norm.nsec);
+
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+EXPORT_SYMBOL(hardpps);
+
+#endif /* CONFIG_NTP_PPS */
+
static int __init ntp_tick_adj_setup(char *str)
{
ntp_tick_adj = simple_strtol(str, NULL, 0);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 48b2761b5668..a3b5aff62606 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -600,4 +600,14 @@ int tick_broadcast_oneshot_active(void)
return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
}
+/*
+ * Check whether the broadcast device supports oneshot.
+ */
+bool tick_broadcast_oneshot_available(void)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+ return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
+}
+
#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 051bc80a0c43..ed228ef6f6b8 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -51,7 +51,11 @@ int tick_is_oneshot_available(void)
{
struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
- return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
+ if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return 0;
+ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ return 1;
+ return tick_broadcast_oneshot_available();
}
/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 290eefbc1f60..f65d3a723a64 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -36,6 +36,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
extern int tick_broadcast_oneshot_active(void);
extern void tick_check_oneshot_broadcast(int cpu);
+bool tick_broadcast_oneshot_available(void);
# else /* BROADCAST */
static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
@@ -46,6 +47,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { }
static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
static inline int tick_broadcast_oneshot_active(void) { return 0; }
static inline void tick_check_oneshot_broadcast(int cpu) { }
+static inline bool tick_broadcast_oneshot_available(void) { return true; }
# endif /* !BROADCAST */
#else /* !ONESHOT */
@@ -76,6 +78,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
return 0;
}
static inline int tick_broadcast_oneshot_active(void) { return 0; }
+static inline bool tick_broadcast_oneshot_available(void) { return false; }
#endif /* !TICK_ONESHOT */
/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3e216e01bbd1..c55ea2433471 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -642,8 +642,7 @@ static void tick_nohz_switch_to_nohz(void)
}
local_irq_enable();
- printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
- smp_processor_id());
+ printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
}
/*
@@ -795,8 +794,10 @@ void tick_setup_sched_timer(void)
}
#ifdef CONFIG_NO_HZ
- if (tick_nohz_enabled)
+ if (tick_nohz_enabled) {
ts->nohz_mode = NOHZ_MODE_HIGHRES;
+ printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
+ }
#endif
}
#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5bb86da82003..d27c7562902c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -49,7 +49,7 @@ struct timekeeper {
u32 mult;
};
-struct timekeeper timekeeper;
+static struct timekeeper timekeeper;
/**
* timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -164,7 +164,7 @@ static struct timespec total_sleep_time;
/*
* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
*/
-struct timespec raw_time;
+static struct timespec raw_time;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -288,6 +288,49 @@ void ktime_get_ts(struct timespec *ts)
}
EXPORT_SYMBOL_GPL(ktime_get_ts);
+#ifdef CONFIG_NTP_PPS
+
+/**
+ * getnstime_raw_and_real - get day and raw monotonic time in timespec format
+ * @ts_raw: pointer to the timespec to be set to raw monotonic time
+ * @ts_real: pointer to the timespec to be set to the time of day
+ *
+ * This function reads both the time of day and raw monotonic time at the
+ * same time atomically and stores the resulting timestamps in timespec
+ * format.
+ */
+void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
+{
+ unsigned long seq;
+ s64 nsecs_raw, nsecs_real;
+
+ WARN_ON_ONCE(timekeeping_suspended);
+
+ do {
+ u32 arch_offset;
+
+ seq = read_seqbegin(&xtime_lock);
+
+ *ts_raw = raw_time;
+ *ts_real = xtime;
+
+ nsecs_raw = timekeeping_get_ns_raw();
+ nsecs_real = timekeeping_get_ns();
+
+ /* If arch requires, add in gettimeoffset() */
+ arch_offset = arch_gettimeoffset();
+ nsecs_raw += arch_offset;
+ nsecs_real += arch_offset;
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ timespec_add_ns(ts_raw, nsecs_raw);
+ timespec_add_ns(ts_real, nsecs_real);
+}
+EXPORT_SYMBOL(getnstime_raw_and_real);
+
+#endif /* CONFIG_NTP_PPS */
+
/**
* do_gettimeofday - Returns the time of day in a timeval
* @tv: pointer to the timeval to be set
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 32a19f9397fc..3258455549f4 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym)
char symname[KSYM_NAME_LEN];
if (lookup_symbol_name((unsigned long)sym, symname) < 0)
- SEQ_printf(m, "<%p>", sym);
+ SEQ_printf(m, "<%pK>", sym);
else
SEQ_printf(m, "%s", symname);
}
@@ -112,7 +112,7 @@ next_one:
static void
print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
- SEQ_printf(m, " .base: %p\n", base);
+ SEQ_printf(m, " .base: %pK\n", base);
SEQ_printf(m, " .index: %d\n",
base->index);
SEQ_printf(m, " .resolution: %Lu nsecs\n",
diff --git a/kernel/timer.c b/kernel/timer.c
index 43ca9936f2d0..d6459923d245 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -959,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
*
* Synchronization rules: Callers must prevent restarting of the timer,
* otherwise this function is meaningless. It must not be called from
- * hardirq contexts. The caller must not hold locks which would prevent
+ * interrupt contexts. The caller must not hold locks which would prevent
* completion of the timer's handler. The timer's handler must not call
* add_timer_on(). Upon exit the timer is not queued and the handler is
* not running on any CPU.
@@ -969,10 +969,12 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
int del_timer_sync(struct timer_list *timer)
{
#ifdef CONFIG_LOCKDEP
- local_bh_disable();
+ unsigned long flags;
+
+ local_irq_save(flags);
lock_map_acquire(&timer->lockdep_map);
lock_map_release(&timer->lockdep_map);
- local_bh_enable();
+ local_irq_restore(flags);
#endif
/*
* don't use it in hardirq context, because it
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f338190b26..761c510a06c5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+obj-$(CONFIG_TRACEPOINTS) += power-traces.o
ifeq ($(CONFIG_TRACING),y)
obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7b8ec0281548..cbafed7d4f38 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -138,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
!blk_tracer_enabled))
return;
+ /*
+ * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
+ * message to the trace.
+ */
+ if (!(bt->act_mask & BLK_TC_NOTIFY))
+ return;
+
local_irq_save(flags);
buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
va_start(args, fmt);
@@ -758,53 +765,58 @@ static void blk_add_trace_rq_complete(void *ignore,
* @q: queue the io is for
* @bio: the source bio
* @what: the action
+ * @error: error, if any
*
* Description:
* Records an action against a bio. Will log the bio offset + size.
*
**/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what)
+ u32 what, int error)
{
struct blk_trace *bt = q->blk_trace;
if (likely(!bt))
return;
+ if (!error && !bio_flagged(bio, BIO_UPTODATE))
+ error = EIO;
+
__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
- !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+ error, 0, NULL);
}
static void blk_add_trace_bio_bounce(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+ blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
}
static void blk_add_trace_bio_complete(void *ignore,
- struct request_queue *q, struct bio *bio)
+ struct request_queue *q, struct bio *bio,
+ int error)
{
- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
}
static void blk_add_trace_bio_backmerge(void *ignore,
struct request_queue *q,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
}
static void blk_add_trace_bio_frontmerge(void *ignore,
struct request_queue *q,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+ blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
}
static void blk_add_trace_bio_queue(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+ blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
}
static void blk_add_trace_getrq(void *ignore,
@@ -812,7 +824,7 @@ static void blk_add_trace_getrq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+ blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
else {
struct blk_trace *bt = q->blk_trace;
@@ -827,7 +839,7 @@ static void blk_add_trace_sleeprq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+ blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
else {
struct blk_trace *bt = q->blk_trace;
@@ -887,7 +899,7 @@ static void blk_add_trace_split(void *ignore,
}
/**
- * blk_add_trace_remap - Add a trace for a remap operation
+ * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
* @ignore: trace callback data parameter (not used)
* @q: queue the io is for
* @bio: the source bio
@@ -899,9 +911,9 @@ static void blk_add_trace_split(void *ignore,
* it spans a stripe (or similar). Add a trace for that action.
*
**/
-static void blk_add_trace_remap(void *ignore,
- struct request_queue *q, struct bio *bio,
- dev_t dev, sector_t from)
+static void blk_add_trace_bio_remap(void *ignore,
+ struct request_queue *q, struct bio *bio,
+ dev_t dev, sector_t from)
{
struct blk_trace *bt = q->blk_trace;
struct blk_io_trace_remap r;
@@ -1016,7 +1028,7 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_split(blk_add_trace_split, NULL);
WARN_ON(ret);
- ret = register_trace_block_remap(blk_add_trace_remap, NULL);
+ ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
WARN_ON(ret);
@@ -1025,7 +1037,7 @@ static void blk_register_tracepoints(void)
static void blk_unregister_tracepoints(void)
{
unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
- unregister_trace_block_remap(blk_add_trace_remap, NULL);
+ unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
unregister_trace_block_split(blk_add_trace_split, NULL);
unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
@@ -1815,21 +1827,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
rwbs[i] = '\0';
}
-void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
-{
- int rw = rq->cmd_flags & 0x03;
- int bytes;
-
- if (rq->cmd_flags & REQ_DISCARD)
- rw |= REQ_DISCARD;
-
- if (rq->cmd_flags & REQ_SECURE)
- rw |= REQ_SECURE;
-
- bytes = blk_rq_bytes(rq);
-
- blk_fill_rwbs(rwbs, rw, bytes);
-}
-
#endif /* CONFIG_EVENT_TRACING */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8cf959bad45..dc53ecb80589 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
__this_cpu_inc(user_stack_count);
-
-
event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
sizeof(*entry), flags, pc);
if (!event)
- return;
+ goto out_drop_count;
entry = ring_buffer_event_data(event);
entry->tgid = current->tgid;
@@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
if (!filter_check_discard(call, entry, buffer, event))
ring_buffer_unlock_commit(buffer, event);
+ out_drop_count:
__this_cpu_dec(user_stack_count);
-
out:
preempt_enable();
}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e3dfecaf13e6..6cf223764be8 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -53,7 +53,7 @@
*/
/*
- * Function trace entry - function address and parent function addres:
+ * Function trace entry - function address and parent function address:
*/
FTRACE_ENTRY(function, ftrace_entry,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 35fde09b81de..5f499e0438a4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1284,7 +1284,7 @@ trace_create_file_ops(struct module *mod)
static void trace_module_add_events(struct module *mod)
{
struct ftrace_module_file_ops *file_ops = NULL;
- struct ftrace_event_call *call, *start, *end;
+ struct ftrace_event_call **call, **start, **end;
start = mod->trace_events;
end = mod->trace_events + mod->num_trace_events;
@@ -1297,7 +1297,7 @@ static void trace_module_add_events(struct module *mod)
return;
for_each_event(call, start, end) {
- __trace_add_event_call(call, mod,
+ __trace_add_event_call(*call, mod,
&file_ops->id, &file_ops->enable,
&file_ops->filter, &file_ops->format);
}
@@ -1367,8 +1367,8 @@ static struct notifier_block trace_module_nb = {
.priority = 0,
};
-extern struct ftrace_event_call __start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
+extern struct ftrace_event_call *__start_ftrace_events[];
+extern struct ftrace_event_call *__stop_ftrace_events[];
static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
@@ -1384,7 +1384,7 @@ __setup("trace_event=", setup_trace_event);
static __init int event_trace_init(void)
{
- struct ftrace_event_call *call;
+ struct ftrace_event_call **call;
struct dentry *d_tracer;
struct dentry *entry;
struct dentry *d_events;
@@ -1430,7 +1430,7 @@ static __init int event_trace_init(void)
pr_warning("tracing: Failed to allocate common fields");
for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
- __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
+ __trace_add_event_call(*call, NULL, &ftrace_event_id_fops,
&ftrace_enable_fops,
&ftrace_event_filter_fops,
&ftrace_event_format_fops);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4b74d71705c0..bbeec31e0ae3 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -161,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = { \
.fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
}; \
\
-struct ftrace_event_call __used \
-__attribute__((__aligned__(4))) \
-__attribute__((section("_ftrace_events"))) event_##call = { \
+struct ftrace_event_call __used event_##call = { \
.name = #call, \
.event.type = etype, \
.class = &event_class_ftrace_##call, \
.print_fmt = print, \
}; \
+struct ftrace_event_call __used \
+__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
#include "trace_entries.h"
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5cf8c602b880..92b6e1e12d98 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
* Stubs:
*/
-void early_boot_irqs_off(void)
-{
-}
-
-void early_boot_irqs_on(void)
-{
-}
-
void trace_softirqs_on(unsigned long ip)
{
}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index bac752f0cfb5..5c9fe08d2093 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
static int syscall_enter_define_fields(struct ftrace_event_call *call);
static int syscall_exit_define_fields(struct ftrace_event_call *call);
-/* All syscall exit events have the same fields */
-static LIST_HEAD(syscall_exit_fields);
-
static struct list_head *
syscall_get_enter_fields(struct ftrace_event_call *call)
{
@@ -34,50 +31,45 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
return &entry->enter_fields;
}
-static struct list_head *
-syscall_get_exit_fields(struct ftrace_event_call *call)
-{
- return &syscall_exit_fields;
-}
-
struct trace_event_functions enter_syscall_print_funcs = {
- .trace = print_syscall_enter,
+ .trace = print_syscall_enter,
};
struct trace_event_functions exit_syscall_print_funcs = {
- .trace = print_syscall_exit,
+ .trace = print_syscall_exit,
};
struct ftrace_event_class event_class_syscall_enter = {
- .system = "syscalls",
- .reg = syscall_enter_register,
- .define_fields = syscall_enter_define_fields,
- .get_fields = syscall_get_enter_fields,
- .raw_init = init_syscall_trace,
+ .system = "syscalls",
+ .reg = syscall_enter_register,
+ .define_fields = syscall_enter_define_fields,
+ .get_fields = syscall_get_enter_fields,
+ .raw_init = init_syscall_trace,
};
struct ftrace_event_class event_class_syscall_exit = {
- .system = "syscalls",
- .reg = syscall_exit_register,
- .define_fields = syscall_exit_define_fields,
- .get_fields = syscall_get_exit_fields,
- .raw_init = init_syscall_trace,
+ .system = "syscalls",
+ .reg = syscall_exit_register,
+ .define_fields = syscall_exit_define_fields,
+ .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
+ .raw_init = init_syscall_trace,
};
-extern unsigned long __start_syscalls_metadata[];
-extern unsigned long __stop_syscalls_metadata[];
+extern struct syscall_metadata *__start_syscalls_metadata[];
+extern struct syscall_metadata *__stop_syscalls_metadata[];
static struct syscall_metadata **syscalls_metadata;
-static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
+static __init struct syscall_metadata *
+find_syscall_meta(unsigned long syscall)
{
- struct syscall_metadata *start;
- struct syscall_metadata *stop;
+ struct syscall_metadata **start;
+ struct syscall_metadata **stop;
char str[KSYM_SYMBOL_LEN];
- start = (struct syscall_metadata *)__start_syscalls_metadata;
- stop = (struct syscall_metadata *)__stop_syscalls_metadata;
+ start = __start_syscalls_metadata;
+ stop = __stop_syscalls_metadata;
kallsyms_lookup(syscall, NULL, NULL, NULL, str);
for ( ; start < stop; start++) {
@@ -87,8 +79,8 @@ static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
* with "SyS" instead of "sys", leading to an unwanted
* mismatch.
*/
- if (start->name && !strcmp(start->name + 3, str + 3))
- return start;
+ if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
+ return *start;
}
return NULL;
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index e95ee7f31d43..68187af4889e 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -27,8 +27,8 @@
#include <linux/sched.h>
#include <linux/jump_label.h>
-extern struct tracepoint __start___tracepoints[];
-extern struct tracepoint __stop___tracepoints[];
+extern struct tracepoint * const __start___tracepoints_ptrs[];
+extern struct tracepoint * const __stop___tracepoints_ptrs[];
/* Set to 1 to enable tracepoint debug output */
static const int tracepoint_debug;
@@ -298,10 +298,10 @@ static void disable_tracepoint(struct tracepoint *elem)
*
* Updates the probe callback corresponding to a range of tracepoints.
*/
-void
-tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
+void tracepoint_update_probe_range(struct tracepoint * const *begin,
+ struct tracepoint * const *end)
{
- struct tracepoint *iter;
+ struct tracepoint * const *iter;
struct tracepoint_entry *mark_entry;
if (!begin)
@@ -309,12 +309,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
mutex_lock(&tracepoints_mutex);
for (iter = begin; iter < end; iter++) {
- mark_entry = get_tracepoint(iter->name);
+ mark_entry = get_tracepoint((*iter)->name);
if (mark_entry) {
- set_tracepoint(&mark_entry, iter,
+ set_tracepoint(&mark_entry, *iter,
!!mark_entry->refcount);
} else {
- disable_tracepoint(iter);
+ disable_tracepoint(*iter);
}
}
mutex_unlock(&tracepoints_mutex);
@@ -326,8 +326,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
static void tracepoint_update_probes(void)
{
/* Core kernel tracepoints */
- tracepoint_update_probe_range(__start___tracepoints,
- __stop___tracepoints);
+ tracepoint_update_probe_range(__start___tracepoints_ptrs,
+ __stop___tracepoints_ptrs);
/* tracepoints in modules. */
module_update_tracepoints();
}
@@ -514,8 +514,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
* Will return the first tracepoint in the range if the input tracepoint is
* NULL.
*/
-int tracepoint_get_iter_range(struct tracepoint **tracepoint,
- struct tracepoint *begin, struct tracepoint *end)
+int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
+ struct tracepoint * const *begin, struct tracepoint * const *end)
{
if (!*tracepoint && begin != end) {
*tracepoint = begin;
@@ -534,7 +534,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
/* Core kernel tracepoints */
if (!iter->module) {
found = tracepoint_get_iter_range(&iter->tracepoint,
- __start___tracepoints, __stop___tracepoints);
+ __start___tracepoints_ptrs,
+ __stop___tracepoints_ptrs);
if (found)
goto end;
}
@@ -585,8 +586,8 @@ int tracepoint_module_notify(struct notifier_block *self,
switch (val) {
case MODULE_STATE_COMING:
case MODULE_STATE_GOING:
- tracepoint_update_probe_range(mod->tracepoints,
- mod->tracepoints + mod->num_tracepoints);
+ tracepoint_update_probe_range(mod->tracepoints_ptrs,
+ mod->tracepoints_ptrs + mod->num_tracepoints);
break;
}
return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 25915832291a..9da289c34f22 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -12,6 +12,8 @@
#include <linux/highuid.h>
#include <linux/cred.h>
+static struct kmem_cache *user_ns_cachep __read_mostly;
+
/*
* Create a new user namespace, deriving the creator from the user in the
* passed credentials, and replacing that user with the new root user for the
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new)
struct user_struct *root_user;
int n;
- ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
+ ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
if (!ns)
return -ENOMEM;
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new)
/* Alloc new root user. */
root_user = alloc_uid(ns, 0);
if (!root_user) {
- kfree(ns);
+ kmem_cache_free(user_ns_cachep, ns);
return -ENOMEM;
}
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work)
struct user_namespace *ns =
container_of(work, struct user_namespace, destroyer);
free_uid(ns->creator);
- kfree(ns);
+ kmem_cache_free(user_ns_cachep, ns);
}
void free_user_ns(struct kref *kref)
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
/* No useful relationship so no mapping */
return overflowgid;
}
+
+static __init int user_namespaces_init(void)
+{
+ user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+ return 0;
+}
+module_init(user_namespaces_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d7ebdf4cea98..18bb15776c57 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -27,7 +27,7 @@
#include <asm/irq_regs.h>
#include <linux/perf_event.h>
-int watchdog_enabled;
+int watchdog_enabled = 1;
int __read_mostly softlockup_thresh = 60;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -43,9 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
-static int no_watchdog;
-
-
/* boot commands */
/*
* Should we panic when a soft-lockup or hard-lockup occurs:
@@ -58,7 +55,7 @@ static int __init hardlockup_panic_setup(char *str)
if (!strncmp(str, "panic", 5))
hardlockup_panic = 1;
else if (!strncmp(str, "0", 1))
- no_watchdog = 1;
+ watchdog_enabled = 0;
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -77,7 +74,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);
static int __init nowatchdog_setup(char *str)
{
- no_watchdog = 1;
+ watchdog_enabled = 0;
return 1;
}
__setup("nowatchdog", nowatchdog_setup);
@@ -85,7 +82,7 @@ __setup("nowatchdog", nowatchdog_setup);
/* deprecated */
static int __init nosoftlockup_setup(char *str)
{
- no_watchdog = 1;
+ watchdog_enabled = 0;
return 1;
}
__setup("nosoftlockup", nosoftlockup_setup);
@@ -366,8 +363,14 @@ static int watchdog_nmi_enable(int cpu)
goto out_save;
}
- printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n",
- cpu, PTR_ERR(event));
+
+ /* vary the KERN level based on the returned errno */
+ if (PTR_ERR(event) == -EOPNOTSUPP)
+ printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+ else if (PTR_ERR(event) == -ENOENT)
+ printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
+ else
+ printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
return PTR_ERR(event);
/* success path */
@@ -432,9 +435,6 @@ static int watchdog_enable(int cpu)
wake_up_process(p);
}
- /* if any cpu succeeds, watchdog is considered enabled for the system */
- watchdog_enabled = 1;
-
return 0;
}
@@ -462,12 +462,16 @@ static void watchdog_disable(int cpu)
static void watchdog_enable_all_cpus(void)
{
int cpu;
- int result = 0;
+
+ watchdog_enabled = 0;
for_each_online_cpu(cpu)
- result += watchdog_enable(cpu);
+ if (!watchdog_enable(cpu))
+ /* if any cpu succeeds, watchdog is considered
+ enabled for the system */
+ watchdog_enabled = 1;
- if (result)
+ if (!watchdog_enabled)
printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
}
@@ -476,9 +480,6 @@ static void watchdog_disable_all_cpus(void)
{
int cpu;
- if (no_watchdog)
- return;
-
for_each_online_cpu(cpu)
watchdog_disable(cpu);
@@ -498,10 +499,12 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write,
{
proc_dointvec(table, write, buffer, length, ppos);
- if (watchdog_enabled)
- watchdog_enable_all_cpus();
- else
- watchdog_disable_all_cpus();
+ if (write) {
+ if (watchdog_enabled)
+ watchdog_enable_all_cpus();
+ else
+ watchdog_disable_all_cpus();
+ }
return 0;
}
@@ -530,7 +533,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- err = watchdog_enable(hotcpu);
+ if (watchdog_enabled)
+ err = watchdog_enable(hotcpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
@@ -555,9 +559,6 @@ void __init lockup_detector_init(void)
void *cpu = (void *)(long)smp_processor_id();
int err;
- if (no_watchdog)
- return;
-
err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
WARN_ON(notifier_to_errno(err));
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8ee6ec82f88a..ee6578b578ad 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -79,7 +79,9 @@ enum {
MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */
- MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */
+ MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2,
+ /* call for help after 10ms
+ (min two ticks) */
MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
CREATE_COOLDOWN = HZ, /* time to breath after fail */
TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */
@@ -768,7 +770,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
worker->flags &= ~flags;
- /* if transitioning out of NOT_RUNNING, increment nr_running */
+ /*
+ * If transitioning out of NOT_RUNNING, increment nr_running. Note
+ * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
+ * of multiple flags, not a single flag.
+ */
if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(get_gcwq_nr_running(gcwq->cpu));
@@ -1840,7 +1846,7 @@ __acquires(&gcwq->lock)
spin_unlock_irq(&gcwq->lock);
work_clear_pending(work);
- lock_map_acquire(&cwq->wq->lockdep_map);
+ lock_map_acquire_read(&cwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
trace_workqueue_execute_start(work);
f(work);
@@ -2043,6 +2049,15 @@ repeat:
move_linked_works(work, scheduled, &n);
process_scheduled_works(rescuer);
+
+ /*
+ * Leave this gcwq. If keep_working() is %true, notify a
+ * regular worker; otherwise, we end up with 0 concurrency
+ * and stalling the execution.
+ */
+ if (keep_working(gcwq))
+ wake_up_worker(gcwq);
+
spin_unlock_irq(&gcwq->lock);
}
@@ -2384,8 +2399,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
insert_wq_barrier(cwq, barr, work, worker);
spin_unlock_irq(&gcwq->lock);
- lock_map_acquire(&cwq->wq->lockdep_map);
+ /*
+ * If @max_active is 1 or rescuer is in use, flushing another work
+ * item on the same workqueue may lead to deadlock. Make sure the
+ * flusher is not running on the same workqueue by verifying write
+ * access.
+ */
+ if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
+ lock_map_acquire(&cwq->wq->lockdep_map);
+ else
+ lock_map_acquire_read(&cwq->wq->lockdep_map);
lock_map_release(&cwq->wq->lockdep_map);
+
return true;
already_gone:
spin_unlock_irq(&gcwq->lock);
@@ -2942,7 +2967,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
*/
spin_lock(&workqueue_lock);
- if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
+ if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
for_each_cwq_cpu(cpu, wq)
get_cwq(cpu, wq)->max_active = 0;
@@ -3054,7 +3079,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
spin_lock_irq(&gcwq->lock);
- if (!(wq->flags & WQ_FREEZEABLE) ||
+ if (!(wq->flags & WQ_FREEZABLE) ||
!(gcwq->flags & GCWQ_FREEZING))
get_cwq(gcwq->cpu, wq)->max_active = max_active;
@@ -3304,7 +3329,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
* want to get it over with ASAP - spam rescuers, wake up as
* many idlers as necessary and create new ones till the
* worklist is empty. Note that if the gcwq is frozen, there
- * may be frozen works in freezeable cwqs. Don't declare
+ * may be frozen works in freezable cwqs. Don't declare
* completion while frozen.
*/
while (gcwq->nr_workers != gcwq->nr_idle ||
@@ -3562,9 +3587,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
/**
* freeze_workqueues_begin - begin freezing workqueues
*
- * Start freezing workqueues. After this function returns, all
- * freezeable workqueues will queue new works to their frozen_works
- * list instead of gcwq->worklist.
+ * Start freezing workqueues. After this function returns, all freezable
+ * workqueues will queue new works to their frozen_works list instead of
+ * gcwq->worklist.
*
* CONTEXT:
* Grabs and releases workqueue_lock and gcwq->lock's.
@@ -3590,7 +3615,7 @@ void freeze_workqueues_begin(void)
list_for_each_entry(wq, &workqueues, list) {
struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
- if (cwq && wq->flags & WQ_FREEZEABLE)
+ if (cwq && wq->flags & WQ_FREEZABLE)
cwq->max_active = 0;
}
@@ -3601,7 +3626,7 @@ void freeze_workqueues_begin(void)
}
/**
- * freeze_workqueues_busy - are freezeable workqueues still busy?
+ * freeze_workqueues_busy - are freezable workqueues still busy?
*
* Check whether freezing is complete. This function must be called
* between freeze_workqueues_begin() and thaw_workqueues().
@@ -3610,8 +3635,8 @@ void freeze_workqueues_begin(void)
* Grabs and releases workqueue_lock.
*
* RETURNS:
- * %true if some freezeable workqueues are still busy. %false if
- * freezing is complete.
+ * %true if some freezable workqueues are still busy. %false if freezing
+ * is complete.
*/
bool freeze_workqueues_busy(void)
{
@@ -3631,7 +3656,7 @@ bool freeze_workqueues_busy(void)
list_for_each_entry(wq, &workqueues, list) {
struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
- if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+ if (!cwq || !(wq->flags & WQ_FREEZABLE))
continue;
BUG_ON(cwq->nr_active < 0);
@@ -3676,7 +3701,7 @@ void thaw_workqueues(void)
list_for_each_entry(wq, &workqueues, list) {
struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
- if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+ if (!cwq || !(wq->flags & WQ_FREEZABLE))
continue;
/* restore max_active and repopulate worklist */