aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c11
-rw-r--r--kernel/auditsc.c7
-rw-r--r--kernel/cpu.c19
-rw-r--r--kernel/cpuset.c23
-rw-r--r--kernel/events/core.c62
-rw-r--r--kernel/exit.c248
-rw-r--r--kernel/extable.c7
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/irq/Kconfig15
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c130
-rw-r--r--kernel/irq/generic-chip.c36
-rw-r--r--kernel/irq/irqdomain.c567
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/irq/msi.c330
-rw-r--r--kernel/kmod.c43
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/locking/mutex.c8
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/panic.c13
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/pid_namespace.c28
-rw-r--r--kernel/power/Kconfig7
-rw-r--r--kernel/power/hibernate.c14
-rw-r--r--kernel/power/power.h3
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/swap.c43
-rw-r--r--kernel/printk/printk.c90
-rw-r--r--kernel/ptrace.c23
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcu.h2
-rw-r--r--kernel/rcu/rcutorture.c1
-rw-r--r--kernel/rcu/tiny.c6
-rw-r--r--kernel/rcu/tree.c97
-rw-r--r--kernel/rcu/tree.h22
-rw-r--r--kernel/rcu/tree_plugin.h111
-rw-r--r--kernel/rcu/update.c89
-rw-r--r--kernel/res_counter.c211
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c255
-rw-r--r--kernel/sched/cpudeadline.h3
-rw-r--r--kernel/sched/cpupri.h3
-rw-r--r--kernel/sched/deadline.c99
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c354
-rw-r--r--kernel/sched/rt.c17
-rw-r--r--kernel/sched/sched.h43
-rw-r--r--kernel/sched/wait.c66
-rw-r--r--kernel/signal.c46
-rw-r--r--kernel/smpboot.c15
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/sys.c12
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/test_udelay.c (renamed from kernel/time/udelay_test.c)0
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/time.c20
-rw-r--r--kernel/time/timekeeping.c127
-rw-r--r--kernel/time/timer.c3
-rw-r--r--kernel/trace/blktrace.c148
-rw-r--r--kernel/trace/ftrace.c330
-rw-r--r--kernel/trace/ring_buffer.c75
-rw-r--r--kernel/trace/trace.c228
-rw-r--r--kernel/trace/trace.h17
-rw-r--r--kernel/trace/trace_branch.c47
-rw-r--r--kernel/trace/trace_events.c15
-rw-r--r--kernel/trace/trace_events_filter.c29
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_functions.c119
-rw-r--r--kernel/trace/trace_functions_graph.c423
-rw-r--r--kernel/trace/trace_kdb.c21
-rw-r--r--kernel/trace/trace_kprobe.c46
-rw-r--r--kernel/trace/trace_mmiotrace.c52
-rw-r--r--kernel/trace/trace_output.c446
-rw-r--r--kernel/trace/trace_output.h16
-rw-r--r--kernel/trace/trace_printk.c2
-rw-r--r--kernel/trace/trace_probe.c10
-rw-r--r--kernel/trace/trace_sched_switch.c144
-rw-r--r--kernel/trace/trace_sched_wakeup.c56
-rw-r--r--kernel/trace/trace_seq.c253
-rw-r--r--kernel/trace/trace_syscalls.c47
-rw-r--r--kernel/trace/trace_uprobe.c28
85 files changed, 3765 insertions, 2210 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 17ea6d4a9a24..a59481a3fa6c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
-obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
diff --git a/kernel/audit.c b/kernel/audit.c
index cebb11db4d34..1f37f15117e5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy)
set_freezable();
while (!kthread_should_stop()) {
struct sk_buff *skb;
- DECLARE_WAITQUEUE(wait, current);
flush_hold_queue();
@@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy)
audit_printk_skb(skb);
continue;
}
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kauditd_wait, &wait);
- if (!skb_queue_len(&audit_skb_queue)) {
- try_to_freeze();
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&kauditd_wait, &wait);
+ wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
}
return 0;
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e420a0c41b5f..c75522a83678 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1897,6 +1897,11 @@ out:
audit_copy_inode(n, dentry, inode);
}
+void __audit_file(const struct file *file)
+{
+ __audit_inode(NULL, file->f_path.dentry, 0);
+}
+
/**
* __audit_inode_child - collect inode info for created/removed objects
* @parent: inode of dentry parent
@@ -2373,7 +2378,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- dentry = dget(bprm->file->f_dentry);
+ dentry = dget(bprm->file->f_path.dentry);
get_vfs_caps_from_disk(dentry, &vcaps);
dput(dentry);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 90a3d017b90c..5d220234b3ca 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -86,6 +86,16 @@ static struct {
#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
+static void apply_puts_pending(int max)
+{
+ int delta;
+
+ if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
+ delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
+ cpu_hotplug.refcount -= delta;
+ }
+}
+
void get_online_cpus(void)
{
might_sleep();
@@ -93,6 +103,7 @@ void get_online_cpus(void)
return;
cpuhp_lock_acquire_read();
mutex_lock(&cpu_hotplug.lock);
+ apply_puts_pending(65536);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
}
@@ -105,6 +116,7 @@ bool try_get_online_cpus(void)
if (!mutex_trylock(&cpu_hotplug.lock))
return false;
cpuhp_lock_acquire_tryread();
+ apply_puts_pending(65536);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
return true;
@@ -161,12 +173,7 @@ void cpu_hotplug_begin(void)
cpuhp_lock_acquire();
for (;;) {
mutex_lock(&cpu_hotplug.lock);
- if (atomic_read(&cpu_hotplug.puts_pending)) {
- int delta;
-
- delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
- cpu_hotplug.refcount -= delta;
- }
+ apply_puts_pending(1);
if (likely(!cpu_hotplug.refcount))
break;
__set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1f107c74087b..723cfc9d0ad7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
goto out;
}
+ /*
+ * We can't shrink if we won't have enough room for SCHED_DEADLINE
+ * tasks.
+ */
+ ret = -EBUSY;
+ if (is_cpu_exclusive(cur) &&
+ !cpuset_cpumask_can_shrink(cur->cpus_allowed,
+ trial->cpus_allowed))
+ goto out;
+
ret = 0;
out:
rcu_read_unlock();
@@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
goto out_unlock;
cgroup_taskset_for_each(task, tset) {
- /*
- * Kthreads which disallow setaffinity shouldn't be moved
- * to a new cpuset; we don't want to change their cpu
- * affinity and isolating such threads by their set of
- * allowed nodes is unnecessary. Thus, cpusets are not
- * applicable for such threads. This prevents checking for
- * success of set_cpus_allowed_ptr() on all attached tasks
- * before cpus_allowed may be changed.
- */
- ret = -EINVAL;
- if (task->flags & PF_NO_SETAFFINITY)
+ ret = task_can_attach(task, cs->cpus_allowed);
+ if (ret)
goto out_unlock;
ret = security_task_setscheduler(task);
if (ret)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1cd5eef1fcdd..113b837470cd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -614,7 +614,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
if (!f.file)
return -EBADF;
- css = css_tryget_online_from_dir(f.file->f_dentry,
+ css = css_tryget_online_from_dir(f.file->f_path.dentry,
&perf_event_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
@@ -4460,7 +4460,7 @@ perf_output_sample_regs(struct perf_output_handle *handle,
}
}
-static void perf_sample_regs_user(struct perf_regs_user *regs_user,
+static void perf_sample_regs_user(struct perf_regs *regs_user,
struct pt_regs *regs)
{
if (!user_mode(regs)) {
@@ -4471,11 +4471,22 @@ static void perf_sample_regs_user(struct perf_regs_user *regs_user,
}
if (regs) {
- regs_user->regs = regs;
regs_user->abi = perf_reg_abi(current);
+ regs_user->regs = regs;
+ } else {
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
+ regs_user->regs = NULL;
}
}
+static void perf_sample_regs_intr(struct perf_regs *regs_intr,
+ struct pt_regs *regs)
+{
+ regs_intr->regs = regs;
+ regs_intr->abi = perf_reg_abi(current);
+}
+
+
/*
* Get remaining task size from user stack pointer.
*
@@ -4857,6 +4868,23 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_TRANSACTION)
perf_output_put(handle, data->txn);
+ if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ u64 abi = data->regs_intr.abi;
+ /*
+ * If there are no regs to dump, notice it through
+ * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
+ */
+ perf_output_put(handle, abi);
+
+ if (abi) {
+ u64 mask = event->attr.sample_regs_intr;
+
+ perf_output_sample_regs(handle,
+ data->regs_intr.regs,
+ mask);
+ }
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -4922,12 +4950,13 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
+ if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
+ perf_sample_regs_user(&data->regs_user, regs);
+
if (sample_type & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
int size = sizeof(u64);
- perf_sample_regs_user(&data->regs_user, regs);
-
if (data->regs_user.regs) {
u64 mask = event->attr.sample_regs_user;
size += hweight64(mask) * sizeof(u64);
@@ -4943,15 +4972,11 @@ void perf_prepare_sample(struct perf_event_header *header,
* in case new sample type is added, because we could eat
* up the rest of the sample size.
*/
- struct perf_regs_user *uregs = &data->regs_user;
u16 stack_size = event->attr.sample_stack_user;
u16 size = sizeof(u64);
- if (!uregs->abi)
- perf_sample_regs_user(uregs, regs);
-
stack_size = perf_sample_ustack_size(stack_size, header->size,
- uregs->regs);
+ data->regs_user.regs);
/*
* If there is something to dump, add space for the dump
@@ -4964,6 +4989,21 @@ void perf_prepare_sample(struct perf_event_header *header,
data->stack_user_size = stack_size;
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ /* regs dump ABI info */
+ int size = sizeof(u64);
+
+ perf_sample_regs_intr(&data->regs_intr, regs);
+
+ if (data->regs_intr.regs) {
+ u64 mask = event->attr.sample_regs_intr;
+
+ size += hweight64(mask) * sizeof(u64);
+ }
+
+ header->size += size;
+ }
}
static void perf_event_output(struct perf_event *event,
@@ -7151,6 +7191,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
ret = -EINVAL;
}
+ if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
+ ret = perf_reg_validate(attr->sample_regs_intr);
out:
return ret;
diff --git a/kernel/exit.c b/kernel/exit.c
index 5d30019ff953..8714e5ded8b4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -118,13 +118,10 @@ static void __exit_signal(struct task_struct *tsk)
}
/*
- * Accumulate here the counters for all threads but the group leader
- * as they die, so they can be added into the process-wide totals
- * when those are taken. The group leader stays around as a zombie as
- * long as there are other threads. When it gets reaped, the exit.c
- * code will add its counts into these totals. We won't ever get here
- * for the group leader, since it will have been the last reference on
- * the signal_struct.
+ * Accumulate here the counters for all threads as they die. We could
+ * skip the group leader because it is the last user of signal_struct,
+ * but we want to avoid the race with thread_group_cputime() which can
+ * see the empty ->thread_head list.
*/
task_cputime(tsk, &utime, &stime);
write_seqlock(&sig->stats_lock);
@@ -462,6 +459,44 @@ static void exit_mm(struct task_struct *tsk)
clear_thread_flag(TIF_MEMDIE);
}
+static struct task_struct *find_alive_thread(struct task_struct *p)
+{
+ struct task_struct *t;
+
+ for_each_thread(p, t) {
+ if (!(t->flags & PF_EXITING))
+ return t;
+ }
+ return NULL;
+}
+
+static struct task_struct *find_child_reaper(struct task_struct *father)
+ __releases(&tasklist_lock)
+ __acquires(&tasklist_lock)
+{
+ struct pid_namespace *pid_ns = task_active_pid_ns(father);
+ struct task_struct *reaper = pid_ns->child_reaper;
+
+ if (likely(reaper != father))
+ return reaper;
+
+ reaper = find_alive_thread(father);
+ if (reaper) {
+ pid_ns->child_reaper = reaper;
+ return reaper;
+ }
+
+ write_unlock_irq(&tasklist_lock);
+ if (unlikely(pid_ns == &init_pid_ns)) {
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ father->signal->group_exit_code ?: father->exit_code);
+ }
+ zap_pid_ns_processes(pid_ns);
+ write_lock_irq(&tasklist_lock);
+
+ return father;
+}
+
/*
* When we die, we re-parent all our children, and try to:
* 1. give them to another thread in our thread group, if such a member exists
@@ -469,58 +504,36 @@ static void exit_mm(struct task_struct *tsk)
* child_subreaper for its children (like a service manager)
* 3. give it to the init process (PID 1) in our pid namespace
*/
-static struct task_struct *find_new_reaper(struct task_struct *father)
- __releases(&tasklist_lock)
- __acquires(&tasklist_lock)
+static struct task_struct *find_new_reaper(struct task_struct *father,
+ struct task_struct *child_reaper)
{
- struct pid_namespace *pid_ns = task_active_pid_ns(father);
- struct task_struct *thread;
+ struct task_struct *thread, *reaper;
- thread = father;
- while_each_thread(father, thread) {
- if (thread->flags & PF_EXITING)
- continue;
- if (unlikely(pid_ns->child_reaper == father))
- pid_ns->child_reaper = thread;
+ thread = find_alive_thread(father);
+ if (thread)
return thread;
- }
-
- if (unlikely(pid_ns->child_reaper == father)) {
- write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns)) {
- panic("Attempted to kill init! exitcode=0x%08x\n",
- father->signal->group_exit_code ?:
- father->exit_code);
- }
-
- zap_pid_ns_processes(pid_ns);
- write_lock_irq(&tasklist_lock);
- } else if (father->signal->has_child_subreaper) {
- struct task_struct *reaper;
+ if (father->signal->has_child_subreaper) {
/*
- * Find the first ancestor marked as child_subreaper.
- * Note that the code below checks same_thread_group(reaper,
- * pid_ns->child_reaper). This is what we need to DTRT in a
- * PID namespace. However we still need the check above, see
- * http://marc.info/?l=linux-kernel&m=131385460420380
+ * Find the first ->is_child_subreaper ancestor in our pid_ns.
+ * We start from father to ensure we can not look into another
+ * namespace, this is safe because all its threads are dead.
*/
- for (reaper = father->real_parent;
- reaper != &init_task;
+ for (reaper = father;
+ !same_thread_group(reaper, child_reaper);
reaper = reaper->real_parent) {
- if (same_thread_group(reaper, pid_ns->child_reaper))
+ /* call_usermodehelper() descendants need this check */
+ if (reaper == &init_task)
break;
if (!reaper->signal->is_child_subreaper)
continue;
- thread = reaper;
- do {
- if (!(thread->flags & PF_EXITING))
- return reaper;
- } while_each_thread(reaper, thread);
+ thread = find_alive_thread(reaper);
+ if (thread)
+ return thread;
}
}
- return pid_ns->child_reaper;
+ return child_reaper;
}
/*
@@ -529,15 +542,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
static void reparent_leader(struct task_struct *father, struct task_struct *p,
struct list_head *dead)
{
- list_move_tail(&p->sibling, &p->real_parent->children);
-
- if (p->exit_state == EXIT_DEAD)
- return;
- /*
- * If this is a threaded reparent there is no need to
- * notify anyone anything has happened.
- */
- if (same_thread_group(p->real_parent, father))
+ if (unlikely(p->exit_state == EXIT_DEAD))
return;
/* We don't want people slaying init. */
@@ -548,49 +553,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
if (do_notify_parent(p, p->exit_signal)) {
p->exit_state = EXIT_DEAD;
- list_move_tail(&p->sibling, dead);
+ list_add(&p->ptrace_entry, dead);
}
}
kill_orphaned_pgrp(p, father);
}
-static void forget_original_parent(struct task_struct *father)
+/*
+ * This does two things:
+ *
+ * A. Make init inherit all the child processes
+ * B. Check to see if any process groups have become orphaned
+ * as a result of our exiting, and if they have any stopped
+ * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+static void forget_original_parent(struct task_struct *father,
+ struct list_head *dead)
{
- struct task_struct *p, *n, *reaper;
- LIST_HEAD(dead_children);
+ struct task_struct *p, *t, *reaper;
- write_lock_irq(&tasklist_lock);
- /*
- * Note that exit_ptrace() and find_new_reaper() might
- * drop tasklist_lock and reacquire it.
- */
- exit_ptrace(father);
- reaper = find_new_reaper(father);
+ if (unlikely(!list_empty(&father->ptraced)))
+ exit_ptrace(father, dead);
- list_for_each_entry_safe(p, n, &father->children, sibling) {
- struct task_struct *t = p;
+ /* Can drop and reacquire tasklist_lock */
+ reaper = find_child_reaper(father);
+ if (list_empty(&father->children))
+ return;
- do {
+ reaper = find_new_reaper(father, reaper);
+ list_for_each_entry(p, &father->children, sibling) {
+ for_each_thread(p, t) {
t->real_parent = reaper;
- if (t->parent == father) {
- BUG_ON(t->ptrace);
+ BUG_ON((!t->ptrace) != (t->parent == father));
+ if (likely(!t->ptrace))
t->parent = t->real_parent;
- }
if (t->pdeath_signal)
group_send_sig_info(t->pdeath_signal,
SEND_SIG_NOINFO, t);
- } while_each_thread(p, t);
- reparent_leader(father, p, &dead_children);
- }
- write_unlock_irq(&tasklist_lock);
-
- BUG_ON(!list_empty(&father->children));
-
- list_for_each_entry_safe(p, n, &dead_children, sibling) {
- list_del_init(&p->sibling);
- release_task(p);
+ }
+ /*
+ * If this is a threaded reparent there is no need to
+ * notify anyone anything has happened.
+ */
+ if (!same_thread_group(reaper, father))
+ reparent_leader(father, p, dead);
}
+ list_splice_tail_init(&father->children, &reaper->children);
}
/*
@@ -600,18 +609,12 @@ static void forget_original_parent(struct task_struct *father)
static void exit_notify(struct task_struct *tsk, int group_dead)
{
bool autoreap;
-
- /*
- * This does two things:
- *
- * A. Make init inherit all the child processes
- * B. Check to see if any process groups have become orphaned
- * as a result of our exiting, and if they have any stopped
- * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
- */
- forget_original_parent(tsk);
+ struct task_struct *p, *n;
+ LIST_HEAD(dead);
write_lock_irq(&tasklist_lock);
+ forget_original_parent(tsk, &dead);
+
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -629,15 +632,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
}
tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
+ if (tsk->exit_state == EXIT_DEAD)
+ list_add(&tsk->ptrace_entry, &dead);
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
write_unlock_irq(&tasklist_lock);
- /* If the process is dead, release it - nobody will wait for it */
- if (autoreap)
- release_task(tsk);
+ list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
+ list_del_init(&p->ptrace_entry);
+ release_task(p);
+ }
}
#ifdef CONFIG_DEBUG_STACK_USAGE
@@ -982,8 +988,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
*/
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
- unsigned long state;
- int retval, status, traced;
+ int state, retval, status;
pid_t pid = task_pid_vnr(p);
uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
struct siginfo __user *infop;
@@ -997,6 +1002,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
get_task_struct(p);
read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
+
if ((exit_code & 0x7f) == 0) {
why = CLD_EXITED;
status = exit_code >> 8;
@@ -1006,21 +1013,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
}
return wait_noreap_copyout(wo, p, pid, uid, why, status);
}
-
- traced = ptrace_reparented(p);
/*
* Move the task's state to DEAD/TRACE, only one thread can do this.
*/
- state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
+ state = (ptrace_reparented(p) && thread_group_leader(p)) ?
+ EXIT_TRACE : EXIT_DEAD;
if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
return 0;
/*
- * It can be ptraced but not reparented, check
- * thread_group_leader() to filter out sub-threads.
+ * We own this thread, nobody else can reap it.
*/
- if (likely(!traced) && thread_group_leader(p)) {
- struct signal_struct *psig;
- struct signal_struct *sig;
+ read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
+
+ /*
+ * Check thread_group_leader() to exclude the traced sub-threads.
+ */
+ if (state == EXIT_DEAD && thread_group_leader(p)) {
+ struct signal_struct *sig = p->signal;
+ struct signal_struct *psig = current->signal;
unsigned long maxrss;
cputime_t tgutime, tgstime;
@@ -1032,21 +1043,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* accumulate in the parent's signal_struct c* fields.
*
* We don't bother to take a lock here to protect these
- * p->signal fields, because they are only touched by
- * __exit_signal, which runs with tasklist_lock
- * write-locked anyway, and so is excluded here. We do
- * need to protect the access to parent->signal fields,
- * as other threads in the parent group can be right
- * here reaping other children at the same time.
+ * p->signal fields because the whole thread group is dead
+ * and nobody can change them.
+ *
+ * psig->stats_lock also protects us from our sub-theads
+ * which can reap other children at the same time. Until
+ * we change k_getrusage()-like users to rely on this lock
+ * we have to take ->siglock as well.
*
* We use thread_group_cputime_adjusted() to get times for
* the thread group, which consolidates times for all threads
* in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- spin_lock_irq(&p->real_parent->sighand->siglock);
- psig = p->real_parent->signal;
- sig = p->signal;
+ spin_lock_irq(&current->sighand->siglock);
write_seqlock(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
@@ -1071,15 +1081,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
write_sequnlock(&psig->stats_lock);
- spin_unlock_irq(&p->real_parent->sighand->siglock);
+ spin_unlock_irq(&current->sighand->siglock);
}
- /*
- * Now we are sure this task is interesting, and no other
- * thread can reap it because we its state == DEAD/TRACE.
- */
- read_unlock(&tasklist_lock);
-
retval = wo->wo_rusage
? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
@@ -1210,6 +1214,7 @@ unlock_sig:
pid = task_pid_vnr(p);
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
if (unlikely(wo->wo_flags & WNOWAIT))
return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
@@ -1272,6 +1277,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
pid = task_pid_vnr(p);
get_task_struct(p);
read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
if (!wo->wo_info) {
retval = wo->wo_rusage
diff --git a/kernel/extable.c b/kernel/extable.c
index d8a6446adbcb..c98f926277a8 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,6 +18,7 @@
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/module.h>
+#include <linux/ftrace.h>
#include <linux/mutex.h>
#include <linux/init.h>
@@ -102,6 +103,8 @@ int __kernel_text_address(unsigned long addr)
return 1;
if (is_module_text_address(addr))
return 1;
+ if (is_ftrace_trampoline(addr))
+ return 1;
/*
* There might be init symbols in saved stacktraces.
* Give those symbols a chance to be printed in
@@ -119,7 +122,9 @@ int kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
- return is_module_text_address(addr);
+ if (is_module_text_address(addr))
+ return 1;
+ return is_ftrace_trampoline(addr);
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b7d746d6d62..9ca84189cfc2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1022,11 +1022,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
{
if (atomic_dec_and_test(&sighand->count)) {
signalfd_cleanup(sighand);
+ /*
+ * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
+ * without an RCU grace period, see __lock_task_sighand().
+ */
kmem_cache_free(sighand_cachep, sighand);
}
}
-
/*
* Initialize POSIX timer handling for a thread group.
*/
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 225086b2652e..9a76e3beda54 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -55,6 +55,21 @@ config GENERIC_IRQ_CHIP
config IRQ_DOMAIN
bool
+# Support for hierarchical irq domains
+config IRQ_DOMAIN_HIERARCHY
+ bool
+ select IRQ_DOMAIN
+
+# Generic MSI interrupt support
+config GENERIC_MSI_IRQ
+ bool
+
+# Generic MSI hierarchical interrupt domain support
+config GENERIC_MSI_IRQ_DOMAIN
+ bool
+ select IRQ_DOMAIN_HIERARCHY
+ select GENERIC_MSI_IRQ
+
config HANDLE_DOMAIN_IRQ
bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index fff17381f0af..d12123526e2b 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
obj-$(CONFIG_PM_SLEEP) += pm.o
+obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index e5202f00cabc..6f1c7a566b95 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <linux/irqdomain.h>
#include <trace/events/irq.h>
@@ -178,6 +179,7 @@ int irq_startup(struct irq_desc *desc, bool resend)
irq_state_clr_disabled(desc);
desc->depth = 0;
+ irq_domain_activate_irq(&desc->irq_data);
if (desc->irq_data.chip->irq_startup) {
ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
irq_state_clr_masked(desc);
@@ -199,6 +201,7 @@ void irq_shutdown(struct irq_desc *desc)
desc->irq_data.chip->irq_disable(&desc->irq_data);
else
desc->irq_data.chip->irq_mask(&desc->irq_data);
+ irq_domain_deactivate_irq(&desc->irq_data);
irq_state_set_masked(desc);
}
@@ -728,7 +731,30 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
if (!handle) {
handle = handle_bad_irq;
} else {
- if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
+ struct irq_data *irq_data = &desc->irq_data;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ /*
+ * With hierarchical domains we might run into a
+ * situation where the outermost chip is not yet set
+ * up, but the inner chips are there. Instead of
+ * bailing we install the handler, but obviously we
+ * cannot enable/startup the interrupt at this point.
+ */
+ while (irq_data) {
+ if (irq_data->chip != &no_irq_chip)
+ break;
+ /*
+ * Bail out if the outer chip is not set up
+ * and the interrrupt supposed to be started
+ * right away.
+ */
+ if (WARN_ON(is_chained))
+ goto out;
+ /* Try the parent */
+ irq_data = irq_data->parent_data;
+ }
+#endif
+ if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip))
goto out;
}
@@ -847,3 +873,105 @@ void irq_cpu_offline(void)
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+/**
+ * irq_chip_ack_parent - Acknowledge the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_ack_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_ack(data);
+}
+
+/**
+ * irq_chip_mask_parent - Mask the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_mask_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_mask(data);
+}
+
+/**
+ * irq_chip_unmask_parent - Unmask the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_unmask_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_unmask(data);
+}
+
+/**
+ * irq_chip_eoi_parent - Invoke EOI on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_eoi_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ data->chip->irq_eoi(data);
+}
+
+/**
+ * irq_chip_set_affinity_parent - Set affinity on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @dest: The affinity mask to set
+ * @force: Flag to enforce setting (disable online checks)
+ *
+ * Conditinal, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_affinity_parent(struct irq_data *data,
+ const struct cpumask *dest, bool force)
+{
+ data = data->parent_data;
+ if (data->chip->irq_set_affinity)
+ return data->chip->irq_set_affinity(data, dest, force);
+
+ return -ENOSYS;
+}
+
+/**
+ * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
+ * @data: Pointer to interrupt specific data
+ *
+ * Iterate through the domain hierarchy of the interrupt and check
+ * whether a hw retrigger function exists. If yes, invoke it.
+ */
+int irq_chip_retrigger_hierarchy(struct irq_data *data)
+{
+ for (data = data->parent_data; data; data = data->parent_data)
+ if (data->chip && data->chip->irq_retrigger)
+ return data->chip->irq_retrigger(data);
+
+ return -ENOSYS;
+}
+#endif
+
+/**
+ * irq_chip_compose_msi_msg - Componse msi message for a irq chip
+ * @data: Pointer to interrupt specific data
+ * @msg: Pointer to the MSI message
+ *
+ * For hierarchical domains we find the first chip in the hierarchy
+ * which implements the irq_compose_msi_msg callback. For non
+ * hierarchical we use the top level chip.
+ */
+int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ struct irq_data *pos = NULL;
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ for (; data; data = data->parent_data)
+#endif
+ if (data->chip && data->chip->irq_compose_msi_msg)
+ pos = data;
+ if (!pos)
+ return -ENOSYS;
+
+ pos->chip->irq_compose_msi_msg(pos, msg);
+
+ return 0;
+}
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index cf80e7b0ddab..61024e8abdef 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.disable);
+ irq_reg_writel(gc, mask, ct->regs.disable);
*ct->mask_cache &= ~mask;
irq_gc_unlock(gc);
}
@@ -59,7 +59,7 @@ void irq_gc_mask_set_bit(struct irq_data *d)
irq_gc_lock(gc);
*ct->mask_cache |= mask;
- irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
+ irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
@@ -79,7 +79,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
irq_gc_lock(gc);
*ct->mask_cache &= ~mask;
- irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
+ irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
@@ -98,7 +98,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.enable);
+ irq_reg_writel(gc, mask, ct->regs.enable);
*ct->mask_cache |= mask;
irq_gc_unlock(gc);
}
@@ -114,7 +114,7 @@ void irq_gc_ack_set_bit(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
+ irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
@@ -130,7 +130,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
u32 mask = ~d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
+ irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
@@ -145,8 +145,8 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.mask);
- irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
+ irq_reg_writel(gc, mask, ct->regs.mask);
+ irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
@@ -161,7 +161,7 @@ void irq_gc_eoi(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.eoi);
+ irq_reg_writel(gc, mask, ct->regs.eoi);
irq_gc_unlock(gc);
}
@@ -191,6 +191,16 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
return 0;
}
+static u32 irq_readl_be(void __iomem *addr)
+{
+ return ioread32be(addr);
+}
+
+static void irq_writel_be(u32 val, void __iomem *addr)
+{
+ iowrite32be(val, addr);
+}
+
static void
irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
int num_ct, unsigned int irq_base,
@@ -245,7 +255,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
}
ct[i].mask_cache = mskptr;
if (flags & IRQ_GC_INIT_MASK_CACHE)
- *mskptr = irq_reg_readl(gc->reg_base + mskreg);
+ *mskptr = irq_reg_readl(gc, mskreg);
}
}
@@ -300,7 +310,13 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
dgc->gc[i] = gc = tmp;
irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
NULL, handler);
+
gc->domain = d;
+ if (gcflags & IRQ_GC_BE_IO) {
+ gc->reg_readl = &irq_readl_be;
+ gc->reg_writel = &irq_writel_be;
+ }
+
raw_spin_lock_irqsave(&gc_lock, flags);
list_add_tail(&gc->list, &gc_list);
raw_spin_unlock_irqrestore(&gc_lock, flags);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 6534ff6ce02e..7fac311057b8 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -23,6 +23,10 @@ static DEFINE_MUTEX(irq_domain_mutex);
static DEFINE_MUTEX(revmap_trees_mutex);
static struct irq_domain *irq_default_domain;
+static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
+ irq_hw_number_t hwirq, int node);
+static void irq_domain_check_hierarchy(struct irq_domain *domain);
+
/**
* __irq_domain_add() - Allocate a new irq_domain data structure
* @of_node: optional device-tree node of the interrupt controller
@@ -30,7 +34,7 @@ static struct irq_domain *irq_default_domain;
* @hwirq_max: Maximum number of interrupts supported by controller
* @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
* direct mapping
- * @ops: map/unmap domain callbacks
+ * @ops: domain callbacks
* @host_data: Controller private data pointer
*
* Allocates and initialize and irq_domain structure.
@@ -56,6 +60,7 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
domain->hwirq_max = hwirq_max;
domain->revmap_size = size;
domain->revmap_direct_max_irq = direct_max;
+ irq_domain_check_hierarchy(domain);
mutex_lock(&irq_domain_mutex);
list_add(&domain->link, &irq_domain_list);
@@ -109,7 +114,7 @@ EXPORT_SYMBOL_GPL(irq_domain_remove);
* @first_irq: first number of irq block assigned to the domain,
* pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
* pre-map all of the irqs in the domain to virqs starting at first_irq.
- * @ops: map/unmap domain callbacks
+ * @ops: domain callbacks
* @host_data: Controller private data pointer
*
* Allocates an irq_domain, and optionally if first_irq is positive then also
@@ -174,10 +179,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
domain = __irq_domain_add(of_node, first_hwirq + size,
first_hwirq + size, 0, ops, host_data);
- if (!domain)
- return NULL;
-
- irq_domain_associate_many(domain, first_irq, first_hwirq, size);
+ if (domain)
+ irq_domain_associate_many(domain, first_irq, first_hwirq, size);
return domain;
}
@@ -388,7 +391,6 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
unsigned int irq_create_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
- unsigned int hint;
int virq;
pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
@@ -410,12 +412,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
}
/* Allocate a virtual interrupt number */
- hint = hwirq % nr_irqs;
- if (hint == 0)
- hint++;
- virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
- if (virq <= 0)
- virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
+ virq = irq_domain_alloc_descs(-1, 1, hwirq,
+ of_node_to_nid(domain->of_node));
if (virq <= 0) {
pr_debug("-> virq allocation failed\n");
return 0;
@@ -471,7 +469,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
struct irq_domain *domain;
irq_hw_number_t hwirq;
unsigned int type = IRQ_TYPE_NONE;
- unsigned int virq;
+ int virq;
domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
if (!domain) {
@@ -489,10 +487,24 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
return 0;
}
- /* Create mapping */
- virq = irq_create_mapping(domain, hwirq);
- if (!virq)
- return virq;
+ if (irq_domain_is_hierarchy(domain)) {
+ /*
+ * If we've already configured this interrupt,
+ * don't do it again, or hell will break loose.
+ */
+ virq = irq_find_mapping(domain, hwirq);
+ if (virq)
+ return virq;
+
+ virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data);
+ if (virq <= 0)
+ return 0;
+ } else {
+ /* Create mapping */
+ virq = irq_create_mapping(domain, hwirq);
+ if (!virq)
+ return virq;
+ }
/* Set type if specified and different than the current one */
if (type != IRQ_TYPE_NONE &&
@@ -540,8 +552,8 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
return 0;
if (hwirq < domain->revmap_direct_max_irq) {
- data = irq_get_irq_data(hwirq);
- if (data && (data->domain == domain) && (data->hwirq == hwirq))
+ data = irq_domain_get_irq_data(domain, hwirq);
+ if (data && data->hwirq == hwirq)
return hwirq;
}
@@ -709,3 +721,518 @@ const struct irq_domain_ops irq_domain_simple_ops = {
.xlate = irq_domain_xlate_onetwocell,
};
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
+
+static int irq_domain_alloc_descs(int virq, unsigned int cnt,
+ irq_hw_number_t hwirq, int node)
+{
+ unsigned int hint;
+
+ if (virq >= 0) {
+ virq = irq_alloc_descs(virq, virq, cnt, node);
+ } else {
+ hint = hwirq % nr_irqs;
+ if (hint == 0)
+ hint++;
+ virq = irq_alloc_descs_from(hint, cnt, node);
+ if (virq <= 0 && hint > 1)
+ virq = irq_alloc_descs_from(1, cnt, node);
+ }
+
+ return virq;
+}
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+/**
+ * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy
+ * @parent: Parent irq domain to associate with the new domain
+ * @flags: Irq domain flags associated to the domain
+ * @size: Size of the domain. See below
+ * @node: Optional device-tree node of the interrupt controller
+ * @ops: Pointer to the interrupt domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * If @size is 0 a tree domain is created, otherwise a linear domain.
+ *
+ * If successful the parent is associated to the new domain and the
+ * domain flags are set.
+ * Returns pointer to IRQ domain, or NULL on failure.
+ */
+struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent,
+ unsigned int flags,
+ unsigned int size,
+ struct device_node *node,
+ const struct irq_domain_ops *ops,
+ void *host_data)
+{
+ struct irq_domain *domain;
+
+ if (size)
+ domain = irq_domain_add_linear(node, size, ops, host_data);
+ else
+ domain = irq_domain_add_tree(node, ops, host_data);
+ if (domain) {
+ domain->parent = parent;
+ domain->flags |= flags;
+ }
+
+ return domain;
+}
+
+static void irq_domain_insert_irq(int virq)
+{
+ struct irq_data *data;
+
+ for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
+ struct irq_domain *domain = data->domain;
+ irq_hw_number_t hwirq = data->hwirq;
+
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = virq;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_insert(&domain->revmap_tree, hwirq, data);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+
+ /* If not already assigned, give the domain the chip's name */
+ if (!domain->name && data->chip)
+ domain->name = data->chip->name;
+ }
+
+ irq_clear_status_flags(virq, IRQ_NOREQUEST);
+}
+
+static void irq_domain_remove_irq(int virq)
+{
+ struct irq_data *data;
+
+ irq_set_status_flags(virq, IRQ_NOREQUEST);
+ irq_set_chip_and_handler(virq, NULL, NULL);
+ synchronize_irq(virq);
+ smp_mb();
+
+ for (data = irq_get_irq_data(virq); data; data = data->parent_data) {
+ struct irq_domain *domain = data->domain;
+ irq_hw_number_t hwirq = data->hwirq;
+
+ if (hwirq < domain->revmap_size) {
+ domain->linear_revmap[hwirq] = 0;
+ } else {
+ mutex_lock(&revmap_trees_mutex);
+ radix_tree_delete(&domain->revmap_tree, hwirq);
+ mutex_unlock(&revmap_trees_mutex);
+ }
+ }
+}
+
+static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
+ struct irq_data *child)
+{
+ struct irq_data *irq_data;
+
+ irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node);
+ if (irq_data) {
+ child->parent_data = irq_data;
+ irq_data->irq = child->irq;
+ irq_data->node = child->node;
+ irq_data->domain = domain;
+ }
+
+ return irq_data;
+}
+
+static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *irq_data, *tmp;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_get_irq_data(virq + i);
+ tmp = irq_data->parent_data;
+ irq_data->parent_data = NULL;
+ irq_data->domain = NULL;
+
+ while (tmp) {
+ irq_data = tmp;
+ tmp = tmp->parent_data;
+ kfree(irq_data);
+ }
+ }
+}
+
+static int irq_domain_alloc_irq_data(struct irq_domain *domain,
+ unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *irq_data;
+ struct irq_domain *parent;
+ int i;
+
+ /* The outermost irq_data is embedded in struct irq_desc */
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_get_irq_data(virq + i);
+ irq_data->domain = domain;
+
+ for (parent = domain->parent; parent; parent = parent->parent) {
+ irq_data = irq_domain_insert_irq_data(parent, irq_data);
+ if (!irq_data) {
+ irq_domain_free_irq_data(virq, i + 1);
+ return -ENOMEM;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
+ * @domain: domain to match
+ * @virq: IRQ number to get irq_data
+ */
+struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
+ unsigned int virq)
+{
+ struct irq_data *irq_data;
+
+ for (irq_data = irq_get_irq_data(virq); irq_data;
+ irq_data = irq_data->parent_data)
+ if (irq_data->domain == domain)
+ return irq_data;
+
+ return NULL;
+}
+
+/**
+ * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number
+ * @hwirq: The hwirq number
+ * @chip: The associated interrupt chip
+ * @chip_data: The associated chip data
+ */
+int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq, struct irq_chip *chip,
+ void *chip_data)
+{
+ struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+ if (!irq_data)
+ return -ENOENT;
+
+ irq_data->hwirq = hwirq;
+ irq_data->chip = chip ? chip : &no_irq_chip;
+ irq_data->chip_data = chip_data;
+
+ return 0;
+}
+
+/**
+ * irq_domain_set_info - Set the complete data for a @virq in @domain
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number
+ * @hwirq: The hardware interrupt number
+ * @chip: The associated interrupt chip
+ * @chip_data: The associated interrupt chip data
+ * @handler: The interrupt flow handler
+ * @handler_data: The interrupt flow handler data
+ * @handler_name: The interrupt handler name
+ */
+void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq, struct irq_chip *chip,
+ void *chip_data, irq_flow_handler_t handler,
+ void *handler_data, const char *handler_name)
+{
+ irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data);
+ __irq_set_handler(virq, handler, 0, handler_name);
+ irq_set_handler_data(virq, handler_data);
+}
+
+/**
+ * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data
+ * @irq_data: The pointer to irq_data
+ */
+void irq_domain_reset_irq_data(struct irq_data *irq_data)
+{
+ irq_data->hwirq = 0;
+ irq_data->chip = &no_irq_chip;
+ irq_data->chip_data = NULL;
+}
+
+/**
+ * irq_domain_free_irqs_common - Clear irq_data and free the parent
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number to start with
+ * @nr_irqs: The number of irqs to free
+ */
+void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_data *irq_data;
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_data = irq_domain_get_irq_data(domain, virq + i);
+ if (irq_data)
+ irq_domain_reset_irq_data(irq_data);
+ }
+ irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+}
+
+/**
+ * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent
+ * @domain: Interrupt domain to match
+ * @virq: IRQ number to start with
+ * @nr_irqs: The number of irqs to free
+ */
+void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ int i;
+
+ for (i = 0; i < nr_irqs; i++) {
+ irq_set_handler_data(virq + i, NULL);
+ irq_set_handler(virq + i, NULL);
+ }
+ irq_domain_free_irqs_common(domain, virq, nr_irqs);
+}
+
+static bool irq_domain_is_auto_recursive(struct irq_domain *domain)
+{
+ return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE;
+}
+
+static void irq_domain_free_irqs_recursive(struct irq_domain *domain,
+ unsigned int irq_base,
+ unsigned int nr_irqs)
+{
+ domain->ops->free(domain, irq_base, nr_irqs);
+ if (irq_domain_is_auto_recursive(domain)) {
+ BUG_ON(!domain->parent);
+ irq_domain_free_irqs_recursive(domain->parent, irq_base,
+ nr_irqs);
+ }
+}
+
+static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain,
+ unsigned int irq_base,
+ unsigned int nr_irqs, void *arg)
+{
+ int ret = 0;
+ struct irq_domain *parent = domain->parent;
+ bool recursive = irq_domain_is_auto_recursive(domain);
+
+ BUG_ON(recursive && !parent);
+ if (recursive)
+ ret = irq_domain_alloc_irqs_recursive(parent, irq_base,
+ nr_irqs, arg);
+ if (ret >= 0)
+ ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg);
+ if (ret < 0 && recursive)
+ irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs);
+
+ return ret;
+}
+
+/**
+ * __irq_domain_alloc_irqs - Allocate IRQs from domain
+ * @domain: domain to allocate from
+ * @irq_base: allocate specified IRQ nubmer if irq_base >= 0
+ * @nr_irqs: number of IRQs to allocate
+ * @node: NUMA node id for memory allocation
+ * @arg: domain specific argument
+ * @realloc: IRQ descriptors have already been allocated if true
+ *
+ * Allocate IRQ numbers and initialized all data structures to support
+ * hierarchy IRQ domains.
+ * Parameter @realloc is mainly to support legacy IRQs.
+ * Returns error code or allocated IRQ number
+ *
+ * The whole process to setup an IRQ has been split into two steps.
+ * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ
+ * descriptor and required hardware resources. The second step,
+ * irq_domain_activate_irq(), is to program hardwares with preallocated
+ * resources. In this way, it's easier to rollback when failing to
+ * allocate resources.
+ */
+int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
+ unsigned int nr_irqs, int node, void *arg,
+ bool realloc)
+{
+ int i, ret, virq;
+
+ if (domain == NULL) {
+ domain = irq_default_domain;
+ if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
+ return -EINVAL;
+ }
+
+ if (!domain->ops->alloc) {
+ pr_debug("domain->ops->alloc() is NULL\n");
+ return -ENOSYS;
+ }
+
+ if (realloc && irq_base >= 0) {
+ virq = irq_base;
+ } else {
+ virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node);
+ if (virq < 0) {
+ pr_debug("cannot allocate IRQ(base %d, count %d)\n",
+ irq_base, nr_irqs);
+ return virq;
+ }
+ }
+
+ if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) {
+ pr_debug("cannot allocate memory for IRQ%d\n", virq);
+ ret = -ENOMEM;
+ goto out_free_desc;
+ }
+
+ mutex_lock(&irq_domain_mutex);
+ ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg);
+ if (ret < 0) {
+ mutex_unlock(&irq_domain_mutex);
+ goto out_free_irq_data;
+ }
+ for (i = 0; i < nr_irqs; i++)
+ irq_domain_insert_irq(virq + i);
+ mutex_unlock(&irq_domain_mutex);
+
+ return virq;
+
+out_free_irq_data:
+ irq_domain_free_irq_data(virq, nr_irqs);
+out_free_desc:
+ irq_free_descs(virq, nr_irqs);
+ return ret;
+}
+
+/**
+ * irq_domain_free_irqs - Free IRQ number and associated data structures
+ * @virq: base IRQ number
+ * @nr_irqs: number of IRQs to free
+ */
+void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)
+{
+ struct irq_data *data = irq_get_irq_data(virq);
+ int i;
+
+ if (WARN(!data || !data->domain || !data->domain->ops->free,
+ "NULL pointer, cannot free irq\n"))
+ return;
+
+ mutex_lock(&irq_domain_mutex);
+ for (i = 0; i < nr_irqs; i++)
+ irq_domain_remove_irq(virq + i);
+ irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs);
+ mutex_unlock(&irq_domain_mutex);
+
+ irq_domain_free_irq_data(virq, nr_irqs);
+ irq_free_descs(virq, nr_irqs);
+}
+
+/**
+ * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain
+ * @irq_base: Base IRQ number
+ * @nr_irqs: Number of IRQs to allocate
+ * @arg: Allocation data (arch/domain specific)
+ *
+ * Check whether the domain has been setup recursive. If not allocate
+ * through the parent domain.
+ */
+int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
+ unsigned int irq_base, unsigned int nr_irqs,
+ void *arg)
+{
+ /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */
+ if (irq_domain_is_auto_recursive(domain))
+ return 0;
+
+ domain = domain->parent;
+ if (domain)
+ return irq_domain_alloc_irqs_recursive(domain, irq_base,
+ nr_irqs, arg);
+ return -ENOSYS;
+}
+
+/**
+ * irq_domain_free_irqs_parent - Free interrupts from parent domain
+ * @irq_base: Base IRQ number
+ * @nr_irqs: Number of IRQs to free
+ *
+ * Check whether the domain has been setup recursive. If not free
+ * through the parent domain.
+ */
+void irq_domain_free_irqs_parent(struct irq_domain *domain,
+ unsigned int irq_base, unsigned int nr_irqs)
+{
+ /* irq_domain_free_irqs_recursive() will call parent's free */
+ if (!irq_domain_is_auto_recursive(domain) && domain->parent)
+ irq_domain_free_irqs_recursive(domain->parent, irq_base,
+ nr_irqs);
+}
+
+/**
+ * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
+ * interrupt
+ * @irq_data: outermost irq_data associated with interrupt
+ *
+ * This is the second step to call domain_ops->activate to program interrupt
+ * controllers, so the interrupt could actually get delivered.
+ */
+void irq_domain_activate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (irq_data->parent_data)
+ irq_domain_activate_irq(irq_data->parent_data);
+ if (domain->ops->activate)
+ domain->ops->activate(domain, irq_data);
+ }
+}
+
+/**
+ * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to
+ * deactivate interrupt
+ * @irq_data: outermost irq_data associated with interrupt
+ *
+ * It calls domain_ops->deactivate to program interrupt controllers to disable
+ * interrupt delivery.
+ */
+void irq_domain_deactivate_irq(struct irq_data *irq_data)
+{
+ if (irq_data && irq_data->domain) {
+ struct irq_domain *domain = irq_data->domain;
+
+ if (domain->ops->deactivate)
+ domain->ops->deactivate(domain, irq_data);
+ if (irq_data->parent_data)
+ irq_domain_deactivate_irq(irq_data->parent_data);
+ }
+}
+
+static void irq_domain_check_hierarchy(struct irq_domain *domain)
+{
+ /* Hierarchy irq_domains must implement callback alloc() */
+ if (domain->ops->alloc)
+ domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;
+}
+#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */
+/**
+ * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain
+ * @domain: domain to match
+ * @virq: IRQ number to get irq_data
+ */
+struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
+ unsigned int virq)
+{
+ struct irq_data *irq_data = irq_get_irq_data(virq);
+
+ return (irq_data && irq_data->domain == domain) ? irq_data : NULL;
+}
+
+static void irq_domain_check_hierarchy(struct irq_domain *domain)
+{
+}
+#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0a9104b4608b..80692373abd6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -183,6 +183,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
ret = chip->irq_set_affinity(data, mask, force);
switch (ret) {
case IRQ_SET_MASK_OK:
+ case IRQ_SET_MASK_OK_DONE:
cpumask_copy(data->affinity, mask);
case IRQ_SET_MASK_OK_NOCOPY:
irq_set_thread_affinity(desc);
@@ -600,6 +601,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
switch (ret) {
case IRQ_SET_MASK_OK:
+ case IRQ_SET_MASK_OK_DONE:
irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
irqd_set(&desc->irq_data, flags);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
new file mode 100644
index 000000000000..3e18163f336f
--- /dev/null
+++ b/kernel/irq/msi.c
@@ -0,0 +1,330 @@
+/*
+ * linux/kernel/irq/msi.c
+ *
+ * Copyright (C) 2014 Intel Corp.
+ * Author: Jiang Liu <jiang.liu@linux.intel.com>
+ *
+ * This file is licensed under GPLv2.
+ *
+ * This file contains common code to support Message Signalled Interrupt for
+ * PCI compatible and non PCI compatible devices.
+ */
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/msi.h>
+
+/* Temparory solution for building, will be removed later */
+#include <linux/pci.h>
+
+void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
+{
+ *msg = entry->msg;
+}
+
+void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
+{
+ struct msi_desc *entry = irq_get_msi_desc(irq);
+
+ __get_cached_msi_msg(entry, msg);
+}
+EXPORT_SYMBOL_GPL(get_cached_msi_msg);
+
+#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+static inline void irq_chip_write_msi_msg(struct irq_data *data,
+ struct msi_msg *msg)
+{
+ data->chip->irq_write_msi_msg(data, msg);
+}
+
+/**
+ * msi_domain_set_affinity - Generic affinity setter function for MSI domains
+ * @irq_data: The irq data associated to the interrupt
+ * @mask: The affinity mask to set
+ * @force: Flag to enforce setting (disable online checks)
+ *
+ * Intended to be used by MSI interrupt controllers which are
+ * implemented with hierarchical domains.
+ */
+int msi_domain_set_affinity(struct irq_data *irq_data,
+ const struct cpumask *mask, bool force)
+{
+ struct irq_data *parent = irq_data->parent_data;
+ struct msi_msg msg;
+ int ret;
+
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
+ BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
+ irq_chip_write_msi_msg(irq_data, &msg);
+ }
+
+ return ret;
+}
+
+static void msi_domain_activate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ struct msi_msg msg;
+
+ BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
+ irq_chip_write_msi_msg(irq_data, &msg);
+}
+
+static void msi_domain_deactivate(struct irq_domain *domain,
+ struct irq_data *irq_data)
+{
+ struct msi_msg msg;
+
+ memset(&msg, 0, sizeof(msg));
+ irq_chip_write_msi_msg(irq_data, &msg);
+}
+
+static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ irq_hw_number_t hwirq = ops->get_hwirq(info, arg);
+ int i, ret;
+
+ if (irq_find_mapping(domain, hwirq) > 0)
+ return -EEXIST;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < nr_irqs; i++) {
+ ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
+ if (ret < 0) {
+ if (ops->msi_free) {
+ for (i--; i > 0; i--)
+ ops->msi_free(domain, info, virq + i);
+ }
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void msi_domain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct msi_domain_info *info = domain->host_data;
+ int i;
+
+ if (info->ops->msi_free) {
+ for (i = 0; i < nr_irqs; i++)
+ info->ops->msi_free(domain, info, virq + i);
+ }
+ irq_domain_free_irqs_top(domain, virq, nr_irqs);
+}
+
+static struct irq_domain_ops msi_domain_ops = {
+ .alloc = msi_domain_alloc,
+ .free = msi_domain_free,
+ .activate = msi_domain_activate,
+ .deactivate = msi_domain_deactivate,
+};
+
+#ifdef GENERIC_MSI_DOMAIN_OPS
+static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return arg->hwirq;
+}
+
+static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg)
+{
+ memset(arg, 0, sizeof(*arg));
+ return 0;
+}
+
+static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
+ struct msi_desc *desc)
+{
+ arg->desc = desc;
+}
+#else
+#define msi_domain_ops_get_hwirq NULL
+#define msi_domain_ops_prepare NULL
+#define msi_domain_ops_set_desc NULL
+#endif /* !GENERIC_MSI_DOMAIN_OPS */
+
+static int msi_domain_ops_init(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ unsigned int virq, irq_hw_number_t hwirq,
+ msi_alloc_info_t *arg)
+{
+ irq_domain_set_hwirq_and_chip(domain, virq, hwirq, info->chip,
+ info->chip_data);
+ if (info->handler && info->handler_name) {
+ __irq_set_handler(virq, info->handler, 0, info->handler_name);
+ if (info->handler_data)
+ irq_set_handler_data(virq, info->handler_data);
+ }
+ return 0;
+}
+
+static int msi_domain_ops_check(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ struct device *dev)
+{
+ return 0;
+}
+
+static struct msi_domain_ops msi_domain_ops_default = {
+ .get_hwirq = msi_domain_ops_get_hwirq,
+ .msi_init = msi_domain_ops_init,
+ .msi_check = msi_domain_ops_check,
+ .msi_prepare = msi_domain_ops_prepare,
+ .set_desc = msi_domain_ops_set_desc,
+};
+
+static void msi_domain_update_dom_ops(struct msi_domain_info *info)
+{
+ struct msi_domain_ops *ops = info->ops;
+
+ if (ops == NULL) {
+ info->ops = &msi_domain_ops_default;
+ return;
+ }
+
+ if (ops->get_hwirq == NULL)
+ ops->get_hwirq = msi_domain_ops_default.get_hwirq;
+ if (ops->msi_init == NULL)
+ ops->msi_init = msi_domain_ops_default.msi_init;
+ if (ops->msi_check == NULL)
+ ops->msi_check = msi_domain_ops_default.msi_check;
+ if (ops->msi_prepare == NULL)
+ ops->msi_prepare = msi_domain_ops_default.msi_prepare;
+ if (ops->set_desc == NULL)
+ ops->set_desc = msi_domain_ops_default.set_desc;
+}
+
+static void msi_domain_update_chip_ops(struct msi_domain_info *info)
+{
+ struct irq_chip *chip = info->chip;
+
+ BUG_ON(!chip);
+ if (!chip->irq_mask)
+ chip->irq_mask = pci_msi_mask_irq;
+ if (!chip->irq_unmask)
+ chip->irq_unmask = pci_msi_unmask_irq;
+ if (!chip->irq_set_affinity)
+ chip->irq_set_affinity = msi_domain_set_affinity;
+}
+
+/**
+ * msi_create_irq_domain - Create a MSI interrupt domain
+ * @of_node: Optional device-tree node of the interrupt controller
+ * @info: MSI domain info
+ * @parent: Parent irq domain
+ */
+struct irq_domain *msi_create_irq_domain(struct device_node *node,
+ struct msi_domain_info *info,
+ struct irq_domain *parent)
+{
+ if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
+ msi_domain_update_dom_ops(info);
+ if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
+ msi_domain_update_chip_ops(info);
+
+ return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops,
+ info);
+}
+
+/**
+ * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
+ * @domain: The domain to allocate from
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @nvec: The number of interrupts to allocate
+ *
+ * Returns 0 on success or an error code.
+ */
+int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ msi_alloc_info_t arg;
+ struct msi_desc *desc;
+ int i, ret, virq = -1;
+
+ ret = ops->msi_check(domain, info, dev);
+ if (ret == 0)
+ ret = ops->msi_prepare(domain, dev, nvec, &arg);
+ if (ret)
+ return ret;
+
+ for_each_msi_entry(desc, dev) {
+ ops->set_desc(&arg, desc);
+ if (info->flags & MSI_FLAG_IDENTITY_MAP)
+ virq = (int)ops->get_hwirq(info, &arg);
+ else
+ virq = -1;
+
+ virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
+ dev_to_node(dev), &arg, false);
+ if (virq < 0) {
+ ret = -ENOSPC;
+ if (ops->handle_error)
+ ret = ops->handle_error(domain, desc, ret);
+ if (ops->msi_finish)
+ ops->msi_finish(&arg, ret);
+ return ret;
+ }
+
+ for (i = 0; i < desc->nvec_used; i++)
+ irq_set_msi_desc_off(virq, i, desc);
+ }
+
+ if (ops->msi_finish)
+ ops->msi_finish(&arg, 0);
+
+ for_each_msi_entry(desc, dev) {
+ if (desc->nvec_used == 1)
+ dev_dbg(dev, "irq %d for MSI\n", virq);
+ else
+ dev_dbg(dev, "irq [%d-%d] for MSI\n",
+ virq, virq + desc->nvec_used - 1);
+ }
+
+ return 0;
+}
+
+/**
+ * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
+ * @domain: The domain to managing the interrupts
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are free
+ */
+void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+ struct msi_desc *desc;
+
+ for_each_msi_entry(desc, dev) {
+ irq_domain_free_irqs(desc->irq, desc->nvec_used);
+ desc->irq = 0;
+ }
+}
+
+/**
+ * msi_get_domain_info - Get the MSI interrupt domain info for @domain
+ * @domain: The interrupt domain to retrieve data from
+ *
+ * Returns the pointer to the msi_domain_info stored in
+ * @domain->host_data.
+ */
+struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
+{
+ return (struct msi_domain_info *)domain->host_data;
+}
+
+#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 80f7a6d00519..2777f40a9c7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -47,13 +47,6 @@ extern int max_threads;
static struct workqueue_struct *khelper_wq;
-/*
- * kmod_thread_locker is used for deadlock avoidance. There is no explicit
- * locking to protect this global - it is private to the singleton khelper
- * thread and should only ever be modified by that thread.
- */
-static const struct task_struct *kmod_thread_locker;
-
#define CAP_BSET (void *)1
#define CAP_PI (void *)2
@@ -223,7 +216,6 @@ static void umh_complete(struct subprocess_info *sub_info)
static int ____call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
- int wait = sub_info->wait & ~UMH_KILLABLE;
struct cred *new;
int retval;
@@ -267,20 +259,13 @@ static int ____call_usermodehelper(void *data)
out:
sub_info->retval = retval;
/* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
- if (wait != UMH_WAIT_PROC)
+ if (!(sub_info->wait & UMH_WAIT_PROC))
umh_complete(sub_info);
if (!retval)
return 0;
do_exit(0);
}
-static int call_helper(void *data)
-{
- /* Worker thread started blocking khelper thread. */
- kmod_thread_locker = current;
- return ____call_usermodehelper(data);
-}
-
/* Keventd can't block, but this (a child) can. */
static int wait_for_helper(void *data)
{
@@ -323,21 +308,14 @@ static void __call_usermodehelper(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
- int wait = sub_info->wait & ~UMH_KILLABLE;
pid_t pid;
- /* CLONE_VFORK: wait until the usermode helper has execve'd
- * successfully We need the data structures to stay around
- * until that is done. */
- if (wait == UMH_WAIT_PROC)
+ if (sub_info->wait & UMH_WAIT_PROC)
pid = kernel_thread(wait_for_helper, sub_info,
CLONE_FS | CLONE_FILES | SIGCHLD);
- else {
- pid = kernel_thread(call_helper, sub_info,
- CLONE_VFORK | SIGCHLD);
- /* Worker thread stopped blocking khelper thread. */
- kmod_thread_locker = NULL;
- }
+ else
+ pid = kernel_thread(____call_usermodehelper, sub_info,
+ SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
@@ -571,17 +549,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
goto out;
}
/*
- * Worker thread must not wait for khelper thread at below
- * wait_for_completion() if the thread was created with CLONE_VFORK
- * flag, for khelper thread is already waiting for the thread at
- * wait_for_completion() in do_fork().
- */
- if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
- retval = -EBUSY;
- goto out;
- }
-
- /*
* Set the completion pointer only if there is a waiter.
* This makes it possible to use umh_complete to free
* the data structure in case of UMH_NO_WAIT.
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3995f546d0f3..831978cebf1d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -915,7 +915,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
#ifdef CONFIG_KPROBES_ON_FTRACE
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
.func = kprobe_ftrace_handler,
- .flags = FTRACE_OPS_FL_SAVE_REGS,
+ .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
};
static int kprobe_ftrace_enabled;
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index dadbf88c22c4..454195194d4a 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -378,8 +378,14 @@ done:
* reschedule now, before we try-lock the mutex. This avoids getting
* scheduled out right after we obtained the mutex.
*/
- if (need_resched())
+ if (need_resched()) {
+ /*
+ * We _should_ have TASK_RUNNING here, but just in case
+ * we do not, make it so, otherwise we might get stuck.
+ */
+ __set_current_state(TASK_RUNNING);
schedule_preempt_disabled();
+ }
return false;
}
diff --git a/kernel/module.c b/kernel/module.c
index 88cec1ddb1e3..e52a8739361a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3097,6 +3097,32 @@ static int may_init_module(void)
}
/*
+ * Can't use wait_event_interruptible() because our condition
+ * 'finished_loading()' contains a blocking primitive itself (mutex_lock).
+ */
+static int wait_finished_loading(struct module *mod)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret = 0;
+
+ add_wait_queue(&module_wq, &wait);
+ for (;;) {
+ if (finished_loading(mod->name))
+ break;
+
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+ }
+ remove_wait_queue(&module_wq, &wait);
+
+ return ret;
+}
+
+/*
* We try to place it in the list now to make sure it's unique before
* we dedicate too many resources. In particular, temporary percpu
* memory exhaustion.
@@ -3116,8 +3142,8 @@ again:
|| old->state == MODULE_STATE_UNFORMED) {
/* Wait in case it fails to load. */
mutex_unlock(&module_mutex);
- err = wait_event_interruptible(module_wq,
- finished_loading(mod->name));
+
+ err = wait_finished_loading(mod);
if (err)
goto out_unlocked;
goto again;
diff --git a/kernel/panic.c b/kernel/panic.c
index cf80672b7924..4d8d6f906dec 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -33,6 +33,7 @@ static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
static bool crash_kexec_post_notifiers;
+int panic_on_warn __read_mostly;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -428,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
if (args)
vprintk(args->fmt, args->args);
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ panic("panic_on_warn set ...\n");
+ }
+
print_modules();
dump_stack();
print_oops_end_marker();
@@ -485,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail);
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
+core_param(panic_on_warn, panic_on_warn, int, 0644);
static int __init setup_crash_kexec_post_notifiers(char *s)
{
diff --git a/kernel/pid.c b/kernel/pid.c
index 9b9a26698144..82430c858d69 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -341,6 +341,8 @@ out:
out_unlock:
spin_unlock_irq(&pidmap_lock);
+ put_pid_ns(ns);
+
out_free:
while (++i <= ns->level)
free_pidmap(pid->numbers + i);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index db95d8eb761b..bc6d6a89b6e6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -190,7 +190,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
- /* Ignore SIGCHLD causing any terminated children to autoreap */
+ /*
+ * Ignore SIGCHLD causing any terminated children to autoreap.
+ * This speeds up the namespace shutdown, plus see the comment
+ * below.
+ */
spin_lock_irq(&me->sighand->siglock);
me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
spin_unlock_irq(&me->sighand->siglock);
@@ -223,15 +227,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
}
read_unlock(&tasklist_lock);
- /* Firstly reap the EXIT_ZOMBIE children we may have. */
+ /*
+ * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
+ * sys_wait4() will also block until our children traced from the
+ * parent namespace are detached and become EXIT_DEAD.
+ */
do {
clear_thread_flag(TIF_SIGPENDING);
rc = sys_wait4(-1, NULL, __WALL, NULL);
} while (rc != -ECHILD);
/*
- * sys_wait4() above can't reap the TASK_DEAD children.
- * Make sure they all go away, see free_pid().
+ * sys_wait4() above can't reap the EXIT_DEAD children but we do not
+ * really care, we could reparent them to the global init. We could
+ * exit and reap ->child_reaper even if it is not the last thread in
+ * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+ * pid_ns can not go away until proc_kill_sb() drops the reference.
+ *
+ * But this ns can also have other tasks injected by setns()+fork().
+ * Again, ignoring the user visible semantics we do not really need
+ * to wait until they are all reaped, but they can be reparented to
+ * us and thus we need to ensure that pid->child_reaper stays valid
+ * until they all go away. See free_pid()->wake_up_process().
+ *
+ * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
+ * if reparented.
*/
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index bbef57f5bdfd..6e7708c2c21f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -94,6 +94,7 @@ config PM_STD_PARTITION
config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
+ select PM_RUNTIME
config PM_SLEEP_SMP
def_bool y
@@ -131,7 +132,6 @@ config PM_WAKELOCKS_GC
config PM_RUNTIME
bool "Run-time PM core functionality"
- depends on !IA64_HP_SIM
---help---
Enable functionality allowing I/O devices to be put into energy-saving
(low power) states at run time (or autosuspended) after a specified
@@ -298,14 +298,9 @@ config PM_GENERIC_DOMAINS_SLEEP
def_bool y
depends on PM_SLEEP && PM_GENERIC_DOMAINS
-config PM_GENERIC_DOMAINS_RUNTIME
- def_bool y
- depends on PM_RUNTIME && PM_GENERIC_DOMAINS
-
config PM_GENERIC_DOMAINS_OF
def_bool y
depends on PM_GENERIC_DOMAINS && OF
config CPU_PM
bool
- depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1f35a3478f3c..2329daae5255 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,6 +28,7 @@
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
#include <linux/genhd.h>
+#include <linux/ktime.h>
#include <trace/events/power.h>
#include "power.h"
@@ -232,20 +233,17 @@ static void platform_recover(int platform_mode)
* @nr_pages: Number of memory pages processed between @start and @stop.
* @msg: Additional diagnostic message to print.
*/
-void swsusp_show_speed(struct timeval *start, struct timeval *stop,
- unsigned nr_pages, char *msg)
+void swsusp_show_speed(ktime_t start, ktime_t stop,
+ unsigned nr_pages, char *msg)
{
+ ktime_t diff;
u64 elapsed_centisecs64;
unsigned int centisecs;
unsigned int k;
unsigned int kps;
- elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
- /*
- * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
- * it is obvious enough for what went wrong.
- */
- do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+ diff = ktime_sub(stop, start);
+ elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC);
centisecs = elapsed_centisecs64;
if (centisecs == 0)
centisecs = 1; /* avoid div-by-zero */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 2df883a9d3cb..ce9b8328a689 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -174,8 +174,7 @@ extern int hib_wait_on_bio_chain(struct bio **bio_chain);
struct timeval;
/* kernel/power/swsusp.c */
-extern void swsusp_show_speed(struct timeval *, struct timeval *,
- unsigned int, char *);
+extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
#ifdef CONFIG_SUSPEND
/* kernel/power/suspend.c */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 791a61892bb5..0c40c16174b4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -28,6 +28,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/compiler.h>
+#include <linux/ktime.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -1576,11 +1577,11 @@ int hibernate_preallocate_memory(void)
struct zone *zone;
unsigned long saveable, size, max_size, count, highmem, pages = 0;
unsigned long alloc, save_highmem, pages_highmem, avail_normal;
- struct timeval start, stop;
+ ktime_t start, stop;
int error;
printk(KERN_INFO "PM: Preallocating image memory... ");
- do_gettimeofday(&start);
+ start = ktime_get();
error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
if (error)
@@ -1709,9 +1710,9 @@ int hibernate_preallocate_memory(void)
free_unnecessary_pages();
out:
- do_gettimeofday(&stop);
+ stop = ktime_get();
printk(KERN_CONT "done (allocated %lu pages)\n", pages);
- swsusp_show_speed(&start, &stop, pages, "Allocated");
+ swsusp_show_speed(start, stop, pages, "Allocated");
return 0;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index aaa3261dea5d..570aff817543 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -30,6 +30,7 @@
#include <linux/atomic.h>
#include <linux/kthread.h>
#include <linux/crc32.h>
+#include <linux/ktime.h>
#include "power.h"
@@ -445,8 +446,8 @@ static int save_image(struct swap_map_handle *handle,
int nr_pages;
int err2;
struct bio *bio;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
nr_to_write);
@@ -455,7 +456,7 @@ static int save_image(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
while (1) {
ret = snapshot_read_next(snapshot);
if (ret <= 0)
@@ -469,12 +470,12 @@ static int save_image(struct swap_map_handle *handle,
nr_pages++;
}
err2 = hib_wait_on_bio_chain(&bio);
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret)
ret = err2;
if (!ret)
printk(KERN_INFO "PM: Image saving done.\n");
- swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+ swsusp_show_speed(start, stop, nr_to_write, "Wrote");
return ret;
}
@@ -580,8 +581,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
int nr_pages;
int err2;
struct bio *bio;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
size_t off;
unsigned thr, run_threads, nr_threads;
unsigned char *page = NULL;
@@ -674,7 +675,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
for (;;) {
for (thr = 0; thr < nr_threads; thr++) {
for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
@@ -759,12 +760,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
out_finish:
err2 = hib_wait_on_bio_chain(&bio);
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret)
ret = err2;
if (!ret)
printk(KERN_INFO "PM: Image saving done.\n");
- swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+ swsusp_show_speed(start, stop, nr_to_write, "Wrote");
out_clean:
if (crc) {
if (crc->thr)
@@ -965,8 +966,8 @@ static int load_image(struct swap_map_handle *handle,
{
unsigned int m;
int ret = 0;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
struct bio *bio;
int err2;
unsigned nr_pages;
@@ -978,7 +979,7 @@ static int load_image(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
for ( ; ; ) {
ret = snapshot_write_next(snapshot);
if (ret <= 0)
@@ -996,7 +997,7 @@ static int load_image(struct swap_map_handle *handle,
nr_pages++;
}
err2 = hib_wait_on_bio_chain(&bio);
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret)
ret = err2;
if (!ret) {
@@ -1005,7 +1006,7 @@ static int load_image(struct swap_map_handle *handle,
if (!snapshot_image_loaded(snapshot))
ret = -ENODATA;
}
- swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+ swsusp_show_speed(start, stop, nr_to_read, "Read");
return ret;
}
@@ -1067,8 +1068,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
int ret = 0;
int eof = 0;
struct bio *bio;
- struct timeval start;
- struct timeval stop;
+ ktime_t start;
+ ktime_t stop;
unsigned nr_pages;
size_t off;
unsigned i, thr, run_threads, nr_threads;
@@ -1190,7 +1191,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
m = 1;
nr_pages = 0;
bio = NULL;
- do_gettimeofday(&start);
+ start = ktime_get();
ret = snapshot_write_next(snapshot);
if (ret <= 0)
@@ -1343,7 +1344,7 @@ out_finish:
wait_event(crc->done, atomic_read(&crc->stop));
atomic_set(&crc->stop, 0);
}
- do_gettimeofday(&stop);
+ stop = ktime_get();
if (!ret) {
printk(KERN_INFO "PM: Image loading done.\n");
snapshot_write_finalize(snapshot);
@@ -1359,7 +1360,7 @@ out_finish:
}
}
}
- swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+ swsusp_show_speed(start, stop, nr_to_read, "Read");
out_clean:
for (i = 0; i < ring_size; i++)
free_page((unsigned long)page[i]);
@@ -1374,7 +1375,7 @@ out_clean:
kthread_stop(data[thr].thr);
vfree(data);
}
- if (page) vfree(page);
+ vfree(page);
return ret;
}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ced2b84b1cb7..f900dc9f6822 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -62,9 +62,6 @@ int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
};
-/* Deferred messaged from sched code are marked by this special level */
-#define SCHED_MESSAGE_LOGLEVEL -2
-
/*
* Low level drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
@@ -480,7 +477,7 @@ static int syslog_action_restricted(int type)
type != SYSLOG_ACTION_SIZE_BUFFER;
}
-static int check_syslog_permissions(int type, bool from_file)
+int check_syslog_permissions(int type, bool from_file)
{
/*
* If this is from /proc/kmsg and we've already opened it, then we've
@@ -1259,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
int do_syslog(int type, char __user *buf, int len, bool from_file)
{
bool clear = false;
- static int saved_console_loglevel = -1;
+ static int saved_console_loglevel = LOGLEVEL_DEFAULT;
int error;
error = check_syslog_permissions(type, from_file);
@@ -1316,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
break;
/* Disable logging to console */
case SYSLOG_ACTION_CONSOLE_OFF:
- if (saved_console_loglevel == -1)
+ if (saved_console_loglevel == LOGLEVEL_DEFAULT)
saved_console_loglevel = console_loglevel;
console_loglevel = minimum_console_loglevel;
break;
/* Enable logging to console */
case SYSLOG_ACTION_CONSOLE_ON:
- if (saved_console_loglevel != -1) {
+ if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
console_loglevel = saved_console_loglevel;
- saved_console_loglevel = -1;
+ saved_console_loglevel = LOGLEVEL_DEFAULT;
}
break;
/* Set level of messages printed to console */
@@ -1336,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
len = minimum_console_loglevel;
console_loglevel = len;
/* Implicitly re-enable logging to console */
- saved_console_loglevel = -1;
+ saved_console_loglevel = LOGLEVEL_DEFAULT;
error = 0;
break;
/* Number of chars in the log buffer */
@@ -1627,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level,
int printed_len = 0;
bool in_sched = false;
/* cpu currently holding logbuf_lock in this function */
- static volatile unsigned int logbuf_cpu = UINT_MAX;
+ static unsigned int logbuf_cpu = UINT_MAX;
- if (level == SCHED_MESSAGE_LOGLEVEL) {
- level = -1;
+ if (level == LOGLEVEL_SCHED) {
+ level = LOGLEVEL_DEFAULT;
in_sched = true;
}
@@ -1695,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level,
const char *end_of_header = printk_skip_level(text);
switch (kern_level) {
case '0' ... '7':
- if (level == -1)
+ if (level == LOGLEVEL_DEFAULT)
level = kern_level - '0';
+ /* fallthrough */
case 'd': /* KERN_DEFAULT */
lflags |= LOG_PREFIX;
}
@@ -1710,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level,
}
}
- if (level == -1)
+ if (level == LOGLEVEL_DEFAULT)
level = default_message_loglevel;
if (dict)
@@ -1788,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit);
asmlinkage int vprintk(const char *fmt, va_list args)
{
- return vprintk_emit(0, -1, NULL, 0, fmt, args);
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
}
EXPORT_SYMBOL(vprintk);
@@ -1807,6 +1805,30 @@ asmlinkage int printk_emit(int facility, int level,
}
EXPORT_SYMBOL(printk_emit);
+int vprintk_default(const char *fmt, va_list args)
+{
+ int r;
+
+#ifdef CONFIG_KGDB_KDB
+ if (unlikely(kdb_trap_printk)) {
+ r = vkdb_printf(fmt, args);
+ return r;
+ }
+#endif
+ r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(vprintk_default);
+
+/*
+ * This allows printk to be diverted to another function per cpu.
+ * This is useful for calling printk functions from within NMI
+ * without worrying about race conditions that can lock up the
+ * box.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+
/**
* printk - print a kernel message
* @fmt: format string
@@ -1830,19 +1852,15 @@ EXPORT_SYMBOL(printk_emit);
*/
asmlinkage __visible int printk(const char *fmt, ...)
{
+ printk_func_t vprintk_func;
va_list args;
int r;
-#ifdef CONFIG_KGDB_KDB
- if (unlikely(kdb_trap_printk)) {
- va_start(args, fmt);
- r = vkdb_printf(fmt, args);
- va_end(args);
- return r;
- }
-#endif
va_start(args, fmt);
- r = vprintk_emit(0, -1, NULL, 0, fmt, args);
+ preempt_disable();
+ vprintk_func = this_cpu_read(printk_func);
+ r = vprintk_func(fmt, args);
+ preempt_enable();
va_end(args);
return r;
@@ -1876,28 +1894,28 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
bool syslog, char *buf, size_t size) { return 0; }
static size_t cont_print_text(char *text, size_t size) { return 0; }
+/* Still needs to be defined for users */
+DEFINE_PER_CPU(printk_func_t, printk_func);
+
#endif /* CONFIG_PRINTK */
#ifdef CONFIG_EARLY_PRINTK
struct console *early_console;
-void early_vprintk(const char *fmt, va_list ap)
-{
- if (early_console) {
- char buf[512];
- int n = vscnprintf(buf, sizeof(buf), fmt, ap);
-
- early_console->write(early_console, buf, n);
- }
-}
-
asmlinkage __visible void early_printk(const char *fmt, ...)
{
va_list ap;
+ char buf[512];
+ int n;
+
+ if (!early_console)
+ return;
va_start(ap, fmt);
- early_vprintk(fmt, ap);
+ n = vscnprintf(buf, sizeof(buf), fmt, ap);
va_end(ap);
+
+ early_console->write(early_console, buf, n);
}
#endif
@@ -2634,7 +2652,7 @@ int printk_deferred(const char *fmt, ...)
preempt_disable();
va_start(args, fmt);
- r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args);
+ r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
va_end(args);
__this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 54e75226c2c4..1eb9d90c3af9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
/*
* Detach all tasks we were using ptrace on. Called with tasklist held
- * for writing, and returns with it held too. But note it can release
- * and reacquire the lock.
+ * for writing.
*/
-void exit_ptrace(struct task_struct *tracer)
- __releases(&tasklist_lock)
- __acquires(&tasklist_lock)
+void exit_ptrace(struct task_struct *tracer, struct list_head *dead)
{
struct task_struct *p, *n;
- LIST_HEAD(ptrace_dead);
-
- if (likely(list_empty(&tracer->ptraced)))
- return;
list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
if (unlikely(p->ptrace & PT_EXITKILL))
send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
if (__ptrace_detach(tracer, p))
- list_add(&p->ptrace_entry, &ptrace_dead);
- }
-
- write_unlock_irq(&tasklist_lock);
- BUG_ON(!list_empty(&tracer->ptraced));
-
- list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
- list_del_init(&p->ptrace_entry);
- release_task(p);
+ list_add(&p->ptrace_entry, dead);
}
-
- write_lock_irq(&tasklist_lock);
}
int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 807ccfbf69b3..e6fae503d1bc 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,6 +1,6 @@
obj-y += update.o srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o
-obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
+obj-$(CONFIG_PREEMPT_RCU) += tree.o
obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index ff1a6de62f17..07bb02eda844 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -135,4 +135,6 @@ int rcu_jiffies_till_stall_check(void);
*/
#define TPS(x) tracepoint_string(x)
+void rcu_early_boot_tests(void);
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 240fa9094f83..4d559baf06e0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -812,6 +812,7 @@ rcu_torture_cbflood(void *arg)
cur_ops->cb_barrier();
stutter_wait("rcu_torture_cbflood");
} while (!torture_must_stop());
+ vfree(rhp);
torture_kthread_stopping("rcu_torture_cbflood");
return 0;
}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c0623fc47125..0db5649f8817 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -247,7 +247,7 @@ void rcu_bh_qs(void)
* be called from hardirq context. It is normally called from the
* scheduling-clock interrupt.
*/
-void rcu_check_callbacks(int cpu, int user)
+void rcu_check_callbacks(int user)
{
RCU_TRACE(check_cpu_stalls());
if (user || rcu_is_cpu_rrupt_from_idle())
@@ -380,7 +380,9 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
-void rcu_init(void)
+void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+
+ rcu_early_boot_tests();
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9815447d22e0..7680fc275036 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -105,7 +105,7 @@ struct rcu_state sname##_state = { \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
}; \
-DEFINE_PER_CPU(struct rcu_data, sname##_data)
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data)
RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
@@ -152,19 +152,6 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
*/
static int rcu_scheduler_fully_active __read_mostly;
-#ifdef CONFIG_RCU_BOOST
-
-/*
- * Control variables for per-CPU and per-rcu_node kthreads. These
- * handle all flavors of RCU.
- */
-static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DEFINE_PER_CPU(char, rcu_cpu_has_work);
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -286,11 +273,11 @@ static void rcu_momentary_dyntick_idle(void)
* and requires special handling for preemptible RCU.
* The caller must have disabled preemption.
*/
-void rcu_note_context_switch(int cpu)
+void rcu_note_context_switch(void)
{
trace_rcu_utilization(TPS("Start context switch"));
rcu_sched_qs();
- rcu_preempt_note_context_switch(cpu);
+ rcu_preempt_note_context_switch();
if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
rcu_momentary_dyntick_idle();
trace_rcu_utilization(TPS("End context switch"));
@@ -325,7 +312,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
unsigned long *maxj),
bool *isidle, unsigned long *maxj);
static void force_quiescent_state(struct rcu_state *rsp);
-static int rcu_pending(int cpu);
+static int rcu_pending(void);
/*
* Return the number of RCU-sched batches processed thus far for debug & stats.
@@ -510,11 +497,11 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
* we really have entered idle, and must do the appropriate accounting.
* The caller must have disabled interrupts.
*/
-static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
- bool user)
+static void rcu_eqs_enter_common(long long oldval, bool user)
{
struct rcu_state *rsp;
struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
@@ -531,7 +518,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
rdp = this_cpu_ptr(rsp->rda);
do_nocb_deferred_wakeup(rdp);
}
- rcu_prepare_for_idle(smp_processor_id());
+ rcu_prepare_for_idle();
/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
@@ -565,7 +552,7 @@ static void rcu_eqs_enter(bool user)
WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
rdtp->dynticks_nesting = 0;
- rcu_eqs_enter_common(rdtp, oldval, user);
+ rcu_eqs_enter_common(oldval, user);
} else {
rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
}
@@ -589,7 +576,7 @@ void rcu_idle_enter(void)
local_irq_save(flags);
rcu_eqs_enter(false);
- rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0);
+ rcu_sysidle_enter(0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -639,8 +626,8 @@ void rcu_irq_exit(void)
if (rdtp->dynticks_nesting)
trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
else
- rcu_eqs_enter_common(rdtp, oldval, true);
- rcu_sysidle_enter(rdtp, 1);
+ rcu_eqs_enter_common(oldval, true);
+ rcu_sysidle_enter(1);
local_irq_restore(flags);
}
@@ -651,16 +638,17 @@ void rcu_irq_exit(void)
* we really have exited idle, and must do the appropriate accounting.
* The caller must have disabled interrupts.
*/
-static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
- int user)
+static void rcu_eqs_exit_common(long long oldval, int user)
{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
rcu_dynticks_task_exit();
smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
atomic_inc(&rdtp->dynticks);
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
smp_mb__after_atomic(); /* See above. */
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
- rcu_cleanup_after_idle(smp_processor_id());
+ rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused =
@@ -691,7 +679,7 @@ static void rcu_eqs_exit(bool user)
rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
} else {
rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_eqs_exit_common(rdtp, oldval, user);
+ rcu_eqs_exit_common(oldval, user);
}
}
@@ -712,7 +700,7 @@ void rcu_idle_exit(void)
local_irq_save(flags);
rcu_eqs_exit(false);
- rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0);
+ rcu_sysidle_exit(0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -763,8 +751,8 @@ void rcu_irq_enter(void)
if (oldval)
trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
else
- rcu_eqs_exit_common(rdtp, oldval, true);
- rcu_sysidle_exit(rdtp, 1);
+ rcu_eqs_exit_common(oldval, true);
+ rcu_sysidle_exit(1);
local_irq_restore(flags);
}
@@ -2387,7 +2375,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
* invoked from the scheduling-clock interrupt. If rcu_pending returns
* false, there is no point in invoking rcu_check_callbacks().
*/
-void rcu_check_callbacks(int cpu, int user)
+void rcu_check_callbacks(int user)
{
trace_rcu_utilization(TPS("Start scheduler-tick"));
increment_cpu_stall_ticks();
@@ -2419,8 +2407,8 @@ void rcu_check_callbacks(int cpu, int user)
rcu_bh_qs();
}
- rcu_preempt_check_callbacks(cpu);
- if (rcu_pending(cpu))
+ rcu_preempt_check_callbacks();
+ if (rcu_pending())
invoke_rcu_core();
if (user)
rcu_note_voluntary_context_switch(current);
@@ -2963,6 +2951,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
*/
void synchronize_sched_expedited(void)
{
+ cpumask_var_t cm;
+ bool cma = false;
+ int cpu;
long firstsnap, s, snap;
int trycount = 0;
struct rcu_state *rsp = &rcu_sched_state;
@@ -2997,11 +2988,26 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
+ /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
+ cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
+ if (cma) {
+ cpumask_copy(cm, cpu_online_mask);
+ cpumask_clear_cpu(raw_smp_processor_id(), cm);
+ for_each_cpu(cpu, cm) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ cpumask_clear_cpu(cpu, cm);
+ }
+ if (cpumask_weight(cm) == 0)
+ goto all_cpus_idle;
+ }
+
/*
* Each pass through the following loop attempts to force a
* context switch on each CPU.
*/
- while (try_stop_cpus(cpu_online_mask,
+ while (try_stop_cpus(cma ? cm : cpu_online_mask,
synchronize_sched_expedited_cpu_stop,
NULL) == -EAGAIN) {
put_online_cpus();
@@ -3013,6 +3019,7 @@ void synchronize_sched_expedited(void)
/* ensure test happens before caller kfree */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone1);
+ free_cpumask_var(cm);
return;
}
@@ -3022,6 +3029,7 @@ void synchronize_sched_expedited(void)
} else {
wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_normal);
+ free_cpumask_var(cm);
return;
}
@@ -3031,6 +3039,7 @@ void synchronize_sched_expedited(void)
/* ensure test happens before caller kfree */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone2);
+ free_cpumask_var(cm);
return;
}
@@ -3045,6 +3054,7 @@ void synchronize_sched_expedited(void)
/* CPU hotplug operation in flight, use normal GP. */
wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_normal);
+ free_cpumask_var(cm);
return;
}
snap = atomic_long_read(&rsp->expedited_start);
@@ -3052,6 +3062,9 @@ void synchronize_sched_expedited(void)
}
atomic_long_inc(&rsp->expedited_stoppedcpus);
+all_cpus_idle:
+ free_cpumask_var(cm);
+
/*
* Everyone up to our most recent fetch is covered by our grace
* period. Update the counter, but only if our work is still
@@ -3143,12 +3156,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
* by the current CPU, returning 1 if so. This function is part of the
* RCU implementation; it is -not- an exported member of the RCU API.
*/
-static int rcu_pending(int cpu)
+static int rcu_pending(void)
{
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
+ if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda)))
return 1;
return 0;
}
@@ -3158,7 +3171,7 @@ static int rcu_pending(int cpu)
* non-NULL, store an indication of whether all callbacks are lazy.
* (If there are no callbacks, all of them are deemed to be lazy.)
*/
-static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
+static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
{
bool al = true;
bool hc = false;
@@ -3166,7 +3179,7 @@ static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp) {
- rdp = per_cpu_ptr(rsp->rda, cpu);
+ rdp = this_cpu_ptr(rsp->rda);
if (!rdp->nxtlist)
continue;
hc = true;
@@ -3485,8 +3498,10 @@ static int rcu_cpu_notify(struct notifier_block *self,
case CPU_DEAD_FROZEN:
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
- for_each_rcu_flavor(rsp)
+ for_each_rcu_flavor(rsp) {
rcu_cleanup_dead_cpu(cpu, rsp);
+ do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
+ }
break;
default:
break;
@@ -3766,6 +3781,8 @@ void __init rcu_init(void)
pm_notifier(rcu_pm_notify, 0);
for_each_online_cpu(cpu)
rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+
+ rcu_early_boot_tests();
}
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bbdc45d8d74f..8e7b1843896e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -139,7 +139,7 @@ struct rcu_node {
unsigned long expmask; /* Groups that have ->blkd_tasks */
/* elements that need to drain to allow the */
/* current expedited grace period to */
- /* complete (only for TREE_PREEMPT_RCU). */
+ /* complete (only for PREEMPT_RCU). */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -530,10 +530,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
extern struct rcu_state rcu_bh_state;
DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
extern struct rcu_state rcu_preempt_state;
DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -547,7 +547,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
long rcu_batches_completed(void);
-static void rcu_preempt_note_context_switch(int cpu);
+static void rcu_preempt_note_context_switch(void);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -561,12 +561,12 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
struct rcu_node *rnp,
struct rcu_data *rdp);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_preempt_check_callbacks(int cpu);
+static void rcu_preempt_check_callbacks(void);
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake);
-#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
+#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
static void __init __rcu_init_preempt(void);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -579,8 +579,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
#endif /* #ifdef CONFIG_RCU_BOOST */
static void __init rcu_spawn_boost_kthreads(void);
static void rcu_prepare_kthreads(int cpu);
-static void rcu_cleanup_after_idle(int cpu);
-static void rcu_prepare_for_idle(int cpu);
+static void rcu_cleanup_after_idle(void);
+static void rcu_prepare_for_idle(void);
static void rcu_idle_count_callbacks_posted(void);
static void print_cpu_stall_info_begin(void);
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
@@ -606,8 +606,8 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
-static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
-static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_enter(int irq);
+static void rcu_sysidle_exit(int irq);
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long *maxj);
static bool is_sysidle_rcu_state(struct rcu_state *rsp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c1d7f27bd38f..3ec85cb5d544 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -30,14 +30,24 @@
#include <linux/smpboot.h>
#include "../time/tick-internal.h"
-#define RCU_KTHREAD_PRIO 1
-
#ifdef CONFIG_RCU_BOOST
+
#include "../locking/rtmutex_common.h"
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-#else
-#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
-#endif
+
+/* rcuc/rcub kthread realtime priority */
+static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+module_param(kthread_prio, int, 0644);
+
+/*
+ * Control variables for per-CPU and per-rcu_node kthreads. These
+ * handle all flavors of RCU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DEFINE_PER_CPU(char, rcu_cpu_has_work);
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -72,9 +82,6 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
pr_info("\tRCU torture testing starts during boot.\n");
#endif
-#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
- pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");
-#endif
#if defined(CONFIG_RCU_CPU_STALL_INFO)
pr_info("\tAdditional per-CPU info printed with stalls.\n");
#endif
@@ -85,9 +92,12 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
+#ifdef CONFIG_RCU_BOOST
+ pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
+#endif
}
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
static struct rcu_state *rcu_state_p = &rcu_preempt_state;
@@ -156,7 +166,7 @@ static void rcu_preempt_qs(void)
*
* Caller must disable preemption.
*/
-static void rcu_preempt_note_context_switch(int cpu)
+static void rcu_preempt_note_context_switch(void)
{
struct task_struct *t = current;
unsigned long flags;
@@ -167,7 +177,7 @@ static void rcu_preempt_note_context_switch(int cpu)
!t->rcu_read_unlock_special.b.blocked) {
/* Possibly blocking in an RCU read-side critical section. */
- rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
+ rdp = this_cpu_ptr(rcu_preempt_state.rda);
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
@@ -415,8 +425,6 @@ void rcu_read_unlock_special(struct task_struct *t)
}
}
-#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
-
/*
* Dump detailed information for all tasks blocking the current RCU
* grace period on the specified rcu_node structure.
@@ -451,14 +459,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
rcu_print_detail_task_stall_rnp(rnp);
}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
-
-static void rcu_print_detail_task_stall(struct rcu_state *rsp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
-
#ifdef CONFIG_RCU_CPU_STALL_INFO
static void rcu_print_task_stall_begin(struct rcu_node *rnp)
@@ -621,7 +621,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
*
* Caller must disable hard irqs.
*/
-static void rcu_preempt_check_callbacks(int cpu)
+static void rcu_preempt_check_callbacks(void)
{
struct task_struct *t = current;
@@ -630,8 +630,8 @@ static void rcu_preempt_check_callbacks(int cpu)
return;
}
if (t->rcu_read_lock_nesting > 0 &&
- per_cpu(rcu_preempt_data, cpu).qs_pending &&
- !per_cpu(rcu_preempt_data, cpu).passed_quiesce)
+ __this_cpu_read(rcu_preempt_data.qs_pending) &&
+ !__this_cpu_read(rcu_preempt_data.passed_quiesce))
t->rcu_read_unlock_special.b.need_qs = true;
}
@@ -919,7 +919,7 @@ void exit_rcu(void)
__rcu_read_unlock();
}
-#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#else /* #ifdef CONFIG_PREEMPT_RCU */
static struct rcu_state *rcu_state_p = &rcu_sched_state;
@@ -945,7 +945,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
* Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
-static void rcu_preempt_note_context_switch(int cpu)
+static void rcu_preempt_note_context_switch(void)
{
}
@@ -1017,7 +1017,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
* Because preemptible RCU does not exist, it never has any callbacks
* to check.
*/
-static void rcu_preempt_check_callbacks(int cpu)
+static void rcu_preempt_check_callbacks(void)
{
}
@@ -1070,7 +1070,7 @@ void exit_rcu(void)
{
}
-#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
@@ -1326,7 +1326,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
smp_mb__after_unlock_lock();
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- sp.sched_priority = RCU_BOOST_PRIO;
+ sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
return 0;
@@ -1343,7 +1343,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
{
struct sched_param sp;
- sp.sched_priority = RCU_KTHREAD_PRIO;
+ sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
}
@@ -1512,10 +1512,10 @@ static void rcu_prepare_kthreads(int cpu)
* any flavor of RCU.
*/
#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+int rcu_needs_cpu(unsigned long *delta_jiffies)
{
*delta_jiffies = ULONG_MAX;
- return rcu_cpu_has_callbacks(cpu, NULL);
+ return rcu_cpu_has_callbacks(NULL);
}
#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
@@ -1523,7 +1523,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
* Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
* after it.
*/
-static void rcu_cleanup_after_idle(int cpu)
+static void rcu_cleanup_after_idle(void)
{
}
@@ -1531,7 +1531,7 @@ static void rcu_cleanup_after_idle(int cpu)
* Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
* is nothing.
*/
-static void rcu_prepare_for_idle(int cpu)
+static void rcu_prepare_for_idle(void)
{
}
@@ -1624,15 +1624,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
* The caller must have disabled interrupts.
*/
#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(int cpu, unsigned long *dj)
+int rcu_needs_cpu(unsigned long *dj)
{
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
/* If no callbacks, RCU doesn't need the CPU. */
- if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
+ if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
*dj = ULONG_MAX;
return 0;
}
@@ -1666,12 +1666,12 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
*
* The caller must have disabled interrupts.
*/
-static void rcu_prepare_for_idle(int cpu)
+static void rcu_prepare_for_idle(void)
{
#ifndef CONFIG_RCU_NOCB_CPU_ALL
bool needwake;
struct rcu_data *rdp;
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
struct rcu_node *rnp;
struct rcu_state *rsp;
int tne;
@@ -1679,7 +1679,7 @@ static void rcu_prepare_for_idle(int cpu)
/* Handle nohz enablement switches conservatively. */
tne = ACCESS_ONCE(tick_nohz_active);
if (tne != rdtp->tick_nohz_enabled_snap) {
- if (rcu_cpu_has_callbacks(cpu, NULL))
+ if (rcu_cpu_has_callbacks(NULL))
invoke_rcu_core(); /* force nohz to see update. */
rdtp->tick_nohz_enabled_snap = tne;
return;
@@ -1688,7 +1688,7 @@ static void rcu_prepare_for_idle(int cpu)
return;
/* If this is a no-CBs CPU, no callbacks, just return. */
- if (rcu_is_nocb_cpu(cpu))
+ if (rcu_is_nocb_cpu(smp_processor_id()))
return;
/*
@@ -1712,7 +1712,7 @@ static void rcu_prepare_for_idle(int cpu)
return;
rdtp->last_accelerate = jiffies;
for_each_rcu_flavor(rsp) {
- rdp = per_cpu_ptr(rsp->rda, cpu);
+ rdp = this_cpu_ptr(rsp->rda);
if (!*rdp->nxttail[RCU_DONE_TAIL])
continue;
rnp = rdp->mynode;
@@ -1731,10 +1731,10 @@ static void rcu_prepare_for_idle(int cpu)
* any grace periods that elapsed while the CPU was idle, and if any
* callbacks are now ready to invoke, initiate invocation.
*/
-static void rcu_cleanup_after_idle(int cpu)
+static void rcu_cleanup_after_idle(void)
{
#ifndef CONFIG_RCU_NOCB_CPU_ALL
- if (rcu_is_nocb_cpu(cpu))
+ if (rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
@@ -2573,9 +2573,13 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
rdp->nocb_leader = rdp_spawn;
if (rdp_last && rdp != rdp_spawn)
rdp_last->nocb_next_follower = rdp;
- rdp_last = rdp;
- rdp = rdp->nocb_next_follower;
- rdp_last->nocb_next_follower = NULL;
+ if (rdp == rdp_spawn) {
+ rdp = rdp->nocb_next_follower;
+ } else {
+ rdp_last = rdp;
+ rdp = rdp->nocb_next_follower;
+ rdp_last->nocb_next_follower = NULL;
+ }
} while (rdp);
rdp_spawn->nocb_next_follower = rdp_old_leader;
}
@@ -2761,9 +2765,10 @@ static int full_sysidle_state; /* Current system-idle state. */
* to detect full-system idle states, not RCU quiescent states and grace
* periods. The caller must have disabled interrupts.
*/
-static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_enter(int irq)
{
unsigned long j;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
/* If there are no nohz_full= CPUs, no need to track this. */
if (!tick_nohz_full_enabled())
@@ -2832,8 +2837,10 @@ void rcu_sysidle_force_exit(void)
* usermode execution does -not- count as idle here! The caller must
* have disabled interrupts.
*/
-static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_exit(int irq)
{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
/* If there are no nohz_full= CPUs, no need to track this. */
if (!tick_nohz_full_enabled())
return;
@@ -3127,11 +3134,11 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_enter(int irq)
{
}
-static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_exit(int irq)
{
}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 3ef8ba58694e..e0d31a345ee6 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -306,7 +306,7 @@ struct debug_obj_descr rcuhead_debug_descr = {
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
unsigned long secs,
unsigned long c_old, unsigned long c)
@@ -531,7 +531,8 @@ static int __noreturn rcu_tasks_kthread(void *arg)
struct rcu_head *next;
LIST_HEAD(rcu_tasks_holdouts);
- /* FIXME: Add housekeeping affinity. */
+ /* Run on housekeeping CPUs by default. Sysadm can move if desired. */
+ housekeeping_affine(current);
/*
* Each pass through the following loop makes one check for
@@ -690,3 +691,87 @@ static void rcu_spawn_tasks_kthread(void)
}
#endif /* #ifdef CONFIG_TASKS_RCU */
+
+#ifdef CONFIG_PROVE_RCU
+
+/*
+ * Early boot self test parameters, one for each flavor
+ */
+static bool rcu_self_test;
+static bool rcu_self_test_bh;
+static bool rcu_self_test_sched;
+
+module_param(rcu_self_test, bool, 0444);
+module_param(rcu_self_test_bh, bool, 0444);
+module_param(rcu_self_test_sched, bool, 0444);
+
+static int rcu_self_test_counter;
+
+static void test_callback(struct rcu_head *r)
+{
+ rcu_self_test_counter++;
+ pr_info("RCU test callback executed %d\n", rcu_self_test_counter);
+}
+
+static void early_boot_test_call_rcu(void)
+{
+ static struct rcu_head head;
+
+ call_rcu(&head, test_callback);
+}
+
+static void early_boot_test_call_rcu_bh(void)
+{
+ static struct rcu_head head;
+
+ call_rcu_bh(&head, test_callback);
+}
+
+static void early_boot_test_call_rcu_sched(void)
+{
+ static struct rcu_head head;
+
+ call_rcu_sched(&head, test_callback);
+}
+
+void rcu_early_boot_tests(void)
+{
+ pr_info("Running RCU self tests\n");
+
+ if (rcu_self_test)
+ early_boot_test_call_rcu();
+ if (rcu_self_test_bh)
+ early_boot_test_call_rcu_bh();
+ if (rcu_self_test_sched)
+ early_boot_test_call_rcu_sched();
+}
+
+static int rcu_verify_early_boot_tests(void)
+{
+ int ret = 0;
+ int early_boot_test_counter = 0;
+
+ if (rcu_self_test) {
+ early_boot_test_counter++;
+ rcu_barrier();
+ }
+ if (rcu_self_test_bh) {
+ early_boot_test_counter++;
+ rcu_barrier_bh();
+ }
+ if (rcu_self_test_sched) {
+ early_boot_test_counter++;
+ rcu_barrier_sched();
+ }
+
+ if (rcu_self_test_counter != early_boot_test_counter) {
+ WARN_ON(1);
+ ret = -1;
+ }
+
+ return ret;
+}
+late_initcall(rcu_verify_early_boot_tests);
+#else
+void rcu_early_boot_tests(void) {}
+#endif /* CONFIG_PROVE_RCU */
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
deleted file mode 100644
index e791130f85a7..000000000000
--- a/kernel/res_counter.c
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * resource cgroups
- *
- * Copyright 2007 OpenVZ SWsoft Inc
- *
- * Author: Pavel Emelianov <xemul@openvz.org>
- *
- */
-
-#include <linux/types.h>
-#include <linux/parser.h>
-#include <linux/fs.h>
-#include <linux/res_counter.h>
-#include <linux/uaccess.h>
-#include <linux/mm.h>
-
-void res_counter_init(struct res_counter *counter, struct res_counter *parent)
-{
- spin_lock_init(&counter->lock);
- counter->limit = RES_COUNTER_MAX;
- counter->soft_limit = RES_COUNTER_MAX;
- counter->parent = parent;
-}
-
-static u64 res_counter_uncharge_locked(struct res_counter *counter,
- unsigned long val)
-{
- if (WARN_ON(counter->usage < val))
- val = counter->usage;
-
- counter->usage -= val;
- return counter->usage;
-}
-
-static int res_counter_charge_locked(struct res_counter *counter,
- unsigned long val, bool force)
-{
- int ret = 0;
-
- if (counter->usage + val > counter->limit) {
- counter->failcnt++;
- ret = -ENOMEM;
- if (!force)
- return ret;
- }
-
- counter->usage += val;
- if (counter->usage > counter->max_usage)
- counter->max_usage = counter->usage;
- return ret;
-}
-
-static int __res_counter_charge(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at, bool force)
-{
- int ret, r;
- unsigned long flags;
- struct res_counter *c, *u;
-
- r = ret = 0;
- *limit_fail_at = NULL;
- local_irq_save(flags);
- for (c = counter; c != NULL; c = c->parent) {
- spin_lock(&c->lock);
- r = res_counter_charge_locked(c, val, force);
- spin_unlock(&c->lock);
- if (r < 0 && !ret) {
- ret = r;
- *limit_fail_at = c;
- if (!force)
- break;
- }
- }
-
- if (ret < 0 && !force) {
- for (u = counter; u != c; u = u->parent) {
- spin_lock(&u->lock);
- res_counter_uncharge_locked(u, val);
- spin_unlock(&u->lock);
- }
- }
- local_irq_restore(flags);
-
- return ret;
-}
-
-int res_counter_charge(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at)
-{
- return __res_counter_charge(counter, val, limit_fail_at, false);
-}
-
-int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at)
-{
- return __res_counter_charge(counter, val, limit_fail_at, true);
-}
-
-u64 res_counter_uncharge_until(struct res_counter *counter,
- struct res_counter *top,
- unsigned long val)
-{
- unsigned long flags;
- struct res_counter *c;
- u64 ret = 0;
-
- local_irq_save(flags);
- for (c = counter; c != top; c = c->parent) {
- u64 r;
- spin_lock(&c->lock);
- r = res_counter_uncharge_locked(c, val);
- if (c == counter)
- ret = r;
- spin_unlock(&c->lock);
- }
- local_irq_restore(flags);
- return ret;
-}
-
-u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
-{
- return res_counter_uncharge_until(counter, NULL, val);
-}
-
-static inline unsigned long long *
-res_counter_member(struct res_counter *counter, int member)
-{
- switch (member) {
- case RES_USAGE:
- return &counter->usage;
- case RES_MAX_USAGE:
- return &counter->max_usage;
- case RES_LIMIT:
- return &counter->limit;
- case RES_FAILCNT:
- return &counter->failcnt;
- case RES_SOFT_LIMIT:
- return &counter->soft_limit;
- };
-
- BUG();
- return NULL;
-}
-
-ssize_t res_counter_read(struct res_counter *counter, int member,
- const char __user *userbuf, size_t nbytes, loff_t *pos,
- int (*read_strategy)(unsigned long long val, char *st_buf))
-{
- unsigned long long *val;
- char buf[64], *s;
-
- s = buf;
- val = res_counter_member(counter, member);
- if (read_strategy)
- s += read_strategy(*val, s);
- else
- s += sprintf(s, "%llu\n", *val);
- return simple_read_from_buffer((void __user *)userbuf, nbytes,
- pos, buf, s - buf);
-}
-
-#if BITS_PER_LONG == 32
-u64 res_counter_read_u64(struct res_counter *counter, int member)
-{
- unsigned long flags;
- u64 ret;
-
- spin_lock_irqsave(&counter->lock, flags);
- ret = *res_counter_member(counter, member);
- spin_unlock_irqrestore(&counter->lock, flags);
-
- return ret;
-}
-#else
-u64 res_counter_read_u64(struct res_counter *counter, int member)
-{
- return *res_counter_member(counter, member);
-}
-#endif
-
-int res_counter_memparse_write_strategy(const char *buf,
- unsigned long long *resp)
-{
- char *end;
- unsigned long long res;
-
- /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
- if (*buf == '-') {
- int rc = kstrtoull(buf + 1, 10, &res);
-
- if (rc)
- return rc;
- if (res != 1)
- return -EINVAL;
- *resp = RES_COUNTER_MAX;
- return 0;
- }
-
- res = memparse(buf, &end);
- if (*end != '\0')
- return -EINVAL;
-
- if (PAGE_ALIGN(res) >= res)
- res = PAGE_ALIGN(res);
- else
- res = RES_COUNTER_MAX;
-
- *resp = res;
-
- return 0;
-}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a63f4dc27909..607f852b4d04 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
*
* This waits to be signaled for completion of a specific task. It is NOT
* interruptible and there is no timeout. The caller is accounted as waiting
- * for IO.
+ * for IO (which traditionally means blkio only).
*/
void __sched wait_for_completion_io(struct completion *x)
{
@@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible. The caller is accounted as waiting for IO.
+ * interruptible. The caller is accounted as waiting for IO (which traditionally
+ * means blkio only).
*
* Return: 0 if timed out, and positive (at least 1, or number of jiffies left
* till timeout) if completed.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 24beb9bb4c3e..b5797b78add6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
+/*
+ * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ */
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
int oldprio)
@@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
+ /* Possble rq->lock 'hole'. */
p->sched_class->switched_to(rq, p);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
@@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* ttwu() will sort out the placement.
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- !(task_preempt_count(p) & PREEMPT_ACTIVE));
+ !p->on_rq);
#ifdef CONFIG_LOCKDEP
/*
@@ -1407,7 +1411,8 @@ out:
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ if (p->nr_cpus_allowed > 1)
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- if (!is_idle_task(rq->curr))
- return;
+ rcu_read_lock();
+
+ if (!is_idle_task(rcu_dereference(rq->curr)))
+ goto out;
if (set_nr_if_polling(rq->idle)) {
trace_sched_wake_idle_without_ipi(cpu);
@@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu)
/* Else cpu is not in idle, do nothing here */
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
+
+out:
+ rcu_read_unlock();
}
bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
- p->numa_faults_memory = NULL;
- p->numa_faults_buffer_memory = NULL;
+ p->numa_faults = NULL;
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
- INIT_LIST_HEAD(&p->numa_entry);
p->numa_group = NULL;
#endif /* CONFIG_NUMA_BALANCING */
}
@@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
}
#endif
-static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
-{
- dl_b->total_bw -= tsk_bw;
-}
-
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
-{
- dl_b->total_bw += tsk_bw;
-}
-
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
-{
- return dl_b->bw != -1 &&
- dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
-}
-
/*
* We must be sure that accepting a new task (or allowing changing the
* parameters of an existing one) is consistent with the bandwidth
@@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
/**
* finish_task_switch - clean up after a task-switch
- * @rq: runqueue associated with task-switch
* @prev: the thread we just switched away from.
*
* finish_task_switch must be called after the context switch, paired
@@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
* so, we finish that here outside of the runqueue lock. (Doing it
* with the lock held can cause deadlocks; see schedule() for
* details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
*/
-static void finish_task_switch(struct rq *rq, struct task_struct *prev)
+static struct rq *finish_task_switch(struct task_struct *prev)
__releases(rq->lock)
{
+ struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
long prev_state;
@@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
}
tick_nohz_task_switch(current);
+ return rq;
}
#ifdef CONFIG_SMP
@@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq)
asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
- struct rq *rq = this_rq();
-
- finish_task_switch(rq, prev);
+ struct rq *rq;
- /*
- * FIXME: do we need to worry about rq being invalidated by the
- * task_switch?
- */
+ /* finish_task_switch() drops rq->lock and enables preemtion */
+ preempt_disable();
+ rq = finish_task_switch(prev);
post_schedule(rq);
+ preempt_enable();
if (current->set_child_tid)
put_user(task_pid_vnr(current), current->set_child_tid);
}
/*
- * context_switch - switch to the new MM and the new
- * thread's register state.
+ * context_switch - switch to the new MM and the new thread's register state.
*/
-static inline void
+static inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
@@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
-
barrier();
- /*
- * this_rq must be evaluated again because prev may have moved
- * CPUs since it called schedule(), thus the 'rq' on its stack
- * frame will be invalid.
- */
- finish_task_switch(this_rq(), prev);
+
+ return finish_task_switch(prev);
}
/*
@@ -2773,7 +2760,7 @@ need_resched:
preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_note_context_switch(cpu);
+ rcu_note_context_switch();
prev = rq->curr;
schedule_debug(prev);
@@ -2826,15 +2813,8 @@ need_resched:
rq->curr = next;
++*switch_count;
- context_switch(rq, prev, next); /* unlocks the rq */
- /*
- * The context switch have flipped the stack from under us
- * and restored the local variables which were saved when
- * this task called schedule() in the past. prev == current
- * is still correct, but it can be moved to another cpu/rq.
- */
- cpu = smp_processor_id();
- rq = cpu_rq(cpu);
+ rq = context_switch(rq, prev, next); /* unlocks the rq */
+ cpu = cpu_of(rq);
} else
raw_spin_unlock_irq(&rq->lock);
@@ -2874,10 +2854,14 @@ asmlinkage __visible void __sched schedule_user(void)
* or we have been woken up remotely but the IPI has not yet arrived,
* we haven't yet exited the RCU idle mode. Do it here manually until
* we find a better solution.
+ *
+ * NB: There are buggy callers of this function. Ideally we
+ * should warn if prev_state != IN_USER, but that will trigger
+ * too frequently to make sense yet.
*/
- user_exit();
+ enum ctx_state prev_state = exception_enter();
schedule();
- user_enter();
+ exception_exit(prev_state);
}
#endif
@@ -4543,8 +4527,10 @@ void sched_show_task(struct task_struct *p)
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
#endif
+ ppid = 0;
rcu_read_lock();
- ppid = task_pid_nr(rcu_dereference(p->real_parent));
+ if (pid_alive(p))
+ ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
task_pid_nr(p), ppid,
@@ -4649,6 +4635,81 @@ void init_idle(struct task_struct *idle, int cpu)
#endif
}
+int cpuset_cpumask_can_shrink(const struct cpumask *cur,
+ const struct cpumask *trial)
+{
+ int ret = 1, trial_cpus;
+ struct dl_bw *cur_dl_b;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ cur_dl_b = dl_bw_of(cpumask_any(cur));
+ trial_cpus = cpumask_weight(trial);
+
+ raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
+ if (cur_dl_b->bw != -1 &&
+ cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+ ret = 0;
+ raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ return ret;
+}
+
+int task_can_attach(struct task_struct *p,
+ const struct cpumask *cs_cpus_allowed)
+{
+ int ret = 0;
+
+ /*
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their cpu
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
+ */
+ if (p->flags & PF_NO_SETAFFINITY) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+#ifdef CONFIG_SMP
+ if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
+ cs_cpus_allowed)) {
+ unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
+ cs_cpus_allowed);
+ struct dl_bw *dl_b;
+ bool overflow;
+ int cpus;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(dest_cpu);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(dest_cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+ if (overflow)
+ ret = -EBUSY;
+ else {
+ /*
+ * We reserve space for this task in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, p->dl.dl_bw);
+ }
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ }
+#endif
+out:
+ return ret;
+}
+
#ifdef CONFIG_SMP
/*
* move_queued_task - move a queued task to new rq.
@@ -6099,7 +6160,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
#ifdef CONFIG_NUMA
static int sched_domains_numa_levels;
+enum numa_topology_type sched_numa_topology_type;
static int *sched_domains_numa_distance;
+int sched_max_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
#endif
@@ -6271,7 +6334,7 @@ static void sched_numa_warn(const char *str)
printk(KERN_WARNING "\n");
}
-static bool find_numa_distance(int distance)
+bool find_numa_distance(int distance)
{
int i;
@@ -6286,6 +6349,56 @@ static bool find_numa_distance(int distance)
return false;
}
+/*
+ * A system can have three types of NUMA topology:
+ * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
+ * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
+ * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
+ *
+ * The difference between a glueless mesh topology and a backplane
+ * topology lies in whether communication between not directly
+ * connected nodes goes through intermediary nodes (where programs
+ * could run), or through backplane controllers. This affects
+ * placement of programs.
+ *
+ * The type of topology can be discerned with the following tests:
+ * - If the maximum distance between any nodes is 1 hop, the system
+ * is directly connected.
+ * - If for two nodes A and B, located N > 1 hops away from each other,
+ * there is an intermediary node C, which is < N hops away from both
+ * nodes A and B, the system is a glueless mesh.
+ */
+static void init_numa_topology_type(void)
+{
+ int a, b, c, n;
+
+ n = sched_max_numa_distance;
+
+ if (n <= 1)
+ sched_numa_topology_type = NUMA_DIRECT;
+
+ for_each_online_node(a) {
+ for_each_online_node(b) {
+ /* Find two nodes furthest removed from each other. */
+ if (node_distance(a, b) < n)
+ continue;
+
+ /* Is there an intermediary node between a and b? */
+ for_each_online_node(c) {
+ if (node_distance(a, c) < n &&
+ node_distance(b, c) < n) {
+ sched_numa_topology_type =
+ NUMA_GLUELESS_MESH;
+ return;
+ }
+ }
+
+ sched_numa_topology_type = NUMA_BACKPLANE;
+ return;
+ }
+ }
+}
+
static void sched_init_numa(void)
{
int next_distance, curr_distance = node_distance(0, 0);
@@ -6422,6 +6535,9 @@ static void sched_init_numa(void)
sched_domain_topology = tl;
sched_domains_numa_levels = level;
+ sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+
+ init_numa_topology_type();
}
static void sched_domains_numa_masks_set(int cpu)
@@ -7174,6 +7290,25 @@ static inline int preempt_count_equals(int preempt_offset)
void __might_sleep(const char *file, int line, int preempt_offset)
{
+ /*
+ * Blocking primitives will set (and therefore destroy) current->state,
+ * since we will exit with TASK_RUNNING make sure we enter with it,
+ * otherwise we will destroy state.
+ */
+ if (WARN_ONCE(current->state != TASK_RUNNING,
+ "do not call blocking ops when !TASK_RUNNING; "
+ "state=%lx set at [<%p>] %pS\n",
+ current->state,
+ (void *)current->task_state_change,
+ (void *)current->task_state_change))
+ __set_current_state(TASK_RUNNING);
+
+ ___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
static unsigned long prev_jiffy; /* ratelimiting */
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7205,7 +7340,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
#endif
dump_stack();
}
-EXPORT_SYMBOL(__might_sleep);
+EXPORT_SYMBOL(___might_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 538c9796ad4a..020039bd1326 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
int cpudl_init(struct cpudl *cp);
void cpudl_cleanup(struct cpudl *cp);
-#else
-#define cpudl_set(cp, cpu, dl) do { } while (0)
-#define cpudl_init() do { } while (0)
#endif /* CONFIG_SMP */
#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 6b033347fdfd..63cbb9ca0496 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp,
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
-#else
-#define cpupri_set(cp, cpu, pri) do { } while (0)
-#define cpupri_init() do { } while (0)
#endif
#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 28fa9d9e9201..e5db8c6feebd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -563,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
- if (hrtimer_active(timer)) {
- hrtimer_try_to_cancel(timer);
- return;
- }
-
hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
timer->function = dl_task_timer;
}
@@ -633,7 +628,7 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
- dl_se->runtime -= delta_exec;
+ dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
if (dl_runtime_exceeded(rq, dl_se)) {
__dequeue_task_dl(rq, curr, 0);
if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@ -933,7 +928,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
struct task_struct *curr;
struct rq *rq;
- if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+ if (sd_flag != SD_BALANCE_WAKE)
goto out;
rq = cpu_rq(cpu);
@@ -1018,6 +1013,10 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
{
hrtick_start(rq, p->dl.runtime);
}
+#else /* !CONFIG_SCHED_HRTICK */
+static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+{
+}
#endif
static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1071,10 +1070,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
/* Running task will never be pushed. */
dequeue_pushable_dl_task(rq, p);
-#ifdef CONFIG_SCHED_HRTICK
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
-#endif
set_post_schedule(rq);
@@ -1093,10 +1090,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
-#ifdef CONFIG_SCHED_HRTICK
if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
start_hrtick_dl(rq, p);
-#endif
}
static void task_fork_dl(struct task_struct *p)
@@ -1333,6 +1328,7 @@ static int push_dl_task(struct rq *rq)
{
struct task_struct *next_task;
struct rq *later_rq;
+ int ret = 0;
if (!rq->dl.overloaded)
return 0;
@@ -1378,7 +1374,6 @@ retry:
* The task is still there. We don't try
* again, some other cpu will pull it when ready.
*/
- dequeue_pushable_dl_task(rq, next_task);
goto out;
}
@@ -1394,6 +1389,7 @@ retry:
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, later_rq->cpu);
activate_task(later_rq, next_task, 0);
+ ret = 1;
resched_curr(later_rq);
@@ -1402,7 +1398,7 @@ retry:
out:
put_task_struct(next_task);
- return 1;
+ return ret;
}
static void push_dl_tasks(struct rq *rq)
@@ -1508,7 +1504,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
(rq->curr->nr_cpus_allowed < 2 ||
- dl_entity_preempt(&rq->curr->dl, &p->dl))) {
+ !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
push_dl_tasks(rq);
}
}
@@ -1517,10 +1513,33 @@ static void set_cpus_allowed_dl(struct task_struct *p,
const struct cpumask *new_mask)
{
struct rq *rq;
+ struct root_domain *src_rd;
int weight;
BUG_ON(!dl_task(p));
+ rq = task_rq(p);
+ src_rd = rq->rd;
+ /*
+ * Migrating a SCHED_DEADLINE task between exclusive
+ * cpusets (different root_domains) entails a bandwidth
+ * update. We already made space for us in the destination
+ * domain (see cpuset_can_attach()).
+ */
+ if (!cpumask_intersects(src_rd->span, new_mask)) {
+ struct dl_bw *src_dl_b;
+
+ src_dl_b = dl_bw_of(cpu_of(rq));
+ /*
+ * We now free resources of the root_domain we are migrating
+ * off. In the worst case, sched_setattr() may temporary fail
+ * until we complete the update.
+ */
+ raw_spin_lock(&src_dl_b->lock);
+ __dl_clear(src_dl_b, p->dl.dl_bw);
+ raw_spin_unlock(&src_dl_b->lock);
+ }
+
/*
* Update only if the task is actually running (i.e.,
* it is on the rq AND it is not throttled).
@@ -1537,8 +1556,6 @@ static void set_cpus_allowed_dl(struct task_struct *p,
if ((p->nr_cpus_allowed > 1) == (weight > 1))
return;
- rq = task_rq(p);
-
/*
* The process used to be able to migrate OR it can now migrate
*/
@@ -1586,22 +1603,48 @@ void init_sched_dl_class(void)
#endif /* CONFIG_SMP */
+/*
+ * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
+ */
+static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
+{
+ struct hrtimer *dl_timer = &p->dl.dl_timer;
+
+ /* Nobody will change task's class if pi_lock is held */
+ lockdep_assert_held(&p->pi_lock);
+
+ if (hrtimer_active(dl_timer)) {
+ int ret = hrtimer_try_to_cancel(dl_timer);
+
+ if (unlikely(ret == -1)) {
+ /*
+ * Note, p may migrate OR new deadline tasks
+ * may appear in rq when we are unlocking it.
+ * A caller of us must be fine with that.
+ */
+ raw_spin_unlock(&rq->lock);
+ hrtimer_cancel(dl_timer);
+ raw_spin_lock(&rq->lock);
+ }
+ }
+}
+
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
- if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
- hrtimer_try_to_cancel(&p->dl.dl_timer);
+ cancel_dl_timer(rq, p);
__dl_clear_params(p);
-#ifdef CONFIG_SMP
/*
* Since this might be the only -deadline task on the rq,
* this is the right place to try to pull some other one
* from an overloaded cpu, if any.
*/
- if (!rq->dl.dl_nr_running)
- pull_dl_task(rq);
-#endif
+ if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+ return;
+
+ if (pull_dl_task(rq))
+ resched_curr(rq);
}
/*
@@ -1622,7 +1665,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
+ if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
+ push_dl_task(rq) && rq != task_rq(p))
/* Only reschedule if pushing failed */
check_resched = 0;
#endif /* CONFIG_SMP */
@@ -1704,3 +1748,12 @@ const struct sched_class dl_sched_class = {
.update_curr = update_curr_dl,
};
+
+#ifdef CONFIG_SCHED_DEBUG
+extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+
+void print_dl_stats(struct seq_file *m, int cpu)
+{
+ print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
+}
+#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ce33780d8f20..92cc52001e74 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -261,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
#undef P
}
+void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
+{
+ SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
+ SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+}
+
extern __read_mostly int sched_clock_running;
static void print_cpu(struct seq_file *m, int cpu)
@@ -329,6 +335,7 @@ do { \
spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
+ print_dl_stats(m, cpu);
print_rq(m, rq, cpu);
spin_unlock_irqrestore(&sched_debug_lock, flags);
@@ -528,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
unsigned long nr_faults = -1;
int cpu_current, home_node;
- if (p->numa_faults_memory)
- nr_faults = p->numa_faults_memory[2*node + i];
+ if (p->numa_faults)
+ nr_faults = p->numa_faults[2*node + i];
cpu_current = !i ? (task_node(p) == node) :
(pol && node_isset(node, pol->v.nodes));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef2b104b254c..df2cdf77f899 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -873,7 +873,6 @@ struct numa_group {
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
- struct list_head task_list;
struct rcu_head rcu;
nodemask_t active_nodes;
@@ -901,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p)
return p->numa_group ? p->numa_group->gid : 0;
}
-static inline int task_faults_idx(int nid, int priv)
+/*
+ * The averaged statistics, shared & private, memory & cpu,
+ * occupy the first half of the array. The second half of the
+ * array is for current counters, which are averaged into the
+ * first set by task_numa_placement.
+ */
+static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
{
- return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
+ return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
}
static inline unsigned long task_faults(struct task_struct *p, int nid)
{
- if (!p->numa_faults_memory)
+ if (!p->numa_faults)
return 0;
- return p->numa_faults_memory[task_faults_idx(nid, 0)] +
- p->numa_faults_memory[task_faults_idx(nid, 1)];
+ return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -920,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
if (!p->numa_group)
return 0;
- return p->numa_group->faults[task_faults_idx(nid, 0)] +
- p->numa_group->faults[task_faults_idx(nid, 1)];
+ return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
{
- return group->faults_cpu[task_faults_idx(nid, 0)] +
- group->faults_cpu[task_faults_idx(nid, 1)];
+ return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
+ group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
+}
+
+/* Handle placement on systems where not all nodes are directly connected. */
+static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
+ int maxdist, bool task)
+{
+ unsigned long score = 0;
+ int node;
+
+ /*
+ * All nodes are directly connected, and the same distance
+ * from each other. No need for fancy placement algorithms.
+ */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return 0;
+
+ /*
+ * This code is called for each node, introducing N^2 complexity,
+ * which should be ok given the number of nodes rarely exceeds 8.
+ */
+ for_each_online_node(node) {
+ unsigned long faults;
+ int dist = node_distance(nid, node);
+
+ /*
+ * The furthest away nodes in the system are not interesting
+ * for placement; nid was already counted.
+ */
+ if (dist == sched_max_numa_distance || node == nid)
+ continue;
+
+ /*
+ * On systems with a backplane NUMA topology, compare groups
+ * of nodes, and move tasks towards the group with the most
+ * memory accesses. When comparing two nodes at distance
+ * "hoplimit", only nodes closer by than "hoplimit" are part
+ * of each group. Skip other nodes.
+ */
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist > maxdist)
+ continue;
+
+ /* Add up the faults from nearby nodes. */
+ if (task)
+ faults = task_faults(p, node);
+ else
+ faults = group_faults(p, node);
+
+ /*
+ * On systems with a glueless mesh NUMA topology, there are
+ * no fixed "groups of nodes". Instead, nodes that are not
+ * directly connected bounce traffic through intermediate
+ * nodes; a numa_group can occupy any set of nodes.
+ * The further away a node is, the less the faults count.
+ * This seems to result in good task placement.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ faults *= (sched_max_numa_distance - dist);
+ faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ }
+
+ score += faults;
+ }
+
+ return score;
}
/*
@@ -936,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
* larger multiplier, in order to group tasks together that are almost
* evenly spread out between numa nodes.
*/
-static inline unsigned long task_weight(struct task_struct *p, int nid)
+static inline unsigned long task_weight(struct task_struct *p, int nid,
+ int dist)
{
- unsigned long total_faults;
+ unsigned long faults, total_faults;
- if (!p->numa_faults_memory)
+ if (!p->numa_faults)
return 0;
total_faults = p->total_numa_faults;
@@ -948,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
if (!total_faults)
return 0;
- return 1000 * task_faults(p, nid) / total_faults;
+ faults = task_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, true);
+
+ return 1000 * faults / total_faults;
}
-static inline unsigned long group_weight(struct task_struct *p, int nid)
+static inline unsigned long group_weight(struct task_struct *p, int nid,
+ int dist)
{
- if (!p->numa_group || !p->numa_group->total_faults)
+ unsigned long faults, total_faults;
+
+ if (!p->numa_group)
return 0;
- return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+ total_faults = p->numa_group->total_faults;
+
+ if (!total_faults)
+ return 0;
+
+ faults = group_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, false);
+
+ return 1000 * faults / total_faults;
}
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
@@ -1089,6 +1174,7 @@ struct task_numa_env {
struct numa_stats src_stats, dst_stats;
int imbalance_pct;
+ int dist;
struct task_struct *best_task;
long best_imp;
@@ -1168,6 +1254,7 @@ static void task_numa_compare(struct task_numa_env *env,
long load;
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
+ int dist = env->dist;
rcu_read_lock();
@@ -1208,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env,
* in any group then look only at task weights.
*/
if (cur->numa_group == env->p->numa_group) {
- imp = taskimp + task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
+ imp = taskimp + task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
/*
* Add some hysteresis to prevent swapping the
* tasks within a group over tiny differences.
@@ -1223,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env,
* instead.
*/
if (cur->numa_group)
- imp += group_weight(cur, env->src_nid) -
- group_weight(cur, env->dst_nid);
+ imp += group_weight(cur, env->src_nid, dist) -
+ group_weight(cur, env->dst_nid, dist);
else
- imp += task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
+ imp += task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
}
}
@@ -1326,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p)
};
struct sched_domain *sd;
unsigned long taskweight, groupweight;
- int nid, ret;
+ int nid, ret, dist;
long taskimp, groupimp;
/*
@@ -1354,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p)
return -EINVAL;
}
- taskweight = task_weight(p, env.src_nid);
- groupweight = group_weight(p, env.src_nid);
- update_numa_stats(&env.src_stats, env.src_nid);
env.dst_nid = p->numa_preferred_nid;
- taskimp = task_weight(p, env.dst_nid) - taskweight;
- groupimp = group_weight(p, env.dst_nid) - groupweight;
+ dist = env.dist = node_distance(env.src_nid, env.dst_nid);
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ update_numa_stats(&env.src_stats, env.src_nid);
+ taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
+ groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
- /* No space available on the preferred nid. Look elsewhere. */
- if (env.best_cpu == -1) {
+ /*
+ * Look at other nodes in these cases:
+ * - there is no space available on the preferred_nid
+ * - the task is part of a numa_group that is interleaved across
+ * multiple NUMA nodes; in order to better consolidate the group,
+ * we need to check other locations.
+ */
+ if (env.best_cpu == -1 || (p->numa_group &&
+ nodes_weight(p->numa_group->active_nodes) > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
+ dist = node_distance(env.src_nid, env.dst_nid);
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist != env.dist) {
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ }
+
/* Only consider nodes where both task and groups benefit */
- taskimp = task_weight(p, nid) - taskweight;
- groupimp = group_weight(p, nid) - groupweight;
+ taskimp = task_weight(p, nid, dist) - taskweight;
+ groupimp = group_weight(p, nid, dist) - groupweight;
if (taskimp < 0 && groupimp < 0)
continue;
+ env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1431,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
@@ -1580,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
return delta;
}
+/*
+ * Determine the preferred nid for a task in a numa_group. This needs to
+ * be done in a way that produces consistent results with group_weight,
+ * otherwise workloads might not converge.
+ */
+static int preferred_group_nid(struct task_struct *p, int nid)
+{
+ nodemask_t nodes;
+ int dist;
+
+ /* Direct connections between all NUMA nodes. */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return nid;
+
+ /*
+ * On a system with glueless mesh NUMA topology, group_weight
+ * scores nodes according to the number of NUMA hinting faults on
+ * both the node itself, and on nearby nodes.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ unsigned long score, max_score = 0;
+ int node, max_node = nid;
+
+ dist = sched_max_numa_distance;
+
+ for_each_online_node(node) {
+ score = group_weight(p, node, dist);
+ if (score > max_score) {
+ max_score = score;
+ max_node = node;
+ }
+ }
+ return max_node;
+ }
+
+ /*
+ * Finding the preferred nid in a system with NUMA backplane
+ * interconnect topology is more involved. The goal is to locate
+ * tasks from numa_groups near each other in the system, and
+ * untangle workloads from different sides of the system. This requires
+ * searching down the hierarchy of node groups, recursively searching
+ * inside the highest scoring group of nodes. The nodemask tricks
+ * keep the complexity of the search down.
+ */
+ nodes = node_online_map;
+ for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
+ unsigned long max_faults = 0;
+ nodemask_t max_group;
+ int a, b;
+
+ /* Are there nodes at this distance from each other? */
+ if (!find_numa_distance(dist))
+ continue;
+
+ for_each_node_mask(a, nodes) {
+ unsigned long faults = 0;
+ nodemask_t this_group;
+ nodes_clear(this_group);
+
+ /* Sum group's NUMA faults; includes a==b case. */
+ for_each_node_mask(b, nodes) {
+ if (node_distance(a, b) < dist) {
+ faults += group_faults(p, b);
+ node_set(b, this_group);
+ node_clear(b, nodes);
+ }
+ }
+
+ /* Remember the top group. */
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_group = this_group;
+ /*
+ * subtle: at the smallest distance there is
+ * just one node left in each "group", the
+ * winner is the preferred nid.
+ */
+ nid = a;
+ }
+ }
+ /* Next round, evaluate the nodes within max_group. */
+ nodes = max_group;
+ }
+ return nid;
+}
+
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = -1, max_group_nid = -1;
@@ -1607,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p)
/* Find the node with the highest number of faults */
for_each_online_node(nid) {
+ /* Keep track of the offsets in numa_faults array */
+ int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
unsigned long faults = 0, group_faults = 0;
- int priv, i;
+ int priv;
for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
long diff, f_diff, f_weight;
- i = task_faults_idx(nid, priv);
+ mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
+ membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
+ cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
+ cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
/* Decay existing window, copy faults since last scan */
- diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
- fault_types[priv] += p->numa_faults_buffer_memory[i];
- p->numa_faults_buffer_memory[i] = 0;
+ diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
+ fault_types[priv] += p->numa_faults[membuf_idx];
+ p->numa_faults[membuf_idx] = 0;
/*
* Normalize the faults_from, so all tasks in a group
@@ -1628,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p)
* faults are less important.
*/
f_weight = div64_u64(runtime << 16, period + 1);
- f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+ f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
(total_faults + 1);
- f_diff = f_weight - p->numa_faults_cpu[i] / 2;
- p->numa_faults_buffer_cpu[i] = 0;
+ f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
+ p->numa_faults[cpubuf_idx] = 0;
- p->numa_faults_memory[i] += diff;
- p->numa_faults_cpu[i] += f_diff;
- faults += p->numa_faults_memory[i];
+ p->numa_faults[mem_idx] += diff;
+ p->numa_faults[cpu_idx] += f_diff;
+ faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff;
if (p->numa_group) {
- /* safe because we can only change our own group */
- p->numa_group->faults[i] += diff;
- p->numa_group->faults_cpu[i] += f_diff;
+ /*
+ * safe because we can only change our own group
+ *
+ * mem_idx represents the offset for a given
+ * nid and priv in a specific region because it
+ * is at the beginning of the numa_faults array.
+ */
+ p->numa_group->faults[mem_idx] += diff;
+ p->numa_group->faults_cpu[mem_idx] += f_diff;
p->numa_group->total_faults += diff;
- group_faults += p->numa_group->faults[i];
+ group_faults += p->numa_group->faults[mem_idx];
}
}
@@ -1662,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p)
if (p->numa_group) {
update_numa_active_node_mask(p->numa_group);
spin_unlock_irq(group_lock);
- max_nid = max_group_nid;
+ max_nid = preferred_group_nid(p, max_group_nid);
}
if (max_faults) {
@@ -1705,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
atomic_set(&grp->refcount, 1);
spin_lock_init(&grp->lock);
- INIT_LIST_HEAD(&grp->task_list);
grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
@@ -1714,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
node_set(task_node(current), grp->active_nodes);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
- grp->faults[i] = p->numa_faults_memory[i];
+ grp->faults[i] = p->numa_faults[i];
grp->total_faults = p->total_numa_faults;
- list_add(&p->numa_entry, &grp->task_list);
grp->nr_tasks++;
rcu_assign_pointer(p->numa_group, grp);
}
@@ -1773,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
- my_grp->faults[i] -= p->numa_faults_memory[i];
- grp->faults[i] += p->numa_faults_memory[i];
+ my_grp->faults[i] -= p->numa_faults[i];
+ grp->faults[i] += p->numa_faults[i];
}
my_grp->total_faults -= p->total_numa_faults;
grp->total_faults += p->total_numa_faults;
- list_move(&p->numa_entry, &grp->task_list);
my_grp->nr_tasks--;
grp->nr_tasks++;
@@ -1799,27 +1996,23 @@ no_join:
void task_numa_free(struct task_struct *p)
{
struct numa_group *grp = p->numa_group;
- void *numa_faults = p->numa_faults_memory;
+ void *numa_faults = p->numa_faults;
unsigned long flags;
int i;
if (grp) {
spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
- grp->faults[i] -= p->numa_faults_memory[i];
+ grp->faults[i] -= p->numa_faults[i];
grp->total_faults -= p->total_numa_faults;
- list_del(&p->numa_entry);
grp->nr_tasks--;
spin_unlock_irqrestore(&grp->lock, flags);
RCU_INIT_POINTER(p->numa_group, NULL);
put_numa_group(grp);
}
- p->numa_faults_memory = NULL;
- p->numa_faults_buffer_memory = NULL;
- p->numa_faults_cpu= NULL;
- p->numa_faults_buffer_cpu = NULL;
+ p->numa_faults = NULL;
kfree(numa_faults);
}
@@ -1842,24 +2035,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
return;
/* Allocate buffer to track faults on a per-node basis */
- if (unlikely(!p->numa_faults_memory)) {
- int size = sizeof(*p->numa_faults_memory) *
+ if (unlikely(!p->numa_faults)) {
+ int size = sizeof(*p->numa_faults) *
NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
- p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
- if (!p->numa_faults_memory)
+ p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
+ if (!p->numa_faults)
return;
- BUG_ON(p->numa_faults_buffer_memory);
- /*
- * The averaged statistics, shared & private, memory & cpu,
- * occupy the first half of the array. The second half of the
- * array is for current counters, which are averaged into the
- * first set by task_numa_placement.
- */
- p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
- p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
- p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
p->total_numa_faults = 0;
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
@@ -1899,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (migrated)
p->numa_pages_migrated += pages;
- p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
- p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
+ p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
+ p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
p->numa_faults_locality[local] += pages;
}
@@ -4469,7 +4652,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
- } else {
+ } else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(i);
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
@@ -4547,9 +4730,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
- if (p->nr_cpus_allowed == 1)
- return prev_cpu;
-
if (sd_flag & SD_BALANCE_WAKE)
want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
@@ -5189,7 +5369,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
- if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
!(env->sd->flags & SD_NUMA)) {
return false;
}
@@ -5228,7 +5408,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
return false;
- if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
+ if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
return false;
src_nid = cpu_to_node(env->src_cpu);
@@ -6172,8 +6352,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
* with a large weight task outweighs the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_free_capacity)
+ sds->local_stat.group_has_free_capacity) {
sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
+ sgs->group_type = group_classify(sg, sgs);
+ }
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 20bca398084a..ee15f5a0d1c1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
struct task_struct *curr;
struct rq *rq;
- if (p->nr_cpus_allowed == 1)
- goto out;
-
/* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
goto out;
@@ -1351,16 +1348,22 @@ out:
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- if (rq->curr->nr_cpus_allowed == 1)
+ /*
+ * Current can't be migrated, useless to reschedule,
+ * let's hope p can move out.
+ */
+ if (rq->curr->nr_cpus_allowed == 1 ||
+ !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
return;
+ /*
+ * p is migratable, so let's not schedule it and
+ * see if it is pushed or pulled somewhere else.
+ */
if (p->nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
- if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
- return;
-
/*
* There appears to be other cpus that can accept
* current and none to run 'p', so lets reschedule
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2df8ef067cc5..9a2a45c970e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -176,6 +176,25 @@ struct dl_bw {
u64 bw, total_bw;
};
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw -= tsk_bw;
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw += tsk_bw;
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+
extern struct mutex sched_domains_mutex;
#ifdef CONFIG_CGROUP_SCHED
@@ -678,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
}
+#ifdef CONFIG_NUMA
+enum numa_topology_type {
+ NUMA_DIRECT,
+ NUMA_GLUELESS_MESH,
+ NUMA_BACKPLANE,
+};
+extern enum numa_topology_type sched_numa_topology_type;
+extern int sched_max_numa_distance;
+extern bool find_numa_distance(int distance);
+#endif
+
#ifdef CONFIG_NUMA_BALANCING
+/* The regions in numa_faults array from task_struct */
+enum numa_faults_stats {
+ NUMA_MEM = 0,
+ NUMA_CPU,
+ NUMA_MEMBUF,
+ NUMA_CPUBUF
+};
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *, struct task_struct *);
@@ -1127,6 +1164,11 @@ struct sched_class {
void (*task_fork) (struct task_struct *p);
void (*task_dead) (struct task_struct *p);
+ /*
+ * The switched_from() call is allowed to drop rq->lock, therefore we
+ * cannot assume the switched_from/switched_to pair is serliazed by
+ * rq->lock. They are however serialized by p->pi_lock.
+ */
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -1504,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
+extern void print_dl_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 5a62915f47a8..852143a79f36 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -9,6 +9,7 @@
#include <linux/mm.h>
#include <linux/wait.h>
#include <linux/hash.h>
+#include <linux/kthread.h>
void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
{
@@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *
}
EXPORT_SYMBOL(autoremove_wake_function);
+static inline bool is_kthread_should_stop(void)
+{
+ return (current->flags & PF_KTHREAD) && kthread_should_stop();
+}
+
+/*
+ * DEFINE_WAIT_FUNC(wait, woken_wake_func);
+ *
+ * add_wait_queue(&wq, &wait);
+ * for (;;) {
+ * if (condition)
+ * break;
+ *
+ * p->state = mode; condition = true;
+ * smp_mb(); // A smp_wmb(); // C
+ * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN;
+ * schedule() try_to_wake_up();
+ * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
+ * wait->flags &= ~WQ_FLAG_WOKEN; condition = true;
+ * smp_mb() // B smp_wmb(); // C
+ * wait->flags |= WQ_FLAG_WOKEN;
+ * }
+ * remove_wait_queue(&wq, &wait);
+ *
+ */
+long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
+{
+ set_current_state(mode); /* A */
+ /*
+ * The above implies an smp_mb(), which matches with the smp_wmb() from
+ * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
+ * also observe all state before the wakeup.
+ */
+ if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+ timeout = schedule_timeout(timeout);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * The below implies an smp_mb(), it too pairs with the smp_wmb() from
+ * woken_wake_function() such that we must either observe the wait
+ * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
+ * an event.
+ */
+ set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+
+ return timeout;
+}
+EXPORT_SYMBOL(wait_woken);
+
+int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ /*
+ * Although this function is called under waitqueue lock, LOCK
+ * doesn't imply write barrier and the users expects write
+ * barrier semantics on wakeup functions. The following
+ * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+ * and is paired with set_mb() in wait_woken().
+ */
+ smp_wmb(); /* C */
+ wait->flags |= WQ_FLAG_WOKEN;
+
+ return default_wake_function(wait, mode, sync, key);
+}
+EXPORT_SYMBOL(woken_wake_function);
+
int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
{
struct wait_bit_key *key = arg;
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f0876f9f6dd..16a305295256 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1275,7 +1275,17 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
local_irq_restore(*flags);
break;
}
-
+ /*
+ * This sighand can be already freed and even reused, but
+ * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which
+ * initializes ->siglock: this slab can't go away, it has
+ * the same object type, ->siglock can't be reinitialized.
+ *
+ * We need to ensure that tsk->sighand is still the same
+ * after we take the lock, we can race with de_thread() or
+ * __exit_signal(). In the latter case the next iteration
+ * must see ->sighand == NULL.
+ */
spin_lock(&sighand->siglock);
if (likely(sighand == tsk->sighand)) {
rcu_read_unlock();
@@ -1331,23 +1341,21 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
int error = -ESRCH;
struct task_struct *p;
- rcu_read_lock();
-retry:
- p = pid_task(pid, PIDTYPE_PID);
- if (p) {
- error = group_send_sig_info(sig, info, p);
- if (unlikely(error == -ESRCH))
- /*
- * The task was unhashed in between, try again.
- * If it is dead, pid_task() will return NULL,
- * if we race with de_thread() it will find the
- * new leader.
- */
- goto retry;
- }
- rcu_read_unlock();
+ for (;;) {
+ rcu_read_lock();
+ p = pid_task(pid, PIDTYPE_PID);
+ if (p)
+ error = group_send_sig_info(sig, info, p);
+ rcu_read_unlock();
+ if (likely(!p || error != -ESRCH))
+ return error;
- return error;
+ /*
+ * The task was unhashed in between, try again. If it
+ * is dead, pid_task() will return NULL, if we race with
+ * de_thread() it will find the new leader.
+ */
+ }
}
int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
@@ -2748,6 +2756,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
#endif
+#ifdef SEGV_BNDERR
+ err |= __put_user(from->si_lower, &to->si_lower);
+ err |= __put_user(from->si_upper, &to->si_upper);
+#endif
break;
case __SI_CHLD:
err |= __put_user(from->si_pid, &to->si_pid);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index eb89e1807408..f032fb5284e3 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data)
set_current_state(TASK_INTERRUPTIBLE);
preempt_disable();
if (kthread_should_stop()) {
- set_current_state(TASK_RUNNING);
+ __set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->cleanup)
ht->cleanup(td->cpu, cpu_online(td->cpu));
@@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data)
/* Check for state change setup */
switch (td->status) {
case HP_THREAD_NONE:
+ __set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->setup)
ht->setup(td->cpu);
td->status = HP_THREAD_ACTIVE;
- preempt_disable();
- break;
+ continue;
+
case HP_THREAD_PARKED:
+ __set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->unpark)
ht->unpark(td->cpu);
td->status = HP_THREAD_ACTIVE;
- preempt_disable();
- break;
+ continue;
}
if (!ht->thread_should_run(td->cpu)) {
- preempt_enable();
+ preempt_enable_no_resched();
schedule();
} else {
- set_current_state(TASK_RUNNING);
+ __set_current_state(TASK_RUNNING);
preempt_enable();
ht->thread_fn(td->cpu);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0699add19164..501baa9ac1be 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -656,7 +656,7 @@ static void run_ksoftirqd(unsigned int cpu)
* in the task stack here.
*/
__do_softirq();
- rcu_note_context_switch(cpu);
+ rcu_note_context_switch();
local_irq_enable();
cond_resched();
return;
diff --git a/kernel/sys.c b/kernel/sys.c
index 1eaa2f0b0246..a8c9f5a7dda6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -91,6 +91,12 @@
#ifndef SET_TSC_CTL
# define SET_TSC_CTL(a) (-EINVAL)
#endif
+#ifndef MPX_ENABLE_MANAGEMENT
+# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL)
+#endif
+#ifndef MPX_DISABLE_MANAGEMENT
+# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -2203,6 +2209,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
me->mm->def_flags &= ~VM_NOHUGEPAGE;
up_write(&me->mm->mmap_sem);
break;
+ case PR_MPX_ENABLE_MANAGEMENT:
+ error = MPX_ENABLE_MANAGEMENT(me);
+ break;
+ case PR_MPX_DISABLE_MANAGEMENT:
+ error = MPX_DISABLE_MANAGEMENT(me);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 15f2511a1b7c..7c54ff79afd7 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1104,6 +1104,15 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
+ {
+ .procname = "panic_on_warn",
+ .data = &panic_on_warn,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
{ }
};
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 9a4f750a2963..7e7746a42a62 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
{ CTL_INT, KERN_COMPAT_LOG, "compat-log" },
{ CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
{ CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
+ { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" },
{}
};
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b312fcc73024..670fff88a961 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -459,7 +459,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
stats = nla_data(na);
memset(stats, 0, sizeof(*stats));
- rc = cgroupstats_build(stats, f.file->f_dentry);
+ rc = cgroupstats_build(stats, f.file->f_path.dentry);
if (rc < 0) {
nlmsg_free(rep_skb);
goto err;
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 7347426fa68d..f622cf28628a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
obj-$(CONFIG_TIMER_STATS) += timer_stats.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
-obj-$(CONFIG_TEST_UDELAY) += udelay_test.o
+obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
$(obj)/time.o: $(obj)/timeconst.h
diff --git a/kernel/time/udelay_test.c b/kernel/time/test_udelay.c
index e622ba365a13..e622ba365a13 100644
--- a/kernel/time/udelay_test.c
+++ b/kernel/time/test_udelay.c
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7b5741fc4110..1f4356037a7d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -585,7 +585,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
last_jiffies = jiffies;
} while (read_seqretry(&jiffies_lock, seq));
- if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
+ if (rcu_needs_cpu(&rcu_delta_jiffies) ||
arch_needs_cpu() || irq_work_needs_cpu()) {
next_jiffies = last_jiffies + 1;
delta_jiffies = 1;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a9ae20fb0b11..65015ff2f07c 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -304,7 +304,9 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
}
EXPORT_SYMBOL(timespec_trunc);
-/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
+/*
+ * mktime64 - Converts date to seconds.
+ * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
* Assumes input in normal date format, i.e. 1980-12-31 23:59:59
* => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
*
@@ -314,15 +316,10 @@ EXPORT_SYMBOL(timespec_trunc);
* -year/100+year/400 terms, and add 10.]
*
* This algorithm was first published by Gauss (I think).
- *
- * WARNING: this function will overflow on 2106-02-07 06:28:16 on
- * machines where long is 32-bit! (However, as time_t is signed, we
- * will already get problems at other places on 2038-01-19 03:14:08)
*/
-unsigned long
-mktime(const unsigned int year0, const unsigned int mon0,
- const unsigned int day, const unsigned int hour,
- const unsigned int min, const unsigned int sec)
+time64_t mktime64(const unsigned int year0, const unsigned int mon0,
+ const unsigned int day, const unsigned int hour,
+ const unsigned int min, const unsigned int sec)
{
unsigned int mon = mon0, year = year0;
@@ -332,15 +329,14 @@ mktime(const unsigned int year0, const unsigned int mon0,
year -= 1;
}
- return ((((unsigned long)
+ return ((((time64_t)
(year/4 - year/100 + year/400 + 367*mon/12 + day) +
year*365 - 719499
)*24 + hour /* now have hours */
)*60 + min /* now have minutes */
)*60 + sec; /* finally seconds */
}
-
-EXPORT_SYMBOL(mktime);
+EXPORT_SYMBOL(mktime64);
/**
* set_normalized_timespec - set timespec sec and nsec parts and normalize
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ec1791fae965..6a931852082f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -417,7 +417,8 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
*/
static inline void tk_update_ktime_data(struct timekeeper *tk)
{
- s64 nsec;
+ u64 seconds;
+ u32 nsec;
/*
* The xtime based monotonic readout is:
@@ -426,13 +427,22 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
* nsec = base_mono + now();
* ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
*/
- nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
- nsec *= NSEC_PER_SEC;
- nsec += tk->wall_to_monotonic.tv_nsec;
- tk->tkr.base_mono = ns_to_ktime(nsec);
+ seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
+ nsec = (u32) tk->wall_to_monotonic.tv_nsec;
+ tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
/* Update the monotonic raw base */
tk->base_raw = timespec64_to_ktime(tk->raw_time);
+
+ /*
+ * The sum of the nanoseconds portions of xtime and
+ * wall_to_monotonic can be greater/equal one second. Take
+ * this into account before updating tk->ktime_sec.
+ */
+ nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+ if (nsec >= NSEC_PER_SEC)
+ seconds++;
+ tk->ktime_sec = seconds;
}
/* must hold timekeeper_lock */
@@ -519,9 +529,9 @@ EXPORT_SYMBOL(__getnstimeofday64);
/**
* getnstimeofday64 - Returns the time of day in a timespec64.
- * @ts: pointer to the timespec to be set
+ * @ts: pointer to the timespec64 to be set
*
- * Returns the time of day in a timespec (WARN if suspended).
+ * Returns the time of day in a timespec64 (WARN if suspended).
*/
void getnstimeofday64(struct timespec64 *ts)
{
@@ -623,7 +633,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw);
*
* The function calculates the monotonic clock from the realtime
* clock and the wall_to_monotonic offset and stores the result
- * in normalized timespec format in the variable pointed to by @ts.
+ * in normalized timespec64 format in the variable pointed to by @ts.
*/
void ktime_get_ts64(struct timespec64 *ts)
{
@@ -648,6 +658,54 @@ void ktime_get_ts64(struct timespec64 *ts)
}
EXPORT_SYMBOL_GPL(ktime_get_ts64);
+/**
+ * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
+ *
+ * Returns the seconds portion of CLOCK_MONOTONIC with a single non
+ * serialized read. tk->ktime_sec is of type 'unsigned long' so this
+ * works on both 32 and 64 bit systems. On 32 bit systems the readout
+ * covers ~136 years of uptime which should be enough to prevent
+ * premature wrap arounds.
+ */
+time64_t ktime_get_seconds(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ WARN_ON(timekeeping_suspended);
+ return tk->ktime_sec;
+}
+EXPORT_SYMBOL_GPL(ktime_get_seconds);
+
+/**
+ * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
+ *
+ * Returns the wall clock seconds since 1970. This replaces the
+ * get_seconds() interface which is not y2038 safe on 32bit systems.
+ *
+ * For 64bit systems the fast access to tk->xtime_sec is preserved. On
+ * 32bit systems the access must be protected with the sequence
+ * counter to provide "atomic" access to the 64bit tk->xtime_sec
+ * value.
+ */
+time64_t ktime_get_real_seconds(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ time64_t seconds;
+ unsigned int seq;
+
+ if (IS_ENABLED(CONFIG_64BIT))
+ return tk->xtime_sec;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ seconds = tk->xtime_sec;
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return seconds;
+}
+EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
+
#ifdef CONFIG_NTP_PPS
/**
@@ -703,18 +761,18 @@ void do_gettimeofday(struct timeval *tv)
EXPORT_SYMBOL(do_gettimeofday);
/**
- * do_settimeofday - Sets the time of day
- * @tv: pointer to the timespec variable containing the new time
+ * do_settimeofday64 - Sets the time of day.
+ * @ts: pointer to the timespec64 variable containing the new time
*
* Sets the time of day to the new time and update NTP and notify hrtimers
*/
-int do_settimeofday(const struct timespec *tv)
+int do_settimeofday64(const struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 ts_delta, xt, tmp;
+ struct timespec64 ts_delta, xt;
unsigned long flags;
- if (!timespec_valid_strict(tv))
+ if (!timespec64_valid_strict(ts))
return -EINVAL;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -723,13 +781,12 @@ int do_settimeofday(const struct timespec *tv)
timekeeping_forward_now(tk);
xt = tk_xtime(tk);
- ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
- ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
+ ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
+ ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
- tmp = timespec_to_timespec64(*tv);
- tk_set_xtime(tk, &tmp);
+ tk_set_xtime(tk, ts);
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -741,7 +798,7 @@ int do_settimeofday(const struct timespec *tv)
return 0;
}
-EXPORT_SYMBOL(do_settimeofday);
+EXPORT_SYMBOL(do_settimeofday64);
/**
* timekeeping_inject_offset - Adds or subtracts from the current time.
@@ -895,12 +952,12 @@ int timekeeping_notify(struct clocksource *clock)
}
/**
- * getrawmonotonic - Returns the raw monotonic time in a timespec
- * @ts: pointer to the timespec to be set
+ * getrawmonotonic64 - Returns the raw monotonic time in a timespec
+ * @ts: pointer to the timespec64 to be set
*
* Returns the raw monotonic time (completely un-modified by ntp)
*/
-void getrawmonotonic(struct timespec *ts)
+void getrawmonotonic64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 ts64;
@@ -915,9 +972,10 @@ void getrawmonotonic(struct timespec *ts)
} while (read_seqcount_retry(&tk_core.seq, seq));
timespec64_add_ns(&ts64, nsecs);
- *ts = timespec64_to_timespec(ts64);
+ *ts = ts64;
}
-EXPORT_SYMBOL(getrawmonotonic);
+EXPORT_SYMBOL(getrawmonotonic64);
+
/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
@@ -1068,8 +1126,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
}
/**
- * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
- * @delta: pointer to a timespec delta value
+ * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
+ * @delta: pointer to a timespec64 delta value
*
* This hook is for architectures that cannot support read_persistent_clock
* because their RTC/persistent clock is only accessible when irqs are enabled.
@@ -1077,10 +1135,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
* This function should only be called by rtc_resume(), and allows
* a suspend offset to be injected into the timekeeping values.
*/
-void timekeeping_inject_sleeptime(struct timespec *delta)
+void timekeeping_inject_sleeptime64(struct timespec64 *delta)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 tmp;
unsigned long flags;
/*
@@ -1095,8 +1152,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
timekeeping_forward_now(tk);
- tmp = timespec_to_timespec64(*delta);
- __timekeeping_inject_sleeptime(tk, &tmp);
+ __timekeeping_inject_sleeptime(tk, delta);
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1332,6 +1388,12 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
*
* XXX - TODO: Doc ntp_error calculation.
*/
+ if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) {
+ /* NTP adjustment caused clocksource mult overflow */
+ WARN_ON_ONCE(1);
+ return;
+ }
+
tk->tkr.mult += mult_adj;
tk->xtime_interval += interval;
tk->tkr.xtime_nsec -= offset;
@@ -1397,7 +1459,8 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
}
if (unlikely(tk->tkr.clock->maxadj &&
- (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) {
+ (abs(tk->tkr.mult - tk->tkr.clock->mult)
+ > tk->tkr.clock->maxadj))) {
printk_once(KERN_WARNING
"Adjusting %s more than 11%% (%ld vs %ld)\n",
tk->tkr.clock->name, (long)tk->tkr.mult,
@@ -1646,7 +1709,7 @@ struct timespec current_kernel_time(void)
}
EXPORT_SYMBOL(current_kernel_time);
-struct timespec get_monotonic_coarse(void)
+struct timespec64 get_monotonic_coarse64(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 now, mono;
@@ -1662,7 +1725,7 @@ struct timespec get_monotonic_coarse(void)
set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
now.tv_nsec + mono.tv_nsec);
- return timespec64_to_timespec(now);
+ return now;
}
/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3260ffdb368f..2d3f5c504939 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1377,12 +1377,11 @@ unsigned long get_next_timer_interrupt(unsigned long now)
void update_process_times(int user_tick)
{
struct task_struct *p = current;
- int cpu = smp_processor_id();
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
- rcu_check_callbacks(cpu, user_tick);
+ rcu_check_callbacks(user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
irq_work_tick();
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c1bd4ada2a04..11b9cb36092b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1142,9 +1142,9 @@ static void get_pdu_remap(const struct trace_entry *ent,
r->sector_from = be64_to_cpu(sector_from);
}
-typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act);
-static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
+static void blk_log_action_classic(struct trace_iterator *iter, const char *act)
{
char rwbs[RWBS_LEN];
unsigned long long ts = iter->ts;
@@ -1154,33 +1154,33 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
fill_rwbs(rwbs, t);
- return trace_seq_printf(&iter->seq,
- "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
- MAJOR(t->device), MINOR(t->device), iter->cpu,
- secs, nsec_rem, iter->ent->pid, act, rwbs);
+ trace_seq_printf(&iter->seq,
+ "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), iter->cpu,
+ secs, nsec_rem, iter->ent->pid, act, rwbs);
}
-static int blk_log_action(struct trace_iterator *iter, const char *act)
+static void blk_log_action(struct trace_iterator *iter, const char *act)
{
char rwbs[RWBS_LEN];
const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
- return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
- MAJOR(t->device), MINOR(t->device), act, rwbs);
+ trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
+ MAJOR(t->device), MINOR(t->device), act, rwbs);
}
-static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
{
const unsigned char *pdu_buf;
int pdu_len;
- int i, end, ret;
+ int i, end;
pdu_buf = pdu_start(ent);
pdu_len = te_blk_io_trace(ent)->pdu_len;
if (!pdu_len)
- return 1;
+ return;
/* find the last zero that needs to be printed */
for (end = pdu_len - 1; end >= 0; end--)
@@ -1188,119 +1188,107 @@ static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
break;
end++;
- if (!trace_seq_putc(s, '('))
- return 0;
+ trace_seq_putc(s, '(');
for (i = 0; i < pdu_len; i++) {
- ret = trace_seq_printf(s, "%s%02x",
- i == 0 ? "" : " ", pdu_buf[i]);
- if (!ret)
- return ret;
+ trace_seq_printf(s, "%s%02x",
+ i == 0 ? "" : " ", pdu_buf[i]);
/*
* stop when the rest is just zeroes and indicate so
* with a ".." appended
*/
- if (i == end && end != pdu_len - 1)
- return trace_seq_puts(s, " ..) ");
+ if (i == end && end != pdu_len - 1) {
+ trace_seq_puts(s, " ..) ");
+ return;
+ }
}
- return trace_seq_puts(s, ") ");
+ trace_seq_puts(s, ") ");
}
-static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
- int ret;
-
- ret = trace_seq_printf(s, "%u ", t_bytes(ent));
- if (!ret)
- return 0;
- ret = blk_log_dump_pdu(s, ent);
- if (!ret)
- return 0;
- return trace_seq_printf(s, "[%s]\n", cmd);
+ trace_seq_printf(s, "%u ", t_bytes(ent));
+ blk_log_dump_pdu(s, ent);
+ trace_seq_printf(s, "[%s]\n", cmd);
} else {
if (t_sec(ent))
- return trace_seq_printf(s, "%llu + %u [%s]\n",
+ trace_seq_printf(s, "%llu + %u [%s]\n",
t_sector(ent), t_sec(ent), cmd);
- return trace_seq_printf(s, "[%s]\n", cmd);
+ else
+ trace_seq_printf(s, "[%s]\n", cmd);
}
}
-static int blk_log_with_error(struct trace_seq *s,
+static void blk_log_with_error(struct trace_seq *s,
const struct trace_entry *ent)
{
if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
- int ret;
-
- ret = blk_log_dump_pdu(s, ent);
- if (ret)
- return trace_seq_printf(s, "[%d]\n", t_error(ent));
- return 0;
+ blk_log_dump_pdu(s, ent);
+ trace_seq_printf(s, "[%d]\n", t_error(ent));
} else {
if (t_sec(ent))
- return trace_seq_printf(s, "%llu + %u [%d]\n",
- t_sector(ent),
- t_sec(ent), t_error(ent));
- return trace_seq_printf(s, "%llu [%d]\n",
- t_sector(ent), t_error(ent));
+ trace_seq_printf(s, "%llu + %u [%d]\n",
+ t_sector(ent),
+ t_sec(ent), t_error(ent));
+ else
+ trace_seq_printf(s, "%llu [%d]\n",
+ t_sector(ent), t_error(ent));
}
}
-static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
{
struct blk_io_trace_remap r = { .device_from = 0, };
get_pdu_remap(ent, &r);
- return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
- t_sector(ent), t_sec(ent),
- MAJOR(r.device_from), MINOR(r.device_from),
- (unsigned long long)r.sector_from);
+ trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
+ t_sector(ent), t_sec(ent),
+ MAJOR(r.device_from), MINOR(r.device_from),
+ (unsigned long long)r.sector_from);
}
-static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- return trace_seq_printf(s, "[%s]\n", cmd);
+ trace_seq_printf(s, "[%s]\n", cmd);
}
-static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
+ trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
}
-static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
{
char cmd[TASK_COMM_LEN];
trace_find_cmdline(ent->pid, cmd);
- return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
- get_pdu_int(ent), cmd);
+ trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
+ get_pdu_int(ent), cmd);
}
-static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
{
- int ret;
const struct blk_io_trace *t = te_blk_io_trace(ent);
- ret = trace_seq_putmem(s, t + 1, t->pdu_len);
- if (ret)
- return trace_seq_putc(s, '\n');
- return ret;
+ trace_seq_putmem(s, t + 1, t->pdu_len);
+ trace_seq_putc(s, '\n');
}
/*
@@ -1339,7 +1327,7 @@ static void blk_tracer_reset(struct trace_array *tr)
static const struct {
const char *act[2];
- int (*print)(struct trace_seq *s, const struct trace_entry *ent);
+ void (*print)(struct trace_seq *s, const struct trace_entry *ent);
} what2act[] = {
[__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
[__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
@@ -1364,7 +1352,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
const struct blk_io_trace *t;
u16 what;
- int ret;
bool long_act;
blk_log_action_t *log_action;
@@ -1374,21 +1361,18 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
log_action = classic ? &blk_log_action_classic : &blk_log_action;
if (t->action == BLK_TN_MESSAGE) {
- ret = log_action(iter, long_act ? "message" : "m");
- if (ret)
- ret = blk_log_msg(s, iter->ent);
- goto out;
+ log_action(iter, long_act ? "message" : "m");
+ blk_log_msg(s, iter->ent);
}
if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
- ret = trace_seq_printf(s, "Unknown action %x\n", what);
+ trace_seq_printf(s, "Unknown action %x\n", what);
else {
- ret = log_action(iter, what2act[what].act[long_act]);
- if (ret)
- ret = what2act[what].print(s, iter->ent);
+ log_action(iter, what2act[what].act[long_act]);
+ what2act[what].print(s, iter->ent);
}
-out:
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+
+ return trace_handle_return(s);
}
static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
@@ -1397,7 +1381,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
return print_one_line(iter, false);
}
-static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
+static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
@@ -1407,18 +1391,18 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
.time = iter->ts,
};
- if (!trace_seq_putmem(s, &old, offset))
- return 0;
- return trace_seq_putmem(s, &t->sector,
- sizeof(old) - offset + t->pdu_len);
+ trace_seq_putmem(s, &old, offset);
+ trace_seq_putmem(s, &t->sector,
+ sizeof(old) - offset + t->pdu_len);
}
static enum print_line_t
blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
- return blk_trace_synthesize_old_trace(iter) ?
- TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ blk_trace_synthesize_old_trace(iter);
+
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 31c90fec4158..929a733d302e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -387,6 +387,8 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
return ret;
}
+static void ftrace_update_trampoline(struct ftrace_ops *ops);
+
static int __register_ftrace_function(struct ftrace_ops *ops)
{
if (ops->flags & FTRACE_OPS_FL_DELETED)
@@ -416,9 +418,13 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if (control_ops_alloc(ops))
return -ENOMEM;
add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
+ /* The control_ops needs the trampoline update */
+ ops = &control_ops;
} else
add_ftrace_ops(&ftrace_ops_list, ops);
+ ftrace_update_trampoline(ops);
+
if (ftrace_enabled)
update_ftrace_function();
@@ -565,13 +571,13 @@ static int function_stat_cmp(void *p1, void *p2)
static int function_stat_headers(struct seq_file *m)
{
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- seq_printf(m, " Function "
- "Hit Time Avg s^2\n"
- " -------- "
- "--- ---- --- ---\n");
+ seq_puts(m, " Function "
+ "Hit Time Avg s^2\n"
+ " -------- "
+ "--- ---- --- ---\n");
#else
- seq_printf(m, " Function Hit\n"
- " -------- ---\n");
+ seq_puts(m, " Function Hit\n"
+ " -------- ---\n");
#endif
return 0;
}
@@ -598,7 +604,7 @@ static int function_stat_show(struct seq_file *m, void *v)
seq_printf(m, " %-30.30s %10lu", str, rec->counter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- seq_printf(m, " ");
+ seq_puts(m, " ");
avg = rec->time;
do_div(avg, rec->counter);
@@ -1111,6 +1117,43 @@ static struct ftrace_ops global_ops = {
FTRACE_OPS_FL_INITIALIZED,
};
+/*
+ * This is used by __kernel_text_address() to return true if the
+ * address is on a dynamically allocated trampoline that would
+ * not return true for either core_kernel_text() or
+ * is_module_text_address().
+ */
+bool is_ftrace_trampoline(unsigned long addr)
+{
+ struct ftrace_ops *op;
+ bool ret = false;
+
+ /*
+ * Some of the ops may be dynamically allocated,
+ * they are freed after a synchronize_sched().
+ */
+ preempt_disable_notrace();
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /*
+ * This is to check for dynamically allocated trampolines.
+ * Trampolines that are in kernel text will have
+ * core_kernel_text() return true.
+ */
+ if (op->trampoline && op->trampoline_size)
+ if (addr >= op->trampoline &&
+ addr < op->trampoline + op->trampoline_size) {
+ ret = true;
+ goto out;
+ }
+ } while_for_each_ftrace_op(op);
+
+ out:
+ preempt_enable_notrace();
+
+ return ret;
+}
+
struct ftrace_page {
struct ftrace_page *next;
struct dyn_ftrace *records;
@@ -1315,6 +1358,9 @@ ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
static void
ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
+static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
+ struct ftrace_hash *new_hash);
+
static int
ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash **dst, struct ftrace_hash *src)
@@ -1325,8 +1371,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash *new_hash;
int size = src->count;
int bits = 0;
+ int ret;
int i;
+ /* Reject setting notrace hash on IPMODIFY ftrace_ops */
+ if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable)
+ return -EINVAL;
+
/*
* If the new source is empty, just free dst and assign it
* the empty_hash.
@@ -1360,6 +1411,16 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
}
update:
+ /* Make sure this can be applied if it is IPMODIFY ftrace_ops */
+ if (enable) {
+ /* IPMODIFY should be updated only when filter_hash updating */
+ ret = ftrace_hash_ipmodify_update(ops, new_hash);
+ if (ret < 0) {
+ free_ftrace_hash(new_hash);
+ return ret;
+ }
+ }
+
/*
* Remove the current set, update the hash and add
* them back.
@@ -1724,6 +1785,114 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
ftrace_hash_rec_update_modify(ops, filter_hash, 1);
}
+/*
+ * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
+ * or no-needed to update, -EBUSY if it detects a conflict of the flag
+ * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs.
+ * Note that old_hash and new_hash has below meanings
+ * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
+ * - If the hash is EMPTY_HASH, it hits nothing
+ * - Anything else hits the recs which match the hash entries.
+ */
+static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
+ struct ftrace_hash *old_hash,
+ struct ftrace_hash *new_hash)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec, *end = NULL;
+ int in_old, in_new;
+
+ /* Only update if the ops has been registered */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return 0;
+
+ if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ return 0;
+
+ /*
+ * Since the IPMODIFY is a very address sensitive action, we do not
+ * allow ftrace_ops to set all functions to new hash.
+ */
+ if (!new_hash || !old_hash)
+ return -EINVAL;
+
+ /* Update rec->flags */
+ do_for_each_ftrace_rec(pg, rec) {
+ /* We need to update only differences of filter_hash */
+ in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
+ in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
+ if (in_old == in_new)
+ continue;
+
+ if (in_new) {
+ /* New entries must ensure no others are using it */
+ if (rec->flags & FTRACE_FL_IPMODIFY)
+ goto rollback;
+ rec->flags |= FTRACE_FL_IPMODIFY;
+ } else /* Removed entry */
+ rec->flags &= ~FTRACE_FL_IPMODIFY;
+ } while_for_each_ftrace_rec();
+
+ return 0;
+
+rollback:
+ end = rec;
+
+ /* Roll back what we did above */
+ do_for_each_ftrace_rec(pg, rec) {
+ if (rec == end)
+ goto err_out;
+
+ in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
+ in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
+ if (in_old == in_new)
+ continue;
+
+ if (in_new)
+ rec->flags &= ~FTRACE_FL_IPMODIFY;
+ else
+ rec->flags |= FTRACE_FL_IPMODIFY;
+ } while_for_each_ftrace_rec();
+
+err_out:
+ return -EBUSY;
+}
+
+static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(hash))
+ hash = NULL;
+
+ return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash);
+}
+
+/* Disabling always succeeds */
+static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops)
+{
+ struct ftrace_hash *hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(hash))
+ hash = NULL;
+
+ __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH);
+}
+
+static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
+ struct ftrace_hash *new_hash)
+{
+ struct ftrace_hash *old_hash = ops->func_hash->filter_hash;
+
+ if (ftrace_hash_empty(old_hash))
+ old_hash = NULL;
+
+ if (ftrace_hash_empty(new_hash))
+ new_hash = NULL;
+
+ return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash);
+}
+
static void print_ip_ins(const char *fmt, unsigned char *p)
{
int i;
@@ -1734,10 +1903,13 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
}
+static struct ftrace_ops *
+ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
+
/**
* ftrace_bug - report and shutdown function tracer
* @failed: The failed type (EFAULT, EINVAL, EPERM)
- * @ip: The address that failed
+ * @rec: The record that failed
*
* The arch code that enables or disables the function tracing
* can call ftrace_bug() when it has detected a problem in
@@ -1746,8 +1918,10 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
* EINVAL - if what is read at @ip is not what was expected
* EPERM - if the problem happens on writting to the @ip address
*/
-void ftrace_bug(int failed, unsigned long ip)
+void ftrace_bug(int failed, struct dyn_ftrace *rec)
{
+ unsigned long ip = rec ? rec->ip : 0;
+
switch (failed) {
case -EFAULT:
FTRACE_WARN_ON_ONCE(1);
@@ -1759,7 +1933,7 @@ void ftrace_bug(int failed, unsigned long ip)
pr_info("ftrace failed to modify ");
print_ip_sym(ip);
print_ip_ins(" actual: ", (unsigned char *)ip);
- printk(KERN_CONT "\n");
+ pr_cont("\n");
break;
case -EPERM:
FTRACE_WARN_ON_ONCE(1);
@@ -1771,6 +1945,24 @@ void ftrace_bug(int failed, unsigned long ip)
pr_info("ftrace faulted on unknown error ");
print_ip_sym(ip);
}
+ if (rec) {
+ struct ftrace_ops *ops = NULL;
+
+ pr_info("ftrace record flags: %lx\n", rec->flags);
+ pr_cont(" (%ld)%s", ftrace_rec_count(rec),
+ rec->flags & FTRACE_FL_REGS ? " R" : " ");
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ ops = ftrace_find_tramp_ops_any(rec);
+ if (ops)
+ pr_cont("\ttramp: %pS",
+ (void *)ops->trampoline);
+ else
+ pr_cont("\ttramp: ERROR!");
+
+ }
+ ip = ftrace_get_addr_curr(rec);
+ pr_cont(" expected tramp: %lx\n", ip);
+ }
}
static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
@@ -2093,7 +2285,7 @@ void __weak ftrace_replace_code(int enable)
do_for_each_ftrace_rec(pg, rec) {
failed = __ftrace_replace_code(rec, enable);
if (failed) {
- ftrace_bug(failed, rec->ip);
+ ftrace_bug(failed, rec);
/* Stop processing */
return;
}
@@ -2175,17 +2367,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
static int
ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
{
- unsigned long ip;
int ret;
- ip = rec->ip;
-
if (unlikely(ftrace_disabled))
return 0;
ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
if (ret) {
- ftrace_bug(ret, ip);
+ ftrace_bug(ret, rec);
return 0;
}
return 1;
@@ -2320,6 +2509,10 @@ static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
static ftrace_func_t saved_ftrace_func;
static int ftrace_start_up;
+void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+}
+
static void control_ops_free(struct ftrace_ops *ops)
{
free_percpu(ops->disabled);
@@ -2369,6 +2562,15 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
*/
ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
+ ret = ftrace_hash_ipmodify_enable(ops);
+ if (ret < 0) {
+ /* Rollback registration process */
+ __unregister_ftrace_function(ops);
+ ftrace_start_up--;
+ ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ return ret;
+ }
+
ftrace_hash_rec_enable(ops, 1);
ftrace_startup_enable(command);
@@ -2397,6 +2599,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
*/
WARN_ON_ONCE(ftrace_start_up < 0);
+ /* Disabling ipmodify never fails */
+ ftrace_hash_ipmodify_disable(ops);
ftrace_hash_rec_disable(ops, 1);
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
@@ -2471,6 +2675,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
schedule_on_each_cpu(ftrace_sync);
+ arch_ftrace_trampoline_free(ops);
+
if (ops->flags & FTRACE_OPS_FL_CONTROL)
control_ops_free(ops);
}
@@ -2623,7 +2829,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
if (ftrace_start_up && cnt) {
int failed = __ftrace_replace_code(p, 1);
if (failed)
- ftrace_bug(failed, p->ip);
+ ftrace_bug(failed, p);
}
}
}
@@ -2948,6 +3154,22 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(&ftrace_lock);
}
+void * __weak
+arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ return NULL;
+}
+
+static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops,
+ struct dyn_ftrace *rec)
+{
+ void *ptr;
+
+ ptr = arch_ftrace_trampoline_func(ops, rec);
+ if (ptr)
+ seq_printf(m, " ->%pS", ptr);
+}
+
static int t_show(struct seq_file *m, void *v)
{
struct ftrace_iterator *iter = m->private;
@@ -2958,9 +3180,9 @@ static int t_show(struct seq_file *m, void *v)
if (iter->flags & FTRACE_ITER_PRINTALL) {
if (iter->flags & FTRACE_ITER_NOTRACE)
- seq_printf(m, "#### no functions disabled ####\n");
+ seq_puts(m, "#### no functions disabled ####\n");
else
- seq_printf(m, "#### all functions enabled ####\n");
+ seq_puts(m, "#### all functions enabled ####\n");
return 0;
}
@@ -2971,22 +3193,25 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, "%ps", (void *)rec->ip);
if (iter->flags & FTRACE_ITER_ENABLED) {
- seq_printf(m, " (%ld)%s",
+ struct ftrace_ops *ops = NULL;
+
+ seq_printf(m, " (%ld)%s%s",
ftrace_rec_count(rec),
- rec->flags & FTRACE_FL_REGS ? " R" : " ");
+ rec->flags & FTRACE_FL_REGS ? " R" : " ",
+ rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ");
if (rec->flags & FTRACE_FL_TRAMP_EN) {
- struct ftrace_ops *ops;
-
ops = ftrace_find_tramp_ops_any(rec);
if (ops)
seq_printf(m, "\ttramp: %pS",
(void *)ops->trampoline);
else
- seq_printf(m, "\ttramp: ERROR!");
+ seq_puts(m, "\ttramp: ERROR!");
+
}
+ add_trampoline_func(m, ops, rec);
}
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -3020,9 +3245,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
{
struct ftrace_iterator *iter;
- if (unlikely(ftrace_disabled))
- return -ENODEV;
-
iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
if (iter) {
iter->pg = ftrace_pages_start;
@@ -3975,6 +4197,9 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
+static unsigned long save_global_trampoline;
+static unsigned long save_global_flags;
+
static int __init set_graph_function(char *str)
{
strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -4183,9 +4408,9 @@ static int g_show(struct seq_file *m, void *v)
struct ftrace_graph_data *fgd = m->private;
if (fgd->table == ftrace_graph_funcs)
- seq_printf(m, "#### all functions enabled ####\n");
+ seq_puts(m, "#### all functions enabled ####\n");
else
- seq_printf(m, "#### no functions disabled ####\n");
+ seq_puts(m, "#### no functions disabled ####\n");
return 0;
}
@@ -4696,6 +4921,32 @@ void __init ftrace_init(void)
ftrace_disabled = 1;
}
+/* Do nothing if arch does not support this */
+void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+}
+
+static void ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+
+/*
+ * Currently there's no safe way to free a trampoline when the kernel
+ * is configured with PREEMPT. That is because a task could be preempted
+ * when it jumped to the trampoline, it may be preempted for a long time
+ * depending on the system load, and currently there's no way to know
+ * when it will be off the trampoline. If the trampoline is freed
+ * too early, when the task runs again, it will be executing on freed
+ * memory and crash.
+ */
+#ifdef CONFIG_PREEMPT
+ /* Currently, only non dynamic ops can have a trampoline */
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
+ return;
+#endif
+
+ arch_ftrace_update_trampoline(ops);
+}
+
#else
static struct ftrace_ops global_ops = {
@@ -4738,6 +4989,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
return 1;
}
+static void ftrace_update_trampoline(struct ftrace_ops *ops)
+{
+}
+
#endif /* CONFIG_DYNAMIC_FTRACE */
__init void ftrace_init_global_array_ops(struct trace_array *tr)
@@ -5075,12 +5330,12 @@ static int fpid_show(struct seq_file *m, void *v)
const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
if (v == (void *)1) {
- seq_printf(m, "no pid\n");
+ seq_puts(m, "no pid\n");
return 0;
}
if (fpid->pid == ftrace_swapper_pid)
- seq_printf(m, "swapper tasks\n");
+ seq_puts(m, "swapper tasks\n");
else
seq_printf(m, "%u\n", pid_vnr(fpid->pid));
@@ -5293,6 +5548,7 @@ static struct ftrace_ops graph_ops = {
FTRACE_OPS_FL_STUB,
#ifdef FTRACE_GRAPH_TRAMP_ADDR
.trampoline = FTRACE_GRAPH_TRAMP_ADDR,
+ /* trampoline_size is only needed for dynamically allocated tramps */
#endif
ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
};
@@ -5522,7 +5778,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
update_function_graph_func();
ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
-
out:
mutex_unlock(&ftrace_lock);
return ret;
@@ -5543,6 +5798,17 @@ void unregister_ftrace_graph(void)
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /*
+ * Function graph does not allocate the trampoline, but
+ * other global_ops do. We need to reset the ALLOC_TRAMP flag
+ * if one was used.
+ */
+ global_ops.trampoline = save_global_trampoline;
+ if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
+ global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
+#endif
+
out:
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a56e07c8d15b..7a4104cb95cb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -34,21 +34,19 @@ static void update_pages_handler(struct work_struct *work);
*/
int ring_buffer_print_entry_header(struct trace_seq *s)
{
- int ret;
-
- ret = trace_seq_puts(s, "# compressed entry header\n");
- ret = trace_seq_puts(s, "\ttype_len : 5 bits\n");
- ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n");
- ret = trace_seq_puts(s, "\tarray : 32 bits\n");
- ret = trace_seq_putc(s, '\n');
- ret = trace_seq_printf(s, "\tpadding : type == %d\n",
- RINGBUF_TYPE_PADDING);
- ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
- RINGBUF_TYPE_TIME_EXTEND);
- ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
- RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
+ trace_seq_puts(s, "# compressed entry header\n");
+ trace_seq_puts(s, "\ttype_len : 5 bits\n");
+ trace_seq_puts(s, "\ttime_delta : 27 bits\n");
+ trace_seq_puts(s, "\tarray : 32 bits\n");
+ trace_seq_putc(s, '\n');
+ trace_seq_printf(s, "\tpadding : type == %d\n",
+ RINGBUF_TYPE_PADDING);
+ trace_seq_printf(s, "\ttime_extend : type == %d\n",
+ RINGBUF_TYPE_TIME_EXTEND);
+ trace_seq_printf(s, "\tdata max type_len == %d\n",
+ RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
- return ret;
+ return !trace_seq_has_overflowed(s);
}
/*
@@ -419,32 +417,31 @@ static inline int test_time_stamp(u64 delta)
int ring_buffer_print_page_header(struct trace_seq *s)
{
struct buffer_data_page field;
- int ret;
-
- ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
- "offset:0;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)sizeof(field.time_stamp),
- (unsigned int)is_signed_type(u64));
-
- ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- (unsigned int)sizeof(field.commit),
- (unsigned int)is_signed_type(long));
-
- ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- 1,
- (unsigned int)is_signed_type(long));
-
- ret = trace_seq_printf(s, "\tfield: char data;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), data),
- (unsigned int)BUF_PAGE_SIZE,
- (unsigned int)is_signed_type(char));
- return ret;
+ trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+ "offset:0;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)sizeof(field.time_stamp),
+ (unsigned int)is_signed_type(u64));
+
+ trace_seq_printf(s, "\tfield: local_t commit;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit),
+ (unsigned int)is_signed_type(long));
+
+ trace_seq_printf(s, "\tfield: int overwrite;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ 1,
+ (unsigned int)is_signed_type(long));
+
+ trace_seq_printf(s, "\tfield: char data;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), data),
+ (unsigned int)BUF_PAGE_SIZE,
+ (unsigned int)is_signed_type(char));
+
+ return !trace_seq_has_overflowed(s);
}
struct rb_irq_work {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 92f4a6cee172..1af4f8f2ab5d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -155,10 +155,11 @@ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
static int __init stop_trace_on_warning(char *str)
{
- __disable_trace_on_warning = 1;
+ if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
+ __disable_trace_on_warning = 1;
return 1;
}
-__setup("traceoff_on_warning=", stop_trace_on_warning);
+__setup("traceoff_on_warning", stop_trace_on_warning);
static int __init boot_alloc_snapshot(char *str)
{
@@ -938,19 +939,20 @@ out:
return ret;
}
+/* TODO add a seq_buf_to_buffer() */
static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
int len;
- if (s->len <= s->readpos)
+ if (trace_seq_used(s) <= s->seq.readpos)
return -EBUSY;
- len = s->len - s->readpos;
+ len = trace_seq_used(s) - s->seq.readpos;
if (cnt > len)
cnt = len;
- memcpy(buf, s->buffer + s->readpos, cnt);
+ memcpy(buf, s->buffer + s->seq.readpos, cnt);
- s->readpos += cnt;
+ s->seq.readpos += cnt;
return cnt;
}
@@ -2158,9 +2160,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
goto out;
}
- len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
- if (len > TRACE_BUF_SIZE)
- goto out;
+ len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
local_save_flags(flags);
size = sizeof(*entry) + len + 1;
@@ -2171,8 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
entry = ring_buffer_event_data(event);
entry->ip = ip;
- memcpy(&entry->buf, tbuffer, len);
- entry->buf[len] = '\0';
+ memcpy(&entry->buf, tbuffer, len + 1);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(buffer, flags, 6, pc);
@@ -2509,14 +2508,14 @@ get_total_entries(struct trace_buffer *buf,
static void print_lat_help_header(struct seq_file *m)
{
- seq_puts(m, "# _------=> CPU# \n");
- seq_puts(m, "# / _-----=> irqs-off \n");
- seq_puts(m, "# | / _----=> need-resched \n");
- seq_puts(m, "# || / _---=> hardirq/softirq \n");
- seq_puts(m, "# ||| / _--=> preempt-depth \n");
- seq_puts(m, "# |||| / delay \n");
- seq_puts(m, "# cmd pid ||||| time | caller \n");
- seq_puts(m, "# \\ / ||||| \\ | / \n");
+ seq_puts(m, "# _------=> CPU# \n"
+ "# / _-----=> irqs-off \n"
+ "# | / _----=> need-resched \n"
+ "# || / _---=> hardirq/softirq \n"
+ "# ||| / _--=> preempt-depth \n"
+ "# |||| / delay \n"
+ "# cmd pid ||||| time | caller \n"
+ "# \\ / ||||| \\ | / \n");
}
static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
@@ -2533,20 +2532,20 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
{
print_event_info(buf, m);
- seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
- seq_puts(m, "# | | | | |\n");
+ seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"
+ "# | | | | |\n");
}
static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
{
print_event_info(buf, m);
- seq_puts(m, "# _-----=> irqs-off\n");
- seq_puts(m, "# / _----=> need-resched\n");
- seq_puts(m, "# | / _---=> hardirq/softirq\n");
- seq_puts(m, "# || / _--=> preempt-depth\n");
- seq_puts(m, "# ||| / delay\n");
- seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n");
- seq_puts(m, "# | | | |||| | |\n");
+ seq_puts(m, "# _-----=> irqs-off\n"
+ "# / _----=> need-resched\n"
+ "# | / _---=> hardirq/softirq\n"
+ "# || / _--=> preempt-depth\n"
+ "# ||| / delay\n"
+ "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"
+ "# | | | |||| | |\n");
}
void
@@ -2649,24 +2648,21 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
- if (!trace_print_lat_context(iter))
- goto partial;
- } else {
- if (!trace_print_context(iter))
- goto partial;
- }
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+ trace_print_lat_context(iter);
+ else
+ trace_print_context(iter);
}
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
if (event)
return event->funcs->trace(iter, sym_flags, event);
- if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
- goto partial;
+ trace_seq_printf(s, "Unknown type %d\n", entry->type);
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
@@ -2677,22 +2673,20 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
entry = iter->ent;
- if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- if (!trace_seq_printf(s, "%d %d %llu ",
- entry->pid, iter->cpu, iter->ts))
- goto partial;
- }
+ if (trace_flags & TRACE_ITER_CONTEXT_INFO)
+ trace_seq_printf(s, "%d %d %llu ",
+ entry->pid, iter->cpu, iter->ts);
+
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
event = ftrace_find_event(entry->type);
if (event)
return event->funcs->raw(iter, 0, event);
- if (!trace_seq_printf(s, "%d ?\n", entry->type))
- goto partial;
+ trace_seq_printf(s, "%d ?\n", entry->type);
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
@@ -2705,9 +2699,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
entry = iter->ent;
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
- SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
- SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
+ SEQ_PUT_HEX_FIELD(s, entry->pid);
+ SEQ_PUT_HEX_FIELD(s, iter->cpu);
+ SEQ_PUT_HEX_FIELD(s, iter->ts);
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
}
event = ftrace_find_event(entry->type);
@@ -2717,9 +2713,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
return ret;
}
- SEQ_PUT_FIELD_RET(s, newline);
+ SEQ_PUT_FIELD(s, newline);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
@@ -2731,9 +2727,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
entry = iter->ent;
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
- SEQ_PUT_FIELD_RET(s, entry->pid);
- SEQ_PUT_FIELD_RET(s, iter->cpu);
- SEQ_PUT_FIELD_RET(s, iter->ts);
+ SEQ_PUT_FIELD(s, entry->pid);
+ SEQ_PUT_FIELD(s, iter->cpu);
+ SEQ_PUT_FIELD(s, iter->ts);
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
}
event = ftrace_find_event(entry->type);
@@ -2779,10 +2777,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
{
enum print_line_t ret;
- if (iter->lost_events &&
- !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
- iter->cpu, iter->lost_events))
- return TRACE_TYPE_PARTIAL_LINE;
+ if (iter->lost_events) {
+ trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+ iter->cpu, iter->lost_events);
+ if (trace_seq_has_overflowed(&iter->seq))
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
if (iter->trace && iter->trace->print_line) {
ret = iter->trace->print_line(iter);
@@ -2860,44 +2860,44 @@ static void test_ftrace_alive(struct seq_file *m)
{
if (!ftrace_is_dead())
return;
- seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
- seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n");
+ seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"
+ "# MAY BE MISSING FUNCTION EVENTS\n");
}
#ifdef CONFIG_TRACER_MAX_TRACE
static void show_snapshot_main_help(struct seq_file *m)
{
- seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
- seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
- seq_printf(m, "# Takes a snapshot of the main buffer.\n");
- seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n");
- seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
- seq_printf(m, "# is not a '0' or '1')\n");
+ seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"
+ "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer.\n"
+ "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
}
static void show_snapshot_percpu_help(struct seq_file *m)
{
- seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
+ seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
- seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
- seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n");
+ seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"
+ "# Takes a snapshot of the main buffer for this cpu.\n");
#else
- seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n");
- seq_printf(m, "# Must use main snapshot file to allocate.\n");
+ seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n"
+ "# Must use main snapshot file to allocate.\n");
#endif
- seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n");
- seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
- seq_printf(m, "# is not a '0' or '1')\n");
+ seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"
+ "# (Doesn't have to be '2' works with any number that\n"
+ "# is not a '0' or '1')\n");
}
static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
{
if (iter->tr->allocated_snapshot)
- seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
+ seq_puts(m, "#\n# * Snapshot is allocated *\n#\n");
else
- seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
+ seq_puts(m, "#\n# * Snapshot is freed *\n#\n");
- seq_printf(m, "# Snapshot commands:\n");
+ seq_puts(m, "# Snapshot commands:\n");
if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
show_snapshot_main_help(m);
else
@@ -3251,7 +3251,7 @@ static int t_show(struct seq_file *m, void *v)
if (!t)
return 0;
- seq_printf(m, "%s", t->name);
+ seq_puts(m, t->name);
if (t->next)
seq_putc(m, ' ');
else
@@ -4314,6 +4314,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
goto out;
}
+ trace_seq_init(&iter->seq);
+
/*
* We make a copy of the current tracer to avoid concurrent
* changes on it while we are reading.
@@ -4507,18 +4509,18 @@ waitagain:
trace_access_lock(iter->cpu_file);
while (trace_find_next_entry_inc(iter) != NULL) {
enum print_line_t ret;
- int len = iter->seq.len;
+ int save_len = iter->seq.seq.len;
ret = print_trace_line(iter);
if (ret == TRACE_TYPE_PARTIAL_LINE) {
/* don't print partial lines */
- iter->seq.len = len;
+ iter->seq.seq.len = save_len;
break;
}
if (ret != TRACE_TYPE_NO_CONSUME)
trace_consume(iter);
- if (iter->seq.len >= cnt)
+ if (trace_seq_used(&iter->seq) >= cnt)
break;
/*
@@ -4534,7 +4536,7 @@ waitagain:
/* Now copy what we have to the user */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
- if (iter->seq.readpos >= iter->seq.len)
+ if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq))
trace_seq_init(&iter->seq);
/*
@@ -4568,20 +4570,33 @@ static size_t
tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
{
size_t count;
+ int save_len;
int ret;
/* Seq buffer is page-sized, exactly what we need. */
for (;;) {
- count = iter->seq.len;
+ save_len = iter->seq.seq.len;
ret = print_trace_line(iter);
- count = iter->seq.len - count;
- if (rem < count) {
- rem = 0;
- iter->seq.len -= count;
+
+ if (trace_seq_has_overflowed(&iter->seq)) {
+ iter->seq.seq.len = save_len;
break;
}
+
+ /*
+ * This should not be hit, because it should only
+ * be set if the iter->seq overflowed. But check it
+ * anyway to be safe.
+ */
if (ret == TRACE_TYPE_PARTIAL_LINE) {
- iter->seq.len -= count;
+ iter->seq.seq.len = save_len;
+ break;
+ }
+
+ count = trace_seq_used(&iter->seq) - save_len;
+ if (rem < count) {
+ rem = 0;
+ iter->seq.seq.len = save_len;
break;
}
@@ -4662,13 +4677,13 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
/* Copy the data into the page, so we can start over. */
ret = trace_seq_to_buffer(&iter->seq,
page_address(spd.pages[i]),
- iter->seq.len);
+ trace_seq_used(&iter->seq));
if (ret < 0) {
__free_page(spd.pages[i]);
break;
}
spd.partial[i].offset = 0;
- spd.partial[i].len = iter->seq.len;
+ spd.partial[i].len = trace_seq_used(&iter->seq);
trace_seq_init(&iter->seq);
}
@@ -5668,7 +5683,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "read events: %ld\n", cnt);
- count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+ count = simple_read_from_buffer(ubuf, count, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -5749,10 +5765,10 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
seq_printf(m, "%ps:", (void *)ip);
- seq_printf(m, "snapshot");
+ seq_puts(m, "snapshot");
if (count == -1)
- seq_printf(m, ":unlimited\n");
+ seq_puts(m, ":unlimited\n");
else
seq_printf(m, ":count=%ld\n", count);
@@ -6417,7 +6433,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m
int ret;
/* Paranoid: Make sure the parent is the "instances" directory */
- parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+ parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
if (WARN_ON_ONCE(parent != trace_instance_dir))
return -ENOENT;
@@ -6444,7 +6460,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry)
int ret;
/* Paranoid: Make sure the parent is the "instances" directory */
- parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+ parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
if (WARN_ON_ONCE(parent != trace_instance_dir))
return -ENOENT;
@@ -6631,11 +6647,19 @@ void
trace_printk_seq(struct trace_seq *s)
{
/* Probably should print a warning here. */
- if (s->len >= TRACE_MAX_PRINT)
- s->len = TRACE_MAX_PRINT;
+ if (s->seq.len >= TRACE_MAX_PRINT)
+ s->seq.len = TRACE_MAX_PRINT;
+
+ /*
+ * More paranoid code. Although the buffer size is set to
+ * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just
+ * an extra layer of protection.
+ */
+ if (WARN_ON_ONCE(s->seq.len >= s->seq.size))
+ s->seq.len = s->seq.size - 1;
/* should be zero ended, but we are paranoid. */
- s->buffer[s->len] = 0;
+ s->buffer[s->seq.len] = 0;
printk(KERN_TRACE "%s", s->buffer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 385391fb1d3b..3255dfb054a0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -14,6 +14,7 @@
#include <linux/trace_seq.h>
#include <linux/ftrace_event.h>
#include <linux/compiler.h>
+#include <linux/trace_seq.h>
#ifdef CONFIG_FTRACE_SYSCALLS
#include <asm/unistd.h> /* For NR_SYSCALLS */
@@ -569,15 +570,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
void tracing_iter_reset(struct trace_iterator *iter, int cpu);
-void tracing_sched_switch_trace(struct trace_array *tr,
- struct task_struct *prev,
- struct task_struct *next,
- unsigned long flags, int pc);
-
-void tracing_sched_wakeup_trace(struct trace_array *tr,
- struct task_struct *wakee,
- struct task_struct *cur,
- unsigned long flags, int pc);
void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
@@ -597,9 +589,6 @@ void set_graph_array(struct trace_array *tr);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
-void tracing_sched_switch_assign_trace(struct trace_array *tr);
-void tracing_stop_sched_switch_record(void);
-void tracing_start_sched_switch_record(void);
int register_tracer(struct tracer *type);
int is_tracing_stopped(void);
@@ -719,6 +708,8 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
extern unsigned long trace_flags;
+extern char trace_find_mark(unsigned long long duration);
+
/* Standard output formatting function used for function return traces */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -737,7 +728,7 @@ extern unsigned long trace_flags;
extern enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags);
extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
-extern enum print_line_t
+extern void
trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
extern void graph_trace_open(struct trace_iterator *iter);
extern void graph_trace_close(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 697fb9bac8f0..7d6e2afde669 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -151,22 +151,21 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
- field->correct ? " ok " : " MISS ",
- field->func,
- field->file,
- field->line))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
+ field->correct ? " ok " : " MISS ",
+ field->func,
+ field->file,
+ field->line);
+
+ return trace_handle_return(&iter->seq);
}
static void branch_print_header(struct seq_file *s)
{
seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT"
- " FUNC:FILE:LINE\n");
- seq_puts(s, "# | | | | | "
- " |\n");
+ " FUNC:FILE:LINE\n"
+ "# | | | | | "
+ " |\n");
}
static struct trace_event_functions trace_branch_funcs = {
@@ -233,12 +232,12 @@ extern unsigned long __stop_annotated_branch_profile[];
static int annotated_branch_stat_headers(struct seq_file *m)
{
- seq_printf(m, " correct incorrect %% ");
- seq_printf(m, " Function "
- " File Line\n"
- " ------- --------- - "
- " -------- "
- " ---- ----\n");
+ seq_puts(m, " correct incorrect % "
+ " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
return 0;
}
@@ -274,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
if (percent < 0)
- seq_printf(m, " X ");
+ seq_puts(m, " X ");
else
seq_printf(m, "%3ld ", percent);
seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
@@ -362,12 +361,12 @@ extern unsigned long __stop_branch_profile[];
static int all_branch_stat_headers(struct seq_file *m)
{
- seq_printf(m, " miss hit %% ");
- seq_printf(m, " Function "
- " File Line\n"
- " ------- --------- - "
- " -------- "
- " ---- ----\n");
+ seq_puts(m, " miss hit % "
+ " Function "
+ " File Line\n"
+ " ------- --------- - "
+ " -------- "
+ " ---- ----\n");
return 0;
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0cc51edde3a8..d0e4f92b5eb6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -461,7 +461,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
if (dir) {
spin_lock(&dir->d_lock); /* probably unneeded */
- list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) {
+ list_for_each_entry(child, &dir->d_subdirs, d_child) {
if (child->d_inode) /* probably unneeded */
child->d_inode->i_private = NULL;
}
@@ -918,7 +918,7 @@ static int f_show(struct seq_file *m, void *v)
case FORMAT_HEADER:
seq_printf(m, "name: %s\n", ftrace_event_name(call));
seq_printf(m, "ID: %d\n", call->event.type);
- seq_printf(m, "format:\n");
+ seq_puts(m, "format:\n");
return 0;
case FORMAT_FIELD_SEPERATOR:
@@ -1044,7 +1044,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_unlock(&event_mutex);
if (file)
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -1210,7 +1211,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_init(s);
print_subsystem_event_filter(system, s);
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -1265,7 +1267,8 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
trace_seq_init(s);
func(s);
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
kfree(s);
@@ -1988,7 +1991,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
ftrace_event_name(data->file->event_call));
if (data->count == -1)
- seq_printf(m, ":unlimited\n");
+ seq_puts(m, ":unlimited\n");
else
seq_printf(m, ":count=%ld\n", data->count);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 7a8c1528e141..ced69da0ff55 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -45,6 +45,7 @@ enum filter_op_ids
OP_GT,
OP_GE,
OP_BAND,
+ OP_NOT,
OP_NONE,
OP_OPEN_PAREN,
};
@@ -67,6 +68,7 @@ static struct filter_op filter_ops[] = {
{ OP_GT, ">", 5 },
{ OP_GE, ">=", 5 },
{ OP_BAND, "&", 6 },
+ { OP_NOT, "!", 6 },
{ OP_NONE, "OP_NONE", 0 },
{ OP_OPEN_PAREN, "(", 0 },
};
@@ -85,6 +87,7 @@ enum {
FILT_ERR_MISSING_FIELD,
FILT_ERR_INVALID_FILTER,
FILT_ERR_IP_FIELD_ONLY,
+ FILT_ERR_ILLEGAL_NOT_OP,
};
static char *err_text[] = {
@@ -101,6 +104,7 @@ static char *err_text[] = {
"Missing field name and/or value",
"Meaningless filter expression",
"Only 'ip' field is supported for function trace",
+ "Illegal use of '!'",
};
struct opstack_op {
@@ -139,6 +143,7 @@ struct pred_stack {
int index;
};
+/* If not of not match is equal to not of not, then it is a match */
#define DEFINE_COMPARISON_PRED(type) \
static int filter_pred_##type(struct filter_pred *pred, void *event) \
{ \
@@ -166,7 +171,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \
break; \
} \
\
- return match; \
+ return !!match == !pred->not; \
}
#define DEFINE_EQUALITY_PRED(size) \
@@ -484,9 +489,10 @@ static int process_ops(struct filter_pred *preds,
if (!WARN_ON_ONCE(!pred->fn))
match = pred->fn(pred, rec);
if (!!match == type)
- return match;
+ break;
}
- return match;
+ /* If not of not match is equal to not of not, then it is a match */
+ return !!match == !op->not;
}
struct filter_match_preds_data {
@@ -735,10 +741,10 @@ static int filter_set_pred(struct event_filter *filter,
* then this op can be folded.
*/
if (left->index & FILTER_PRED_FOLD &&
- (left->op == dest->op ||
+ ((left->op == dest->op && !left->not) ||
left->left == FILTER_PRED_INVALID) &&
right->index & FILTER_PRED_FOLD &&
- (right->op == dest->op ||
+ ((right->op == dest->op && !right->not) ||
right->left == FILTER_PRED_INVALID))
dest->index |= FILTER_PRED_FOLD;
@@ -1028,7 +1034,7 @@ static int init_pred(struct filter_parse_state *ps,
}
if (pred->op == OP_NE)
- pred->not = 1;
+ pred->not ^= 1;
pred->fn = fn;
return 0;
@@ -1590,6 +1596,17 @@ static int replace_preds(struct ftrace_event_call *call,
continue;
}
+ if (elt->op == OP_NOT) {
+ if (!n_preds || operand1 || operand2) {
+ parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0);
+ err = -EINVAL;
+ goto fail;
+ }
+ if (!dry_run)
+ filter->preds[n_preds - 1].not ^= 1;
+ continue;
+ }
+
if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
err = -ENOSPC;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4747b476a030..8712df9decb4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -373,7 +373,7 @@ event_trigger_print(const char *name, struct seq_file *m,
{
long count = (long)data;
- seq_printf(m, "%s", name);
+ seq_puts(m, name);
if (count == -1)
seq_puts(m, ":unlimited");
@@ -383,7 +383,7 @@ event_trigger_print(const char *name, struct seq_file *m,
if (filter_str)
seq_printf(m, " if %s\n", filter_str);
else
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -1105,7 +1105,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
if (data->filter_str)
seq_printf(m, " if %s\n", data->filter_str);
else
- seq_puts(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 57f0ec962d2c..fcd41a166405 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -261,37 +261,74 @@ static struct tracer function_trace __tracer_data =
};
#ifdef CONFIG_DYNAMIC_FTRACE
-static int update_count(void **data)
+static void update_traceon_count(void **data, bool on)
{
- unsigned long *count = (long *)data;
+ long *count = (long *)data;
+ long old_count = *count;
- if (!*count)
- return 0;
+ /*
+ * Tracing gets disabled (or enabled) once per count.
+ * This function can be called at the same time on multiple CPUs.
+ * It is fine if both disable (or enable) tracing, as disabling
+ * (or enabling) the second time doesn't do anything as the
+ * state of the tracer is already disabled (or enabled).
+ * What needs to be synchronized in this case is that the count
+ * only gets decremented once, even if the tracer is disabled
+ * (or enabled) twice, as the second one is really a nop.
+ *
+ * The memory barriers guarantee that we only decrement the
+ * counter once. First the count is read to a local variable
+ * and a read barrier is used to make sure that it is loaded
+ * before checking if the tracer is in the state we want.
+ * If the tracer is not in the state we want, then the count
+ * is guaranteed to be the old count.
+ *
+ * Next the tracer is set to the state we want (disabled or enabled)
+ * then a write memory barrier is used to make sure that
+ * the new state is visible before changing the counter by
+ * one minus the old counter. This guarantees that another CPU
+ * executing this code will see the new state before seeing
+ * the new counter value, and would not do anything if the new
+ * counter is seen.
+ *
+ * Note, there is no synchronization between this and a user
+ * setting the tracing_on file. But we currently don't care
+ * about that.
+ */
+ if (!old_count)
+ return;
- if (*count != -1)
- (*count)--;
+ /* Make sure we see count before checking tracing state */
+ smp_rmb();
- return 1;
+ if (on == !!tracing_is_on())
+ return;
+
+ if (on)
+ tracing_on();
+ else
+ tracing_off();
+
+ /* unlimited? */
+ if (old_count == -1)
+ return;
+
+ /* Make sure tracing state is visible before updating count */
+ smp_wmb();
+
+ *count = old_count - 1;
}
static void
ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
{
- if (tracing_is_on())
- return;
-
- if (update_count(data))
- tracing_on();
+ update_traceon_count(data, 1);
}
static void
ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
{
- if (!tracing_is_on())
- return;
-
- if (update_count(data))
- tracing_off();
+ update_traceon_count(data, 0);
}
static void
@@ -330,11 +367,49 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
static void
ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
{
- if (!tracing_is_on())
- return;
+ long *count = (long *)data;
+ long old_count;
+ long new_count;
- if (update_count(data))
- trace_dump_stack(STACK_SKIP);
+ /*
+ * Stack traces should only execute the number of times the
+ * user specified in the counter.
+ */
+ do {
+
+ if (!tracing_is_on())
+ return;
+
+ old_count = *count;
+
+ if (!old_count)
+ return;
+
+ /* unlimited? */
+ if (old_count == -1) {
+ trace_dump_stack(STACK_SKIP);
+ return;
+ }
+
+ new_count = old_count - 1;
+ new_count = cmpxchg(count, old_count, new_count);
+ if (new_count == old_count)
+ trace_dump_stack(STACK_SKIP);
+
+ } while (new_count != old_count);
+}
+
+static int update_count(void **data)
+{
+ unsigned long *count = (long *)data;
+
+ if (!*count)
+ return 0;
+
+ if (*count != -1)
+ (*count)--;
+
+ return 1;
}
static void
@@ -361,7 +436,7 @@ ftrace_probe_print(const char *name, struct seq_file *m,
seq_printf(m, "%ps:%s", (void *)ip, name);
if (count == -1)
- seq_printf(m, ":unlimited\n");
+ seq_puts(m, ":unlimited\n");
else
seq_printf(m, ":count=%ld\n", count);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index f0a0c982cde3..ba476009e5de 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -107,7 +107,7 @@ enum {
FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,
};
-static enum print_line_t
+static void
print_graph_duration(unsigned long long duration, struct trace_seq *s,
u32 flags);
@@ -483,33 +483,24 @@ static int graph_trace_update_thresh(struct trace_array *tr)
static int max_bytes_for_cpu;
-static enum print_line_t
-print_graph_cpu(struct trace_seq *s, int cpu)
+static void print_graph_cpu(struct trace_seq *s, int cpu)
{
- int ret;
-
/*
* Start with a space character - to make it stand out
* to the right a bit when trace output is pasted into
* email:
*/
- ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
}
#define TRACE_GRAPH_PROCINFO_LENGTH 14
-static enum print_line_t
-print_graph_proc(struct trace_seq *s, pid_t pid)
+static void print_graph_proc(struct trace_seq *s, pid_t pid)
{
char comm[TASK_COMM_LEN];
/* sign + log10(MAX_INT) + '\0' */
char pid_str[11];
int spaces = 0;
- int ret;
int len;
int i;
@@ -524,56 +515,43 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
/* First spaces to align center */
- for (i = 0; i < spaces / 2; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < spaces / 2; i++)
+ trace_seq_putc(s, ' ');
- ret = trace_seq_printf(s, "%s-%s", comm, pid_str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s-%s", comm, pid_str);
/* Last spaces to align center */
- for (i = 0; i < spaces - (spaces / 2); i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- return TRACE_TYPE_HANDLED;
+ for (i = 0; i < spaces - (spaces / 2); i++)
+ trace_seq_putc(s, ' ');
}
-static enum print_line_t
-print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
{
- if (!trace_seq_putc(s, ' '))
- return 0;
-
- return trace_print_lat_fmt(s, entry);
+ trace_seq_putc(s, ' ');
+ trace_print_lat_fmt(s, entry);
}
/* If the pid changed since the last trace, output this event */
-static enum print_line_t
+static void
verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
{
pid_t prev_pid;
pid_t *last_pid;
- int ret;
if (!data)
- return TRACE_TYPE_HANDLED;
+ return;
last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
if (*last_pid == pid)
- return TRACE_TYPE_HANDLED;
+ return;
prev_pid = *last_pid;
*last_pid = pid;
if (prev_pid == -1)
- return TRACE_TYPE_HANDLED;
+ return;
/*
* Context-switch trace line:
@@ -582,33 +560,12 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
------------------------------------------
*/
- ret = trace_seq_puts(s,
- " ------------------------------------------\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_proc(s, prev_pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_puts(s, " => ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_proc(s, pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_puts(s,
- "\n ------------------------------------------\n\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, " ------------------------------------------\n");
+ print_graph_cpu(s, cpu);
+ print_graph_proc(s, prev_pid);
+ trace_seq_puts(s, " => ");
+ print_graph_proc(s, pid);
+ trace_seq_puts(s, "\n ------------------------------------------\n\n");
}
static struct ftrace_graph_ret_entry *
@@ -682,175 +639,122 @@ get_return_for_leaf(struct trace_iterator *iter,
return next;
}
-static int print_graph_abs_time(u64 t, struct trace_seq *s)
+static void print_graph_abs_time(u64 t, struct trace_seq *s)
{
unsigned long usecs_rem;
usecs_rem = do_div(t, NSEC_PER_SEC);
usecs_rem /= 1000;
- return trace_seq_printf(s, "%5lu.%06lu | ",
- (unsigned long)t, usecs_rem);
+ trace_seq_printf(s, "%5lu.%06lu | ",
+ (unsigned long)t, usecs_rem);
}
-static enum print_line_t
+static void
print_graph_irq(struct trace_iterator *iter, unsigned long addr,
enum trace_type type, int cpu, pid_t pid, u32 flags)
{
- int ret;
struct trace_seq *s = &iter->seq;
+ struct trace_entry *ent = iter->ent;
if (addr < (unsigned long)__irqentry_text_start ||
addr >= (unsigned long)__irqentry_text_end)
- return TRACE_TYPE_UNHANDLED;
+ return;
if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
/* Absolute time */
- if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
- ret = print_graph_abs_time(iter->ts, s);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
+ print_graph_abs_time(iter->ts, s);
/* Cpu */
- if (flags & TRACE_GRAPH_PRINT_CPU) {
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ print_graph_cpu(s, cpu);
/* Proc */
if (flags & TRACE_GRAPH_PRINT_PROC) {
- ret = print_graph_proc(s, pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- ret = trace_seq_puts(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_proc(s, pid);
+ trace_seq_puts(s, " | ");
}
+
+ /* Latency format */
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ print_graph_lat_fmt(s, ent);
}
/* No overhead */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_START);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
+ print_graph_duration(0, s, flags | FLAGS_FILL_START);
if (type == TRACE_GRAPH_ENT)
- ret = trace_seq_puts(s, "==========>");
+ trace_seq_puts(s, "==========>");
else
- ret = trace_seq_puts(s, "<==========");
-
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
-
- ret = trace_seq_putc(s, '\n');
+ trace_seq_puts(s, "<==========");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
+ print_graph_duration(0, s, flags | FLAGS_FILL_END);
+ trace_seq_putc(s, '\n');
}
-enum print_line_t
+void
trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
{
unsigned long nsecs_rem = do_div(duration, 1000);
/* log10(ULONG_MAX) + '\0' */
- char msecs_str[21];
+ char usecs_str[21];
char nsecs_str[5];
- int ret, len;
+ int len;
int i;
- sprintf(msecs_str, "%lu", (unsigned long) duration);
+ sprintf(usecs_str, "%lu", (unsigned long) duration);
/* Print msecs */
- ret = trace_seq_printf(s, "%s", msecs_str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s", usecs_str);
- len = strlen(msecs_str);
+ len = strlen(usecs_str);
/* Print nsecs (we don't want to exceed 7 numbers) */
if (len < 7) {
size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len);
snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
- ret = trace_seq_printf(s, ".%s", nsecs_str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, ".%s", nsecs_str);
len += strlen(nsecs_str);
}
- ret = trace_seq_puts(s, " us ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " us ");
/* Print remaining spaces to fit the row's width */
- for (i = len; i < 7; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
- return TRACE_TYPE_HANDLED;
+ for (i = len; i < 7; i++)
+ trace_seq_putc(s, ' ');
}
-static enum print_line_t
+static void
print_graph_duration(unsigned long long duration, struct trace_seq *s,
u32 flags)
{
- int ret = -1;
-
if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
!(trace_flags & TRACE_ITER_CONTEXT_INFO))
- return TRACE_TYPE_HANDLED;
+ return;
/* No real adata, just filling the column with spaces */
switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) {
case FLAGS_FILL_FULL:
- ret = trace_seq_puts(s, " | ");
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " | ");
+ return;
case FLAGS_FILL_START:
- ret = trace_seq_puts(s, " ");
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " ");
+ return;
case FLAGS_FILL_END:
- ret = trace_seq_puts(s, " |");
- return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, " |");
+ return;
}
/* Signal a overhead of time execution to the output */
- if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
- /* Duration exceeded 100 msecs */
- if (duration > 100000ULL)
- ret = trace_seq_puts(s, "! ");
- /* Duration exceeded 10 msecs */
- else if (duration > 10000ULL)
- ret = trace_seq_puts(s, "+ ");
- }
-
- /*
- * The -1 means we either did not exceed the duration tresholds
- * or we dont want to print out the overhead. Either way we need
- * to fill out the space.
- */
- if (ret == -1)
- ret = trace_seq_puts(s, " ");
-
- /* Catching here any failure happenned above */
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_print_graph_duration(duration, s);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
-
- ret = trace_seq_puts(s, "| ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ if (flags & TRACE_GRAPH_PRINT_OVERHEAD)
+ trace_seq_printf(s, "%c ", trace_find_mark(duration));
+ else
+ trace_seq_puts(s, " ");
- return TRACE_TYPE_HANDLED;
+ trace_print_graph_duration(duration, s);
+ trace_seq_puts(s, "| ");
}
/* Case of a leaf function on its call entry */
@@ -864,7 +768,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ret *graph_ret;
struct ftrace_graph_ent *call;
unsigned long long duration;
- int ret;
int i;
graph_ret = &ret_entry->ret;
@@ -890,22 +793,15 @@ print_graph_entry_leaf(struct trace_iterator *iter,
}
/* Overhead and duration */
- ret = print_graph_duration(duration, s, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_duration(duration, s, flags);
/* Function */
- for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
- ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%ps();\n", (void *)call->func);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -915,7 +811,6 @@ print_graph_entry_nested(struct trace_iterator *iter,
{
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
- int ret;
int i;
if (data) {
@@ -931,19 +826,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
}
/* No time */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
+ print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
/* Function */
- for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
+
+ trace_seq_printf(s, "%ps() {\n", (void *)call->func);
- ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
- if (!ret)
+ if (trace_seq_has_overflowed(s))
return TRACE_TYPE_PARTIAL_LINE;
/*
@@ -953,62 +844,43 @@ print_graph_entry_nested(struct trace_iterator *iter,
return TRACE_TYPE_NO_CONSUME;
}
-static enum print_line_t
+static void
print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
int type, unsigned long addr, u32 flags)
{
struct fgraph_data *data = iter->private;
struct trace_entry *ent = iter->ent;
int cpu = iter->cpu;
- int ret;
/* Pid */
- if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ verif_pid(s, ent->pid, cpu, data);
- if (type) {
+ if (type)
/* Interrupt */
- ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
- return 0;
+ return;
/* Absolute time */
- if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
- ret = print_graph_abs_time(iter->ts, s);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
+ print_graph_abs_time(iter->ts, s);
/* Cpu */
- if (flags & TRACE_GRAPH_PRINT_CPU) {
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_CPU)
+ print_graph_cpu(s, cpu);
/* Proc */
if (flags & TRACE_GRAPH_PRINT_PROC) {
- ret = print_graph_proc(s, ent->pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_puts(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_proc(s, ent->pid);
+ trace_seq_puts(s, " | ");
}
/* Latency format */
- if (trace_flags & TRACE_ITER_LATENCY_FMT) {
- ret = print_graph_lat_fmt(s, ent);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ print_graph_lat_fmt(s, ent);
- return 0;
+ return;
}
/*
@@ -1126,8 +998,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
if (check_irq_entry(iter, flags, call->func, call->depth))
return TRACE_TYPE_HANDLED;
- if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags);
leaf_ret = get_return_for_leaf(iter, field);
if (leaf_ret)
@@ -1160,7 +1031,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
pid_t pid = ent->pid;
int cpu = iter->cpu;
int func_match = 1;
- int ret;
int i;
if (check_irq_return(iter, flags, trace->depth))
@@ -1186,20 +1056,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
}
}
- if (print_graph_prologue(iter, s, 0, 0, flags))
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_prologue(iter, s, 0, 0, flags);
/* Overhead and duration */
- ret = print_graph_duration(duration, s, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_duration(duration, s, flags);
/* Closing brace */
- for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
/*
* If the return function does not have a matching entry,
@@ -1208,30 +1072,20 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
* belongs to, write out the function name. Always do
* that if the funcgraph-tail option is enabled.
*/
- if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) {
- ret = trace_seq_puts(s, "}\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- } else {
- ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL))
+ trace_seq_puts(s, "}\n");
+ else
+ trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
/* Overrun */
- if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
- ret = trace_seq_printf(s, " (Overruns: %lu)\n",
- trace->overrun);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (flags & TRACE_GRAPH_PRINT_OVERRUN)
+ trace_seq_printf(s, " (Overruns: %lu)\n",
+ trace->overrun);
- ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
- cpu, pid, flags);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
+ cpu, pid, flags);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -1248,26 +1102,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
if (data)
depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
- if (print_graph_prologue(iter, s, 0, 0, flags))
- return TRACE_TYPE_PARTIAL_LINE;
+ print_graph_prologue(iter, s, 0, 0, flags);
/* No time */
- ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
- if (ret != TRACE_TYPE_HANDLED)
- return ret;
+ print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
/* Indentation */
if (depth > 0)
- for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
- ret = trace_seq_putc(s, ' ');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++)
+ trace_seq_putc(s, ' ');
/* The comment */
- ret = trace_seq_puts(s, "/* ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, "/* ");
switch (iter->ent->type) {
case TRACE_BPRINT:
@@ -1290,17 +1136,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
return ret;
}
+ if (trace_seq_has_overflowed(s))
+ goto out;
+
/* Strip ending newline */
- if (s->buffer[s->len - 1] == '\n') {
- s->buffer[s->len - 1] = '\0';
- s->len--;
+ if (s->buffer[s->seq.len - 1] == '\n') {
+ s->buffer[s->seq.len - 1] = '\0';
+ s->seq.len--;
}
- ret = trace_seq_puts(s, " */\n");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, " */\n");
+ out:
+ return trace_handle_return(s);
}
@@ -1407,32 +1254,32 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
print_lat_header(s, flags);
/* 1st line */
- seq_printf(s, "#");
+ seq_putc(s, '#');
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
- seq_printf(s, " TIME ");
+ seq_puts(s, " TIME ");
if (flags & TRACE_GRAPH_PRINT_CPU)
- seq_printf(s, " CPU");
+ seq_puts(s, " CPU");
if (flags & TRACE_GRAPH_PRINT_PROC)
- seq_printf(s, " TASK/PID ");
+ seq_puts(s, " TASK/PID ");
if (lat)
- seq_printf(s, "||||");
+ seq_puts(s, "||||");
if (flags & TRACE_GRAPH_PRINT_DURATION)
- seq_printf(s, " DURATION ");
- seq_printf(s, " FUNCTION CALLS\n");
+ seq_puts(s, " DURATION ");
+ seq_puts(s, " FUNCTION CALLS\n");
/* 2nd line */
- seq_printf(s, "#");
+ seq_putc(s, '#');
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
- seq_printf(s, " | ");
+ seq_puts(s, " | ");
if (flags & TRACE_GRAPH_PRINT_CPU)
- seq_printf(s, " | ");
+ seq_puts(s, " | ");
if (flags & TRACE_GRAPH_PRINT_PROC)
- seq_printf(s, " | | ");
+ seq_puts(s, " | | ");
if (lat)
- seq_printf(s, "||||");
+ seq_puts(s, "||||");
if (flags & TRACE_GRAPH_PRINT_DURATION)
- seq_printf(s, " | | ");
- seq_printf(s, " | | | |\n");
+ seq_puts(s, " | | ");
+ seq_puts(s, " | | | |\n");
}
static void print_graph_headers(struct seq_file *s)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index bd90e1b06088..b0b1c44e923a 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -20,10 +20,12 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
{
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
+ static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
unsigned int old_userobj;
int cnt = 0, cpu;
trace_init_global_iter(&iter);
+ iter.buffer_iter = buffer_iter;
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
@@ -57,19 +59,19 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}
- if (!trace_empty(&iter))
- trace_find_next_entry_inc(&iter);
- while (!trace_empty(&iter)) {
+
+ while (trace_find_next_entry_inc(&iter)) {
if (!cnt)
kdb_printf("---------------------------------\n");
cnt++;
- if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines)
+ if (!skip_lines) {
print_trace_line(&iter);
- if (!skip_lines)
trace_printk_seq(&iter.seq);
- else
+ } else {
skip_lines--;
+ }
+
if (KDB_FLAG(CMD_INTERRUPT))
goto out;
}
@@ -86,9 +88,12 @@ out:
atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
- for_each_tracing_cpu(cpu)
- if (iter.buffer_iter[cpu])
+ for_each_tracing_cpu(cpu) {
+ if (iter.buffer_iter[cpu]) {
ring_buffer_read_finish(iter.buffer_iter[cpu]);
+ iter.buffer_iter[cpu] = NULL;
+ }
+ }
}
/*
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 282f6e4e5539..5edb518be345 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -826,7 +826,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
struct trace_kprobe *tk = v;
int i;
- seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p');
+ seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p');
seq_printf(m, ":%s/%s", tk->tp.call.class->system,
ftrace_event_name(&tk->tp.call));
@@ -840,7 +840,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
for (i = 0; i < tk->tp.nr_args; i++)
seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -1024,27 +1024,22 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
field = (struct kprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
- goto partial;
+ trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, ")"))
- goto partial;
+ trace_seq_putc(s, ')');
data = (u8 *)&field[1];
for (i = 0; i < tp->nr_args; i++)
if (!tp->args[i].type->print(s, tp->args[i].name,
data + tp->args[i].offset, field))
- goto partial;
-
- if (!trace_seq_puts(s, "\n"))
- goto partial;
+ goto out;
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_putc(s, '\n');
+ out:
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -1060,33 +1055,28 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
field = (struct kretprobe_trace_entry_head *)iter->ent;
tp = container_of(event, struct trace_probe, call.event);
- if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
- goto partial;
+ trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call));
if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, " <- "))
- goto partial;
+ trace_seq_puts(s, " <- ");
if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, ")"))
- goto partial;
+ trace_seq_putc(s, ')');
data = (u8 *)&field[1];
for (i = 0; i < tp->nr_args; i++)
if (!tp->args[i].type->print(s, tp->args[i].name,
data + tp->args[i].offset, field))
- goto partial;
+ goto out;
- if (!trace_seq_puts(s, "\n"))
- goto partial;
+ trace_seq_putc(s, '\n');
- return TRACE_TYPE_HANDLED;
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ out:
+ return trace_handle_return(s);
}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0abd9b863474..7a9ba62e9fef 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -59,17 +59,15 @@ static void mmio_trace_start(struct trace_array *tr)
mmio_reset_data(tr);
}
-static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
{
- int ret = 0;
int i;
resource_size_t start, end;
const struct pci_driver *drv = pci_dev_driver(dev);
- /* XXX: incomplete checks for trace_seq_printf() return value */
- ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
- dev->bus->number, dev->devfn,
- dev->vendor, dev->device, dev->irq);
+ trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+ dev->bus->number, dev->devfn,
+ dev->vendor, dev->device, dev->irq);
/*
* XXX: is pci_resource_to_user() appropriate, since we are
* supposed to interpret the __ioremap() phys_addr argument based on
@@ -77,21 +75,20 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
*/
for (i = 0; i < 7; i++) {
pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
- ret += trace_seq_printf(s, " %llx",
+ trace_seq_printf(s, " %llx",
(unsigned long long)(start |
(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
}
for (i = 0; i < 7; i++) {
pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
- ret += trace_seq_printf(s, " %llx",
+ trace_seq_printf(s, " %llx",
dev->resource[i].start < dev->resource[i].end ?
(unsigned long long)(end - start) + 1 : 0);
}
if (drv)
- ret += trace_seq_printf(s, " %s\n", drv->name);
+ trace_seq_printf(s, " %s\n", drv->name);
else
- ret += trace_seq_puts(s, " \n");
- return ret;
+ trace_seq_puts(s, " \n");
}
static void destroy_header_iter(struct header_iter *hiter)
@@ -179,28 +176,27 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
- int ret = 1;
trace_assign_type(field, entry);
rw = &field->rw;
switch (rw->opcode) {
case MMIO_READ:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_WRITE:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_UNKNOWN_OP:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
"%02lx 0x%lx %d\n",
secs, usec_rem, rw->map_id,
@@ -209,12 +205,11 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
(rw->value >> 0) & 0xff, rw->pc, 0);
break;
default:
- ret = trace_seq_puts(s, "rw what?\n");
+ trace_seq_puts(s, "rw what?\n");
break;
}
- if (ret)
- return TRACE_TYPE_HANDLED;
- return TRACE_TYPE_PARTIAL_LINE;
+
+ return trace_handle_return(s);
}
static enum print_line_t mmio_print_map(struct trace_iterator *iter)
@@ -226,31 +221,29 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
- int ret;
trace_assign_type(field, entry);
m = &field->map;
switch (m->opcode) {
case MMIO_PROBE:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
secs, usec_rem, m->map_id,
(unsigned long long)m->phys, m->virt, m->len,
0UL, 0);
break;
case MMIO_UNPROBE:
- ret = trace_seq_printf(s,
+ trace_seq_printf(s,
"UNMAP %u.%06lu %d 0x%lx %d\n",
secs, usec_rem, m->map_id, 0UL, 0);
break;
default:
- ret = trace_seq_puts(s, "map what?\n");
+ trace_seq_puts(s, "map what?\n");
break;
}
- if (ret)
- return TRACE_TYPE_HANDLED;
- return TRACE_TYPE_PARTIAL_LINE;
+
+ return trace_handle_return(s);
}
static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
@@ -262,14 +255,11 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
- int ret;
/* The trailing newline must be in the message. */
- ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t mmio_print_line(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c6977d5a9b12..b77b9a697619 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -25,15 +25,12 @@ enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct bputs_entry *field;
- int ret;
trace_assign_type(field, entry);
- ret = trace_seq_puts(s, field->str);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, field->str);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -41,15 +38,12 @@ enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct bprint_entry *field;
- int ret;
trace_assign_type(field, entry);
- ret = trace_seq_bprintf(s, field->fmt, field->buf);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_bprintf(s, field->fmt, field->buf);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
@@ -57,15 +51,12 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct print_entry *field;
- int ret;
trace_assign_type(field, entry);
- ret = trace_seq_puts(s, field->buf);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_puts(s, field->buf);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
const char *
@@ -124,7 +115,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
if (ret == (const char *)(trace_seq_buffer_ptr(p)))
trace_seq_printf(p, "0x%lx", val);
-
+
trace_seq_putc(p, 0);
return ret;
@@ -193,7 +184,6 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
struct trace_seq *p = &iter->tmp_seq;
struct trace_entry *entry;
- int ret;
event = container_of(trace_event, struct ftrace_event_call, event);
entry = iter->ent;
@@ -204,11 +194,9 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
}
trace_seq_init(p);
- ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event));
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s: ", ftrace_event_name(event));
- return 0;
+ return trace_handle_return(s);
}
EXPORT_SYMBOL(ftrace_raw_output_prep);
@@ -216,18 +204,11 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name,
char *fmt, va_list ap)
{
struct trace_seq *s = &iter->seq;
- int ret;
-
- ret = trace_seq_printf(s, "%s: ", name);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
- ret = trace_seq_vprintf(s, fmt, ap);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s: ", name);
+ trace_seq_vprintf(s, fmt, ap);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
@@ -260,7 +241,7 @@ static inline const char *kretprobed(const char *name)
}
#endif /* CONFIG_KRETPROBES */
-static int
+static void
seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
{
#ifdef CONFIG_KALLSYMS
@@ -271,12 +252,11 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
name = kretprobed(str);
- return trace_seq_printf(s, fmt, name);
+ trace_seq_printf(s, fmt, name);
#endif
- return 1;
}
-static int
+static void
seq_print_sym_offset(struct trace_seq *s, const char *fmt,
unsigned long address)
{
@@ -287,9 +267,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
sprint_symbol(str, address);
name = kretprobed(str);
- return trace_seq_printf(s, fmt, name);
+ trace_seq_printf(s, fmt, name);
#endif
- return 1;
}
#ifndef CONFIG_64BIT
@@ -320,14 +299,14 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
if (file) {
ret = trace_seq_path(s, &file->f_path);
if (ret)
- ret = trace_seq_printf(s, "[+0x%lx]",
- ip - vmstart);
+ trace_seq_printf(s, "[+0x%lx]",
+ ip - vmstart);
}
up_read(&mm->mmap_sem);
}
if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
- ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
- return ret;
+ trace_seq_printf(s, " <" IP_FMT ">", ip);
+ return !trace_seq_has_overflowed(s);
}
int
@@ -335,7 +314,6 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
unsigned long sym_flags)
{
struct mm_struct *mm = NULL;
- int ret = 1;
unsigned int i;
if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
@@ -354,48 +332,45 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
unsigned long ip = entry->caller[i];
- if (ip == ULONG_MAX || !ret)
+ if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
break;
- if (ret)
- ret = trace_seq_puts(s, " => ");
+
+ trace_seq_puts(s, " => ");
+
if (!ip) {
- if (ret)
- ret = trace_seq_puts(s, "??");
- if (ret)
- ret = trace_seq_putc(s, '\n');
+ trace_seq_puts(s, "??");
+ trace_seq_putc(s, '\n');
continue;
}
- if (!ret)
- break;
- if (ret)
- ret = seq_print_user_ip(s, mm, ip, sym_flags);
- ret = trace_seq_putc(s, '\n');
+
+ seq_print_user_ip(s, mm, ip, sym_flags);
+ trace_seq_putc(s, '\n');
}
if (mm)
mmput(mm);
- return ret;
+
+ return !trace_seq_has_overflowed(s);
}
int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
{
- int ret;
-
- if (!ip)
- return trace_seq_putc(s, '0');
+ if (!ip) {
+ trace_seq_putc(s, '0');
+ goto out;
+ }
if (sym_flags & TRACE_ITER_SYM_OFFSET)
- ret = seq_print_sym_offset(s, "%s", ip);
+ seq_print_sym_offset(s, "%s", ip);
else
- ret = seq_print_sym_short(s, "%s", ip);
-
- if (!ret)
- return 0;
+ seq_print_sym_short(s, "%s", ip);
if (sym_flags & TRACE_ITER_SYM_ADDR)
- ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
- return ret;
+ trace_seq_printf(s, " <" IP_FMT ">", ip);
+
+ out:
+ return !trace_seq_has_overflowed(s);
}
/**
@@ -413,7 +388,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
char irqs_off;
int hardirq;
int softirq;
- int ret;
hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
@@ -445,16 +419,15 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
softirq ? 's' :
'.';
- if (!trace_seq_printf(s, "%c%c%c",
- irqs_off, need_resched, hardsoft_irq))
- return 0;
+ trace_seq_printf(s, "%c%c%c",
+ irqs_off, need_resched, hardsoft_irq);
if (entry->preempt_count)
- ret = trace_seq_printf(s, "%x", entry->preempt_count);
+ trace_seq_printf(s, "%x", entry->preempt_count);
else
- ret = trace_seq_putc(s, '.');
+ trace_seq_putc(s, '.');
- return ret;
+ return !trace_seq_has_overflowed(s);
}
static int
@@ -464,14 +437,38 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
trace_find_cmdline(entry->pid, comm);
- if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
- comm, entry->pid, cpu))
- return 0;
+ trace_seq_printf(s, "%8.8s-%-5d %3d",
+ comm, entry->pid, cpu);
return trace_print_lat_fmt(s, entry);
}
-static unsigned long preempt_mark_thresh_us = 100;
+#undef MARK
+#define MARK(v, s) {.val = v, .sym = s}
+/* trace overhead mark */
+static const struct trace_mark {
+ unsigned long long val; /* unit: nsec */
+ char sym;
+} mark[] = {
+ MARK(1000000000ULL , '$'), /* 1 sec */
+ MARK(1000000ULL , '#'), /* 1000 usecs */
+ MARK(100000ULL , '!'), /* 100 usecs */
+ MARK(10000ULL , '+'), /* 10 usecs */
+};
+#undef MARK
+
+char trace_find_mark(unsigned long long d)
+{
+ int i;
+ int size = ARRAY_SIZE(mark);
+
+ for (i = 0; i < size; i++) {
+ if (d >= mark[i].val)
+ break;
+ }
+
+ return (i == size) ? ' ' : mark[i].sym;
+}
static int
lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
@@ -493,24 +490,28 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
unsigned long rel_msec = (unsigned long)rel_ts;
- return trace_seq_printf(
- s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
- ns2usecs(iter->ts),
- abs_msec, abs_usec,
- rel_msec, rel_usec);
+ trace_seq_printf(
+ s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
+ ns2usecs(iter->ts),
+ abs_msec, abs_usec,
+ rel_msec, rel_usec);
+
} else if (verbose && !in_ns) {
- return trace_seq_printf(
- s, "[%016llx] %lld (+%lld): ",
- iter->ts, abs_ts, rel_ts);
+ trace_seq_printf(
+ s, "[%016llx] %lld (+%lld): ",
+ iter->ts, abs_ts, rel_ts);
+
} else if (!verbose && in_ns) {
- return trace_seq_printf(
- s, " %4lldus%c: ",
- abs_ts,
- rel_ts > preempt_mark_thresh_us ? '!' :
- rel_ts > 1 ? '+' : ' ');
+ trace_seq_printf(
+ s, " %4lldus%c: ",
+ abs_ts,
+ trace_find_mark(rel_ts * NSEC_PER_USEC));
+
} else { /* !verbose && !in_ns */
- return trace_seq_printf(s, " %4lld: ", abs_ts);
+ trace_seq_printf(s, " %4lld: ", abs_ts);
}
+
+ return !trace_seq_has_overflowed(s);
}
int trace_print_context(struct trace_iterator *iter)
@@ -520,34 +521,29 @@ int trace_print_context(struct trace_iterator *iter)
unsigned long long t;
unsigned long secs, usec_rem;
char comm[TASK_COMM_LEN];
- int ret;
trace_find_cmdline(entry->pid, comm);
- ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
+ trace_seq_printf(s, "%16s-%-5d [%03d] ",
comm, entry->pid, iter->cpu);
- if (!ret)
- return 0;
- if (trace_flags & TRACE_ITER_IRQ_INFO) {
- ret = trace_print_lat_fmt(s, entry);
- if (!ret)
- return 0;
- }
+ if (trace_flags & TRACE_ITER_IRQ_INFO)
+ trace_print_lat_fmt(s, entry);
if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
t = ns2usecs(iter->ts);
usec_rem = do_div(t, USEC_PER_SEC);
secs = (unsigned long)t;
- return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
+ trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
} else
- return trace_seq_printf(s, " %12llu: ", iter->ts);
+ trace_seq_printf(s, " %12llu: ", iter->ts);
+
+ return !trace_seq_has_overflowed(s);
}
int trace_print_lat_context(struct trace_iterator *iter)
{
u64 next_ts;
- int ret;
/* trace_find_next_entry will reset ent_size */
int ent_size = iter->ent_size;
struct trace_seq *s = &iter->seq;
@@ -567,18 +563,17 @@ int trace_print_lat_context(struct trace_iterator *iter)
trace_find_cmdline(entry->pid, comm);
- ret = trace_seq_printf(
- s, "%16s %5d %3d %d %08x %08lx ",
- comm, entry->pid, iter->cpu, entry->flags,
- entry->preempt_count, iter->idx);
+ trace_seq_printf(
+ s, "%16s %5d %3d %d %08x %08lx ",
+ comm, entry->pid, iter->cpu, entry->flags,
+ entry->preempt_count, iter->idx);
} else {
- ret = lat_print_generic(s, entry, iter->cpu);
+ lat_print_generic(s, entry, iter->cpu);
}
- if (ret)
- ret = lat_print_timestamp(iter, next_ts);
+ lat_print_timestamp(iter, next_ts);
- return ret;
+ return !trace_seq_has_overflowed(s);
}
static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
@@ -692,7 +687,7 @@ int register_ftrace_event(struct trace_event *event)
goto out;
} else {
-
+
event->type = next_event_type++;
list = &ftrace_event_list;
}
@@ -764,10 +759,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
- if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type))
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(&iter->seq);
}
/* TRACE_FN */
@@ -779,24 +773,16 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
- if (!trace_seq_puts(s, " <-"))
- goto partial;
- if (!seq_print_ip_sym(s,
- field->parent_ip,
- flags))
- goto partial;
+ trace_seq_puts(s, " <-");
+ seq_print_ip_sym(s, field->parent_ip, flags);
}
- if (!trace_seq_putc(s, '\n'))
- goto partial;
- return TRACE_TYPE_HANDLED;
+ trace_seq_putc(s, '\n');
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
@@ -806,12 +792,11 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(&iter->seq, "%lx %lx\n",
- field->ip,
- field->parent_ip))
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(&iter->seq, "%lx %lx\n",
+ field->ip,
+ field->parent_ip);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
@@ -822,10 +807,10 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- SEQ_PUT_HEX_FIELD_RET(s, field->ip);
- SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
+ SEQ_PUT_HEX_FIELD(s, field->ip);
+ SEQ_PUT_HEX_FIELD(s, field->parent_ip);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
@@ -836,10 +821,10 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- SEQ_PUT_FIELD_RET(s, field->ip);
- SEQ_PUT_FIELD_RET(s, field->parent_ip);
+ SEQ_PUT_FIELD(s, field->ip);
+ SEQ_PUT_FIELD(s, field->parent_ip);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_fn_funcs = {
@@ -868,18 +853,17 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
T = task_state_char(field->next_state);
S = task_state_char(field->prev_state);
trace_find_cmdline(field->next_pid, comm);
- if (!trace_seq_printf(&iter->seq,
- " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
- field->prev_pid,
- field->prev_prio,
- S, delim,
- field->next_cpu,
- field->next_pid,
- field->next_prio,
- T, comm))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq,
+ " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+ field->prev_pid,
+ field->prev_prio,
+ S, delim,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T, comm);
+
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
@@ -904,17 +888,16 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
if (!S)
S = task_state_char(field->prev_state);
T = task_state_char(field->next_state);
- if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
- field->prev_pid,
- field->prev_prio,
- S,
- field->next_cpu,
- field->next_pid,
- field->next_prio,
- T))
- return TRACE_TYPE_PARTIAL_LINE;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
+ field->prev_pid,
+ field->prev_prio,
+ S,
+ field->next_cpu,
+ field->next_pid,
+ field->next_prio,
+ T);
+
+ return trace_handle_return(&iter->seq);
}
static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
@@ -942,15 +925,15 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
S = task_state_char(field->prev_state);
T = task_state_char(field->next_state);
- SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
- SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
- SEQ_PUT_HEX_FIELD_RET(s, S);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
- SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
- SEQ_PUT_HEX_FIELD_RET(s, T);
+ SEQ_PUT_HEX_FIELD(s, field->prev_pid);
+ SEQ_PUT_HEX_FIELD(s, field->prev_prio);
+ SEQ_PUT_HEX_FIELD(s, S);
+ SEQ_PUT_HEX_FIELD(s, field->next_cpu);
+ SEQ_PUT_HEX_FIELD(s, field->next_pid);
+ SEQ_PUT_HEX_FIELD(s, field->next_prio);
+ SEQ_PUT_HEX_FIELD(s, T);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
@@ -973,14 +956,15 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- SEQ_PUT_FIELD_RET(s, field->prev_pid);
- SEQ_PUT_FIELD_RET(s, field->prev_prio);
- SEQ_PUT_FIELD_RET(s, field->prev_state);
- SEQ_PUT_FIELD_RET(s, field->next_pid);
- SEQ_PUT_FIELD_RET(s, field->next_prio);
- SEQ_PUT_FIELD_RET(s, field->next_state);
+ SEQ_PUT_FIELD(s, field->prev_pid);
+ SEQ_PUT_FIELD(s, field->prev_prio);
+ SEQ_PUT_FIELD(s, field->prev_state);
+ SEQ_PUT_FIELD(s, field->next_cpu);
+ SEQ_PUT_FIELD(s, field->next_pid);
+ SEQ_PUT_FIELD(s, field->next_prio);
+ SEQ_PUT_FIELD(s, field->next_state);
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_ctx_funcs = {
@@ -1020,23 +1004,19 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
end = (unsigned long *)((long)iter->ent + iter->ent_size);
- if (!trace_seq_puts(s, "<stack trace>\n"))
- goto partial;
+ trace_seq_puts(s, "<stack trace>\n");
for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
- if (!trace_seq_puts(s, " => "))
- goto partial;
- if (!seq_print_ip_sym(s, *p, flags))
- goto partial;
- if (!trace_seq_putc(s, '\n'))
- goto partial;
- }
+ if (trace_seq_has_overflowed(s))
+ break;
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, " => ");
+ seq_print_ip_sym(s, *p, flags);
+ trace_seq_putc(s, '\n');
+ }
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_stack_funcs = {
@@ -1057,16 +1037,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- if (!trace_seq_puts(s, "<user stack trace>\n"))
- goto partial;
-
- if (!seq_print_userip_objs(field, s, flags))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_puts(s, "<user stack trace>\n");
+ seq_print_userip_objs(field, s, flags);
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_user_stack_funcs = {
@@ -1089,19 +1063,11 @@ trace_bputs_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, entry);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_puts(s, ": ");
+ trace_seq_puts(s, field->str);
- if (!trace_seq_puts(s, ": "))
- goto partial;
-
- if (!trace_seq_puts(s, field->str))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
@@ -1114,16 +1080,10 @@ trace_bputs_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(s, ": %lx : ", field->ip))
- goto partial;
-
- if (!trace_seq_puts(s, field->str))
- goto partial;
+ trace_seq_printf(s, ": %lx : ", field->ip);
+ trace_seq_puts(s, field->str);
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_bputs_funcs = {
@@ -1147,19 +1107,11 @@ trace_bprint_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, entry);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
-
- if (!trace_seq_puts(s, ": "))
- goto partial;
-
- if (!trace_seq_bprintf(s, field->fmt, field->buf))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_puts(s, ": ");
+ trace_seq_bprintf(s, field->fmt, field->buf);
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
@@ -1172,16 +1124,10 @@ trace_bprint_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(s, ": %lx : ", field->ip))
- goto partial;
-
- if (!trace_seq_bprintf(s, field->fmt, field->buf))
- goto partial;
+ trace_seq_printf(s, ": %lx : ", field->ip);
+ trace_seq_bprintf(s, field->fmt, field->buf);
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static struct trace_event_functions trace_bprint_funcs = {
@@ -1203,16 +1149,10 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- if (!seq_print_ip_sym(s, field->ip, flags))
- goto partial;
-
- if (!trace_seq_printf(s, ": %s", field->buf))
- goto partial;
+ seq_print_ip_sym(s, field->ip, flags);
+ trace_seq_printf(s, ": %s", field->buf);
- return TRACE_TYPE_HANDLED;
-
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(s);
}
static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
@@ -1222,13 +1162,9 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
- goto partial;
-
- return TRACE_TYPE_HANDLED;
+ trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
- partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ return trace_handle_return(&iter->seq);
}
static struct trace_event_functions trace_print_funcs = {
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 80b25b585a70..8ef2c40efb3c 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,17 +35,11 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
extern int __unregister_ftrace_event(struct trace_event *event);
extern struct rw_semaphore trace_event_sem;
-#define SEQ_PUT_FIELD_RET(s, x) \
-do { \
- if (!trace_seq_putmem(s, &(x), sizeof(x))) \
- return TRACE_TYPE_PARTIAL_LINE; \
-} while (0)
-
-#define SEQ_PUT_HEX_FIELD_RET(s, x) \
-do { \
- if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
- return TRACE_TYPE_PARTIAL_LINE; \
-} while (0)
+#define SEQ_PUT_FIELD(s, x) \
+ trace_seq_putmem(s, &(x), sizeof(x))
+
+#define SEQ_PUT_HEX_FIELD(s, x) \
+ trace_seq_putmem_hex(s, &(x), sizeof(x))
#endif
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2900817ba65c..c4e70b6bd7fa 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -305,7 +305,7 @@ static int t_show(struct seq_file *m, void *v)
seq_puts(m, "\\t");
break;
case '\\':
- seq_puts(m, "\\");
+ seq_putc(m, '\\');
break;
case '"':
seq_puts(m, "\\\"");
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index d4b9fc22cd27..b983b2fd2ca1 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -40,7 +40,8 @@ const char *reserved_field_names[] = {
int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
void *data, void *ent) \
{ \
- return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
+ trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
+ return !trace_seq_has_overflowed(s); \
} \
const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
@@ -61,10 +62,11 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
int len = *(u32 *)data >> 16;
if (!len)
- return trace_seq_printf(s, " %s=(fault)", name);
+ trace_seq_printf(s, " %s=(fault)", name);
else
- return trace_seq_printf(s, " %s=\"%s\"", name,
- (const char *)get_loc_data(data, ent));
+ trace_seq_printf(s, " %s=\"%s\"", name,
+ (const char *)get_loc_data(data, ent));
+ return !trace_seq_has_overflowed(s);
}
NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3f34dc9b40f3..2e293beb186e 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -14,122 +14,26 @@
#include "trace.h"
-static struct trace_array *ctx_trace;
-static int __read_mostly tracer_enabled;
static int sched_ref;
static DEFINE_MUTEX(sched_register_mutex);
-static int sched_stopped;
-
-
-void
-tracing_sched_switch_trace(struct trace_array *tr,
- struct task_struct *prev,
- struct task_struct *next,
- unsigned long flags, int pc)
-{
- struct ftrace_event_call *call = &event_context_switch;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
- struct ring_buffer_event *event;
- struct ctx_switch_entry *entry;
-
- event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
- sizeof(*entry), flags, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->prev_pid = prev->pid;
- entry->prev_prio = prev->prio;
- entry->prev_state = prev->state;
- entry->next_pid = next->pid;
- entry->next_prio = next->prio;
- entry->next_state = next->state;
- entry->next_cpu = task_cpu(next);
-
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, flags, pc);
-}
static void
probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
{
- struct trace_array_cpu *data;
- unsigned long flags;
- int cpu;
- int pc;
-
if (unlikely(!sched_ref))
return;
tracing_record_cmdline(prev);
tracing_record_cmdline(next);
-
- if (!tracer_enabled || sched_stopped)
- return;
-
- pc = preempt_count();
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
-
- if (likely(!atomic_read(&data->disabled)))
- tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
-
- local_irq_restore(flags);
-}
-
-void
-tracing_sched_wakeup_trace(struct trace_array *tr,
- struct task_struct *wakee,
- struct task_struct *curr,
- unsigned long flags, int pc)
-{
- struct ftrace_event_call *call = &event_wakeup;
- struct ring_buffer_event *event;
- struct ctx_switch_entry *entry;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
-
- event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
- sizeof(*entry), flags, pc);
- if (!event)
- return;
- entry = ring_buffer_event_data(event);
- entry->prev_pid = curr->pid;
- entry->prev_prio = curr->prio;
- entry->prev_state = curr->state;
- entry->next_pid = wakee->pid;
- entry->next_prio = wakee->prio;
- entry->next_state = wakee->state;
- entry->next_cpu = task_cpu(wakee);
-
- if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, flags, pc);
}
static void
probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
{
- struct trace_array_cpu *data;
- unsigned long flags;
- int cpu, pc;
-
if (unlikely(!sched_ref))
return;
tracing_record_cmdline(current);
-
- if (!tracer_enabled || sched_stopped)
- return;
-
- pc = preempt_count();
- local_irq_save(flags);
- cpu = raw_smp_processor_id();
- data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
-
- if (likely(!atomic_read(&data->disabled)))
- tracing_sched_wakeup_trace(ctx_trace, wakee, current,
- flags, pc);
-
- local_irq_restore(flags);
}
static int tracing_sched_register(void)
@@ -197,51 +101,3 @@ void tracing_stop_cmdline_record(void)
{
tracing_stop_sched_switch();
}
-
-/**
- * tracing_start_sched_switch_record - start tracing context switches
- *
- * Turns on context switch tracing for a tracer.
- */
-void tracing_start_sched_switch_record(void)
-{
- if (unlikely(!ctx_trace)) {
- WARN_ON(1);
- return;
- }
-
- tracing_start_sched_switch();
-
- mutex_lock(&sched_register_mutex);
- tracer_enabled++;
- mutex_unlock(&sched_register_mutex);
-}
-
-/**
- * tracing_stop_sched_switch_record - start tracing context switches
- *
- * Turns off context switch tracing for a tracer.
- */
-void tracing_stop_sched_switch_record(void)
-{
- mutex_lock(&sched_register_mutex);
- tracer_enabled--;
- WARN_ON(tracer_enabled < 0);
- mutex_unlock(&sched_register_mutex);
-
- tracing_stop_sched_switch();
-}
-
-/**
- * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
- * @tr: trace array pointer to assign
- *
- * Some tracers might want to record the context switches in their
- * trace. This function lets those tracers assign the trace array
- * to use.
- */
-void tracing_sched_switch_assign_trace(struct trace_array *tr)
-{
- ctx_trace = tr;
-}
-
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 19bd8928ce94..8fb84b362816 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -365,6 +365,62 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
wakeup_current_cpu = cpu;
}
+static void
+tracing_sched_switch_trace(struct trace_array *tr,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_context_switch;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = prev->pid;
+ entry->prev_prio = prev->prio;
+ entry->prev_state = prev->state;
+ entry->next_pid = next->pid;
+ entry->next_prio = next->prio;
+ entry->next_state = next->state;
+ entry->next_cpu = task_cpu(next);
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+
+static void
+tracing_sched_wakeup_trace(struct trace_array *tr,
+ struct task_struct *wakee,
+ struct task_struct *curr,
+ unsigned long flags, int pc)
+{
+ struct ftrace_event_call *call = &event_wakeup;
+ struct ring_buffer_event *event;
+ struct ctx_switch_entry *entry;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
+ sizeof(*entry), flags, pc);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+ entry->prev_pid = curr->pid;
+ entry->prev_prio = curr->prio;
+ entry->prev_state = curr->state;
+ entry->next_pid = wakee->pid;
+ entry->next_prio = wakee->prio;
+ entry->next_state = wakee->state;
+ entry->next_cpu = task_cpu(wakee);
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ trace_buffer_unlock_commit(buffer, event, flags, pc);
+}
+
static void notrace
probe_wakeup_sched_switch(void *ignore,
struct task_struct *prev, struct task_struct *next)
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 1f24ed99dca2..f8b45d8792f9 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -27,10 +27,19 @@
#include <linux/trace_seq.h>
/* How much buffer is left on the trace_seq? */
-#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len)
+#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq)
/* How much buffer is written? */
-#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1))
+#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq)
+
+/*
+ * trace_seq should work with being initialized with 0s.
+ */
+static inline void __trace_seq_init(struct trace_seq *s)
+{
+ if (unlikely(!s->seq.size))
+ trace_seq_init(s);
+}
/**
* trace_print_seq - move the contents of trace_seq into a seq_file
@@ -43,10 +52,11 @@
*/
int trace_print_seq(struct seq_file *m, struct trace_seq *s)
{
- unsigned int len = TRACE_SEQ_BUF_USED(s);
int ret;
- ret = seq_write(m, s->buffer, len);
+ __trace_seq_init(s);
+
+ ret = seq_buf_print_seq(m, &s->seq);
/*
* Only reset this buffer if we successfully wrote to the
@@ -69,34 +79,26 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
* trace_seq_printf() is used to store strings into a special
* buffer (@s). Then the output may be either used by
* the sequencer or pulled into another buffer.
- *
- * Returns 1 if we successfully written all the contents to
- * the buffer.
- * Returns 0 if we the length to write is bigger than the
- * reserved buffer space. In this case, nothing gets written.
*/
-int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+void trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
{
- unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ unsigned int save_len = s->seq.len;
va_list ap;
- int ret;
- if (s->full || !len)
- return 0;
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
va_start(ap, fmt);
- ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+ seq_buf_vprintf(&s->seq, fmt, ap);
va_end(ap);
/* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
s->full = 1;
- return 0;
}
-
- s->len += ret;
-
- return 1;
}
EXPORT_SYMBOL_GPL(trace_seq_printf);
@@ -107,25 +109,23 @@ EXPORT_SYMBOL_GPL(trace_seq_printf);
* @nmaskbits: The number of bits that are valid in @maskp
*
* Writes a ASCII representation of a bitmask string into @s.
- *
- * Returns 1 if we successfully written all the contents to
- * the buffer.
- * Returns 0 if we the length to write is bigger than the
- * reserved buffer space. In this case, nothing gets written.
*/
-int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
int nmaskbits)
{
- unsigned int len = TRACE_SEQ_BUF_LEFT(s);
- int ret;
+ unsigned int save_len = s->seq.len;
- if (s->full || !len)
- return 0;
+ if (s->full)
+ return;
- ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
- s->len += ret;
+ __trace_seq_init(s);
- return 1;
+ seq_buf_bitmask(&s->seq, maskp, nmaskbits);
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ }
}
EXPORT_SYMBOL_GPL(trace_seq_bitmask);
@@ -139,28 +139,23 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
* trace_seq_printf is used to store strings into a special
* buffer (@s). Then the output may be either used by
* the sequencer or pulled into another buffer.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
{
- unsigned int len = TRACE_SEQ_BUF_LEFT(s);
- int ret;
+ unsigned int save_len = s->seq.len;
- if (s->full || !len)
- return 0;
+ if (s->full)
+ return;
- ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+ __trace_seq_init(s);
+
+ seq_buf_vprintf(&s->seq, fmt, args);
/* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
s->full = 1;
- return 0;
}
-
- s->len += ret;
-
- return len;
}
EXPORT_SYMBOL_GPL(trace_seq_vprintf);
@@ -178,28 +173,24 @@ EXPORT_SYMBOL_GPL(trace_seq_vprintf);
*
* This function will take the format and the binary array and finish
* the conversion into the ASCII string within the buffer.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
{
- unsigned int len = TRACE_SEQ_BUF_LEFT(s);
- int ret;
+ unsigned int save_len = s->seq.len;
- if (s->full || !len)
- return 0;
+ if (s->full)
+ return;
+
+ __trace_seq_init(s);
- ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+ seq_buf_bprintf(&s->seq, fmt, binary);
/* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
s->full = 1;
- return 0;
+ return;
}
-
- s->len += ret;
-
- return len;
}
EXPORT_SYMBOL_GPL(trace_seq_bprintf);
@@ -212,25 +203,22 @@ EXPORT_SYMBOL_GPL(trace_seq_bprintf);
* copy to user routines. This function records a simple string
* into a special buffer (@s) for later retrieval by a sequencer
* or other mechanism.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_puts(struct trace_seq *s, const char *str)
+void trace_seq_puts(struct trace_seq *s, const char *str)
{
unsigned int len = strlen(str);
if (s->full)
- return 0;
+ return;
+
+ __trace_seq_init(s);
if (len > TRACE_SEQ_BUF_LEFT(s)) {
s->full = 1;
- return 0;
+ return;
}
- memcpy(s->buffer + s->len, str, len);
- s->len += len;
-
- return len;
+ seq_buf_putmem(&s->seq, str, len);
}
EXPORT_SYMBOL_GPL(trace_seq_puts);
@@ -243,22 +231,20 @@ EXPORT_SYMBOL_GPL(trace_seq_puts);
* copy to user routines. This function records a simple charater
* into a special buffer (@s) for later retrieval by a sequencer
* or other mechanism.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_putc(struct trace_seq *s, unsigned char c)
+void trace_seq_putc(struct trace_seq *s, unsigned char c)
{
if (s->full)
- return 0;
+ return;
+
+ __trace_seq_init(s);
if (TRACE_SEQ_BUF_LEFT(s) < 1) {
s->full = 1;
- return 0;
+ return;
}
- s->buffer[s->len++] = c;
-
- return 1;
+ seq_buf_putc(&s->seq, c);
}
EXPORT_SYMBOL_GPL(trace_seq_putc);
@@ -271,29 +257,23 @@ EXPORT_SYMBOL_GPL(trace_seq_putc);
* There may be cases where raw memory needs to be written into the
* buffer and a strcpy() would not work. Using this function allows
* for such cases.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
+void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
{
if (s->full)
- return 0;
+ return;
+
+ __trace_seq_init(s);
if (len > TRACE_SEQ_BUF_LEFT(s)) {
s->full = 1;
- return 0;
+ return;
}
- memcpy(s->buffer + s->len, mem, len);
- s->len += len;
-
- return len;
+ seq_buf_putmem(&s->seq, mem, len);
}
EXPORT_SYMBOL_GPL(trace_seq_putmem);
-#define MAX_MEMHEX_BYTES 8U
-#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
-
/**
* trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
* @s: trace sequence descriptor
@@ -303,41 +283,31 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem);
* This is similar to trace_seq_putmem() except instead of just copying the
* raw memory into the buffer it writes its ASCII representation of it
* in hex characters.
- *
- * Returns how much it wrote to the buffer.
*/
-int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+void trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
unsigned int len)
{
- unsigned char hex[HEX_CHARS];
- const unsigned char *data = mem;
- unsigned int start_len;
- int i, j;
- int cnt = 0;
+ unsigned int save_len = s->seq.len;
if (s->full)
- return 0;
+ return;
- while (len) {
- start_len = min(len, HEX_CHARS - 1);
-#ifdef __BIG_ENDIAN
- for (i = 0, j = 0; i < start_len; i++) {
-#else
- for (i = start_len-1, j = 0; i >= 0; i--) {
-#endif
- hex[j++] = hex_asc_hi(data[i]);
- hex[j++] = hex_asc_lo(data[i]);
- }
- if (WARN_ON_ONCE(j == 0 || j/2 > len))
- break;
-
- /* j increments twice per loop */
- len -= j / 2;
- hex[j++] = ' ';
-
- cnt += trace_seq_putmem(s, hex, j);
+ __trace_seq_init(s);
+
+ /* Each byte is represented by two chars */
+ if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return;
+ }
+
+ /* The added spaces can still cause an overflow */
+ seq_buf_putmem_hex(&s->seq, mem, len);
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ return;
}
- return cnt;
}
EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
@@ -355,30 +325,27 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
*/
int trace_seq_path(struct trace_seq *s, const struct path *path)
{
- unsigned char *p;
+ unsigned int save_len = s->seq.len;
if (s->full)
return 0;
+ __trace_seq_init(s);
+
if (TRACE_SEQ_BUF_LEFT(s) < 1) {
s->full = 1;
return 0;
}
- p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
- if (!IS_ERR(p)) {
- p = mangle_path(s->buffer + s->len, p, "\n");
- if (p) {
- s->len = p - s->buffer;
- return 1;
- }
- } else {
- s->buffer[s->len++] = '?';
- return 1;
+ seq_buf_path(&s->seq, path, "\n");
+
+ if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+ s->seq.len = save_len;
+ s->full = 1;
+ return 0;
}
- s->full = 1;
- return 0;
+ return 1;
}
EXPORT_SYMBOL_GPL(trace_seq_path);
@@ -404,25 +371,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path);
*/
int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
{
- int len;
- int ret;
-
- if (!cnt)
- return 0;
-
- if (s->len <= s->readpos)
- return -EBUSY;
-
- len = s->len - s->readpos;
- if (cnt > len)
- cnt = len;
- ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
- if (ret == cnt)
- return -EFAULT;
-
- cnt -= ret;
-
- s->readpos += cnt;
- return cnt;
+ __trace_seq_init(s);
+ return seq_buf_to_user(&s->seq, ubuf, cnt);
}
EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 29228c4d5696..dfe00a4f3f3e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -114,7 +114,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_entry *ent = iter->ent;
struct syscall_trace_enter *trace;
struct syscall_metadata *entry;
- int i, ret, syscall;
+ int i, syscall;
trace = (typeof(trace))ent;
syscall = trace->nr;
@@ -128,35 +128,28 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
goto end;
}
- ret = trace_seq_printf(s, "%s(", entry->name);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s(", entry->name);
for (i = 0; i < entry->nb_args; i++) {
+
+ if (trace_seq_has_overflowed(s))
+ goto end;
+
/* parameter types */
- if (trace_flags & TRACE_ITER_VERBOSE) {
- ret = trace_seq_printf(s, "%s ", entry->types[i]);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (trace_flags & TRACE_ITER_VERBOSE)
+ trace_seq_printf(s, "%s ", entry->types[i]);
+
/* parameter values */
- ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
- trace->args[i],
- i == entry->nb_args - 1 ? "" : ", ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_printf(s, "%s: %lx%s", entry->args[i],
+ trace->args[i],
+ i == entry->nb_args - 1 ? "" : ", ");
}
- ret = trace_seq_putc(s, ')');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
+ trace_seq_putc(s, ')');
end:
- ret = trace_seq_putc(s, '\n');
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ trace_seq_putc(s, '\n');
- return TRACE_TYPE_HANDLED;
+ return trace_handle_return(s);
}
static enum print_line_t
@@ -168,7 +161,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
struct syscall_trace_exit *trace;
int syscall;
struct syscall_metadata *entry;
- int ret;
trace = (typeof(trace))ent;
syscall = trace->nr;
@@ -176,7 +168,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
if (!entry) {
trace_seq_putc(s, '\n');
- return TRACE_TYPE_HANDLED;
+ goto out;
}
if (entry->exit_event->event.type != ent->type) {
@@ -184,12 +176,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
return TRACE_TYPE_UNHANDLED;
}
- ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
+ trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
trace->ret);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- return TRACE_TYPE_HANDLED;
+ out:
+ return trace_handle_return(s);
}
extern char *__bad_type_size(void);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 33ff6a24b802..8520acc34b18 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -552,8 +552,7 @@ error:
return ret;
fail_address_parse:
- if (inode)
- iput(inode);
+ iput(inode);
pr_info("Failed to parse address or file.\n");
@@ -606,7 +605,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
for (i = 0; i < tu->tp.nr_args; i++)
seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
- seq_printf(m, "\n");
+ seq_putc(m, '\n');
return 0;
}
@@ -852,16 +851,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
tu = container_of(event, struct trace_uprobe, tp.call.event);
if (is_ret_probe(tu)) {
- if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
- ftrace_event_name(&tu->tp.call),
- entry->vaddr[1], entry->vaddr[0]))
- goto partial;
+ trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
+ ftrace_event_name(&tu->tp.call),
+ entry->vaddr[1], entry->vaddr[0]);
data = DATAOF_TRACE_ENTRY(entry, true);
} else {
- if (!trace_seq_printf(s, "%s: (0x%lx)",
- ftrace_event_name(&tu->tp.call),
- entry->vaddr[0]))
- goto partial;
+ trace_seq_printf(s, "%s: (0x%lx)",
+ ftrace_event_name(&tu->tp.call),
+ entry->vaddr[0]);
data = DATAOF_TRACE_ENTRY(entry, false);
}
@@ -869,14 +866,13 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
struct probe_arg *parg = &tu->tp.args[i];
if (!parg->type->print(s, parg->name, data + parg->offset, entry))
- goto partial;
+ goto out;
}
- if (trace_seq_puts(s, "\n"))
- return TRACE_TYPE_HANDLED;
+ trace_seq_putc(s, '\n');
-partial:
- return TRACE_TYPE_PARTIAL_LINE;
+ out:
+ return trace_handle_return(s);
}
typedef bool (*filter_func_t)(struct uprobe_consumer *self,