From 412d0bb553c0227191f1bfd06100f561600bff22 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 24 Dec 2008 01:43:25 +0100
Subject: tracing/function-graph-tracer: strip ending newlines on comments

Impact: tracer output improvement

Ending newlines are appended automatically on comments by the function
graph tracer because the newline needs to be placed after the "*/"
comment characters.

So if the user puts an ending newline, we want to strip it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4bf39fcae97a..bc7d90850be5 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -592,6 +592,12 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 	if (ent->flags & TRACE_FLAG_CONT)
 		trace_seq_print_cont(s, iter);
 
+	/* Strip ending newline */
+	if (s->buffer[s->len - 1] == '\n') {
+		s->buffer[s->len - 1] = '\0';
+		s->len--;
+	}
+
 	ret = trace_seq_printf(s, " */\n");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
-- 
cgit v1.2.3-59-g8ed1b


From c47956d9ae3341d2d1998bff26620fa3338c01e4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 23 Dec 2008 23:24:11 -0500
Subject: ftrace: remove obsolete print continue functionality

Impact: cleanup, remove obsolete code

Now that the ring buffer used by ftrace allows for variable length
entries, we do not need the 'cont' feature of the buffer.  This code
makes other parts of ftrace more complex and by removing this it
simplifies the ftrace code.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                 | 61 ------------------------------------
 kernel/trace/trace.h                 |  7 -----
 kernel/trace/trace_functions_graph.c |  3 --
 kernel/trace/trace_mmiotrace.c       |  3 --
 4 files changed, 74 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f4bb3800318b..fca0233f1d73 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1765,43 +1765,6 @@ static int task_state_char(unsigned long state)
 	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
 }
 
-/*
- * The message is supposed to contain an ending newline.
- * If the printing stops prematurely, try to add a newline of our own.
- */
-void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
-{
-	struct trace_entry *ent;
-	struct trace_field_cont *cont;
-	bool ok = true;
-
-	ent = peek_next_entry(iter, iter->cpu, NULL);
-	if (!ent || ent->type != TRACE_CONT) {
-		trace_seq_putc(s, '\n');
-		return;
-	}
-
-	do {
-		cont = (struct trace_field_cont *)ent;
-		if (ok)
-			ok = (trace_seq_printf(s, "%s", cont->buf) > 0);
-
-		ftrace_disable_cpu();
-
-		if (iter->buffer_iter[iter->cpu])
-			ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
-		else
-			ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
-
-		ftrace_enable_cpu();
-
-		ent = peek_next_entry(iter, iter->cpu, NULL);
-	} while (ent && ent->type == TRACE_CONT);
-
-	if (!ok)
-		trace_seq_putc(s, '\n');
-}
-
 static void test_cpu_buff_start(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1834,9 +1797,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	int S, T;
 	int i;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	test_cpu_buff_start(iter);
 
 	next_entry = find_next_entry(iter, NULL, &next_ts);
@@ -1922,8 +1882,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 
 		seq_print_ip_sym(s, field->ip, sym_flags);
 		trace_seq_printf(s, ": %s", field->buf);
-		if (entry->flags & TRACE_FLAG_CONT)
-			trace_seq_print_cont(s, iter);
 		break;
 	}
 	case TRACE_BRANCH: {
@@ -1968,9 +1926,6 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	test_cpu_buff_start(iter);
 
 	comm = trace_find_cmdline(iter->ent->pid);
@@ -2076,8 +2031,6 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 
 		seq_print_ip_sym(s, field->ip, sym_flags);
 		trace_seq_printf(s, ": %s", field->buf);
-		if (entry->flags & TRACE_FLAG_CONT)
-			trace_seq_print_cont(s, iter);
 		break;
 	}
 	case TRACE_GRAPH_RET: {
@@ -2124,9 +2077,6 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	ret = trace_seq_printf(s, "%d %d %llu ",
 		entry->pid, iter->cpu, iter->ts);
 	if (!ret)
@@ -2187,8 +2137,6 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 
 		trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
-		if (entry->flags & TRACE_FLAG_CONT)
-			trace_seq_print_cont(s, iter);
 		break;
 	}
 	}
@@ -2217,9 +2165,6 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
 	SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
 	SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
@@ -2283,9 +2228,6 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	if (entry->flags & TRACE_FLAG_CONT)
-		trace_seq_print_cont(s, iter);
-
 	return TRACE_TYPE_HANDLED;
 }
 
@@ -2296,9 +2238,6 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	SEQ_PUT_FIELD_RET(s, entry->pid);
 	SEQ_PUT_FIELD_RET(s, entry->cpu);
 	SEQ_PUT_FIELD_RET(s, iter->ts);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc7a4f864036..3a357382cce6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -16,7 +16,6 @@ enum trace_type {
 	TRACE_FN,
 	TRACE_CTX,
 	TRACE_WAKE,
-	TRACE_CONT,
 	TRACE_STACK,
 	TRACE_PRINT,
 	TRACE_SPECIAL,
@@ -178,7 +177,6 @@ struct trace_power {
  *  NEED_RESCED		- reschedule is requested
  *  HARDIRQ		- inside an interrupt handler
  *  SOFTIRQ		- inside a softirq handler
- *  CONT		- multiple entries hold the trace item
  */
 enum trace_flag_type {
 	TRACE_FLAG_IRQS_OFF		= 0x01,
@@ -186,7 +184,6 @@ enum trace_flag_type {
 	TRACE_FLAG_NEED_RESCHED		= 0x04,
 	TRACE_FLAG_HARDIRQ		= 0x08,
 	TRACE_FLAG_SOFTIRQ		= 0x10,
-	TRACE_FLAG_CONT			= 0x20,
 };
 
 #define TRACE_BUF_SIZE		1024
@@ -262,7 +259,6 @@ extern void __ftrace_bad_type(void);
 	do {								\
 		IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN);	\
 		IF_ASSIGN(var, ent, struct ctx_switch_entry, 0);	\
-		IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
@@ -489,9 +485,6 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 
 extern void *head_page(struct trace_array_cpu *data);
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
-extern void trace_seq_print_cont(struct trace_seq *s,
-				 struct trace_iterator *iter);
-
 extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index bc7d90850be5..f261966e5b6c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -589,9 +589,6 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	if (ent->flags & TRACE_FLAG_CONT)
-		trace_seq_print_cont(s, iter);
-
 	/* Strip ending newline */
 	if (s->buffer[s->len - 1] == '\n') {
 		s->buffer[s->len - 1] = '\0';
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fffcb069f1dc..83f20ae6bd68 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -262,9 +262,6 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	if (entry->flags & TRACE_FLAG_CONT)
-		trace_seq_print_cont(s, iter);
-
 	return TRACE_TYPE_HANDLED;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From f0868d1e23a8efec33beb3aa688aab7fdb1ae093 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 23 Dec 2008 23:24:12 -0500
Subject: ftrace: set up trace event hash infrastructure

Impact: simplify/generalize/refactor trace.c

The trace.c file is becoming more difficult to maintain due to the
growing number of events. There is several formats that an event may
be printed. This patch sets up the infrastructure of an event hash to
allow for events to register how they should be printed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile                |   1 +
 kernel/trace/trace.c                 | 275 +-------------------------
 kernel/trace/trace.h                 |   8 +-
 kernel/trace/trace_boot.c            |   1 +
 kernel/trace/trace_functions_graph.c |   1 +
 kernel/trace/trace_hw_branches.c     |   1 +
 kernel/trace/trace_mmiotrace.c       |   1 +
 kernel/trace/trace_output.c          | 365 +++++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.h          |  43 +++++
 kernel/trace/trace_power.c           |   1 +
 10 files changed, 416 insertions(+), 281 deletions(-)
 create mode 100644 kernel/trace/trace_output.c
 create mode 100644 kernel/trace/trace_output.h

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 349d5a93653f..549f93c9b393 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fca0233f1d73..90ce0c1d437b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -38,6 +38,7 @@
 #include <linux/irqflags.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 #define TRACE_BUFFER_FLAGS	(RB_FL_OVERWRITE)
 
@@ -330,132 +331,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	tracing_record_cmdline(current);
 }
 
-/**
- * trace_seq_printf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-{
-	int len = (PAGE_SIZE - 1) - s->len;
-	va_list ap;
-	int ret;
-
-	if (!len)
-		return 0;
-
-	va_start(ap, fmt);
-	ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
-	va_end(ap);
-
-	/* If we can't write it all, don't bother writing anything */
-	if (ret >= len)
-		return 0;
-
-	s->len += ret;
-
-	return len;
-}
-
-/**
- * trace_seq_puts - trace sequence printing of simple string
- * @s: trace sequence descriptor
- * @str: simple string to record
- *
- * The tracer may use either the sequence operations or its own
- * copy to user routines. This function records a simple string
- * into a special buffer (@s) for later retrieval by a sequencer
- * or other mechanism.
- */
-static int
-trace_seq_puts(struct trace_seq *s, const char *str)
-{
-	int len = strlen(str);
-
-	if (len > ((PAGE_SIZE - 1) - s->len))
-		return 0;
-
-	memcpy(s->buffer + s->len, str, len);
-	s->len += len;
-
-	return len;
-}
-
-static int
-trace_seq_putc(struct trace_seq *s, unsigned char c)
-{
-	if (s->len >= (PAGE_SIZE - 1))
-		return 0;
-
-	s->buffer[s->len++] = c;
-
-	return 1;
-}
-
-static int
-trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
-{
-	if (len > ((PAGE_SIZE - 1) - s->len))
-		return 0;
-
-	memcpy(s->buffer + s->len, mem, len);
-	s->len += len;
-
-	return len;
-}
-
-#define MAX_MEMHEX_BYTES	8
-#define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
-
-static int
-trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
-{
-	unsigned char hex[HEX_CHARS];
-	unsigned char *data = mem;
-	int i, j;
-
-#ifdef __BIG_ENDIAN
-	for (i = 0, j = 0; i < len; i++) {
-#else
-	for (i = len-1, j = 0; i >= 0; i--) {
-#endif
-		hex[j++] = hex_asc_hi(data[i]);
-		hex[j++] = hex_asc_lo(data[i]);
-	}
-	hex[j++] = ' ';
-
-	return trace_seq_putmem(s, hex, j);
-}
-
-static int
-trace_seq_path(struct trace_seq *s, struct path *path)
-{
-	unsigned char *p;
-
-	if (s->len >= (PAGE_SIZE - 1))
-		return 0;
-	p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
-	if (!IS_ERR(p)) {
-		p = mangle_path(s->buffer + s->len, p, "\n");
-		if (p) {
-			s->len = p - s->buffer;
-			return 1;
-		}
-	} else {
-		s->buffer[s->len++] = '?';
-		return 1;
-	}
-
-	return 0;
-}
-
 static void
 trace_seq_reset(struct trace_seq *s)
 {
@@ -1473,154 +1348,6 @@ static void s_stop(struct seq_file *m, void *p)
 	mutex_unlock(&trace_types_lock);
 }
 
-#ifdef CONFIG_KRETPROBES
-static inline const char *kretprobed(const char *name)
-{
-	static const char tramp_name[] = "kretprobe_trampoline";
-	int size = sizeof(tramp_name);
-
-	if (strncmp(tramp_name, name, size) == 0)
-		return "[unknown/kretprobe'd]";
-	return name;
-}
-#else
-static inline const char *kretprobed(const char *name)
-{
-	return name;
-}
-#endif /* CONFIG_KRETPROBES */
-
-static int
-seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
-{
-#ifdef CONFIG_KALLSYMS
-	char str[KSYM_SYMBOL_LEN];
-	const char *name;
-
-	kallsyms_lookup(address, NULL, NULL, NULL, str);
-
-	name = kretprobed(str);
-
-	return trace_seq_printf(s, fmt, name);
-#endif
-	return 1;
-}
-
-static int
-seq_print_sym_offset(struct trace_seq *s, const char *fmt,
-		     unsigned long address)
-{
-#ifdef CONFIG_KALLSYMS
-	char str[KSYM_SYMBOL_LEN];
-	const char *name;
-
-	sprint_symbol(str, address);
-	name = kretprobed(str);
-
-	return trace_seq_printf(s, fmt, name);
-#endif
-	return 1;
-}
-
-#ifndef CONFIG_64BIT
-# define IP_FMT "%08lx"
-#else
-# define IP_FMT "%016lx"
-#endif
-
-int
-seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
-{
-	int ret;
-
-	if (!ip)
-		return trace_seq_printf(s, "0");
-
-	if (sym_flags & TRACE_ITER_SYM_OFFSET)
-		ret = seq_print_sym_offset(s, "%s", ip);
-	else
-		ret = seq_print_sym_short(s, "%s", ip);
-
-	if (!ret)
-		return 0;
-
-	if (sym_flags & TRACE_ITER_SYM_ADDR)
-		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
-	return ret;
-}
-
-static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
-				    unsigned long ip, unsigned long sym_flags)
-{
-	struct file *file = NULL;
-	unsigned long vmstart = 0;
-	int ret = 1;
-
-	if (mm) {
-		const struct vm_area_struct *vma;
-
-		down_read(&mm->mmap_sem);
-		vma = find_vma(mm, ip);
-		if (vma) {
-			file = vma->vm_file;
-			vmstart = vma->vm_start;
-		}
-		if (file) {
-			ret = trace_seq_path(s, &file->f_path);
-			if (ret)
-				ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
-		}
-		up_read(&mm->mmap_sem);
-	}
-	if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
-		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
-	return ret;
-}
-
-static int
-seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
-		      unsigned long sym_flags)
-{
-	struct mm_struct *mm = NULL;
-	int ret = 1;
-	unsigned int i;
-
-	if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
-		struct task_struct *task;
-		/*
-		 * we do the lookup on the thread group leader,
-		 * since individual threads might have already quit!
-		 */
-		rcu_read_lock();
-		task = find_task_by_vpid(entry->ent.tgid);
-		if (task)
-			mm = get_task_mm(task);
-		rcu_read_unlock();
-	}
-
-	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-		unsigned long ip = entry->caller[i];
-
-		if (ip == ULONG_MAX || !ret)
-			break;
-		if (i && ret)
-			ret = trace_seq_puts(s, " <- ");
-		if (!ip) {
-			if (ret)
-				ret = trace_seq_puts(s, "??");
-			continue;
-		}
-		if (!ret)
-			break;
-		if (ret)
-			ret = seq_print_user_ip(s, mm, ip, sym_flags);
-	}
-
-	if (mm)
-		mmput(mm);
-	return ret;
-}
-
 static void print_lat_help_header(struct seq_file *m)
 {
 	seq_puts(m, "#                  _------=> CPU#            \n");
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3a357382cce6..6bd71fa1e1c7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -30,7 +30,7 @@ enum trace_type {
 	TRACE_HW_BRANCHES,
 	TRACE_POWER,
 
-	__TRACE_LAST_TYPE
+	__TRACE_LAST_TYPE,
 };
 
 /*
@@ -484,12 +484,6 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
-extern int
-seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
-		unsigned long sym_flags);
-extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
-				 size_t cnt);
 extern long ns2usecs(cycle_t nsec);
 extern int
 trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 3ccebde28482..cb2ff3e297b1 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -11,6 +11,7 @@
 #include <linux/kallsyms.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 static struct trace_array *boot_trace;
 static bool pre_initcalls_finished;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index f261966e5b6c..f8ac5417afc8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 #define TRACE_GRAPH_INDENT	2
 
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index b6a3e20a49a9..879752b006b3 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -14,6 +14,7 @@
 #include <asm/ds.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 
 #define SIZEOF_BTS (1 << 13)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 83f20ae6bd68..fcec59ff2355 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -11,6 +11,7 @@
 #include <linux/pci.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 struct header_iter {
 	struct pci_dev *dev;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
new file mode 100644
index 000000000000..1f3f80002b5e
--- /dev/null
+++ b/kernel/trace/trace_output.c
@@ -0,0 +1,365 @@
+/*
+ * trace_output.c
+ *
+ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/ftrace.h>
+
+#include "trace_output.h"
+
+/* must be a power of 2 */
+#define EVENT_HASHSIZE	128
+
+static DEFINE_MUTEX(trace_event_mutex);
+static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
+
+static int next_event_type = __TRACE_LAST_TYPE + 1;
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	va_list ap;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	va_start(ap, fmt);
+	ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+	va_end(ap);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+	int len = strlen(str);
+
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return 0;
+
+	memcpy(s->buffer + s->len, str, len);
+	s->len += len;
+
+	return len;
+}
+
+int trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+	if (s->len >= (PAGE_SIZE - 1))
+		return 0;
+
+	s->buffer[s->len++] = c;
+
+	return 1;
+}
+
+int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
+{
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return 0;
+
+	memcpy(s->buffer + s->len, mem, len);
+	s->len += len;
+
+	return len;
+}
+
+int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
+{
+	unsigned char hex[HEX_CHARS];
+	unsigned char *data = mem;
+	int i, j;
+
+#ifdef __BIG_ENDIAN
+	for (i = 0, j = 0; i < len; i++) {
+#else
+	for (i = len-1, j = 0; i >= 0; i--) {
+#endif
+		hex[j++] = hex_asc_hi(data[i]);
+		hex[j++] = hex_asc_lo(data[i]);
+	}
+	hex[j++] = ' ';
+
+	return trace_seq_putmem(s, hex, j);
+}
+
+int trace_seq_path(struct trace_seq *s, struct path *path)
+{
+	unsigned char *p;
+
+	if (s->len >= (PAGE_SIZE - 1))
+		return 0;
+	p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+	if (!IS_ERR(p)) {
+		p = mangle_path(s->buffer + s->len, p, "\n");
+		if (p) {
+			s->len = p - s->buffer;
+			return 1;
+		}
+	} else {
+		s->buffer[s->len++] = '?';
+		return 1;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_KRETPROBES
+static inline const char *kretprobed(const char *name)
+{
+	static const char tramp_name[] = "kretprobe_trampoline";
+	int size = sizeof(tramp_name);
+
+	if (strncmp(tramp_name, name, size) == 0)
+		return "[unknown/kretprobe'd]";
+	return name;
+}
+#else
+static inline const char *kretprobed(const char *name)
+{
+	return name;
+}
+#endif /* CONFIG_KRETPROBES */
+
+static int
+seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+	const char *name;
+
+	kallsyms_lookup(address, NULL, NULL, NULL, str);
+
+	name = kretprobed(str);
+
+	return trace_seq_printf(s, fmt, name);
+#endif
+	return 1;
+}
+
+static int
+seq_print_sym_offset(struct trace_seq *s, const char *fmt,
+		     unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+	const char *name;
+
+	sprint_symbol(str, address);
+	name = kretprobed(str);
+
+	return trace_seq_printf(s, fmt, name);
+#endif
+	return 1;
+}
+
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
+int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+		      unsigned long ip, unsigned long sym_flags)
+{
+	struct file *file = NULL;
+	unsigned long vmstart = 0;
+	int ret = 1;
+
+	if (mm) {
+		const struct vm_area_struct *vma;
+
+		down_read(&mm->mmap_sem);
+		vma = find_vma(mm, ip);
+		if (vma) {
+			file = vma->vm_file;
+			vmstart = vma->vm_start;
+		}
+		if (file) {
+			ret = trace_seq_path(s, &file->f_path);
+			if (ret)
+				ret = trace_seq_printf(s, "[+0x%lx]",
+						       ip - vmstart);
+		}
+		up_read(&mm->mmap_sem);
+	}
+	if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	return ret;
+}
+
+int
+seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+		      unsigned long sym_flags)
+{
+	struct mm_struct *mm = NULL;
+	int ret = 1;
+	unsigned int i;
+
+	if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
+		struct task_struct *task;
+		/*
+		 * we do the lookup on the thread group leader,
+		 * since individual threads might have already quit!
+		 */
+		rcu_read_lock();
+		task = find_task_by_vpid(entry->ent.tgid);
+		if (task)
+			mm = get_task_mm(task);
+		rcu_read_unlock();
+	}
+
+	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+		unsigned long ip = entry->caller[i];
+
+		if (ip == ULONG_MAX || !ret)
+			break;
+		if (i && ret)
+			ret = trace_seq_puts(s, " <- ");
+		if (!ip) {
+			if (ret)
+				ret = trace_seq_puts(s, "??");
+			continue;
+		}
+		if (!ret)
+			break;
+		if (ret)
+			ret = seq_print_user_ip(s, mm, ip, sym_flags);
+	}
+
+	if (mm)
+		mmput(mm);
+	return ret;
+}
+
+int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
+{
+	int ret;
+
+	if (!ip)
+		return trace_seq_printf(s, "0");
+
+	if (sym_flags & TRACE_ITER_SYM_OFFSET)
+		ret = seq_print_sym_offset(s, "%s", ip);
+	else
+		ret = seq_print_sym_short(s, "%s", ip);
+
+	if (!ret)
+		return 0;
+
+	if (sym_flags & TRACE_ITER_SYM_ADDR)
+		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	return ret;
+}
+
+/**
+ * ftrace_find_event - find a registered event
+ * @type: the type of event to look for
+ *
+ * Returns an event of type @type otherwise NULL
+ */
+struct trace_event *ftrace_find_event(int type)
+{
+	struct trace_event *event;
+	struct hlist_node *n;
+	unsigned key;
+
+	key = type & (EVENT_HASHSIZE - 1);
+
+	hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
+		if (event->type == type)
+			return event;
+	}
+
+	return NULL;
+}
+
+/**
+ * register_ftrace_event - register output for an event type
+ * @event: the event type to register
+ *
+ * Event types are stored in a hash and this hash is used to
+ * find a way to print an event. If the @event->type is set
+ * then it will use that type, otherwise it will assign a
+ * type to use.
+ *
+ * If you assign your own type, please make sure it is added
+ * to the trace_type enum in trace.h, to avoid collisions
+ * with the dynamic types.
+ *
+ * Returns the event type number or zero on error.
+ */
+int register_ftrace_event(struct trace_event *event)
+{
+	unsigned key;
+	int ret = 0;
+
+	mutex_lock(&trace_event_mutex);
+
+	if (!event->type)
+		event->type = next_event_type++;
+	else if (event->type > __TRACE_LAST_TYPE) {
+		printk(KERN_WARNING "Need to add type to trace.h\n");
+		WARN_ON(1);
+	}
+
+	if (ftrace_find_event(event->type))
+		goto out;
+
+	key = event->type & (EVENT_HASHSIZE - 1);
+
+	hlist_add_head_rcu(&event->node, &event_hash[key]);
+
+	ret = event->type;
+ out:
+	mutex_unlock(&trace_event_mutex);
+
+	return ret;
+}
+
+/**
+ * unregister_ftrace_event - remove a no longer used event
+ * @event: the event to remove
+ */
+int unregister_ftrace_event(struct trace_event *event)
+{
+	mutex_lock(&trace_event_mutex);
+	hlist_del(&event->node);
+	mutex_unlock(&trace_event_mutex);
+
+	return 0;
+}
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
new file mode 100644
index 000000000000..1fcc76e1378e
--- /dev/null
+++ b/kernel/trace/trace_output.h
@@ -0,0 +1,43 @@
+#ifndef __TRACE_EVENTS_H
+#define __TRACE_EVENTS_H
+
+#include "trace.h"
+
+typedef int (*trace_print_func)(struct trace_seq *s, struct trace_entry *entry,
+				int flags);
+
+struct trace_event {
+	struct hlist_node	node;
+	int			type;
+	trace_print_func	trace;
+	trace_print_func	latency_trace;
+	trace_print_func	raw;
+	trace_print_func	hex;
+	trace_print_func	binary;
+};
+
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
+		unsigned long sym_flags);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt);
+int trace_seq_puts(struct trace_seq *s, const char *str);
+int trace_seq_putc(struct trace_seq *s, unsigned char c);
+int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
+int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
+int trace_seq_path(struct trace_seq *s, struct path *path);
+int seq_print_userip_objs(const struct userstack_entry *entry,
+			  struct trace_seq *s, unsigned long sym_flags);
+int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+		      unsigned long ip, unsigned long sym_flags);
+
+struct trace_event *ftrace_find_event(int type);
+int register_ftrace_event(struct trace_event *event);
+int unregister_ftrace_event(struct trace_event *event);
+
+#define MAX_MEMHEX_BYTES	8
+#define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
+
+#endif
+
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index a7172a352f62..b9b13c39b4bb 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 static struct trace_array *power_trace;
 static int __read_mostly trace_power_enabled;
-- 
cgit v1.2.3-59-g8ed1b


From f633cef0200bbaec539e2dbb0bc4bed7f022f98b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 23 Dec 2008 23:24:13 -0500
Subject: ftrace: change trace.c to use registered events

Impact: rework trace.c to use new event register API

Almost every ftrace event has to implement its output display in
trace.c through a different function. Some events did not handle
all the formats (trace, latency-trace, raw, hex, binary), and
this method does not scale well.

This patch converts the format functions to use the event API to
find the event and and print its format. Currently, we have
a print function for trace, latency_trace, raw, hex and binary.
A trace_nop_print is available if the event wants to avoid output
on a particular format.

Perhaps other tracers could use this in the future (like mmiotrace and
function_graph).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        | 402 ++++----------------------------------
 kernel/trace/trace_branch.c |  53 +++++
 kernel/trace/trace_output.c | 467 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.h |  16 ++
 4 files changed, 574 insertions(+), 364 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 90ce0c1d437b..3f0317586cfd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1483,15 +1483,6 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
 		trace_seq_puts(s, " : ");
 }
 
-static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-
-static int task_state_char(unsigned long state)
-{
-	int bit = state ? __ffs(state) + 1 : 0;
-
-	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
-}
-
 static void test_cpu_buff_start(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1515,14 +1506,14 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	struct trace_seq *s = &iter->seq;
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_entry *next_entry;
+	struct trace_event *event;
 	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
 	struct trace_entry *entry = iter->ent;
 	unsigned long abs_usecs;
 	unsigned long rel_usecs;
 	u64 next_ts;
 	char *comm;
-	int S, T;
-	int i;
+	int ret;
 
 	test_cpu_buff_start(iter);
 
@@ -1547,94 +1538,16 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 		lat_print_generic(s, entry, cpu);
 		lat_print_timestamp(s, abs_usecs, rel_usecs);
 	}
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
-
-		seq_print_ip_sym(s, field->ip, sym_flags);
-		trace_seq_puts(s, " (");
-		seq_print_ip_sym(s, field->parent_ip, sym_flags);
-		trace_seq_puts(s, ")\n");
-		break;
-	}
-	case TRACE_CTX:
-	case TRACE_WAKE: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		T = task_state_char(field->next_state);
-		S = task_state_char(field->prev_state);
-		comm = trace_find_cmdline(field->next_pid);
-		trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
-				 field->prev_pid,
-				 field->prev_prio,
-				 S, entry->type == TRACE_CTX ? "==>" : "  +",
-				 field->next_cpu,
-				 field->next_pid,
-				 field->next_prio,
-				 T, comm);
-		break;
-	}
-	case TRACE_SPECIAL: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
-
-		trace_seq_printf(s, "# %ld %ld %ld\n",
-				 field->arg1,
-				 field->arg2,
-				 field->arg3);
-		break;
-	}
-	case TRACE_STACK: {
-		struct stack_entry *field;
-
-		trace_assign_type(field, entry);
-
-		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-			if (i)
-				trace_seq_puts(s, " <= ");
-			seq_print_ip_sym(s, field->caller[i], sym_flags);
-		}
-		trace_seq_puts(s, "\n");
-		break;
-	}
-	case TRACE_PRINT: {
-		struct print_entry *field;
-
-		trace_assign_type(field, entry);
 
-		seq_print_ip_sym(s, field->ip, sym_flags);
-		trace_seq_printf(s, ": %s", field->buf);
-		break;
-	}
-	case TRACE_BRANCH: {
-		struct trace_branch *field;
-
-		trace_assign_type(field, entry);
-
-		trace_seq_printf(s, "[%s] %s:%s:%d\n",
-				 field->correct ? "  ok  " : " MISS ",
-				 field->func,
-				 field->file,
-				 field->line);
-		break;
+	event = ftrace_find_event(entry->type);
+	if (event && event->latency_trace) {
+		ret = event->latency_trace(s, entry, sym_flags);
+		if (ret)
+			return ret;
+		return TRACE_TYPE_HANDLED;
 	}
-	case TRACE_USER_STACK: {
-		struct userstack_entry *field;
 
-		trace_assign_type(field, entry);
-
-		seq_print_userip_objs(field, s, sym_flags);
-		trace_seq_putc(s, '\n');
-		break;
-	}
-	default:
-		trace_seq_printf(s, "Unknown type %d\n", entry->type);
-	}
+	trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	return TRACE_TYPE_HANDLED;
 }
 
@@ -1643,13 +1556,12 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	struct trace_seq *s = &iter->seq;
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_entry *entry;
+	struct trace_event *event;
 	unsigned long usec_rem;
 	unsigned long long t;
 	unsigned long secs;
 	char *comm;
 	int ret;
-	int S, T;
-	int i;
 
 	entry = iter->ent;
 
@@ -1671,127 +1583,17 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
-
-		ret = seq_print_ip_sym(s, field->ip, sym_flags);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
-						field->parent_ip) {
-			ret = trace_seq_printf(s, " <-");
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-			ret = seq_print_ip_sym(s,
-					       field->parent_ip,
-					       sym_flags);
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
-		ret = trace_seq_printf(s, "\n");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_CTX:
-	case TRACE_WAKE: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		T = task_state_char(field->next_state);
-		S = task_state_char(field->prev_state);
-		ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
-				       field->prev_pid,
-				       field->prev_prio,
-				       S,
-				       entry->type == TRACE_CTX ? "==>" : "  +",
-				       field->next_cpu,
-				       field->next_pid,
-				       field->next_prio,
-				       T);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_SPECIAL: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
-
-		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
-				 field->arg1,
-				 field->arg2,
-				 field->arg3);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_STACK: {
-		struct stack_entry *field;
-
-		trace_assign_type(field, entry);
-
-		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-			if (i) {
-				ret = trace_seq_puts(s, " <= ");
-				if (!ret)
-					return TRACE_TYPE_PARTIAL_LINE;
-			}
-			ret = seq_print_ip_sym(s, field->caller[i],
-					       sym_flags);
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
-		ret = trace_seq_puts(s, "\n");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_PRINT: {
-		struct print_entry *field;
-
-		trace_assign_type(field, entry);
-
-		seq_print_ip_sym(s, field->ip, sym_flags);
-		trace_seq_printf(s, ": %s", field->buf);
-		break;
-	}
-	case TRACE_GRAPH_RET: {
-		return print_graph_function(iter);
-	}
-	case TRACE_GRAPH_ENT: {
-		return print_graph_function(iter);
-	}
-	case TRACE_BRANCH: {
-		struct trace_branch *field;
-
-		trace_assign_type(field, entry);
-
-		trace_seq_printf(s, "[%s] %s:%s:%d\n",
-				 field->correct ? "  ok  " : " MISS ",
-				 field->func,
-				 field->file,
-				 field->line);
-		break;
+	event = ftrace_find_event(entry->type);
+	if (event && event->trace) {
+		ret = event->trace(s, entry, sym_flags);
+		if (ret)
+			return ret;
+		return TRACE_TYPE_HANDLED;
 	}
-	case TRACE_USER_STACK: {
-		struct userstack_entry *field;
-
-		trace_assign_type(field, entry);
+	ret = trace_seq_printf(s, "Unknown type %d\n", entry->type);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		ret = seq_print_userip_objs(field, s, sym_flags);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		ret = trace_seq_putc(s, '\n');
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	}
 	return TRACE_TYPE_HANDLED;
 }
 
@@ -1799,8 +1601,8 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry;
+	struct trace_event *event;
 	int ret;
-	int S, T;
 
 	entry = iter->ent;
 
@@ -1809,86 +1611,26 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
-
-		ret = trace_seq_printf(s, "%x %x\n",
-					field->ip,
-					field->parent_ip);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_CTX:
-	case TRACE_WAKE: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		T = task_state_char(field->next_state);
-		S = entry->type == TRACE_WAKE ? '+' :
-			task_state_char(field->prev_state);
-		ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
-				       field->prev_pid,
-				       field->prev_prio,
-				       S,
-				       field->next_cpu,
-				       field->next_pid,
-				       field->next_prio,
-				       T);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_SPECIAL:
-	case TRACE_USER_STACK:
-	case TRACE_STACK: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
-
-		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
-				 field->arg1,
-				 field->arg2,
-				 field->arg3);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
+	event = ftrace_find_event(entry->type);
+	if (event && event->raw) {
+		ret = event->raw(s, entry, 0);
+		if (ret)
+			return ret;
+		return TRACE_TYPE_HANDLED;
 	}
-	case TRACE_PRINT: {
-		struct print_entry *field;
-
-		trace_assign_type(field, entry);
+	ret = trace_seq_printf(s, "%d ?\n", entry->type);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
-		break;
-	}
-	}
 	return TRACE_TYPE_HANDLED;
 }
 
-#define SEQ_PUT_FIELD_RET(s, x)				\
-do {							\
-	if (!trace_seq_putmem(s, &(x), sizeof(x)))	\
-		return 0;				\
-} while (0)
-
-#define SEQ_PUT_HEX_FIELD_RET(s, x)			\
-do {							\
-	BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES);	\
-	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\
-		return 0;				\
-} while (0)
-
 static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	unsigned char newline = '\n';
 	struct trace_entry *entry;
-	int S, T;
+	struct trace_event *event;
 
 	entry = iter->ent;
 
@@ -1896,47 +1638,10 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
 	SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
 
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
-
-		SEQ_PUT_HEX_FIELD_RET(s, field->ip);
-		SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
-		break;
-	}
-	case TRACE_CTX:
-	case TRACE_WAKE: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		T = task_state_char(field->next_state);
-		S = entry->type == TRACE_WAKE ? '+' :
-			task_state_char(field->prev_state);
-		SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
-		SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
-		SEQ_PUT_HEX_FIELD_RET(s, S);
-		SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
-		SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
-		SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
-		SEQ_PUT_HEX_FIELD_RET(s, T);
-		break;
-	}
-	case TRACE_SPECIAL:
-	case TRACE_USER_STACK:
-	case TRACE_STACK: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
+	event = ftrace_find_event(entry->type);
+	if (event && event->hex)
+		event->hex(s, entry, 0);
 
-		SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
-		SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
-		SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
-		break;
-	}
-	}
 	SEQ_PUT_FIELD_RET(s, newline);
 
 	return TRACE_TYPE_HANDLED;
@@ -1962,6 +1667,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry;
+	struct trace_event *event;
 
 	entry = iter->ent;
 
@@ -1969,43 +1675,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 	SEQ_PUT_FIELD_RET(s, entry->cpu);
 	SEQ_PUT_FIELD_RET(s, iter->ts);
 
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
+	event = ftrace_find_event(entry->type);
+	if (event && event->binary)
+		event->binary(s, entry, 0);
 
-		SEQ_PUT_FIELD_RET(s, field->ip);
-		SEQ_PUT_FIELD_RET(s, field->parent_ip);
-		break;
-	}
-	case TRACE_CTX: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		SEQ_PUT_FIELD_RET(s, field->prev_pid);
-		SEQ_PUT_FIELD_RET(s, field->prev_prio);
-		SEQ_PUT_FIELD_RET(s, field->prev_state);
-		SEQ_PUT_FIELD_RET(s, field->next_pid);
-		SEQ_PUT_FIELD_RET(s, field->next_prio);
-		SEQ_PUT_FIELD_RET(s, field->next_state);
-		break;
-	}
-	case TRACE_SPECIAL:
-	case TRACE_USER_STACK:
-	case TRACE_STACK: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
-
-		SEQ_PUT_FIELD_RET(s, field->arg1);
-		SEQ_PUT_FIELD_RET(s, field->arg2);
-		SEQ_PUT_FIELD_RET(s, field->arg3);
-		break;
-	}
-	}
-	return 1;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_empty(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6c00feb3bac7..c15222a01073 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -14,7 +14,9 @@
 #include <linux/hash.h>
 #include <linux/fs.h>
 #include <asm/local.h>
+
 #include "trace.h"
+#include "trace_output.h"
 
 #ifdef CONFIG_BRANCH_TRACER
 
@@ -142,6 +144,49 @@ static void branch_trace_reset(struct trace_array *tr)
 	stop_branch_trace(tr);
 }
 
+static int
+trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct print_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (trace_seq_printf(s, ": %s", field->buf))
+		goto partial;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_branch_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct trace_branch *field;
+
+	trace_assign_type(field, entry);
+
+	if (trace_seq_printf(s, "[%s] %s:%s:%d\n",
+			     field->correct ? "  ok  " : " MISS ",
+			     field->func,
+			     field->file,
+			     field->line))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static struct trace_event trace_branch_event = {
+	.type	 	= TRACE_BRANCH,
+	.trace		= trace_branch_print,
+	.latency_trace	= trace_branch_print,
+	.raw		= trace_nop_print,
+	.hex		= trace_nop_print,
+	.binary		= trace_nop_print,
+};
+
 struct tracer branch_trace __read_mostly =
 {
 	.name		= "branch",
@@ -154,6 +199,14 @@ struct tracer branch_trace __read_mostly =
 
 __init static int init_branch_trace(void)
 {
+	int ret;
+
+	ret = register_ftrace_event(&trace_branch_event);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register branch events\n");
+		return 1;
+	}
+
 	return register_tracer(&branch_trace);
 }
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 1f3f80002b5e..df0c25cbed30 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -286,6 +286,15 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
+static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+
+static int task_state_char(unsigned long state)
+{
+	int bit = state ? __ffs(state) + 1 : 0;
+
+	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
+}
+
 /**
  * ftrace_find_event - find a registered event
  * @type: the type of event to look for
@@ -363,3 +372,461 @@ int unregister_ftrace_event(struct trace_event *event)
 
 	return 0;
 }
+
+/*
+ * Standard events
+ */
+
+int
+trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return 0;
+}
+
+/* TRACE_FN */
+static int
+trace_fn_latency(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+	if (!trace_seq_puts(s, " ("))
+		goto partial;
+	if (!seq_print_ip_sym(s, field->parent_ip, flags))
+		goto partial;
+	if (!trace_seq_puts(s, ")\n"))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_fn_trace(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
+		if (!trace_seq_printf(s, " <-"))
+			goto partial;
+		if (!seq_print_ip_sym(s,
+				      field->parent_ip,
+				      flags))
+			goto partial;
+	}
+	if (!trace_seq_printf(s, "\n"))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (trace_seq_printf(s, "%x %x\n",
+			     field->ip,
+			     field->parent_ip))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static int
+trace_fn_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_HEX_FIELD_RET(s, field->ip);
+	SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
+
+	return 0;
+}
+
+static int
+trace_fn_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_FIELD_RET(s, field->ip);
+	SEQ_PUT_FIELD_RET(s, field->parent_ip);
+
+	return 0;
+}
+
+static struct trace_event trace_fn_event = {
+	.type	 	= TRACE_FN,
+	.trace		= trace_fn_trace,
+	.latency_trace	= trace_fn_latency,
+	.raw		= trace_fn_raw,
+	.hex		= trace_fn_hex,
+	.binary		= trace_fn_bin,
+};
+
+/* TRACE_CTX an TRACE_WAKE */
+static int
+trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags,
+		    char *delim)
+{
+	struct ctx_switch_entry *field;
+	char *comm;
+	int S, T;
+
+	trace_assign_type(field, entry);
+
+	T = task_state_char(field->next_state);
+	S = task_state_char(field->prev_state);
+	comm = trace_find_cmdline(field->next_pid);
+	if (trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+			     field->prev_pid,
+			     field->prev_prio,
+			     S, delim,
+			     field->next_cpu,
+			     field->next_pid,
+			     field->next_prio,
+			     T, comm))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static int
+trace_ctx_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_print(s, entry, flags, "==>");
+}
+
+static int
+trace_wake_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_print(s, entry, flags, "  +");
+}
+
+static int
+trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags,
+		  char S)
+{
+	struct ctx_switch_entry *field;
+	int T;
+
+	trace_assign_type(field, entry);
+
+	if (!S)
+		task_state_char(field->prev_state);
+	T = task_state_char(field->next_state);
+	if (trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
+			     field->prev_pid,
+			     field->prev_prio,
+			     S,
+			     field->next_cpu,
+			     field->next_pid,
+			     field->next_prio,
+			     T))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static int
+trace_ctx_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_raw(s, entry, flags, 0);
+}
+
+static int
+trace_wake_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_raw(s, entry, flags, '+');
+}
+
+
+static int
+trace_ctxwake_hex(struct trace_seq *s, struct trace_entry *entry, int flags,
+		  char S)
+{
+	struct ctx_switch_entry *field;
+	int T;
+
+	trace_assign_type(field, entry);
+
+	if (!S)
+		task_state_char(field->prev_state);
+	T = task_state_char(field->next_state);
+
+	SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
+	SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
+	SEQ_PUT_HEX_FIELD_RET(s, S);
+	SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
+	SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
+	SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
+	SEQ_PUT_HEX_FIELD_RET(s, T);
+
+	return 0;
+}
+
+static int
+trace_ctx_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_hex(s, entry, flags, 0);
+}
+
+static int
+trace_wake_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_hex(s, entry, flags, '+');
+}
+
+static int
+trace_ctxwake_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ctx_switch_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_FIELD_RET(s, field->prev_pid);
+	SEQ_PUT_FIELD_RET(s, field->prev_prio);
+	SEQ_PUT_FIELD_RET(s, field->prev_state);
+	SEQ_PUT_FIELD_RET(s, field->next_pid);
+	SEQ_PUT_FIELD_RET(s, field->next_prio);
+	SEQ_PUT_FIELD_RET(s, field->next_state);
+
+	return 0;
+}
+
+static struct trace_event trace_ctx_event = {
+	.type	 	= TRACE_CTX,
+	.trace		= trace_ctx_print,
+	.latency_trace	= trace_ctx_print,
+	.raw		= trace_ctx_raw,
+	.hex		= trace_ctx_hex,
+	.binary		= trace_ctxwake_bin,
+};
+
+static struct trace_event trace_wake_event = {
+	.type	 	= TRACE_WAKE,
+	.trace		= trace_wake_print,
+	.latency_trace	= trace_wake_print,
+	.raw		= trace_wake_raw,
+	.hex		= trace_wake_hex,
+	.binary		= trace_ctxwake_bin,
+};
+
+/* TRACE_SPECIAL */
+static int
+trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct special_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (trace_seq_printf(s, "# %ld %ld %ld\n",
+			     field->arg1,
+			     field->arg2,
+			     field->arg3))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static int
+trace_special_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct special_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
+	SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
+	SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
+
+	return 0;
+}
+
+static int
+trace_special_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct special_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_FIELD_RET(s, field->arg1);
+	SEQ_PUT_FIELD_RET(s, field->arg2);
+	SEQ_PUT_FIELD_RET(s, field->arg3);
+
+	return 0;
+}
+
+static struct trace_event trace_special_event = {
+	.type	 	= TRACE_SPECIAL,
+	.trace		= trace_special_print,
+	.latency_trace	= trace_special_print,
+	.raw		= trace_special_print,
+	.hex		= trace_special_hex,
+	.binary		= trace_special_bin,
+};
+
+/* TRACE_STACK */
+
+static int
+trace_stack_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct stack_entry *field;
+	int i;
+
+	trace_assign_type(field, entry);
+
+	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+		if (i) {
+			if (trace_seq_puts(s, " <= "))
+				goto partial;
+
+			if (seq_print_ip_sym(s, field->caller[i], flags))
+				goto partial;
+		}
+		if (trace_seq_puts(s, "\n"))
+			goto partial;
+	}
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_stack_event = {
+	.type	 	= TRACE_STACK,
+	.trace		= trace_stack_print,
+	.latency_trace	= trace_stack_print,
+	.raw		= trace_special_print,
+	.hex		= trace_special_hex,
+	.binary		= trace_special_bin,
+};
+
+/* TRACE_USER_STACK */
+static int
+trace_user_stack_print(struct trace_seq *s, struct trace_entry *entry,
+		       int flags)
+{
+	struct userstack_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (seq_print_userip_objs(field, s, flags))
+		goto partial;
+
+	if (trace_seq_putc(s, '\n'))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_user_stack_event = {
+	.type	 	= TRACE_USER_STACK,
+	.trace		= trace_user_stack_print,
+	.latency_trace	= trace_user_stack_print,
+	.raw		= trace_special_print,
+	.hex		= trace_special_hex,
+	.binary		= trace_special_bin,
+};
+
+/* TRACE_PRINT */
+static int
+trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct print_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (trace_seq_printf(s, ": %s", field->buf))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_print_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct print_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (trace_seq_printf(s, "# %lx %s", field->ip, field->buf))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_print_event = {
+	.type	 	= TRACE_PRINT,
+	.trace		= trace_print_print,
+	.latency_trace	= trace_print_print,
+	.raw		= trace_print_raw,
+	.hex		= trace_nop_print,
+	.binary		= trace_nop_print,
+};
+
+static struct trace_event *events[] __initdata = {
+	&trace_fn_event,
+	&trace_ctx_event,
+	&trace_wake_event,
+	&trace_special_event,
+	&trace_stack_event,
+	&trace_user_stack_event,
+	&trace_print_event,
+	NULL
+};
+
+__init static int init_events(void)
+{
+	struct trace_event *event;
+	int i, ret;
+
+	for (i = 0; events[i]; i++) {
+		event = events[i];
+
+		ret = register_ftrace_event(event);
+		if (!ret) {
+			printk(KERN_WARNING "event %d failed to register\n",
+			       event->type);
+			WARN_ON_ONCE(1);
+		}
+	}
+
+	return 0;
+}
+device_initcall(init_events);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 1fcc76e1378e..ecab4ea4a4fd 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -36,8 +36,24 @@ struct trace_event *ftrace_find_event(int type);
 int register_ftrace_event(struct trace_event *event);
 int unregister_ftrace_event(struct trace_event *event);
 
+int
+trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags);
+
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
 
+#define SEQ_PUT_FIELD_RET(s, x)				\
+do {							\
+	if (!trace_seq_putmem(s, &(x), sizeof(x)))	\
+		return 0;				\
+} while (0)
+
+#define SEQ_PUT_HEX_FIELD_RET(s, x)			\
+do {							\
+	BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES);	\
+	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\
+		return 0;				\
+} while (0)
+
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From dbd0b4b33074aa6b7832a9d9a5bd985eca5c1aa2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 28 Dec 2008 20:44:51 -0800
Subject: tracing/ftrace: provide the base infrastructure for histogram tracing

Impact: extend the tracing API

The goal of this patch is to normalize and make more easy the
implementation of statistical (histogram) tracing.

It implements a trace_stat file into the /debugfs/tracing directory where
one can print a one-shot output of statistics/histogram entries.

A tracer has to provide two basic iterator callbacks:

  stat_start() => the first entry
  stat_next(prev, idx) => the next one.

Note that it is adapted for arrays or hash tables or lists.... since it
provides a pointer to the previous entry and the current index of the
iterator.

These two callbacks are called to get a snapshot of the statistics at each
opening of the trace_stat file because. The values are so updated between
two "cat trace_stat". And the tracer is free to lock its datas during the
iteration to keep consistent values.

Since it is almost always interesting to sort statisticals values to
address the problems by priority, this infrastructure provides a "sorting"
of the stat entries too if desired. A tracer has just to provide a
stat_cmp callback to compare two entries and the stat tracing
infrastructure will build a sorted list of the given entries.

A last callback, called stat_headers, can be implemented by a tracer to
output headers on its trace.

If one of these callbacks is changed on runtime, it just have to signal it
to the stat tracing API by calling the init_tracer_stat() helper.

Changes in V2:

- Fix a memory leak if the user opens multiple times the trace_stat file
  without closing it. Now we always free our list before rebuilding it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile     |   1 +
 kernel/trace/trace.c      |   3 +-
 kernel/trace/trace.h      |  17 ++++
 kernel/trace/trace_stat.c | 251 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 271 insertions(+), 1 deletion(-)
 create mode 100644 kernel/trace/trace_stat.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 549f93c9b393..31cd5fbc0eed 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_output.o
+obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3f0317586cfd..b789c010512c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2354,6 +2354,7 @@ static int tracing_set_tracer(char *buf)
 		if (ret)
 			goto out;
 	}
+	init_tracer_stat(t);
 
 	trace_branch_enable(tr);
  out:
@@ -3206,7 +3207,7 @@ __init static int tracer_alloc_buffers(void)
 #else
 	current_trace = &nop_trace;
 #endif
-
+	init_tracer_stat(current_trace);
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6bd71fa1e1c7..05fa804d1c16 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -336,6 +336,21 @@ struct tracer {
 	struct tracer		*next;
 	int			print_max;
 	struct tracer_flags 	*flags;
+
+	/*
+	 * If you change one of the following on tracing runtime, recall
+	 * init_tracer_stat()
+	 */
+
+	/* Iteration over statistic entries */
+	void			*(*stat_start)(void);
+	void			*(*stat_next)(void *prev, int idx);
+	/* Compare two entries for sorting (optional) for stats */
+	int			(*stat_cmp)(void *p1, void *p2);
+	/* Print a stat entry */
+	int			(*stat_show)(struct seq_file *s, void *p);
+	/* Print the headers of your stat entries */
+	int			(*stat_headers)(struct seq_file *s);
 };
 
 struct trace_seq {
@@ -421,6 +436,8 @@ void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 
+void init_tracer_stat(struct tracer *trace);
+
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 
 extern unsigned long tracing_max_latency;
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
new file mode 100644
index 000000000000..6f194a33a64a
--- /dev/null
+++ b/kernel/trace/trace_stat.c
@@ -0,0 +1,251 @@
+/*
+ * Infrastructure for statistic tracing (histogram output).
+ *
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Based on the code from trace_branch.c which is
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+
+#include <linux/list.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include "trace.h"
+
+
+/* List of stat entries from a tracer */
+struct trace_stat_list {
+	struct list_head list;
+	void *stat;
+};
+
+static struct trace_stat_list stat_list;
+
+/*
+ * This is a copy of the current tracer to avoid racy
+ * and dangerous output while the current tracer is
+ * switched.
+ */
+static struct tracer current_tracer;
+
+/*
+ * Protect both the current tracer and the global
+ * stat list.
+ */
+static DEFINE_MUTEX(stat_list_mutex);
+
+
+static void reset_stat_list(void)
+{
+	struct trace_stat_list *node;
+	struct list_head *next;
+
+	if (list_empty(&stat_list.list))
+		return;
+
+	node = list_entry(stat_list.list.next, struct trace_stat_list, list);
+	next = node->list.next;
+
+	while (&node->list != next) {
+		kfree(node);
+		node = list_entry(next, struct trace_stat_list, list);
+	}
+	kfree(node);
+
+	INIT_LIST_HEAD(&stat_list.list);
+}
+
+void init_tracer_stat(struct tracer *trace)
+{
+	mutex_lock(&stat_list_mutex);
+	current_tracer = *trace;
+	mutex_unlock(&stat_list_mutex);
+}
+
+/*
+ * For tracers that don't provide a stat_cmp callback.
+ * This one will force an immediate insertion on tail of
+ * the list.
+ */
+static int dummy_cmp(void *p1, void *p2)
+{
+	return 1;
+}
+
+/*
+ * Initialize the stat list at each trace_stat file opening.
+ * All of these copies and sorting are required on all opening
+ * since the stats could have changed between two file sessions.
+ */
+static int stat_seq_init(void)
+{
+	struct trace_stat_list *iter_entry, *new_entry;
+	void *prev_stat;
+	int ret = 0;
+	int i;
+
+	mutex_lock(&stat_list_mutex);
+	reset_stat_list();
+
+	if (!current_tracer.stat_start || !current_tracer.stat_next ||
+					!current_tracer.stat_show)
+		goto exit;
+
+	if (!current_tracer.stat_cmp)
+		current_tracer.stat_cmp = dummy_cmp;
+
+	/*
+	 * The first entry. Actually this is the second, but the first
+	 * one (the stat_list head) is pointless.
+	 */
+	new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+	if (!new_entry) {
+		ret = -ENOMEM;
+		goto exit;
+	}
+
+	INIT_LIST_HEAD(&new_entry->list);
+	list_add(&new_entry->list, &stat_list.list);
+	new_entry->stat = current_tracer.stat_start();
+
+	prev_stat = new_entry->stat;
+
+	/*
+	 * Iterate over the tracer stat entries and store them in a sorted
+	 * list.
+	 */
+	for (i = 1; ; i++) {
+		new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+		if (!new_entry) {
+			ret = -ENOMEM;
+			goto exit_free_list;
+		}
+
+		INIT_LIST_HEAD(&new_entry->list);
+		new_entry->stat = current_tracer.stat_next(prev_stat, i);
+
+		/* End of insertion */
+		if (!new_entry->stat)
+			break;
+
+		list_for_each_entry(iter_entry, &stat_list.list, list) {
+			/* Insertion with a descendent sorting */
+			if (current_tracer.stat_cmp(new_entry->stat,
+						iter_entry->stat) > 0) {
+
+				list_add_tail(&new_entry->list,
+						&iter_entry->list);
+				break;
+
+			/* The current smaller value */
+			} else if (list_is_last(&iter_entry->list,
+						&stat_list.list)) {
+				list_add(&new_entry->list, &iter_entry->list);
+				break;
+			}
+		}
+
+		prev_stat = new_entry->stat;
+	}
+exit:
+	mutex_unlock(&stat_list_mutex);
+	return ret;
+
+exit_free_list:
+	reset_stat_list();
+	mutex_unlock(&stat_list_mutex);
+	return ret;
+}
+
+
+static void *stat_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct trace_stat_list *l = (struct trace_stat_list *)s->private;
+
+	/* Prevent from tracer switch or stat_list modification */
+	mutex_lock(&stat_list_mutex);
+
+	/* If we are in the beginning of the file, print the headers */
+	if (!*pos && current_tracer.stat_headers)
+		current_tracer.stat_headers(s);
+
+	return seq_list_start(&l->list, *pos);
+}
+
+static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
+{
+	struct trace_stat_list *l = (struct trace_stat_list *)s->private;
+
+	return seq_list_next(p, &l->list, pos);
+}
+
+static void stat_seq_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&stat_list_mutex);
+}
+
+static int stat_seq_show(struct seq_file *s, void *v)
+{
+	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
+	return current_tracer.stat_show(s, l->stat);
+}
+
+static const struct seq_operations trace_stat_seq_ops = {
+	.start = stat_seq_start,
+	.next = stat_seq_next,
+	.stop = stat_seq_stop,
+	.show = stat_seq_show
+};
+
+static int tracing_stat_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &trace_stat_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = &stat_list;
+		ret = stat_seq_init();
+	}
+
+	return ret;
+}
+
+
+/*
+ * Avoid consuming memory with our now useless list.
+ */
+static int tracing_stat_release(struct inode *i, struct file *f)
+{
+	mutex_lock(&stat_list_mutex);
+	reset_stat_list();
+	mutex_unlock(&stat_list_mutex);
+	return 0;
+}
+
+static const struct file_operations tracing_stat_fops = {
+	.open		= tracing_stat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= tracing_stat_release
+};
+
+static int __init tracing_stat_init(void)
+{
+	struct dentry *d_tracing;
+	struct dentry *entry;
+
+	INIT_LIST_HEAD(&stat_list.list);
+	d_tracing = tracing_init_dentry();
+
+	entry = debugfs_create_file("trace_stat", 0444, d_tracing,
+					NULL,
+				    &tracing_stat_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'trace_stat' entry\n");
+	return 0;
+}
+fs_initcall(tracing_stat_init);
-- 
cgit v1.2.3-59-g8ed1b


From e302cf3f961ceb54c1dd0aff7ba8531df83be07a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 27 Dec 2008 23:25:38 +0100
Subject: tracing/branch-tracer: adapt to the stat tracing API

Impact: refactor the branch tracer

This patch adapts the branch tracer to the tracing API.

This is a proof of concept because the branch tracer implements two
"stat tracing" that were split in two files.

So I added an option to the branch tracer: stat_all_branch.
If it is set, then trace_stat will output all of the branches
entries stats. Otherwise, it will print the annotated branches.

Its is a kind of quick trick, waiting for a better solution.

By default, the annotated branches stat are sorted by incorrect branch
prediction percentage.

Ie:

 correct incorrect  %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
       0        1 100 native_smp_prepare_cpus        smpboot.c            1228
       0        1 100 hpet_rtc_timer_reinit          hpet.c               1057
       0    18032 100 sched_info_queued              sched_stats.h        223
       0      684 100 yield_task_fair                sched_fair.c         984
       0      282 100 pre_schedule_rt                sched_rt.c           1263
       0    13414 100 sched_info_dequeued            sched_stats.h        178
       0    21724 100 sched_info_switch              sched_stats.h        270
       0        1 100 get_signal_to_deliver          signal.c             1820
       0        8 100 __cancel_work_timer            workqueue.c          560
       0      212 100 verify_export_symbols          module.c             1509
       0       17 100 __rmqueue_fallback             page_alloc.c         793
       0       43 100 clear_page_mlock               internal.h           129
       0      124 100 try_to_unmap_anon              rmap.c               1021
       0       53 100 try_to_unmap_anon              rmap.c               1013
       0        6 100 vma_address                    rmap.c               232
       0     3301 100 try_to_unmap_file              rmap.c               1082
       0      466 100 try_to_unmap_file              rmap.c               1077
       0        1 100 mem_cgroup_create              memcontrol.c         1090
       0        3 100 inotify_find_update_watch      inotify.c            726
       2    30163  99 perf_counter_task_sched_out    perf_counter.c       385
       1     2935  99 percpu_free                    allocpercpu.c        138
    1544   297672  99 dentry_lru_del_init            dcache.c             153
       8     1074  99 input_pass_event               input.c              86
    1390    76781  98 mapping_unevictable            pagemap.h            50
     280     6665  95 pick_next_task_rt              sched_rt.c           889
     750     4826  86 next_pidmap                    pid.c                194
       2        8  80 blocking_notifier_chain_regist notifier.c           220
      36      130  78 ioremap_pte_range              ioremap.c            22
    1093     3247  74 IS_ERR                         err.h                34
    1023     2908  73 sched_slice                    sched_fair.c         445
      22       60  73 disk_put_part                  genhd.h              206
[...]

It enables a developer to quickly address the source of incorrect branch
predictions.  Note that this sorting would be better with a second sort on
the number of incorrect predictions.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 268 +++++++++++++++++++++++---------------------
 1 file changed, 142 insertions(+), 126 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index c15222a01073..4785a3b9bc4a 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -18,10 +18,13 @@
 #include "trace.h"
 #include "trace_output.h"
 
+static struct tracer branch_trace;
+
 #ifdef CONFIG_BRANCH_TRACER
 
 static int branch_tracing_enabled __read_mostly;
 static DEFINE_MUTEX(branch_tracing_mutex);
+
 static struct trace_array *branch_tracer;
 
 static void
@@ -178,6 +181,7 @@ trace_branch_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
+
 static struct trace_event trace_branch_event = {
 	.type	 	= TRACE_BRANCH,
 	.trace		= trace_branch_print,
@@ -187,30 +191,6 @@ static struct trace_event trace_branch_event = {
 	.binary		= trace_nop_print,
 };
 
-struct tracer branch_trace __read_mostly =
-{
-	.name		= "branch",
-	.init		= branch_trace_init,
-	.reset		= branch_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_branch,
-#endif
-};
-
-__init static int init_branch_trace(void)
-{
-	int ret;
-
-	ret = register_ftrace_event(&trace_branch_event);
-	if (!ret) {
-		printk(KERN_WARNING "Warning: could not register branch events\n");
-		return 1;
-	}
-
-	return register_tracer(&branch_trace);
-}
-
-device_initcall(init_branch_trace);
 #else
 static inline
 void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
@@ -236,66 +216,39 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
 }
 EXPORT_SYMBOL(ftrace_likely_update);
 
-struct ftrace_pointer {
-	void		*start;
-	void		*stop;
-	int		hit;
-};
+extern unsigned long __start_annotated_branch_profile[];
+extern unsigned long __stop_annotated_branch_profile[];
 
-static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+static int annotated_branch_stat_headers(struct seq_file *m)
 {
-	const struct ftrace_pointer *f = m->private;
-	struct ftrace_branch_data *p = v;
-
-	(*pos)++;
-
-	if (v == (void *)1)
-		return f->start;
-
-	++p;
-
-	if ((void *)p >= (void *)f->stop)
-		return NULL;
-
-	return p;
+	seq_printf(m, " correct incorrect  %% ");
+	seq_printf(m, "       Function                "
+			      "  File              Line\n"
+			      " ------- ---------  - "
+			      "       --------                "
+			      "  ----              ----\n");
+	return 0;
 }
 
-static void *t_start(struct seq_file *m, loff_t *pos)
+static inline long get_incorrect_percent(struct ftrace_branch_data *p)
 {
-	void *t = (void *)1;
-	loff_t l = 0;
-
-	for (; t && l < *pos; t = t_next(m, t, &l))
-		;
+	long percent;
 
-	return t;
-}
+	if (p->correct) {
+		percent = p->incorrect * 100;
+		percent /= p->correct + p->incorrect;
+	} else
+		percent = p->incorrect ? 100 : -1;
 
-static void t_stop(struct seq_file *m, void *p)
-{
+	return percent;
 }
 
-static int t_show(struct seq_file *m, void *v)
+static int branch_stat_show(struct seq_file *m, void *v)
 {
-	const struct ftrace_pointer *fp = m->private;
 	struct ftrace_branch_data *p = v;
 	const char *f;
 	long percent;
 
-	if (v == (void *)1) {
-		if (fp->hit)
-			seq_printf(m, "   miss      hit    %% ");
-		else
-			seq_printf(m, " correct incorrect  %% ");
-		seq_printf(m, "       Function                "
-			      "  File              Line\n"
-			      " ------- ---------  - "
-			      "       --------                "
-			      "  ----              ----\n");
-		return 0;
-	}
-
 	/* Only print the file, not the path */
 	f = p->file + strlen(p->file);
 	while (f >= p->file && *f != '/')
@@ -305,11 +258,7 @@ static int t_show(struct seq_file *m, void *v)
 	/*
 	 * The miss is overlayed on correct, and hit on incorrect.
 	 */
-	if (p->correct) {
-		percent = p->incorrect * 100;
-		percent /= p->correct + p->incorrect;
-	} else
-		percent = p->incorrect ? 100 : -1;
+	percent = get_incorrect_percent(p);
 
 	seq_printf(m, "%8lu %8lu ",  p->correct, p->incorrect);
 	if (percent < 0)
@@ -320,76 +269,143 @@ static int t_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations tracing_likely_seq_ops = {
-	.start		= t_start,
-	.next		= t_next,
-	.stop		= t_stop,
-	.show		= t_show,
-};
+static void *annotated_branch_stat_start(void)
+{
+	return __start_annotated_branch_profile;
+}
 
-static int tracing_branch_open(struct inode *inode, struct file *file)
+static void *
+annotated_branch_stat_next(void *v, int idx)
 {
-	int ret;
+	struct ftrace_branch_data *p = v;
 
-	ret = seq_open(file, &tracing_likely_seq_ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = (void *)inode->i_private;
-	}
+	++p;
 
-	return ret;
+	if ((void *)p >= (void *)__stop_annotated_branch_profile)
+		return NULL;
+
+	return p;
 }
 
-static const struct file_operations tracing_branch_fops = {
-	.open		= tracing_branch_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-};
+static int annotated_branch_stat_cmp(void *p1, void *p2)
+{
+	struct ftrace_branch_data *a = p1;
+	struct ftrace_branch_data *b = p2;
+
+	long percent_a, percent_b;
+
+	percent_a = get_incorrect_percent(a);
+	percent_b = get_incorrect_percent(b);
+
+	if (percent_a < percent_b)
+		return -1;
+	if (percent_a > percent_b)
+		return 1;
+	else
+		return 0;
+}
 
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
-extern unsigned long __start_branch_profile[];
-extern unsigned long __stop_branch_profile[];
+enum {
+	TRACE_BRANCH_OPT_ALL = 0x1
+};
 
-static const struct ftrace_pointer ftrace_branch_pos = {
-	.start			= __start_branch_profile,
-	.stop			= __stop_branch_profile,
-	.hit			= 1,
+static struct tracer_opt branch_opts[] = {
+	{ TRACER_OPT(stat_all_branch, TRACE_BRANCH_OPT_ALL) },
+	{ }
 };
 
-#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+static struct tracer_flags branch_flags = {
+	.val = 0,
+	.opts = branch_opts
+};
 
-extern unsigned long __start_annotated_branch_profile[];
-extern unsigned long __stop_annotated_branch_profile[];
+extern unsigned long __start_branch_profile[];
+extern unsigned long __stop_branch_profile[];
 
-static const struct ftrace_pointer ftrace_annotated_branch_pos = {
-	.start			= __start_annotated_branch_profile,
-	.stop			= __stop_annotated_branch_profile,
-};
+static int all_branch_stat_headers(struct seq_file *m)
+{
+	seq_printf(m, "   miss      hit    %% ");
+	seq_printf(m, "       Function                "
+			      "  File              Line\n"
+			      " ------- ---------  - "
+			      "       --------                "
+			      "  ----              ----\n");
+	return 0;
+}
 
-static __init int ftrace_branch_init(void)
+static void *all_branch_stat_start(void)
 {
-	struct dentry *d_tracer;
-	struct dentry *entry;
+	return __start_branch_profile;
+}
+
+static void *
+all_branch_stat_next(void *v, int idx)
+{
+	struct ftrace_branch_data *p = v;
 
-	d_tracer = tracing_init_dentry();
+	++p;
 
-	entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
-				    (void *)&ftrace_annotated_branch_pos,
-				    &tracing_branch_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'profile_annotatet_branch' entry\n");
+	if ((void *)p >= (void *)__stop_branch_profile)
+		return NULL;
 
-#ifdef CONFIG_PROFILE_ALL_BRANCHES
-	entry = debugfs_create_file("profile_branch", 0444, d_tracer,
-				    (void *)&ftrace_branch_pos,
-				    &tracing_branch_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs"
-			   " 'profile_branch' entry\n");
-#endif
+	return p;
+}
 
+static int branch_set_flag(u32 old_flags, u32 bit, int set)
+{
+	if (bit == TRACE_BRANCH_OPT_ALL) {
+		if (set) {
+			branch_trace.stat_headers = all_branch_stat_headers;
+			branch_trace.stat_start = all_branch_stat_start;
+			branch_trace.stat_next = all_branch_stat_next;
+			branch_trace.stat_cmp = NULL;
+		} else {
+			branch_trace.stat_headers =
+				annotated_branch_stat_headers;
+			branch_trace.stat_start = annotated_branch_stat_start;
+			branch_trace.stat_next = annotated_branch_stat_next;
+			branch_trace.stat_cmp = annotated_branch_stat_cmp;
+		}
+		init_tracer_stat(&branch_trace);
+	}
 	return 0;
 }
 
-device_initcall(ftrace_branch_init);
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+
+static struct tracer branch_trace __read_mostly =
+{
+	.name		= "branch",
+#ifdef CONFIG_BRANCH_TRACER
+	.init		= branch_trace_init,
+	.reset		= branch_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_branch,
+#endif /* CONFIG_FTRACE_SELFTEST */
+#endif /* CONFIG_BRANCH_TRACER */
+	.stat_start	=	annotated_branch_stat_start,
+	.stat_next	= annotated_branch_stat_next,
+	.stat_show	= branch_stat_show,
+	.stat_headers	= annotated_branch_stat_headers,
+	.stat_cmp	= annotated_branch_stat_cmp,
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+	.flags	= &branch_flags,
+	.set_flag	= branch_set_flag,
+#endif
+};
+
+__init static int init_branch_trace(void)
+{
+#ifdef CONFIG_BRANCH_TRACER
+	int ret;
+	ret = register_ftrace_event(&trace_branch_event);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register branch events\n");
+		return 1;
+	}
+#endif
+
+	return register_tracer(&branch_trace);
+}
+device_initcall(init_branch_trace);
-- 
cgit v1.2.3-59-g8ed1b


From f7d48cbde5c0710008caeaf7dbf14f4a9b064940 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 29 Dec 2008 13:02:17 +0100
Subject: tracing/ftrace: make trace_find_cmdline() generally available

Impact: build fix

On !CONFIG_CONTEXT_SWITCH_TRACER trace_find_cmdline() is not defined:

 kernel/trace/trace_output.c: In function 'trace_ctxwake_print':
 kernel/trace/trace_output.c:499: error: implicit declaration of function 'trace_find_cmdline'
 kernel/trace/trace_output.c:499: warning: assignment makes pointer from integer without a cast

Move it to the generic section in trace.h.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 05fa804d1c16..a8b624ccd4d6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -469,10 +469,10 @@ struct tracer_switch_ops {
 	void				*private;
 	struct tracer_switch_ops	*next;
 };
-
-char *trace_find_cmdline(int pid);
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
+extern char *trace_find_cmdline(int pid);
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern unsigned long ftrace_update_tot_cnt;
 #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
-- 
cgit v1.2.3-59-g8ed1b


From a103e2ab7377dbbef2506be59c49a3f2ae10b60b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 29 Dec 2008 15:07:47 +0100
Subject: tracing/selftest: remove TRACE_CONT reference

Impact: build fix

TRACE_CONT is gone - fix up the self-test too.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 88c8eb70f54a..5013812578b1 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -9,7 +9,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_FN:
 	case TRACE_CTX:
 	case TRACE_WAKE:
-	case TRACE_CONT:
 	case TRACE_STACK:
 	case TRACE_PRINT:
 	case TRACE_SPECIAL:
-- 
cgit v1.2.3-59-g8ed1b


From 7a51cffbd10886c0557677dd916c090097c691ef Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 29 Dec 2008 16:03:40 +0100
Subject: relayfs: replace BUG() with WARN_ON() in relay_late_setup_files()

Impact: turn boot crash into boot warning

This BUG() can trigger:

[   16.684131] initcall fail_page_alloc_debugfs+0x0/0xc1 returned 0 after 0 usecs
[   16.692035] calling  kmemtrace_setup_late+0x0/0xd5 @ 1
[   16.700087] relay_late_setup_files: CPU 1 has no buffer, it must have!
[   16.704044] ------------[ cut here ]------------
[   16.708030] kernel BUG at kernel/relay.c:680!
[   16.708030] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC
[   16.708030] last sysfs file:
[   16.708030]
[   16.708030] Pid: 1, comm: swapper Not tainted (2.6.28-tip-03903-g9a39f58-dirty #13207) System Product Name
[   16.708030] EIP: 0060:[<c01604ae>] EFLAGS: 00010246 CPU: 1
[   16.708030] EIP is at relay_late_setup_files+0x8c/0x176

Reduce it to a more reportable WARN_ONCE().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/relay.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 09ac2008f77b..d06450670c86 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -675,9 +675,7 @@ int relay_late_setup_files(struct rchan *chan,
 	 */
 	for_each_online_cpu(i) {
 		if (unlikely(!chan->buf[i])) {
-			printk(KERN_ERR "relay_late_setup_files: CPU %u "
-					"has no buffer, it must have!\n", i);
-			BUG();
+			WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
 			err = -EINVAL;
 			break;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 36994e58a48fb8f9651c7dc845a6de298aba5bfc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 29 Dec 2008 13:42:23 -0800
Subject: tracing/kmemtrace: normalize the raw tracer event to the unified
 tracing API

Impact: new tracer plugin

This patch adapts kmemtrace raw events tracing to the unified tracing API.

To enable and use this tracer, just do the following:

 echo kmemtrace > /debugfs/tracing/current_tracer
 cat /debugfs/tracing/trace

You will have the following output:

 # tracer: kmemtrace
 #
 #
 # ALLOC  TYPE  REQ   GIVEN  FLAGS           POINTER         NODE    CALLER
 # FREE   |      |     |       |              |   |            |        |
 # |

type_id 1 call_site 18446744071565527833 ptr 18446612134395152256
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345164672 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345164912 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345165152 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 0 call_site 18446744071566144042 ptr 18446612134346191680 bytes_req 1304 bytes_alloc 1312 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584

That was to stay backward compatible with the format output produced in
inux/tracepoint.h.

This is the default ouput, but note that I tried something else.

If you change an option:

echo kmem_minimalistic > /debugfs/trace_options

and then cat /debugfs/trace, you will have the following output:

 # tracer: kmemtrace
 #
 #
 # ALLOC  TYPE  REQ   GIVEN  FLAGS           POINTER         NODE    CALLER
 # FREE   |      |     |       |              |   |            |        |
 # |

   -      C                            0xffff88007c088780          file_free_rcu
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc780     -1   d_alloc
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc870     -1   d_alloc
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc960     -1   d_alloc
   +      K   1304   1312   000000d0   0xffff8800791d7340     -1   reiserfs_alloc_inode
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   -      C                            0xffff88007cad6000          putname
   +      K    992   1000   000000d0   0xffff880079045b58     -1   alloc_inode
   +      K    768   1024   000080d0   0xffff88007c096400     -1   alloc_pipe_info
   +      K    240    240   000000d0   0xffff8800790dca50     -1   d_alloc
   +      K    272    320   000080d0   0xffff88007c088780     -1   get_empty_filp
   +      K    272    320   000080d0   0xffff88007c088000     -1   get_empty_filp

Yeah I shall confess kmem_minimalistic should be: kmem_alternative.

Whatever, I find it more readable but this a personal opinion of course.
We can drop it if you want.

On the ALLOC/FREE column, + means an allocation and - a free.

On the type column, you have K = kmalloc, C = cache, P = page

I would like the flags to be GFP_* strings but that would not be easy to not
break the column with strings....

About the node...it seems to always be -1. I don't know why but that shouldn't
be difficult to find.

I moved linux/tracepoint.h to trace/tracepoint.h as well. I think that would
be more easy to find the tracer headers if they are all in their common
directory.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kmemtrace.h |  86 ------------
 include/linux/slab_def.h  |   2 +-
 include/linux/slub_def.h  |   2 +-
 include/trace/kmemtrace.h |  75 ++++++++++
 init/main.c               |   2 +-
 kernel/trace/Kconfig      |  22 +++
 kernel/trace/Makefile     |   1 +
 kernel/trace/kmemtrace.c  | 343 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h      |  25 ++++
 lib/Kconfig.debug         |  20 ---
 mm/kmemtrace.c            |   2 +-
 mm/slob.c                 |   2 +-
 mm/slub.c                 |   2 +-
 13 files changed, 472 insertions(+), 112 deletions(-)
 delete mode 100644 include/linux/kmemtrace.h
 create mode 100644 include/trace/kmemtrace.h
 create mode 100644 kernel/trace/kmemtrace.c

(limited to 'kernel')

diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
deleted file mode 100644
index 5bea8ead6a6b..000000000000
--- a/include/linux/kmemtrace.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#ifndef _LINUX_KMEMTRACE_H
-#define _LINUX_KMEMTRACE_H
-
-#ifdef __KERNEL__
-
-#include <linux/types.h>
-#include <linux/marker.h>
-
-enum kmemtrace_type_id {
-	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
-	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
-	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
-};
-
-#ifdef CONFIG_KMEMTRACE
-
-extern void kmemtrace_init(void);
-
-static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-					     unsigned long call_site,
-					     const void *ptr,
-					     size_t bytes_req,
-					     size_t bytes_alloc,
-					     gfp_t gfp_flags,
-					     int node)
-{
-	trace_mark(kmemtrace_alloc, "type_id %d call_site %lu ptr %lu "
-		   "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d",
-		   type_id, call_site, (unsigned long) ptr,
-		   (unsigned long) bytes_req, (unsigned long) bytes_alloc,
-		   (unsigned long) gfp_flags, node);
-}
-
-static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-				       unsigned long call_site,
-				       const void *ptr)
-{
-	trace_mark(kmemtrace_free, "type_id %d call_site %lu ptr %lu",
-		   type_id, call_site, (unsigned long) ptr);
-}
-
-#else /* CONFIG_KMEMTRACE */
-
-static inline void kmemtrace_init(void)
-{
-}
-
-static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-					     unsigned long call_site,
-					     const void *ptr,
-					     size_t bytes_req,
-					     size_t bytes_alloc,
-					     gfp_t gfp_flags,
-					     int node)
-{
-}
-
-static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-				       unsigned long call_site,
-				       const void *ptr)
-{
-}
-
-#endif /* CONFIG_KMEMTRACE */
-
-static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id,
-					unsigned long call_site,
-					const void *ptr,
-					size_t bytes_req,
-					size_t bytes_alloc,
-					gfp_t gfp_flags)
-{
-	kmemtrace_mark_alloc_node(type_id, call_site, ptr,
-				  bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_KMEMTRACE_H */
-
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 7555ce99f6d2..455f9affea9a 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,7 +14,7 @@
 #include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 /* Size description struct for general caches. */
 struct cache_sizes {
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index dc28432b5b9a..6b657f7dcb2b 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,7 +10,7 @@
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h
new file mode 100644
index 000000000000..ad8b7857855a
--- /dev/null
+++ b/include/trace/kmemtrace.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#ifndef _LINUX_KMEMTRACE_H
+#define _LINUX_KMEMTRACE_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/marker.h>
+
+enum kmemtrace_type_id {
+	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
+	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
+	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
+};
+
+#ifdef CONFIG_KMEMTRACE
+
+extern void kmemtrace_init(void);
+
+extern void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+					     unsigned long call_site,
+					     const void *ptr,
+					     size_t bytes_req,
+					     size_t bytes_alloc,
+					     gfp_t gfp_flags,
+					     int node);
+
+extern void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+				       unsigned long call_site,
+				       const void *ptr);
+
+#else /* CONFIG_KMEMTRACE */
+
+static inline void kmemtrace_init(void)
+{
+}
+
+static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+					     unsigned long call_site,
+					     const void *ptr,
+					     size_t bytes_req,
+					     size_t bytes_alloc,
+					     gfp_t gfp_flags,
+					     int node)
+{
+}
+
+static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+				       unsigned long call_site,
+				       const void *ptr)
+{
+}
+
+#endif /* CONFIG_KMEMTRACE */
+
+static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id,
+					unsigned long call_site,
+					const void *ptr,
+					size_t bytes_req,
+					size_t bytes_alloc,
+					gfp_t gfp_flags)
+{
+	kmemtrace_mark_alloc_node(type_id, call_site, ptr,
+				  bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_KMEMTRACE_H */
+
diff --git a/init/main.c b/init/main.c
index 9711586aa7c9..beca7aaddb22 100644
--- a/init/main.c
+++ b/init/main.c
@@ -70,7 +70,7 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e2a4ff6fc3a6..27fb74b06b3c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -264,6 +264,28 @@ config HW_BRANCH_TRACER
 	  This tracer records all branches on the system in a circular
 	  buffer giving access to the last N branches for each cpu.
 
+config KMEMTRACE
+	bool "Trace SLAB allocations"
+	select TRACING
+	depends on RELAY
+	help
+	  kmemtrace provides tracing for slab allocator functions, such as
+	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
+	  data is then fed to the userspace application in order to analyse
+	  allocation hotspots, internal fragmentation and so on, making it
+	  possible to see how well an allocator performs, as well as debug
+	  and profile kernel code.
+
+	  This requires an userspace application to use. See
+	  Documentation/vm/kmemtrace.txt for more information.
+
+	  Saying Y will make the kernel somewhat larger and slower. However,
+	  if you disable kmemtrace at run-time or boot-time, the performance
+	  impact is minimal (depending on the arch the kernel is built for).
+
+	  If unsure, say N.
+
+
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 349d5a93653f..513dc86b5dfa 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -33,5 +33,6 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
+obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
new file mode 100644
index 000000000000..d69cbe3c2a4b
--- /dev/null
+++ b/kernel/trace/kmemtrace.c
@@ -0,0 +1,343 @@
+/*
+ * Memory allocator tracing
+ *
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+#include <linux/dcache.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <trace/kmemtrace.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+/* Select an alternative, minimalistic output than the original one */
+#define TRACE_KMEM_OPT_MINIMAL	0x1
+
+static struct tracer_opt kmem_opts[] = {
+	/* Default disable the minimalistic output */
+	{ TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
+	{ }
+};
+
+static struct tracer_flags kmem_tracer_flags = {
+	.val = 0,
+	.opts = kmem_opts
+};
+
+
+static bool kmem_tracing_enabled __read_mostly;
+static struct trace_array *kmemtrace_array;
+
+static int kmem_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	kmemtrace_array = tr;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		tracing_reset(tr, cpu);
+
+	kmem_tracing_enabled = true;
+
+	return 0;
+}
+
+static void kmem_trace_reset(struct trace_array *tr)
+{
+	kmem_tracing_enabled = false;
+}
+
+static void kmemtrace_headers(struct seq_file *s)
+{
+	/* Don't need headers for the original kmemtrace output */
+	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
+		return;
+
+	seq_printf(s, "#\n");
+	seq_printf(s, "# ALLOC  TYPE  REQ   GIVEN  FLAGS     "
+			"      POINTER         NODE    CALLER\n");
+	seq_printf(s, "# FREE   |      |     |       |       "
+			"       |   |            |        |\n");
+	seq_printf(s, "# |\n\n");
+}
+
+/*
+ * The two following functions give the original output from kmemtrace,
+ * or something close to....perhaps they need some missing things
+ */
+static enum print_line_t
+kmemtrace_print_alloc_original(struct trace_iterator *iter,
+				struct kmemtrace_alloc_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Taken from the old linux/kmemtrace.h */
+	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu "
+	  "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
+	   entry->type_id, entry->call_site, (unsigned long) entry->ptr,
+	   (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc,
+	   (unsigned long) entry->gfp_flags, entry->node);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free_original(struct trace_iterator *iter,
+				struct kmemtrace_free_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Taken from the old linux/kmemtrace.h */
+	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n",
+	   entry->type_id, entry->call_site, (unsigned long) entry->ptr);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+
+/* The two other following provide a more minimalistic output */
+static enum print_line_t
+kmemtrace_print_alloc_compress(struct trace_iterator *iter,
+					struct kmemtrace_alloc_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Alloc entry */
+	ret = trace_seq_printf(s, "  +      ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Type */
+	switch (entry->type_id) {
+	case KMEMTRACE_TYPE_KMALLOC:
+		ret = trace_seq_printf(s, "K   ");
+		break;
+	case KMEMTRACE_TYPE_CACHE:
+		ret = trace_seq_printf(s, "C   ");
+		break;
+	case KMEMTRACE_TYPE_PAGES:
+		ret = trace_seq_printf(s, "P   ");
+		break;
+	default:
+		ret = trace_seq_printf(s, "?   ");
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Requested */
+	ret = trace_seq_printf(s, "%4d   ", entry->bytes_req);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Allocated */
+	ret = trace_seq_printf(s, "%4d   ", entry->bytes_alloc);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Flags
+	 * TODO: would be better to see the name of the GFP flag names
+	 */
+	ret = trace_seq_printf(s, "%08x   ", entry->gfp_flags);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Pointer to allocated */
+	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Node */
+	ret = trace_seq_printf(s, "%4d   ", entry->node);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Call site */
+	ret = seq_print_ip_sym(s, entry->call_site, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (!trace_seq_printf(s, "\n"))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free_compress(struct trace_iterator *iter,
+				struct kmemtrace_free_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Free entry */
+	ret = trace_seq_printf(s, "  -      ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Type */
+	switch (entry->type_id) {
+	case KMEMTRACE_TYPE_KMALLOC:
+		ret = trace_seq_printf(s, "K     ");
+		break;
+	case KMEMTRACE_TYPE_CACHE:
+		ret = trace_seq_printf(s, "C     ");
+		break;
+	case KMEMTRACE_TYPE_PAGES:
+		ret = trace_seq_printf(s, "P     ");
+		break;
+	default:
+		ret = trace_seq_printf(s, "?     ");
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Skip requested/allocated/flags */
+	ret = trace_seq_printf(s, "                       ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Pointer to allocated */
+	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Skip node */
+	ret = trace_seq_printf(s, "       ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Call site */
+	ret = seq_print_ip_sym(s, entry->call_site, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (!trace_seq_printf(s, "\n"))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+
+	switch (entry->type) {
+	case TRACE_KMEM_ALLOC: {
+		struct kmemtrace_alloc_entry *field;
+		trace_assign_type(field, entry);
+		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
+			return kmemtrace_print_alloc_compress(iter, field);
+		else
+			return kmemtrace_print_alloc_original(iter, field);
+	}
+
+	case TRACE_KMEM_FREE: {
+		struct kmemtrace_free_entry *field;
+		trace_assign_type(field, entry);
+		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
+			return kmemtrace_print_free_compress(iter, field);
+		else
+			return kmemtrace_print_free_original(iter, field);
+	}
+
+	default:
+		return TRACE_TYPE_UNHANDLED;
+	}
+}
+
+/* Trace allocations */
+void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+			     unsigned long call_site,
+			     const void *ptr,
+			     size_t bytes_req,
+			     size_t bytes_alloc,
+			     gfp_t gfp_flags,
+			     int node)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_alloc_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+	unsigned long irq_flags;
+
+	if (!kmem_tracing_enabled)
+		return;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_ALLOC;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+	entry->bytes_req = bytes_req;
+	entry->bytes_alloc = bytes_alloc;
+	entry->gfp_flags = gfp_flags;
+	entry->node	=	node;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+}
+
+void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+		       unsigned long call_site,
+		       const void *ptr)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_free_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+	unsigned long irq_flags;
+
+	if (!kmem_tracing_enabled)
+		return;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_FREE;
+	entry->type_id	= type_id;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+}
+
+static struct tracer kmem_tracer __read_mostly = {
+	.name		= "kmemtrace",
+	.init		= kmem_trace_init,
+	.reset		= kmem_trace_reset,
+	.print_line	= kmemtrace_print_line,
+	.print_header = kmemtrace_headers,
+	.flags		= &kmem_tracer_flags
+};
+
+static int __init init_kmem_tracer(void)
+{
+	return register_tracer(&kmem_tracer);
+}
+
+device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc7a4f864036..534505bb39b0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,6 +9,7 @@
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
 #include <trace/boot.h>
+#include <trace/kmemtrace.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
@@ -29,6 +30,8 @@ enum trace_type {
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_HW_BRANCHES,
+	TRACE_KMEM_ALLOC,
+	TRACE_KMEM_FREE,
 	TRACE_POWER,
 
 	__TRACE_LAST_TYPE
@@ -170,6 +173,24 @@ struct trace_power {
 	struct power_trace	state_data;
 };
 
+struct kmemtrace_alloc_entry {
+	struct trace_entry	ent;
+	enum kmemtrace_type_id type_id;
+	unsigned long call_site;
+	const void *ptr;
+	size_t bytes_req;
+	size_t bytes_alloc;
+	gfp_t gfp_flags;
+	int node;
+};
+
+struct kmemtrace_free_entry {
+	struct trace_entry	ent;
+	enum kmemtrace_type_id type_id;
+	unsigned long call_site;
+	const void *ptr;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -280,6 +301,10 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
  		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
+		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
+			  TRACE_KMEM_ALLOC);	\
+		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
+			  TRACE_KMEM_FREE);	\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b5417e23ba94..b0f239e443bc 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -803,26 +803,6 @@ config FIREWIRE_OHCI_REMOTE_DMA
 
 	  If unsure, say N.
 
-config KMEMTRACE
-	bool "Kernel memory tracer (kmemtrace)"
-	depends on RELAY && DEBUG_FS && MARKERS
-	help
-	  kmemtrace provides tracing for slab allocator functions, such as
-	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
-	  data is then fed to the userspace application in order to analyse
-	  allocation hotspots, internal fragmentation and so on, making it
-	  possible to see how well an allocator performs, as well as debug
-	  and profile kernel code.
-
-	  This requires an userspace application to use. See
-	  Documentation/vm/kmemtrace.txt for more information.
-
-	  Saying Y will make the kernel somewhat larger and slower. However,
-	  if you disable kmemtrace at run-time or boot-time, the performance
-	  impact is minimal (depending on the arch the kernel is built for).
-
-	  If unsure, say N.
-
 menuconfig BUILD_DOCSRC
 	bool "Build targets in Documentation/ tree"
 	depends on HEADERS_CHECK
diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c
index 2a70a805027c..0573b5080cc4 100644
--- a/mm/kmemtrace.c
+++ b/mm/kmemtrace.c
@@ -10,7 +10,7 @@
 #include <linux/module.h>
 #include <linux/marker.h>
 #include <linux/gfp.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 #define KMEMTRACE_SUBBUF_SIZE		524288
 #define KMEMTRACE_DEF_N_SUBBUFS		20
diff --git a/mm/slob.c b/mm/slob.c
index 0f1a49f40690..4d1c0fc33b6b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -65,7 +65,7 @@
 #include <linux/module.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 #include <asm/atomic.h>
 
 /*
diff --git a/mm/slub.c b/mm/slub.c
index cc4001fee7ac..7bf8cf8ec082 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,7 +16,7 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/mempolicy.h>
-- 
cgit v1.2.3-59-g8ed1b


From 3fd4bc015ef879a7d2b955ce97fb125e3a51ba7e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 30 Dec 2008 12:07:27 +0100
Subject: tracing/kmemtrace: export kmemtrace_mark_alloc_node() /
 kmemtrace_mark_free()

Impact: build fix

Also fix up Kconfig dependencies and include files.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig     | 3 ++-
 kernel/trace/kmemtrace.c | 2 ++
 mm/slab.c                | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 27fb74b06b3c..cc9f91e7daf4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -267,7 +267,8 @@ config HW_BRANCH_TRACER
 config KMEMTRACE
 	bool "Trace SLAB allocations"
 	select TRACING
-	depends on RELAY
+	select MARKERS
+	select RELAY
 	help
 	  kmemtrace provides tracing for slab allocator functions, such as
 	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index d69cbe3c2a4b..2bfdcd326226 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -296,6 +296,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 
 	trace_wake_up();
 }
+EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
 
 void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 		       unsigned long call_site,
@@ -325,6 +326,7 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 
 	trace_wake_up();
 }
+EXPORT_SYMBOL(kmemtrace_mark_free);
 
 static struct tracer kmem_tracer __read_mostly = {
 	.name		= "kmemtrace",
diff --git a/mm/slab.c b/mm/slab.c
index bcf08ea88380..7f72bb386a09 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,7 +102,7 @@
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
-#include	<linux/kmemtrace.h>
+#include	<tracing/kmemtrace.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
 #include	<linux/uaccess.h>
-- 
cgit v1.2.3-59-g8ed1b


From 723cbe0775514853c22dc45005af59c360916af1 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 5 Jan 2009 22:09:58 +0200
Subject: kmemtrace: Remove the relay version of kmemtrace

Impact: cleanup

kmemtrace now uses ftrace. This patch removes the relay version.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig |   2 -
 mm/Makefile          |   1 -
 mm/kmemtrace.c       | 333 ---------------------------------------------------
 3 files changed, 336 deletions(-)
 delete mode 100644 mm/kmemtrace.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index cc9f91e7daf4..1c0b7504cab3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -267,8 +267,6 @@ config HW_BRANCH_TRACER
 config KMEMTRACE
 	bool "Trace SLAB allocations"
 	select TRACING
-	select MARKERS
-	select RELAY
 	help
 	  kmemtrace provides tracing for slab allocator functions, such as
 	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
diff --git a/mm/Makefile b/mm/Makefile
index c92e8af13206..51c27709cc7c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -35,4 +35,3 @@ obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
-obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c
deleted file mode 100644
index 0573b5080cc4..000000000000
--- a/mm/kmemtrace.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * Copyright (C) 2008 Pekka Enberg, Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#include <linux/string.h>
-#include <linux/debugfs.h>
-#include <linux/relay.h>
-#include <linux/module.h>
-#include <linux/marker.h>
-#include <linux/gfp.h>
-#include <trace/kmemtrace.h>
-
-#define KMEMTRACE_SUBBUF_SIZE		524288
-#define KMEMTRACE_DEF_N_SUBBUFS		20
-
-static struct rchan *kmemtrace_chan;
-static u32 kmemtrace_buf_overruns;
-
-static unsigned int kmemtrace_n_subbufs;
-
-/* disabled by default */
-static unsigned int kmemtrace_enabled;
-
-/*
- * The sequence number is used for reordering kmemtrace packets
- * in userspace, since they are logged as per-CPU data.
- *
- * atomic_t should always be a 32-bit signed integer. Wraparound is not
- * likely to occur, but userspace can deal with it by expecting a certain
- * sequence number in the next packet that will be read.
- */
-static atomic_t kmemtrace_seq_num;
-
-#define KMEMTRACE_ABI_VERSION		1
-
-static u32 kmemtrace_abi_version __read_mostly = KMEMTRACE_ABI_VERSION;
-
-enum kmemtrace_event_id {
-	KMEMTRACE_EVENT_ALLOC = 0,
-	KMEMTRACE_EVENT_FREE,
-};
-
-struct kmemtrace_event {
-	u8		event_id;
-	u8		type_id;
-	u16		event_size;
-	s32		seq_num;
-	u64		call_site;
-	u64		ptr;
-} __attribute__ ((__packed__));
-
-struct kmemtrace_stats_alloc {
-	u64		bytes_req;
-	u64		bytes_alloc;
-	u32		gfp_flags;
-	s32		numa_node;
-} __attribute__ ((__packed__));
-
-static void kmemtrace_probe_alloc(void *probe_data, void *call_data,
-				  const char *format, va_list *args)
-{
-	unsigned long flags;
-	struct kmemtrace_event *ev;
-	struct kmemtrace_stats_alloc *stats;
-	void *buf;
-
-	local_irq_save(flags);
-
-	buf = relay_reserve(kmemtrace_chan,
-			    sizeof(struct kmemtrace_event) +
-			    sizeof(struct kmemtrace_stats_alloc));
-	if (!buf)
-		goto failed;
-
-	/*
-	 * Don't convert this to use structure initializers,
-	 * C99 does not guarantee the rvalues evaluation order.
-	 */
-
-	ev = buf;
-	ev->event_id = KMEMTRACE_EVENT_ALLOC;
-	ev->type_id = va_arg(*args, int);
-	ev->event_size = sizeof(struct kmemtrace_event) +
-			 sizeof(struct kmemtrace_stats_alloc);
-	ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num);
-	ev->call_site = va_arg(*args, unsigned long);
-	ev->ptr = va_arg(*args, unsigned long);
-
-	stats = buf + sizeof(struct kmemtrace_event);
-	stats->bytes_req = va_arg(*args, unsigned long);
-	stats->bytes_alloc = va_arg(*args, unsigned long);
-	stats->gfp_flags = va_arg(*args, unsigned long);
-	stats->numa_node = va_arg(*args, int);
-
-failed:
-	local_irq_restore(flags);
-}
-
-static void kmemtrace_probe_free(void *probe_data, void *call_data,
-				 const char *format, va_list *args)
-{
-	unsigned long flags;
-	struct kmemtrace_event *ev;
-
-	local_irq_save(flags);
-
-	ev = relay_reserve(kmemtrace_chan, sizeof(struct kmemtrace_event));
-	if (!ev)
-		goto failed;
-
-	/*
-	 * Don't convert this to use structure initializers,
-	 * C99 does not guarantee the rvalues evaluation order.
-	 */
-	ev->event_id = KMEMTRACE_EVENT_FREE;
-	ev->type_id = va_arg(*args, int);
-	ev->event_size = sizeof(struct kmemtrace_event);
-	ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num);
-	ev->call_site = va_arg(*args, unsigned long);
-	ev->ptr = va_arg(*args, unsigned long);
-
-failed:
-	local_irq_restore(flags);
-}
-
-static struct dentry *
-kmemtrace_create_buf_file(const char *filename, struct dentry *parent,
-			  int mode, struct rchan_buf *buf, int *is_global)
-{
-	return debugfs_create_file(filename, mode, parent, buf,
-				   &relay_file_operations);
-}
-
-static int kmemtrace_remove_buf_file(struct dentry *dentry)
-{
-	debugfs_remove(dentry);
-
-	return 0;
-}
-
-static int kmemtrace_subbuf_start(struct rchan_buf *buf,
-				  void *subbuf,
-				  void *prev_subbuf,
-				  size_t prev_padding)
-{
-	if (relay_buf_full(buf)) {
-		/*
-		 * We know it's not SMP-safe, but neither
-		 * debugfs_create_u32() is.
-		 */
-		kmemtrace_buf_overruns++;
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct rchan_callbacks relay_callbacks = {
-	.create_buf_file = kmemtrace_create_buf_file,
-	.remove_buf_file = kmemtrace_remove_buf_file,
-	.subbuf_start = kmemtrace_subbuf_start,
-};
-
-static struct dentry *kmemtrace_dir;
-static struct dentry *kmemtrace_overruns_dentry;
-static struct dentry *kmemtrace_abi_version_dentry;
-
-static struct dentry *kmemtrace_enabled_dentry;
-
-static int kmemtrace_start_probes(void)
-{
-	int err;
-
-	err = marker_probe_register("kmemtrace_alloc", "type_id %d "
-				    "call_site %lu ptr %lu "
-				    "bytes_req %lu bytes_alloc %lu "
-				    "gfp_flags %lu node %d",
-				    kmemtrace_probe_alloc, NULL);
-	if (err)
-		return err;
-	err = marker_probe_register("kmemtrace_free", "type_id %d "
-				    "call_site %lu ptr %lu",
-				    kmemtrace_probe_free, NULL);
-
-	return err;
-}
-
-static void kmemtrace_stop_probes(void)
-{
-	marker_probe_unregister("kmemtrace_alloc",
-				kmemtrace_probe_alloc, NULL);
-	marker_probe_unregister("kmemtrace_free",
-				kmemtrace_probe_free, NULL);
-}
-
-static int kmemtrace_enabled_get(void *data, u64 *val)
-{
-	*val = *((int *) data);
-
-	return 0;
-}
-
-static int kmemtrace_enabled_set(void *data, u64 val)
-{
-	u64 old_val = kmemtrace_enabled;
-
-	*((int *) data) = !!val;
-
-	if (old_val == val)
-		return 0;
-	if (val)
-		kmemtrace_start_probes();
-	else
-		kmemtrace_stop_probes();
-
-	return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(kmemtrace_enabled_fops,
-			kmemtrace_enabled_get,
-			kmemtrace_enabled_set, "%llu\n");
-
-static void kmemtrace_cleanup(void)
-{
-	if (kmemtrace_enabled_dentry)
-		debugfs_remove(kmemtrace_enabled_dentry);
-
-	kmemtrace_stop_probes();
-
-	if (kmemtrace_abi_version_dentry)
-		debugfs_remove(kmemtrace_abi_version_dentry);
-	if (kmemtrace_overruns_dentry)
-		debugfs_remove(kmemtrace_overruns_dentry);
-
-	relay_close(kmemtrace_chan);
-	kmemtrace_chan = NULL;
-
-	if (kmemtrace_dir)
-		debugfs_remove(kmemtrace_dir);
-}
-
-static int __init kmemtrace_setup_late(void)
-{
-	if (!kmemtrace_chan)
-		goto failed;
-
-	kmemtrace_dir = debugfs_create_dir("kmemtrace", NULL);
-	if (!kmemtrace_dir)
-		goto cleanup;
-
-	kmemtrace_abi_version_dentry =
-		debugfs_create_u32("abi_version", S_IRUSR,
-				   kmemtrace_dir, &kmemtrace_abi_version);
-	kmemtrace_overruns_dentry =
-		debugfs_create_u32("total_overruns", S_IRUSR,
-				   kmemtrace_dir, &kmemtrace_buf_overruns);
-	if (!kmemtrace_overruns_dentry || !kmemtrace_abi_version_dentry)
-		goto cleanup;
-
-	kmemtrace_enabled_dentry =
-		debugfs_create_file("enabled", S_IRUSR | S_IWUSR,
-				    kmemtrace_dir, &kmemtrace_enabled,
-				    &kmemtrace_enabled_fops);
-	if (!kmemtrace_enabled_dentry)
-		goto cleanup;
-
-	if (relay_late_setup_files(kmemtrace_chan, "cpu", kmemtrace_dir))
-		goto cleanup;
-
-	printk(KERN_INFO "kmemtrace: fully up.\n");
-
-	return 0;
-
-cleanup:
-	kmemtrace_cleanup();
-failed:
-	return 1;
-}
-late_initcall(kmemtrace_setup_late);
-
-static int __init kmemtrace_set_boot_enabled(char *str)
-{
-	if (!str)
-		return -EINVAL;
-
-	if (!strcmp(str, "yes"))
-		kmemtrace_enabled = 1;
-	else if (!strcmp(str, "no"))
-		kmemtrace_enabled = 0;
-	else
-		return -EINVAL;
-
-	return 0;
-}
-early_param("kmemtrace.enable", kmemtrace_set_boot_enabled);
-
-static int __init kmemtrace_set_subbufs(char *str)
-{
-	get_option(&str, &kmemtrace_n_subbufs);
-	return 0;
-}
-early_param("kmemtrace.subbufs", kmemtrace_set_subbufs);
-
-void kmemtrace_init(void)
-{
-	if (!kmemtrace_n_subbufs)
-		kmemtrace_n_subbufs = KMEMTRACE_DEF_N_SUBBUFS;
-
-	kmemtrace_chan = relay_open(NULL, NULL, KMEMTRACE_SUBBUF_SIZE,
-				    kmemtrace_n_subbufs, &relay_callbacks,
-				    NULL);
-	if (!kmemtrace_chan) {
-		printk(KERN_ERR "kmemtrace: could not open relay channel.\n");
-		return;
-	}
-
-	if (!kmemtrace_enabled) {
-		printk(KERN_INFO "kmemtrace: disabled. Pass "
-			"kemtrace.enable=yes as kernel parameter for "
-			"boot-time tracing.\n");
-		return;
-	}
-	if (kmemtrace_start_probes()) {
-		printk(KERN_ERR "kmemtrace: could not register marker probes!\n");
-		kmemtrace_cleanup();
-		return;
-	}
-
-	printk(KERN_INFO "kmemtrace: enabled.\n");
-}
-
-- 
cgit v1.2.3-59-g8ed1b


From 3e80680208ba6ce9635ca7c21ad0019442ea166a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 6 Jan 2009 10:16:35 +0100
Subject: kmemtrace: add kmemtrace_init()

Impact: build fix

leftover from the relayfs version - but we want to keep it because
this call is the earliest opportunity when we can start kmemtrace
tracing. (after kmem_cache_init()).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 2bfdcd326226..faaa5ae7e75a 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -337,6 +337,11 @@ static struct tracer kmem_tracer __read_mostly = {
 	.flags		= &kmem_tracer_flags
 };
 
+void kmemtrace_init(void)
+{
+	/* earliest opportunity to start kmem tracing */
+}
+
 static int __init init_kmem_tracer(void)
 {
 	return register_tracer(&kmem_tracer);
-- 
cgit v1.2.3-59-g8ed1b


From 431aa3fbf5bbe3be79809c7e603c2ed2ac64b015 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 6 Jan 2009 12:43:01 -0500
Subject: ftrace: convert unsigned index to signed

Impact: fix to unsigned compared to less than zero

Roel Kluin pointed out that there is a compare of an unsigned number
to less than zero. A previous clean up had the unsigned index set
to -1 for certain cases, but never converted it to signed.

Frederic Weisbecker noticed that another index is used to compare
the above index to and it also needs to be converted to signed.

[
  Converted ftrace_page->index to int from unsigned long as
  Andrew Morton pointed out that there's no need for it to
  stay a long.
]

Reported-by: Roel Kluin <roel.kluin@gmail.com>
Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2f32969c09df..9e54a6ccdb93 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -289,7 +289,7 @@ static DEFINE_MUTEX(ftrace_regex_lock);
 
 struct ftrace_page {
 	struct ftrace_page	*next;
-	unsigned long		index;
+	int			index;
 	struct dyn_ftrace	records[];
 };
 
@@ -786,7 +786,7 @@ enum {
 
 struct ftrace_iterator {
 	struct ftrace_page	*pg;
-	unsigned		idx;
+	int			idx;
 	unsigned		flags;
 	unsigned char		buffer[FTRACE_BUFF_MAX+1];
 	unsigned		buffer_idx;
-- 
cgit v1.2.3-59-g8ed1b


From ff288b274a9b383046fdbda4be3067daba4d5fe8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 6 Jan 2009 21:33:30 +0100
Subject: tracing/ftrace: fix a memory leak in stat tracing

Impact: fix memory leak

This patch fixes a memory leak inside reset_stat_list(). The freeing
loop iterated only once.

Also turn the stat_list into a simple struct list_head, which
simplify the code and avoid an unused static pointer.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 6f194a33a64a..4cb4ff27646d 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -21,7 +21,7 @@ struct trace_stat_list {
 	void *stat;
 };
 
-static struct trace_stat_list stat_list;
+static LIST_HEAD(stat_list);
 
 /*
  * This is a copy of the current tracer to avoid racy
@@ -39,22 +39,12 @@ static DEFINE_MUTEX(stat_list_mutex);
 
 static void reset_stat_list(void)
 {
-	struct trace_stat_list *node;
-	struct list_head *next;
+	struct trace_stat_list *node, *next;
 
-	if (list_empty(&stat_list.list))
-		return;
-
-	node = list_entry(stat_list.list.next, struct trace_stat_list, list);
-	next = node->list.next;
-
-	while (&node->list != next) {
+	list_for_each_entry_safe(node, next, &stat_list, list)
 		kfree(node);
-		node = list_entry(next, struct trace_stat_list, list);
-	}
-	kfree(node);
 
-	INIT_LIST_HEAD(&stat_list.list);
+	INIT_LIST_HEAD(&stat_list);
 }
 
 void init_tracer_stat(struct tracer *trace)
@@ -107,7 +97,7 @@ static int stat_seq_init(void)
 	}
 
 	INIT_LIST_HEAD(&new_entry->list);
-	list_add(&new_entry->list, &stat_list.list);
+	list_add(&new_entry->list, &stat_list);
 	new_entry->stat = current_tracer.stat_start();
 
 	prev_stat = new_entry->stat;
@@ -130,7 +120,7 @@ static int stat_seq_init(void)
 		if (!new_entry->stat)
 			break;
 
-		list_for_each_entry(iter_entry, &stat_list.list, list) {
+		list_for_each_entry(iter_entry, &stat_list, list) {
 			/* Insertion with a descendent sorting */
 			if (current_tracer.stat_cmp(new_entry->stat,
 						iter_entry->stat) > 0) {
@@ -141,7 +131,7 @@ static int stat_seq_init(void)
 
 			/* The current smaller value */
 			} else if (list_is_last(&iter_entry->list,
-						&stat_list.list)) {
+						&stat_list)) {
 				list_add(&new_entry->list, &iter_entry->list);
 				break;
 			}
@@ -162,7 +152,7 @@ exit_free_list:
 
 static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 {
-	struct trace_stat_list *l = (struct trace_stat_list *)s->private;
+	struct list_head *l = (struct list_head *)s->private;
 
 	/* Prevent from tracer switch or stat_list modification */
 	mutex_lock(&stat_list_mutex);
@@ -171,14 +161,14 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 	if (!*pos && current_tracer.stat_headers)
 		current_tracer.stat_headers(s);
 
-	return seq_list_start(&l->list, *pos);
+	return seq_list_start(l, *pos);
 }
 
 static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
-	struct trace_stat_list *l = (struct trace_stat_list *)s->private;
+	struct list_head *l = (struct list_head *)s->private;
 
-	return seq_list_next(p, &l->list, pos);
+	return seq_list_next(p, l, pos);
 }
 
 static void stat_seq_stop(struct seq_file *m, void *p)
@@ -188,8 +178,10 @@ static void stat_seq_stop(struct seq_file *m, void *p)
 
 static int stat_seq_show(struct seq_file *s, void *v)
 {
-	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
-	return current_tracer.stat_show(s, l->stat);
+	struct trace_stat_list *entry =
+		list_entry(v, struct trace_stat_list, list);
+
+	return current_tracer.stat_show(s, entry->stat);
 }
 
 static const struct seq_operations trace_stat_seq_ops = {
@@ -237,7 +229,6 @@ static int __init tracing_stat_init(void)
 	struct dentry *d_tracing;
 	struct dentry *entry;
 
-	INIT_LIST_HEAD(&stat_list.list);
 	d_tracing = tracing_init_dentry();
 
 	entry = debugfs_create_file("trace_stat", 0444, d_tracing,
-- 
cgit v1.2.3-59-g8ed1b


From e8a9cbf6ae620d9e5ba9cb42001c033287a284a3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 6 Jan 2009 22:02:35 -0500
Subject: trace: clean up funny line breaks in stat_seq_show

Impact: clean up

Andrew Morton pointed out that the entry assignment in stat_seq_show
did not need to be done in the declaration, causing funny line breaks.

This patch makes it a bit more pleasing on the eyes.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 4cb4ff27646d..f110ce9ce7fb 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -178,8 +178,9 @@ static void stat_seq_stop(struct seq_file *m, void *p)
 
 static int stat_seq_show(struct seq_file *s, void *v)
 {
-	struct trace_stat_list *entry =
-		list_entry(v, struct trace_stat_list, list);
+	struct trace_stat_list *entry;
+
+	entry =	list_entry(v, struct trace_stat_list, list);
 
 	return current_tracer.stat_show(s, entry->stat);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 34a148bf0911a4a1cae85f8ecba57affb4d76aee Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 9 Jan 2009 12:27:09 -0800
Subject: kernel/trace/ring_buffer.c: reduce inlining

text    data     bss     dec     hex filename
before:  11320     228       8   11556    2d24 kernel/trace/ring_buffer.o
after:   10592     228       8   10828    2a4c kernel/trace/ring_buffer.o

Also: free_page(0) is legal.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8b0daf0662ef..9542990f515b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -133,7 +133,7 @@ enum {
 };
 
 /* inline for ring buffer fast paths */
-static inline unsigned
+static unsigned
 rb_event_length(struct ring_buffer_event *event)
 {
 	unsigned length;
@@ -179,7 +179,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 
 /* inline for ring buffer fast paths */
-static inline void *
+static void *
 rb_event_data(struct ring_buffer_event *event)
 {
 	BUG_ON(event->type != RINGBUF_TYPE_DATA);
@@ -229,10 +229,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
  * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
  * this issue out.
  */
-static inline void free_buffer_page(struct buffer_page *bpage)
+static void free_buffer_page(struct buffer_page *bpage)
 {
-	if (bpage->page)
-		free_page((unsigned long)bpage->page);
+	free_page((unsigned long)bpage->page);
 	kfree(bpage);
 }
 
@@ -811,7 +810,7 @@ rb_event_index(struct ring_buffer_event *event)
 	return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
 }
 
-static inline int
+static int
 rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 	     struct ring_buffer_event *event)
 {
@@ -825,7 +824,7 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 		rb_commit_index(cpu_buffer) == index;
 }
 
-static inline void
+static void
 rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
 		    struct ring_buffer_event *event)
 {
@@ -850,7 +849,7 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
 	local_set(&cpu_buffer->commit_page->page->commit, index);
 }
 
-static inline void
+static void
 rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	/*
@@ -896,7 +895,7 @@ static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->reader_page->read = 0;
 }
 
-static inline void rb_inc_iter(struct ring_buffer_iter *iter)
+static void rb_inc_iter(struct ring_buffer_iter *iter)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
 
@@ -926,7 +925,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
  * and with this, we can determine what to place into the
  * data field.
  */
-static inline void
+static void
 rb_update_event(struct ring_buffer_event *event,
 			 unsigned type, unsigned length)
 {
@@ -964,7 +963,7 @@ rb_update_event(struct ring_buffer_event *event,
 	}
 }
 
-static inline unsigned rb_calculate_event_length(unsigned length)
+static unsigned rb_calculate_event_length(unsigned length)
 {
 	struct ring_buffer_event event; /* Used only for sizeof array */
 
@@ -1438,7 +1437,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_write);
 
-static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct buffer_page *reader = cpu_buffer->reader_page;
 	struct buffer_page *head = cpu_buffer->head_page;
-- 
cgit v1.2.3-59-g8ed1b


From 67d347245f76a149c45bffb1a10145d31d61d1da Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 9 Jan 2009 12:27:09 -0800
Subject: kernel/trace/ring_buffer.c: use DIV_ROUND_UP

Instead of open-coding it.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9542990f515b..4832ffa5d937 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -123,8 +123,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
 EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
 
 #define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
-#define RB_ALIGNMENT_SHIFT	2
-#define RB_ALIGNMENT		(1 << RB_ALIGNMENT_SHIFT)
+#define RB_ALIGNMENT		4U
 #define RB_MAX_SMALL_DATA	28
 
 enum {
@@ -151,7 +150,7 @@ rb_event_length(struct ring_buffer_event *event)
 
 	case RINGBUF_TYPE_DATA:
 		if (event->len)
-			length = event->len << RB_ALIGNMENT_SHIFT;
+			length = event->len * RB_ALIGNMENT;
 		else
 			length = event->array[0];
 		return length + RB_EVNT_HDR_SIZE;
@@ -937,15 +936,11 @@ rb_update_event(struct ring_buffer_event *event,
 		break;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
-		event->len =
-			(RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
-			>> RB_ALIGNMENT_SHIFT;
+		event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
 		break;
 
 	case RINGBUF_TYPE_TIME_STAMP:
-		event->len =
-			(RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
-			>> RB_ALIGNMENT_SHIFT;
+		event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
 		break;
 
 	case RINGBUF_TYPE_DATA:
@@ -954,9 +949,7 @@ rb_update_event(struct ring_buffer_event *event,
 			event->len = 0;
 			event->array[0] = length;
 		} else
-			event->len =
-				(length + (RB_ALIGNMENT-1))
-				>> RB_ALIGNMENT_SHIFT;
+			event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
 		break;
 	default:
 		BUG();
-- 
cgit v1.2.3-59-g8ed1b


From 034939b65ad5ff64b9709210b3469a95153c51a3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 8 Jan 2009 10:03:56 -0800
Subject: tracing/ftrace: handle more than one stat file per tracer

Impact: new API for tracers

Make the stat tracing API reentrant. And also provide the new directory
/debugfs/tracing/trace_stat which will contain all the stat files for the
current active tracer.

Now a tracer will, if desired, want to provide a zero terminated array of
tracer_stat structures.
Each one contains the callbacks necessary for one stat file.
It have to provide at least a name for its stat file, an iterator with
stat_start/start_next callback and an output callback for one stat entry.

Also adapt the branch tracer to this new API.
We create two files "all" and "annotated" inside the /debugfs/tracing/trace_stat
directory, making the both stats simultaneously available instead of needing
to change an option to switch from one stat file to another.

The output of these stats haven't changed.

Changes in v2:

_ Apply the previous memory leak fix (rebase against tip/master)

Changes in v3:

_ Merge the patch that adapted the branch tracer to this Api in this patch to
  not break the kernel build.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h        |  35 ++++---
 kernel/trace/trace_branch.c |  69 ++++++-------
 kernel/trace/trace_stat.c   | 230 ++++++++++++++++++++++++++++++++------------
 3 files changed, 217 insertions(+), 117 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 94ed45e93a80..b3f9ad1b4d84 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -334,6 +334,25 @@ struct tracer_flags {
 /* Makes more easy to define a tracer opt */
 #define TRACER_OPT(s, b)	.name = #s, .bit = b
 
+/*
+ * If you want to provide a stat file (one-shot statistics), fill
+ * an iterator with stat_start/stat_next and a stat_show callbacks.
+ * The others callbacks are optional.
+ */
+struct tracer_stat {
+	/* The name of your stat file */
+	const char		*name;
+	/* Iteration over statistic entries */
+	void			*(*stat_start)(void);
+	void			*(*stat_next)(void *prev, int idx);
+	/* Compare two entries for sorting (optional) for stats */
+	int			(*stat_cmp)(void *p1, void *p2);
+	/* Print a stat entry */
+	int			(*stat_show)(struct seq_file *s, void *p);
+	/* Print the headers of your stat entries */
+	int			(*stat_headers)(struct seq_file *s);
+};
+
 /*
  * A specific tracer, represented by methods that operate on a trace array:
  */
@@ -361,21 +380,7 @@ struct tracer {
 	struct tracer		*next;
 	int			print_max;
 	struct tracer_flags 	*flags;
-
-	/*
-	 * If you change one of the following on tracing runtime, recall
-	 * init_tracer_stat()
-	 */
-
-	/* Iteration over statistic entries */
-	void			*(*stat_start)(void);
-	void			*(*stat_next)(void *prev, int idx);
-	/* Compare two entries for sorting (optional) for stats */
-	int			(*stat_cmp)(void *p1, void *p2);
-	/* Print a stat entry */
-	int			(*stat_show)(struct seq_file *s, void *p);
-	/* Print the headers of your stat entries */
-	int			(*stat_headers)(struct seq_file *s);
+	struct tracer_stat	*stats;
 };
 
 struct trace_seq {
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4785a3b9bc4a..da5cf3e5581b 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -306,19 +306,6 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
 }
 
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
-enum {
-	TRACE_BRANCH_OPT_ALL = 0x1
-};
-
-static struct tracer_opt branch_opts[] = {
-	{ TRACER_OPT(stat_all_branch, TRACE_BRANCH_OPT_ALL) },
-	{ }
-};
-
-static struct tracer_flags branch_flags = {
-	.val = 0,
-	.opts = branch_opts
-};
 
 extern unsigned long __start_branch_profile[];
 extern unsigned long __stop_branch_profile[];
@@ -352,28 +339,36 @@ all_branch_stat_next(void *v, int idx)
 	return p;
 }
 
-static int branch_set_flag(u32 old_flags, u32 bit, int set)
-{
-	if (bit == TRACE_BRANCH_OPT_ALL) {
-		if (set) {
-			branch_trace.stat_headers = all_branch_stat_headers;
-			branch_trace.stat_start = all_branch_stat_start;
-			branch_trace.stat_next = all_branch_stat_next;
-			branch_trace.stat_cmp = NULL;
-		} else {
-			branch_trace.stat_headers =
-				annotated_branch_stat_headers;
-			branch_trace.stat_start = annotated_branch_stat_start;
-			branch_trace.stat_next = annotated_branch_stat_next;
-			branch_trace.stat_cmp = annotated_branch_stat_cmp;
-		}
-		init_tracer_stat(&branch_trace);
-	}
-	return 0;
-}
+static struct tracer_stat branch_stats[] = {
+	{.name = "annotated",
+	.stat_start = annotated_branch_stat_start,
+	.stat_next = annotated_branch_stat_next,
+	.stat_cmp = annotated_branch_stat_cmp,
+	.stat_headers = annotated_branch_stat_headers,
+	.stat_show = branch_stat_show},
 
+	{.name = "all",
+	.stat_start = all_branch_stat_start,
+	.stat_next = all_branch_stat_next,
+	.stat_headers = all_branch_stat_headers,
+	.stat_show = branch_stat_show},
+
+	{ }
+};
+#else
+static struct tracer_stat branch_stats[] = {
+	{.name = "annotated",
+	.stat_start = annotated_branch_stat_start,
+	.stat_next = annotated_branch_stat_next,
+	.stat_cmp = annotated_branch_stat_cmp,
+	.stat_headers = annotated_branch_stat_headers,
+	.stat_show = branch_stat_show},
+
+	{ }
+};
 #endif /* CONFIG_PROFILE_ALL_BRANCHES */
 
+
 static struct tracer branch_trace __read_mostly =
 {
 	.name		= "branch",
@@ -383,16 +378,8 @@ static struct tracer branch_trace __read_mostly =
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest	= trace_selftest_startup_branch,
 #endif /* CONFIG_FTRACE_SELFTEST */
-#endif /* CONFIG_BRANCH_TRACER */
-	.stat_start	=	annotated_branch_stat_start,
-	.stat_next	= annotated_branch_stat_next,
-	.stat_show	= branch_stat_show,
-	.stat_headers	= annotated_branch_stat_headers,
-	.stat_cmp	= annotated_branch_stat_cmp,
-#ifdef CONFIG_PROFILE_ALL_BRANCHES
-	.flags	= &branch_flags,
-	.set_flag	= branch_set_flag,
 #endif
+	.stats		= branch_stats
 };
 
 __init static int init_branch_trace(void)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index f110ce9ce7fb..1515f9e7adfc 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -21,37 +21,87 @@ struct trace_stat_list {
 	void *stat;
 };
 
-static LIST_HEAD(stat_list);
-
-/*
- * This is a copy of the current tracer to avoid racy
- * and dangerous output while the current tracer is
- * switched.
- */
-static struct tracer current_tracer;
+/* A stat session is the stats output in one file */
+struct tracer_stat_session {
+	struct tracer_stat *ts;
+	struct list_head stat_list;
+	struct mutex stat_mutex;
+};
 
-/*
- * Protect both the current tracer and the global
- * stat list.
- */
-static DEFINE_MUTEX(stat_list_mutex);
+/* All of the sessions currently in use. Each stat file embeed one session */
+static struct tracer_stat_session **all_stat_sessions;
+static int nb_sessions;
+static struct dentry *stat_dir, **stat_files;
 
 
-static void reset_stat_list(void)
+static void reset_stat_session(struct tracer_stat_session *session)
 {
 	struct trace_stat_list *node, *next;
 
-	list_for_each_entry_safe(node, next, &stat_list, list)
+	list_for_each_entry_safe(node, next, &session->stat_list, list)
 		kfree(node);
 
-	INIT_LIST_HEAD(&stat_list);
+	INIT_LIST_HEAD(&session->stat_list);
 }
 
-void init_tracer_stat(struct tracer *trace)
+/* Called when a tracer is initialized */
+static int init_all_sessions(int nb, struct tracer_stat *ts)
 {
-	mutex_lock(&stat_list_mutex);
-	current_tracer = *trace;
-	mutex_unlock(&stat_list_mutex);
+	int i, j;
+	struct tracer_stat_session *session;
+
+	nb_sessions = 0;
+
+	if (all_stat_sessions) {
+		for (i = 0; i < nb_sessions; i++) {
+			session = all_stat_sessions[i];
+			reset_stat_session(session);
+			mutex_destroy(&session->stat_mutex);
+			kfree(session);
+		}
+	}
+	all_stat_sessions = kmalloc(sizeof(struct tracer_stat_session *) * nb,
+				    GFP_KERNEL);
+	if (!all_stat_sessions)
+		return -ENOMEM;
+
+	for (i = 0; i < nb; i++) {
+		session = kmalloc(sizeof(struct tracer_stat_session) * nb,
+				  GFP_KERNEL);
+		if (!session)
+			goto free_sessions;
+
+		INIT_LIST_HEAD(&session->stat_list);
+		mutex_init(&session->stat_mutex);
+		session->ts = &ts[i];
+		all_stat_sessions[i] = session;
+	}
+	nb_sessions = nb;
+	return 0;
+
+free_sessions:
+
+	for (j = 0; j < i; j++)
+		kfree(all_stat_sessions[i]);
+
+	kfree(all_stat_sessions);
+	all_stat_sessions = NULL;
+
+	return -ENOMEM;
+}
+
+static int basic_tracer_stat_checks(struct tracer_stat *ts)
+{
+	int i;
+
+	if (!ts)
+		return 0;
+
+	for (i = 0; ts[i].name; i++) {
+		if (!ts[i].stat_start || !ts[i].stat_next || !ts[i].stat_show)
+			return -EBUSY;
+	}
+	return i;
 }
 
 /*
@@ -69,22 +119,19 @@ static int dummy_cmp(void *p1, void *p2)
  * All of these copies and sorting are required on all opening
  * since the stats could have changed between two file sessions.
  */
-static int stat_seq_init(void)
+static int stat_seq_init(struct tracer_stat_session *session)
 {
 	struct trace_stat_list *iter_entry, *new_entry;
+	struct tracer_stat *ts = session->ts;
 	void *prev_stat;
 	int ret = 0;
 	int i;
 
-	mutex_lock(&stat_list_mutex);
-	reset_stat_list();
-
-	if (!current_tracer.stat_start || !current_tracer.stat_next ||
-					!current_tracer.stat_show)
-		goto exit;
+	mutex_lock(&session->stat_mutex);
+	reset_stat_session(session);
 
-	if (!current_tracer.stat_cmp)
-		current_tracer.stat_cmp = dummy_cmp;
+	if (!ts->stat_cmp)
+		ts->stat_cmp = dummy_cmp;
 
 	/*
 	 * The first entry. Actually this is the second, but the first
@@ -97,9 +144,10 @@ static int stat_seq_init(void)
 	}
 
 	INIT_LIST_HEAD(&new_entry->list);
-	list_add(&new_entry->list, &stat_list);
-	new_entry->stat = current_tracer.stat_start();
 
+	list_add(&new_entry->list, &session->stat_list);
+
+	new_entry->stat = ts->stat_start();
 	prev_stat = new_entry->stat;
 
 	/*
@@ -114,15 +162,16 @@ static int stat_seq_init(void)
 		}
 
 		INIT_LIST_HEAD(&new_entry->list);
-		new_entry->stat = current_tracer.stat_next(prev_stat, i);
+		new_entry->stat = ts->stat_next(prev_stat, i);
 
 		/* End of insertion */
 		if (!new_entry->stat)
 			break;
 
-		list_for_each_entry(iter_entry, &stat_list, list) {
+		list_for_each_entry(iter_entry, &session->stat_list, list) {
+
 			/* Insertion with a descendent sorting */
-			if (current_tracer.stat_cmp(new_entry->stat,
+			if (ts->stat_cmp(new_entry->stat,
 						iter_entry->stat) > 0) {
 
 				list_add_tail(&new_entry->list,
@@ -131,7 +180,7 @@ static int stat_seq_init(void)
 
 			/* The current smaller value */
 			} else if (list_is_last(&iter_entry->list,
-						&stat_list)) {
+						&session->stat_list)) {
 				list_add(&new_entry->list, &iter_entry->list);
 				break;
 			}
@@ -140,49 +189,49 @@ static int stat_seq_init(void)
 		prev_stat = new_entry->stat;
 	}
 exit:
-	mutex_unlock(&stat_list_mutex);
+	mutex_unlock(&session->stat_mutex);
 	return ret;
 
 exit_free_list:
-	reset_stat_list();
-	mutex_unlock(&stat_list_mutex);
+	reset_stat_session(session);
+	mutex_unlock(&session->stat_mutex);
 	return ret;
 }
 
 
 static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 {
-	struct list_head *l = (struct list_head *)s->private;
+	struct tracer_stat_session *session = s->private;
 
 	/* Prevent from tracer switch or stat_list modification */
-	mutex_lock(&stat_list_mutex);
+	mutex_lock(&session->stat_mutex);
 
 	/* If we are in the beginning of the file, print the headers */
-	if (!*pos && current_tracer.stat_headers)
-		current_tracer.stat_headers(s);
+	if (!*pos && session->ts->stat_headers)
+		session->ts->stat_headers(s);
 
-	return seq_list_start(l, *pos);
+	return seq_list_start(&session->stat_list, *pos);
 }
 
 static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
-	struct list_head *l = (struct list_head *)s->private;
+	struct tracer_stat_session *session = s->private;
 
-	return seq_list_next(p, l, pos);
+	return seq_list_next(p, &session->stat_list, pos);
 }
 
-static void stat_seq_stop(struct seq_file *m, void *p)
+static void stat_seq_stop(struct seq_file *s, void *p)
 {
-	mutex_unlock(&stat_list_mutex);
+	struct tracer_stat_session *session = s->private;
+	mutex_unlock(&session->stat_mutex);
 }
 
 static int stat_seq_show(struct seq_file *s, void *v)
 {
-	struct trace_stat_list *entry;
-
-	entry =	list_entry(v, struct trace_stat_list, list);
+	struct tracer_stat_session *session = s->private;
+	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
 
-	return current_tracer.stat_show(s, entry->stat);
+	return session->ts->stat_show(s, l->stat);
 }
 
 static const struct seq_operations trace_stat_seq_ops = {
@@ -192,15 +241,18 @@ static const struct seq_operations trace_stat_seq_ops = {
 	.show = stat_seq_show
 };
 
+/* The session stat is refilled and resorted at each stat file opening */
 static int tracing_stat_open(struct inode *inode, struct file *file)
 {
 	int ret;
 
+	struct tracer_stat_session *session = inode->i_private;
+
 	ret = seq_open(file, &trace_stat_seq_ops);
 	if (!ret) {
 		struct seq_file *m = file->private_data;
-		m->private = &stat_list;
-		ret = stat_seq_init();
+		m->private = session;
+		ret = stat_seq_init(session);
 	}
 
 	return ret;
@@ -212,9 +264,12 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
  */
 static int tracing_stat_release(struct inode *i, struct file *f)
 {
-	mutex_lock(&stat_list_mutex);
-	reset_stat_list();
-	mutex_unlock(&stat_list_mutex);
+	struct tracer_stat_session *session = i->i_private;
+
+	mutex_lock(&session->stat_mutex);
+	reset_stat_session(session);
+	mutex_unlock(&session->stat_mutex);
+
 	return 0;
 }
 
@@ -225,17 +280,70 @@ static const struct file_operations tracing_stat_fops = {
 	.release	= tracing_stat_release
 };
 
+
+static void destroy_trace_stat_files(void)
+{
+	int i;
+
+	if (stat_files) {
+		for (i = 0; i < nb_sessions; i++)
+			debugfs_remove(stat_files[i]);
+		kfree(stat_files);
+		stat_files = NULL;
+	}
+}
+
+static void init_trace_stat_files(void)
+{
+	int i;
+
+	if (!stat_dir || !nb_sessions)
+		return;
+
+	stat_files = kmalloc(sizeof(struct dentry *) * nb_sessions, GFP_KERNEL);
+
+	if (!stat_files) {
+		pr_warning("trace stat: not enough memory\n");
+		return;
+	}
+
+	for (i = 0; i < nb_sessions; i++) {
+		struct tracer_stat_session *session = all_stat_sessions[i];
+		stat_files[i] = debugfs_create_file(session->ts->name, 0644,
+						stat_dir,
+						session, &tracing_stat_fops);
+		if (!stat_files[i])
+			pr_warning("cannot create %s entry\n",
+				   session->ts->name);
+	}
+}
+
+void init_tracer_stat(struct tracer *trace)
+{
+	int nb = basic_tracer_stat_checks(trace->stats);
+
+	destroy_trace_stat_files();
+
+	if (nb < 0) {
+		pr_warning("stat tracing: missing stat callback on %s\n",
+			   trace->name);
+		return;
+	}
+	if (!nb)
+		return;
+
+	init_all_sessions(nb, trace->stats);
+	init_trace_stat_files();
+}
+
 static int __init tracing_stat_init(void)
 {
 	struct dentry *d_tracing;
-	struct dentry *entry;
 
 	d_tracing = tracing_init_dentry();
 
-	entry = debugfs_create_file("trace_stat", 0444, d_tracing,
-					NULL,
-				    &tracing_stat_fops);
-	if (!entry)
+	stat_dir = debugfs_create_dir("trace_stat", d_tracing);
+	if (!stat_dir)
 		pr_warning("Could not create debugfs "
 			   "'trace_stat' entry\n");
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From fe6f90e57fd31af8daca534ea01db2e5666c15da Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Sat, 3 Jan 2009 21:23:51 +0200
Subject: trace: mmiotrace to the tracer menu in Kconfig

Impact: cosmetic change in Kconfig menu layout

This patch was originally suggested by Peter Zijlstra, but seems it
was forgotten.

CONFIG_MMIOTRACE and CONFIG_MMIOTRACE_TEST were selectable
directly under the Kernel hacking / debugging menu in the kernel
configuration system. They were present only for x86 and x86_64.

Other tracers that use the ftrace tracing framework are in their own
sub-menu. This patch moves the mmiotrace configuration options there.
Since the Kconfig file, where the tracer menu is, is not architecture
specific, HAVE_MMIOTRACE_SUPPORT is introduced and provided only by
x86/x86_64. CONFIG_MMIOTRACE now depends on it.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.debug | 24 ++----------------------
 kernel/trace/Kconfig   | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 10d6cc3fd052..e1983fa025d2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -174,28 +174,8 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
-config MMIOTRACE
-	bool "Memory mapped IO tracing"
-	depends on DEBUG_KERNEL && PCI
-	select TRACING
-	help
-	  Mmiotrace traces Memory Mapped I/O access and is meant for
-	  debugging and reverse engineering. It is called from the ioremap
-	  implementation and works via page faults. Tracing is disabled by
-	  default and can be enabled at run-time.
-
-	  See Documentation/tracers/mmiotrace.txt.
-	  If you are not helping to develop drivers, say N.
-
-config MMIOTRACE_TEST
-	tristate "Test module for mmiotrace"
-	depends on MMIOTRACE && m
-	help
-	  This is a dumb module for testing mmiotrace. It is very dangerous
-	  as it will write garbage to IO memory starting at a given address.
-	  However, it should be safe to use on e.g. unused portion of VRAM.
-
-	  Say N, unless you absolutely know what you are doing.
+config HAVE_MMIOTRACE_SUPPORT
+	def_bool y
 
 #
 # IO delay types:
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1c0b7504cab3..944239296f13 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -323,4 +323,27 @@ config FTRACE_STARTUP_TEST
 	  functioning properly. It will do tests on all the configured
 	  tracers of ftrace.
 
+config MMIOTRACE
+	bool "Memory mapped IO tracing"
+	depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI
+	select TRACING
+	help
+	  Mmiotrace traces Memory Mapped I/O access and is meant for
+	  debugging and reverse engineering. It is called from the ioremap
+	  implementation and works via page faults. Tracing is disabled by
+	  default and can be enabled at run-time.
+
+	  See Documentation/tracers/mmiotrace.txt.
+	  If you are not helping to develop drivers, say N.
+
+config MMIOTRACE_TEST
+	tristate "Test module for mmiotrace"
+	depends on MMIOTRACE && m
+	help
+	  This is a dumb module for testing mmiotrace. It is very dangerous
+	  as it will write garbage to IO memory starting at a given address.
+	  However, it should be safe to use on e.g. unused portion of VRAM.
+
+	  Say N, unless you absolutely know what you are doing.
+
 endmenu
-- 
cgit v1.2.3-59-g8ed1b


From 173ed24ee2d64f5de28654eb456ec1ee18a142e5 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Tue, 6 Jan 2009 13:57:11 +0200
Subject: mmiotrace: count events lost due to not recording

Impact: enhances lost events counting in mmiotrace

The tracing framework, or the ring buffer facility it uses, has a switch
to stop recording data. When recording is off, the trace events will be
lost. The framework does not count these, so mmiotrace has to count them
itself.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_mmiotrace.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fcec59ff2355..621c8c3f3139 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
+#include <asm/atomic.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -20,6 +21,7 @@ struct header_iter {
 static struct trace_array *mmio_trace_array;
 static bool overrun_detected;
 static unsigned long prev_overruns;
+static atomic_t dropped_count;
 
 static void mmio_reset_data(struct trace_array *tr)
 {
@@ -122,11 +124,11 @@ static void mmio_close(struct trace_iterator *iter)
 
 static unsigned long count_overruns(struct trace_iterator *iter)
 {
-	unsigned long cnt = 0;
+	unsigned long cnt = atomic_xchg(&dropped_count, 0);
 	unsigned long over = ring_buffer_overruns(iter->tr->buffer);
 
 	if (over > prev_overruns)
-		cnt = over - prev_overruns;
+		cnt += over - prev_overruns;
 	prev_overruns = over;
 	return cnt;
 }
@@ -308,8 +310,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 
 	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					   &irq_flags);
-	if (!event)
+	if (!event) {
+		atomic_inc(&dropped_count);
 		return;
+	}
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_RW;
@@ -336,8 +340,10 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 
 	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					   &irq_flags);
-	if (!event)
+	if (!event) {
+		atomic_inc(&dropped_count);
 		return;
+	}
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_MAP;
-- 
cgit v1.2.3-59-g8ed1b


From 25aac9dc7c8c73798c1be8aa36141f980d32579e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 9 Jan 2009 11:29:40 +0800
Subject: ftrace, ia64: explictly ignore a file in recordmcount.pl

In IA64, a function pointer isn't a 'unsigned long' but a
'struct {unsigned long ip, unsigned long gp}'. MCOUNT_ADDR is determined
at link time not compile time, so explictly ignore kernel/trace/ftrace.o
in recordmcount.pl.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c   | 10 +---------
 scripts/recordmcount.pl |  5 +++++
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e54a6ccdb93..76bb884b6e16 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -263,14 +263,6 @@ static void ftrace_update_pid_func(void)
 # error Dynamic ftrace depends on MCOUNT_RECORD
 #endif
 
-/*
- * Since MCOUNT_ADDR may point to mcount itself, we do not want
- * to get it confused by reading a reference in the code as we
- * are parsing on objcopy output of text. Use a variable for
- * it instead.
- */
-static unsigned long mcount_addr = MCOUNT_ADDR;
-
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
@@ -575,7 +567,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 
 	ip = rec->ip;
 
-	ret = ftrace_make_nop(mod, rec, mcount_addr);
+	ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
 	if (ret) {
 		ftrace_bug(ret, ip);
 		rec->flags |= FTRACE_FL_FAILED;
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 282485a22bf8..070042b4ccd5 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -109,6 +109,11 @@ if ($#ARGV < 7) {
 my ($arch, $bits, $objdump, $objcopy, $cc,
     $ld, $nm, $rm, $mv, $is_module, $inputfile) = @ARGV;
 
+# This file refers to mcount and shouldn't be ftraced, so lets' ignore it
+if ($inputfile eq "kernel/trace/ftrace.o") {
+    exit(0);
+}
+
 # Acceptable sections to record.
 my %text_sections = (
      ".text" => 1,
-- 
cgit v1.2.3-59-g8ed1b


From f00012074b1a1a67d9c8603617bbbab267347ca6 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 9 Jan 2009 11:29:42 +0800
Subject: ftrace, ia64: Add macro for ftrace_caller

Define FTRACE_ADDR. In IA64, a function pointer isn't a 'unsigned long' but a
'struct {unsigned long ip, unsigned long gp}'.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 4 ++++
 kernel/trace/ftrace.c  | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 677432b9cb7e..054721487574 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -126,6 +126,10 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
+
+#ifndef FTRACE_ADDR
+#define FTRACE_ADDR ((unsigned long)ftrace_caller)
+#endif
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern void ftrace_graph_caller(void);
 extern int ftrace_enable_ftrace_graph_caller(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 76bb884b6e16..9f536108d3f3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -455,7 +455,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	unsigned long ip, fl;
 	unsigned long ftrace_addr;
 
-	ftrace_addr = (unsigned long)ftrace_caller;
+	ftrace_addr = (unsigned long)FTRACE_ADDR;
 
 	ip = rec->ip;
 
-- 
cgit v1.2.3-59-g8ed1b


From 002bb86d8d42f18937aef396c3ecd65c7e02e21a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 10 Jan 2009 11:34:13 -0800
Subject: tracing/ftrace: separate events tracing and stats tracing engine

Impact: tracing's Api change

Currently, the stat tracing depends on the events tracing.
When you switch to a new tracer, the stats files of the previous tracer
will disappear. But it's more scalable to separate those two engines.
This way, we can keep the stat files of one or several tracers when we
want, without bothering of multiple tracer stat files or tracer switching.

To build/destroys its stats files, a tracer just have to call
register_stat_tracer/unregister_stat_tracer everytimes it wants to.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        |   2 -
 kernel/trace/trace.h        |  20 -----
 kernel/trace/trace_branch.c | 108 ++++++++++++++-----------
 kernel/trace/trace_stat.c   | 191 +++++++++++++++++++-------------------------
 kernel/trace/trace_stat.h   |  31 +++++++
 5 files changed, 172 insertions(+), 180 deletions(-)
 create mode 100644 kernel/trace/trace_stat.h

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0418fc338b5c..40217fb499ea 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2353,7 +2353,6 @@ static int tracing_set_tracer(char *buf)
 		if (ret)
 			goto out;
 	}
-	init_tracer_stat(t);
 
 	trace_branch_enable(tr);
  out:
@@ -3218,7 +3217,6 @@ __init static int tracer_alloc_buffers(void)
 #else
 	current_trace = &nop_trace;
 #endif
-	init_tracer_stat(current_trace);
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b3f9ad1b4d84..79c872100dd5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -334,24 +334,6 @@ struct tracer_flags {
 /* Makes more easy to define a tracer opt */
 #define TRACER_OPT(s, b)	.name = #s, .bit = b
 
-/*
- * If you want to provide a stat file (one-shot statistics), fill
- * an iterator with stat_start/stat_next and a stat_show callbacks.
- * The others callbacks are optional.
- */
-struct tracer_stat {
-	/* The name of your stat file */
-	const char		*name;
-	/* Iteration over statistic entries */
-	void			*(*stat_start)(void);
-	void			*(*stat_next)(void *prev, int idx);
-	/* Compare two entries for sorting (optional) for stats */
-	int			(*stat_cmp)(void *p1, void *p2);
-	/* Print a stat entry */
-	int			(*stat_show)(struct seq_file *s, void *p);
-	/* Print the headers of your stat entries */
-	int			(*stat_headers)(struct seq_file *s);
-};
 
 /*
  * A specific tracer, represented by methods that operate on a trace array:
@@ -466,8 +448,6 @@ void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 
-void init_tracer_stat(struct tracer *trace);
-
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 
 extern unsigned long tracing_max_latency;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index da5cf3e5581b..ca017e0a9a27 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -16,12 +16,12 @@
 #include <asm/local.h>
 
 #include "trace.h"
+#include "trace_stat.h"
 #include "trace_output.h"
 
-static struct tracer branch_trace;
-
 #ifdef CONFIG_BRANCH_TRACER
 
+static struct tracer branch_trace;
 static int branch_tracing_enabled __read_mostly;
 static DEFINE_MUTEX(branch_tracing_mutex);
 
@@ -191,6 +191,30 @@ static struct trace_event trace_branch_event = {
 	.binary		= trace_nop_print,
 };
 
+static struct tracer branch_trace __read_mostly =
+{
+	.name		= "branch",
+	.init		= branch_trace_init,
+	.reset		= branch_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_branch,
+#endif /* CONFIG_FTRACE_SELFTEST */
+};
+
+__init static int init_branch_tracer(void)
+{
+	int ret;
+
+	ret = register_ftrace_event(&trace_branch_event);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register "
+				    "branch events\n");
+		return 1;
+	}
+	return register_tracer(&branch_trace);
+}
+device_initcall(init_branch_tracer);
+
 #else
 static inline
 void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
@@ -305,6 +329,29 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
 		return 0;
 }
 
+static struct tracer_stat annotated_branch_stats = {
+	.name = "branch_annotated",
+	.stat_start = annotated_branch_stat_start,
+	.stat_next = annotated_branch_stat_next,
+	.stat_cmp = annotated_branch_stat_cmp,
+	.stat_headers = annotated_branch_stat_headers,
+	.stat_show = branch_stat_show
+};
+
+__init static int init_annotated_branch_stats(void)
+{
+	int ret;
+
+	ret = register_stat_tracer(&annotated_branch_stats);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register "
+				    "annotated branches stats\n");
+		return 1;
+	}
+	return 0;
+}
+fs_initcall(init_annotated_branch_stats);
+
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
 
 extern unsigned long __start_branch_profile[];
@@ -339,60 +386,25 @@ all_branch_stat_next(void *v, int idx)
 	return p;
 }
 
-static struct tracer_stat branch_stats[] = {
-	{.name = "annotated",
-	.stat_start = annotated_branch_stat_start,
-	.stat_next = annotated_branch_stat_next,
-	.stat_cmp = annotated_branch_stat_cmp,
-	.stat_headers = annotated_branch_stat_headers,
-	.stat_show = branch_stat_show},
-
-	{.name = "all",
+static struct tracer_stat all_branch_stats = {
+	.name = "branch_all",
 	.stat_start = all_branch_stat_start,
 	.stat_next = all_branch_stat_next,
 	.stat_headers = all_branch_stat_headers,
-	.stat_show = branch_stat_show},
-
-	{ }
-};
-#else
-static struct tracer_stat branch_stats[] = {
-	{.name = "annotated",
-	.stat_start = annotated_branch_stat_start,
-	.stat_next = annotated_branch_stat_next,
-	.stat_cmp = annotated_branch_stat_cmp,
-	.stat_headers = annotated_branch_stat_headers,
-	.stat_show = branch_stat_show},
-
-	{ }
+	.stat_show = branch_stat_show
 };
-#endif /* CONFIG_PROFILE_ALL_BRANCHES */
 
-
-static struct tracer branch_trace __read_mostly =
+__init static int all_annotated_branch_stats(void)
 {
-	.name		= "branch",
-#ifdef CONFIG_BRANCH_TRACER
-	.init		= branch_trace_init,
-	.reset		= branch_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_branch,
-#endif /* CONFIG_FTRACE_SELFTEST */
-#endif
-	.stats		= branch_stats
-};
-
-__init static int init_branch_trace(void)
-{
-#ifdef CONFIG_BRANCH_TRACER
 	int ret;
-	ret = register_ftrace_event(&trace_branch_event);
+
+	ret = register_stat_tracer(&all_branch_stats);
 	if (!ret) {
-		printk(KERN_WARNING "Warning: could not register branch events\n");
+		printk(KERN_WARNING "Warning: could not register "
+				    "all branches stats\n");
 		return 1;
 	}
-#endif
-
-	return register_tracer(&branch_trace);
+	return 0;
 }
-device_initcall(init_branch_trace);
+fs_initcall(all_annotated_branch_stats);
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 1515f9e7adfc..cb29282b9485 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -10,28 +10,32 @@
 
 
 #include <linux/list.h>
-#include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include "trace_stat.h"
 #include "trace.h"
 
 
 /* List of stat entries from a tracer */
 struct trace_stat_list {
-	struct list_head list;
-	void *stat;
+	struct list_head 	list;
+	void 			*stat;
 };
 
 /* A stat session is the stats output in one file */
 struct tracer_stat_session {
-	struct tracer_stat *ts;
-	struct list_head stat_list;
-	struct mutex stat_mutex;
+	struct list_head	session_list;
+	struct tracer_stat 	*ts;
+	struct list_head 	stat_list;
+	struct mutex 		stat_mutex;
+	struct dentry		*file;
 };
 
 /* All of the sessions currently in use. Each stat file embeed one session */
-static struct tracer_stat_session **all_stat_sessions;
-static int nb_sessions;
-static struct dentry *stat_dir, **stat_files;
+static LIST_HEAD(all_stat_sessions);
+static DEFINE_MUTEX(all_stat_sessions_mutex);
+
+/* The root directory for all stat files */
+static struct dentry *stat_dir;
 
 
 static void reset_stat_session(struct tracer_stat_session *session)
@@ -44,66 +48,77 @@ static void reset_stat_session(struct tracer_stat_session *session)
 	INIT_LIST_HEAD(&session->stat_list);
 }
 
-/* Called when a tracer is initialized */
-static int init_all_sessions(int nb, struct tracer_stat *ts)
+static void destroy_session(struct tracer_stat_session *session)
 {
-	int i, j;
-	struct tracer_stat_session *session;
+	debugfs_remove(session->file);
+	reset_stat_session(session);
+	mutex_destroy(&session->stat_mutex);
+	kfree(session);
+}
 
-	nb_sessions = 0;
 
-	if (all_stat_sessions) {
-		for (i = 0; i < nb_sessions; i++) {
-			session = all_stat_sessions[i];
-			reset_stat_session(session);
-			mutex_destroy(&session->stat_mutex);
-			kfree(session);
-		}
-	}
-	all_stat_sessions = kmalloc(sizeof(struct tracer_stat_session *) * nb,
-				    GFP_KERNEL);
-	if (!all_stat_sessions)
-		return -ENOMEM;
+static int init_stat_file(struct tracer_stat_session *session);
 
-	for (i = 0; i < nb; i++) {
-		session = kmalloc(sizeof(struct tracer_stat_session) * nb,
-				  GFP_KERNEL);
-		if (!session)
-			goto free_sessions;
+int register_stat_tracer(struct tracer_stat *trace)
+{
+	struct tracer_stat_session *session, *node, *tmp;
+	int ret;
+
+	if (!trace)
+		return -EINVAL;
+
+	if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
+		return -EINVAL;
 
-		INIT_LIST_HEAD(&session->stat_list);
-		mutex_init(&session->stat_mutex);
-		session->ts = &ts[i];
-		all_stat_sessions[i] = session;
+	/* Already registered? */
+	mutex_lock(&all_stat_sessions_mutex);
+	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+		if (node->ts == trace)
+			return -EINVAL;
 	}
-	nb_sessions = nb;
-	return 0;
+	mutex_unlock(&all_stat_sessions_mutex);
+
+	/* Init the session */
+	session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
+	if (!session)
+		return -ENOMEM;
 
-free_sessions:
+	session->ts = trace;
+	INIT_LIST_HEAD(&session->session_list);
+	INIT_LIST_HEAD(&session->stat_list);
+	mutex_init(&session->stat_mutex);
+	session->file = NULL;
 
-	for (j = 0; j < i; j++)
-		kfree(all_stat_sessions[i]);
+	ret = init_stat_file(session);
+	if (ret) {
+		destroy_session(session);
+		return ret;
+	}
 
-	kfree(all_stat_sessions);
-	all_stat_sessions = NULL;
+	/* Register */
+	mutex_lock(&all_stat_sessions_mutex);
+	list_add_tail(&session->session_list, &all_stat_sessions);
+	mutex_unlock(&all_stat_sessions_mutex);
 
-	return -ENOMEM;
+	return 0;
 }
 
-static int basic_tracer_stat_checks(struct tracer_stat *ts)
+void unregister_stat_tracer(struct tracer_stat *trace)
 {
-	int i;
+	struct tracer_stat_session *node, *tmp;
 
-	if (!ts)
-		return 0;
-
-	for (i = 0; ts[i].name; i++) {
-		if (!ts[i].stat_start || !ts[i].stat_next || !ts[i].stat_show)
-			return -EBUSY;
+	mutex_lock(&all_stat_sessions_mutex);
+	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+		if (node->ts == trace) {
+			list_del(&node->session_list);
+			destroy_session(node);
+			break;
+		}
 	}
-	return i;
+	mutex_unlock(&all_stat_sessions_mutex);
 }
 
+
 /*
  * For tracers that don't provide a stat_cmp callback.
  * This one will force an immediate insertion on tail of
@@ -280,63 +295,7 @@ static const struct file_operations tracing_stat_fops = {
 	.release	= tracing_stat_release
 };
 
-
-static void destroy_trace_stat_files(void)
-{
-	int i;
-
-	if (stat_files) {
-		for (i = 0; i < nb_sessions; i++)
-			debugfs_remove(stat_files[i]);
-		kfree(stat_files);
-		stat_files = NULL;
-	}
-}
-
-static void init_trace_stat_files(void)
-{
-	int i;
-
-	if (!stat_dir || !nb_sessions)
-		return;
-
-	stat_files = kmalloc(sizeof(struct dentry *) * nb_sessions, GFP_KERNEL);
-
-	if (!stat_files) {
-		pr_warning("trace stat: not enough memory\n");
-		return;
-	}
-
-	for (i = 0; i < nb_sessions; i++) {
-		struct tracer_stat_session *session = all_stat_sessions[i];
-		stat_files[i] = debugfs_create_file(session->ts->name, 0644,
-						stat_dir,
-						session, &tracing_stat_fops);
-		if (!stat_files[i])
-			pr_warning("cannot create %s entry\n",
-				   session->ts->name);
-	}
-}
-
-void init_tracer_stat(struct tracer *trace)
-{
-	int nb = basic_tracer_stat_checks(trace->stats);
-
-	destroy_trace_stat_files();
-
-	if (nb < 0) {
-		pr_warning("stat tracing: missing stat callback on %s\n",
-			   trace->name);
-		return;
-	}
-	if (!nb)
-		return;
-
-	init_all_sessions(nb, trace->stats);
-	init_trace_stat_files();
-}
-
-static int __init tracing_stat_init(void)
+static int tracing_stat_init(void)
 {
 	struct dentry *d_tracing;
 
@@ -348,4 +307,16 @@ static int __init tracing_stat_init(void)
 			   "'trace_stat' entry\n");
 	return 0;
 }
-fs_initcall(tracing_stat_init);
+
+static int init_stat_file(struct tracer_stat_session *session)
+{
+	if (!stat_dir && tracing_stat_init())
+		return -ENODEV;
+
+	session->file = debugfs_create_file(session->ts->name, 0644,
+					    stat_dir,
+					    session, &tracing_stat_fops);
+	if (!session->file)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
new file mode 100644
index 000000000000..202274cf7f3d
--- /dev/null
+++ b/kernel/trace/trace_stat.h
@@ -0,0 +1,31 @@
+#ifndef __TRACE_STAT_H
+#define __TRACE_STAT_H
+
+#include <linux/seq_file.h>
+
+/*
+ * If you want to provide a stat file (one-shot statistics), fill
+ * an iterator with stat_start/stat_next and a stat_show callbacks.
+ * The others callbacks are optional.
+ */
+struct tracer_stat {
+	/* The name of your stat file */
+	const char		*name;
+	/* Iteration over statistic entries */
+	void			*(*stat_start)(void);
+	void			*(*stat_next)(void *prev, int idx);
+	/* Compare two entries for stats sorting */
+	int			(*stat_cmp)(void *p1, void *p2);
+	/* Print a stat entry */
+	int			(*stat_show)(struct seq_file *s, void *p);
+	/* Print the headers of your stat entries */
+	int			(*stat_headers)(struct seq_file *s);
+};
+
+/*
+ * Destroy or create a stat file
+ */
+extern int register_stat_tracer(struct tracer_stat *trace);
+extern void unregister_stat_tracer(struct tracer_stat *trace);
+
+#endif /* __TRACE_STAT_H */
-- 
cgit v1.2.3-59-g8ed1b


From e1d8aa9f1dd655a3534b22fcfbecb70cdb125766 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 12 Jan 2009 23:15:46 +0100
Subject: tracing: add a new workqueue tracer

Impact: new tracer

The workqueue tracer provides some statistical informations
about each cpu workqueue thread such as the number of the
works inserted and executed since their creation. It can help
to evaluate the amount of work each of them have to perform.
For example it can help a developer to decide whether he should
choose a per cpu workqueue instead of a singlethreaded one.

It only traces statistical informations for now but it will probably later
provide event tracing too.

Such a tracer could help too, and be improved, to help rt priority sorted
workqueue development.

To have a snapshot of the workqueues state at any time, just do

cat /debugfs/tracing/trace_stat/workqueues

Ie:

  1    125        125       reiserfs/1
  1      0          0       scsi_tgtd/1
  1      0          0       aio/1
  1      0          0       ata/1
  1    114        114       kblockd/1
  1      0          0       kintegrityd/1
  1   2147       2147       events/1

  0      0          0       kpsmoused
  0    105        105       reiserfs/0
  0      0          0       scsi_tgtd/0
  0      0          0       aio/0
  0      0          0       ata_aux
  0      0          0       ata/0
  0      0          0       cqueue
  0      0          0       kacpi_notify
  0      0          0       kacpid
  0    149        149       kblockd/0
  0      0          0       kintegrityd/0
  0   1000       1000       khelper
  0   2270       2270       events/0

Changes in V2:

_ Drop the static array based on NR_CPU and dynamically allocate the stat array
  with num_possible_cpus() and other cpu mask facilities....
_ Trace workqueue insertion at a bit lower level (insert_work instead of queue_work) to handle
  even the workqueue barriers.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/workqueue.h      |  25 ++++
 kernel/trace/Kconfig           |  11 ++
 kernel/trace/Makefile          |   1 +
 kernel/trace/trace_workqueue.c | 287 +++++++++++++++++++++++++++++++++++++++++
 kernel/workqueue.c             |  16 ++-
 5 files changed, 339 insertions(+), 1 deletion(-)
 create mode 100644 include/trace/workqueue.h
 create mode 100644 kernel/trace/trace_workqueue.c

(limited to 'kernel')

diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h
new file mode 100644
index 000000000000..867829df4571
--- /dev/null
+++ b/include/trace/workqueue.h
@@ -0,0 +1,25 @@
+#ifndef __TRACE_WORKQUEUE_H
+#define __TRACE_WORKQUEUE_H
+
+#include <linux/tracepoint.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+
+DECLARE_TRACE(workqueue_insertion,
+	   TPPROTO(struct task_struct *wq_thread, struct work_struct *work),
+	   TPARGS(wq_thread, work));
+
+DECLARE_TRACE(workqueue_execution,
+	   TPPROTO(struct task_struct *wq_thread, struct work_struct *work),
+	   TPARGS(wq_thread, work));
+
+/* Trace the creation of one workqueue thread on a cpu */
+DECLARE_TRACE(workqueue_creation,
+	   TPPROTO(struct task_struct *wq_thread, int cpu),
+	   TPARGS(wq_thread, cpu));
+
+DECLARE_TRACE(workqueue_destruction,
+	   TPPROTO(struct task_struct *wq_thread),
+	   TPARGS(wq_thread));
+
+#endif /* __TRACE_WORKQUEUE_H */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 944239296f13..dde1d46f77e5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -284,6 +284,17 @@ config KMEMTRACE
 
 	  If unsure, say N.
 
+config WORKQUEUE_TRACER
+	bool "Trace workqueues"
+	select TRACING
+	help
+	  The workqueue tracer provides some statistical informations
+          about each cpu workqueue thread such as the number of the
+          works inserted and executed since their creation. It can help
+          to evaluate the amount of work each of them have to perform.
+          For example it can help a developer to decide whether he should
+          choose a per cpu workqueue instead of a singlethreaded one.
+
 
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 05c9182061de..f76d48f3527d 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -36,5 +36,6 @@ obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
+obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
new file mode 100644
index 000000000000..f8118d39ca9b
--- /dev/null
+++ b/kernel/trace/trace_workqueue.c
@@ -0,0 +1,287 @@
+/*
+ * Workqueue statistical tracer.
+ *
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+
+#include <trace/workqueue.h>
+#include <linux/list.h>
+#include "trace_stat.h"
+#include "trace.h"
+
+
+/* A cpu workqueue thread */
+struct cpu_workqueue_stats {
+	struct list_head            list;
+/* Useful to know if we print the cpu headers */
+	bool		            first_entry;
+	int		            cpu;
+	pid_t 			    pid;
+/* Can be inserted from interrupt or user context, need to be atomic */
+	atomic_t 	            inserted;
+/*
+ *  Don't need to be atomic, works are serialized in a single workqueue thread
+ *  on a single CPU.
+ */
+	unsigned int 	 	    executed;
+};
+
+/* List of workqueue threads on one cpu */
+struct workqueue_global_stats {
+	struct list_head	list;
+	spinlock_t		lock;
+};
+
+/* Don't need a global lock because allocated before the workqueues, and
+ * never freed.
+ */
+static struct workqueue_global_stats *all_workqueue_stat;
+
+/* Insertion of a work */
+static void
+probe_workqueue_insertion(struct task_struct *wq_thread,
+			  struct work_struct *work)
+{
+	int cpu = cpumask_first(&wq_thread->cpus_allowed);
+	struct cpu_workqueue_stats *node, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+							list) {
+		if (node->pid == wq_thread->pid) {
+			atomic_inc(&node->inserted);
+			goto found;
+		}
+	}
+	pr_debug("trace_workqueue: entry not found\n");
+found:
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+}
+
+/* Execution of a work */
+static void
+probe_workqueue_execution(struct task_struct *wq_thread,
+			  struct work_struct *work)
+{
+	int cpu = cpumask_first(&wq_thread->cpus_allowed);
+	struct cpu_workqueue_stats *node, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+							list) {
+		if (node->pid == wq_thread->pid) {
+			node->executed++;
+			goto found;
+		}
+	}
+	pr_debug("trace_workqueue: entry not found\n");
+found:
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+}
+
+/* Creation of a cpu workqueue thread */
+static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
+{
+	struct cpu_workqueue_stats *cws;
+	unsigned long flags;
+
+	WARN_ON(cpu < 0 || cpu >= num_possible_cpus());
+
+	/* Workqueues are sometimes created in atomic context */
+	cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
+	if (!cws) {
+		pr_warning("trace_workqueue: not enough memory\n");
+		return;
+	}
+	tracing_record_cmdline(wq_thread);
+
+	INIT_LIST_HEAD(&cws->list);
+	cws->cpu = cpu;
+
+	cws->pid = wq_thread->pid;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	if (list_empty(&all_workqueue_stat[cpu].list))
+		cws->first_entry = true;
+	list_add_tail(&cws->list, &all_workqueue_stat[cpu].list);
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+}
+
+/* Destruction of a cpu workqueue thread */
+static void probe_workqueue_destruction(struct task_struct *wq_thread)
+{
+	/* Workqueue only execute on one cpu */
+	int cpu = cpumask_first(&wq_thread->cpus_allowed);
+	struct cpu_workqueue_stats *node, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+							list) {
+		if (node->pid == wq_thread->pid) {
+			list_del(&node->list);
+			kfree(node);
+			goto found;
+		}
+	}
+
+	pr_debug("trace_workqueue: don't find workqueue to destroy\n");
+found:
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+
+}
+
+static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
+{
+	unsigned long flags;
+	struct cpu_workqueue_stats *ret = NULL;
+
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+
+	if (!list_empty(&all_workqueue_stat[cpu].list))
+		ret = list_entry(all_workqueue_stat[cpu].list.next,
+				 struct cpu_workqueue_stats, list);
+
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+
+	return ret;
+}
+
+static void *workqueue_stat_start(void)
+{
+	int cpu;
+	void *ret = NULL;
+
+	for_each_possible_cpu(cpu) {
+		ret = workqueue_stat_start_cpu(cpu);
+		if (ret)
+			return ret;
+	}
+	return NULL;
+}
+
+static void *workqueue_stat_next(void *prev, int idx)
+{
+	struct cpu_workqueue_stats *prev_cws = prev;
+	int cpu = prev_cws->cpu;
+	unsigned long flags;
+	void *ret = NULL;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	if (list_is_last(&prev_cws->list, &all_workqueue_stat[cpu].list)) {
+		spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+		for (++cpu ; cpu < num_possible_cpus(); cpu++) {
+			ret = workqueue_stat_start_cpu(cpu);
+			if (ret)
+				return ret;
+		}
+		return NULL;
+	}
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+
+	return list_entry(prev_cws->list.next, struct cpu_workqueue_stats,
+			  list);
+}
+
+static int workqueue_stat_show(struct seq_file *s, void *p)
+{
+	struct cpu_workqueue_stats *cws = p;
+	unsigned long flags;
+	int cpu = cws->cpu;
+
+	seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu,
+		   atomic_read(&cws->inserted),
+		   cws->executed,
+		   trace_find_cmdline(cws->pid));
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	if (&cws->list == all_workqueue_stat[cpu].list.next)
+		seq_printf(s, "\n");
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+
+	return 0;
+}
+
+static int workqueue_stat_headers(struct seq_file *s)
+{
+	seq_printf(s, "# CPU  INSERTED  EXECUTED   NAME\n");
+	seq_printf(s, "# |      |         |          |\n\n");
+	return 0;
+}
+
+struct tracer_stat workqueue_stats __read_mostly = {
+	.name = "workqueues",
+	.stat_start = workqueue_stat_start,
+	.stat_next = workqueue_stat_next,
+	.stat_show = workqueue_stat_show,
+	.stat_headers = workqueue_stat_headers
+};
+
+
+int __init stat_workqueue_init(void)
+{
+	if (register_stat_tracer(&workqueue_stats)) {
+		pr_warning("Unable to register workqueue stat tracer\n");
+		return 1;
+	}
+
+	return 0;
+}
+fs_initcall(stat_workqueue_init);
+
+/*
+ * Workqueues are created very early, just after pre-smp initcalls.
+ * So we must register our tracepoints at this stage.
+ */
+int __init trace_workqueue_early_init(void)
+{
+	int ret, cpu;
+
+	ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
+	if (ret)
+		goto out;
+
+	ret = register_trace_workqueue_execution(probe_workqueue_execution);
+	if (ret)
+		goto no_insertion;
+
+	ret = register_trace_workqueue_creation(probe_workqueue_creation);
+	if (ret)
+		goto no_execution;
+
+	ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
+	if (ret)
+		goto no_creation;
+
+	all_workqueue_stat = kmalloc(sizeof(struct workqueue_global_stats)
+				     * num_possible_cpus(), GFP_KERNEL);
+
+	if (!all_workqueue_stat) {
+		pr_warning("trace_workqueue: not enough memory\n");
+		goto no_creation;
+	}
+
+	for_each_possible_cpu(cpu) {
+		spin_lock_init(&all_workqueue_stat[cpu].lock);
+		INIT_LIST_HEAD(&all_workqueue_stat[cpu].list);
+	}
+
+	return 0;
+
+no_creation:
+	unregister_trace_workqueue_creation(probe_workqueue_creation);
+no_execution:
+	unregister_trace_workqueue_execution(probe_workqueue_execution);
+no_insertion:
+	unregister_trace_workqueue_insertion(probe_workqueue_insertion);
+out:
+	pr_warning("trace_workqueue: unable to trace workqueues\n");
+
+	return 1;
+}
+early_initcall(trace_workqueue_early_init);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2f445833ae37..1fc2bc20603f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,6 +33,7 @@
 #include <linux/kallsyms.h>
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
+#include <trace/workqueue.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -125,9 +126,13 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 	return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
 }
 
+DEFINE_TRACE(workqueue_insertion);
+
 static void insert_work(struct cpu_workqueue_struct *cwq,
 			struct work_struct *work, struct list_head *head)
 {
+	trace_workqueue_insertion(cwq->thread, work);
+
 	set_wq_data(work, cwq);
 	/*
 	 * Ensure that we get the right work->data if we see the
@@ -259,6 +264,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
+DEFINE_TRACE(workqueue_execution);
+
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
 	spin_lock_irq(&cwq->lock);
@@ -284,7 +291,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 		 */
 		struct lockdep_map lockdep_map = work->lockdep_map;
 #endif
-
+		trace_workqueue_execution(cwq->thread, work);
 		cwq->current_work = work;
 		list_del_init(cwq->worklist.next);
 		spin_unlock_irq(&cwq->lock);
@@ -765,6 +772,8 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
 	return cwq;
 }
 
+DEFINE_TRACE(workqueue_creation);
+
 static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -787,6 +796,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 	cwq->thread = p;
 
+	trace_workqueue_creation(cwq->thread, cpu);
+
 	return 0;
 }
 
@@ -868,6 +879,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 }
 EXPORT_SYMBOL_GPL(__create_workqueue_key);
 
+DEFINE_TRACE(workqueue_destruction);
+
 static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
 	/*
@@ -891,6 +904,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 	 * checks list_empty(), and a "normal" queue_work() can't use
 	 * a dead CPU.
 	 */
+	trace_workqueue_destruction(cwq->thread);
 	kthread_stop(cwq->thread);
 	cwq->thread = NULL;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 32632920a788fb13da35b131b77cc4324c38c1c5 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
Date: Mon, 12 Jan 2009 23:35:50 +0100
Subject: ftrace, trivial: fix typo "resgister" -> "register"

Signed-off-by: Uwe Kleine-Koenig <ukleinek@strlen.de>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9f536108d3f3..8c1c9c0f4775 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1894,7 +1894,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
 }
 
 /**
- * unregister_ftrace_function - unresgister a function for profiling.
+ * unregister_ftrace_function - unregister a function for profiling.
  * @ops - ops structure that holds the function to unregister
  *
  * Unregister a function that was added to be called by ftrace profiling.
-- 
cgit v1.2.3-59-g8ed1b


From 428aee1460a75197f0190534b4d610450ee59af1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 14 Jan 2009 12:24:42 -0500
Subject: trace: print ftrace_dump at KERN_EMERG log level

Impact: fix to print out ftrace_dump when expected

I was debugging a hard race condition to only find out that
after I hit the race, my log level was not at level to show
KERN_INFO. The time it took to trigger the race was wasted because
I did not capture the trace.

Since ftrace_dump is only called from kernel oops (and only when
it is set in the kernel command line to do so), or when a
developer adds it to their own local tree, the log level of
the print should be at KERN_EMERG to make sure the print appears.

ftrace_dump is not called by a normal user setup, and will not
add extra unwanted print out to the console. There is no reason
it should be at KERN_INFO.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 40217fb499ea..408c03f2b8a9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3076,7 +3076,7 @@ static struct notifier_block trace_die_notifier = {
  * it if we decide to change what log level the ftrace dump
  * should be at.
  */
-#define KERN_TRACE		KERN_INFO
+#define KERN_TRACE		KERN_EMERG
 
 static void
 trace_printk_seq(struct trace_seq *s)
-- 
cgit v1.2.3-59-g8ed1b


From 6f3b34402e7282cde49dff395d7ea462bf33bf50 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 12 Jan 2009 11:06:18 +0800
Subject: ring_buffer: reset write when reserve buffer fail

Impact: reset struct buffer_page.write when interrupt storm

if struct buffer_page.write is not reset, any succedent committing
will corrupted ring_buffer:

static inline void
rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
{
	......
		cpu_buffer->commit_page->commit =
			cpu_buffer->commit_page->write;
	......
}

when "if (RB_WARN_ON(cpu_buffer, next_page == reader_page))", ring_buffer
is disabled, but some reserved buffers may haven't been committed.
we need reset struct buffer_page.write.

when "if (unlikely(next_page == cpu_buffer->commit_page))", ring_buffer
is still available, we should not corrupt it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4832ffa5d937..0b9de5a3d699 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1017,12 +1017,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		}
 
 		if (next_page == head_page) {
-			if (!(buffer->flags & RB_FL_OVERWRITE)) {
-				/* reset write */
-				if (tail <= BUF_PAGE_SIZE)
-					local_set(&tail_page->write, tail);
+			if (!(buffer->flags & RB_FL_OVERWRITE))
 				goto out_unlock;
-			}
 
 			/* tail_page has not moved yet? */
 			if (tail_page == cpu_buffer->tail_page) {
@@ -1097,6 +1093,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 
  out_unlock:
+	/* reset write */
+	if (tail <= BUF_PAGE_SIZE)
+		local_set(&tail_page->write, tail);
+
 	__raw_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 	return NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 0ee6b6cf5bdb793b4c68507dd65adf16341aa4ca Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 14 Jan 2009 14:50:19 -0500
Subject: trace: stop all recording to ring buffer on ftrace_dump

Impact: limit ftrace dump output

Currently ftrace_dump only calls ftrace_kill that is a fast way
to prevent the function tracer functions from being called (just sets
a flag and clears the function to call, nothing else). It is better
to also turn off any recording to the ring buffers as well.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 408c03f2b8a9..dcb757f70d21 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3110,6 +3110,7 @@ void ftrace_dump(void)
 	dump_ran = 1;
 
 	/* No turning back! */
+	tracing_off();
 	ftrace_kill();
 
 	for_each_tracing_cpu(cpu) {
-- 
cgit v1.2.3-59-g8ed1b


From 4a2b8dda3f8705880ec7408135645602d5590f51 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 14 Jan 2009 13:33:27 -0800
Subject: tracing/function-graph-tracer: fix a regression while suspend to disk

Impact: fix a crash while kernel image restore

When the function graph tracer is running and while suspend to disk, some racy
and dangerous things happen against this tracer.

The current task will save its registers including the stack pointer which
contains the return address hooked by the tracer. But the current task will
continue to enter other functions after that to save the memory, and then
it will store other return addresses, and finally loose the old depth which
matches the return address saved in the old stack (during the registers saving).

So on image restore, the code will return to wrong addresses.
And there are other things: on restore, the task will have it's "current"
pointer overwritten during registers restoring....switching from one task to
another... That would be insane to try to trace function graphs at these
stages.

This patch makes the function graph tracer listening on power events, making
it's tracing disabled for the current task (the one that performs the
hibernation work) while suspend/resume to disk, making the tracing safe
during hibernation.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8c1c9c0f4775..7e9a20b69939 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -17,6 +17,7 @@
 #include <linux/clocksource.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
+#include <linux/suspend.h>
 #include <linux/debugfs.h>
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
@@ -1957,6 +1958,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 static atomic_t ftrace_graph_active;
+static struct notifier_block ftrace_suspend_notifier;
 
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
 {
@@ -2035,6 +2037,27 @@ static int start_graph_tracing(void)
 	return ret;
 }
 
+/*
+ * Hibernation protection.
+ * The state of the current task is too much unstable during
+ * suspend/restore to disk. We want to protect against that.
+ */
+static int
+ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
+							void *unused)
+{
+	switch (state) {
+	case PM_HIBERNATION_PREPARE:
+		pause_graph_tracing();
+		break;
+
+	case PM_POST_HIBERNATION:
+		unpause_graph_tracing();
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
 int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 			trace_func_graph_ent_t entryfunc)
 {
@@ -2042,6 +2065,9 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 
 	mutex_lock(&ftrace_sysctl_lock);
 
+	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
+	register_pm_notifier(&ftrace_suspend_notifier);
+
 	atomic_inc(&ftrace_graph_active);
 	ret = start_graph_tracing();
 	if (ret) {
@@ -2067,6 +2093,7 @@ void unregister_ftrace_graph(void)
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
+	unregister_pm_notifier(&ftrace_suspend_notifier);
 
 	mutex_unlock(&ftrace_sysctl_lock);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 42fab4b2cdc02cf28e2474ccfd75bc9225076590 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 15 Jan 2009 09:30:52 +0800
Subject: tracing/ftrace: add missing unlock in register_stat_tracer()

We should unlock all_stat_sessions_mutex before returning failure.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index cb29282b9485..2110cea2ece3 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -73,8 +73,10 @@ int register_stat_tracer(struct tracer_stat *trace)
 	/* Already registered? */
 	mutex_lock(&all_stat_sessions_mutex);
 	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
-		if (node->ts == trace)
+		if (node->ts == trace) {
+			mutex_unlock(&all_stat_sessions_mutex);
 			return -EINVAL;
+		}
 	}
 	mutex_unlock(&all_stat_sessions_mutex);
 
-- 
cgit v1.2.3-59-g8ed1b


From 55922173f1f63903b6de03711ab8ff980cbe58d2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 15 Jan 2009 11:31:21 +0100
Subject: tracing: trace_stat.c cleanup

Impact: cleanup

- whitespace / code alignment cleanups
- avoid unnecessary forward prototype by reordering functions

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 147 ++++++++++++++++++++++------------------------
 1 file changed, 71 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 2110cea2ece3..eae9cef39291 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -17,16 +17,16 @@
 
 /* List of stat entries from a tracer */
 struct trace_stat_list {
-	struct list_head 	list;
-	void 			*stat;
+	struct list_head	list;
+	void			*stat;
 };
 
 /* A stat session is the stats output in one file */
 struct tracer_stat_session {
 	struct list_head	session_list;
-	struct tracer_stat 	*ts;
-	struct list_head 	stat_list;
-	struct mutex 		stat_mutex;
+	struct tracer_stat	*ts;
+	struct list_head	stat_list;
+	struct mutex		stat_mutex;
 	struct dentry		*file;
 };
 
@@ -35,7 +35,7 @@ static LIST_HEAD(all_stat_sessions);
 static DEFINE_MUTEX(all_stat_sessions_mutex);
 
 /* The root directory for all stat files */
-static struct dentry *stat_dir;
+static struct dentry		*stat_dir;
 
 
 static void reset_stat_session(struct tracer_stat_session *session)
@@ -56,71 +56,6 @@ static void destroy_session(struct tracer_stat_session *session)
 	kfree(session);
 }
 
-
-static int init_stat_file(struct tracer_stat_session *session);
-
-int register_stat_tracer(struct tracer_stat *trace)
-{
-	struct tracer_stat_session *session, *node, *tmp;
-	int ret;
-
-	if (!trace)
-		return -EINVAL;
-
-	if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
-		return -EINVAL;
-
-	/* Already registered? */
-	mutex_lock(&all_stat_sessions_mutex);
-	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
-		if (node->ts == trace) {
-			mutex_unlock(&all_stat_sessions_mutex);
-			return -EINVAL;
-		}
-	}
-	mutex_unlock(&all_stat_sessions_mutex);
-
-	/* Init the session */
-	session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
-	if (!session)
-		return -ENOMEM;
-
-	session->ts = trace;
-	INIT_LIST_HEAD(&session->session_list);
-	INIT_LIST_HEAD(&session->stat_list);
-	mutex_init(&session->stat_mutex);
-	session->file = NULL;
-
-	ret = init_stat_file(session);
-	if (ret) {
-		destroy_session(session);
-		return ret;
-	}
-
-	/* Register */
-	mutex_lock(&all_stat_sessions_mutex);
-	list_add_tail(&session->session_list, &all_stat_sessions);
-	mutex_unlock(&all_stat_sessions_mutex);
-
-	return 0;
-}
-
-void unregister_stat_tracer(struct tracer_stat *trace)
-{
-	struct tracer_stat_session *node, *tmp;
-
-	mutex_lock(&all_stat_sessions_mutex);
-	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
-		if (node->ts == trace) {
-			list_del(&node->session_list);
-			destroy_session(node);
-			break;
-		}
-	}
-	mutex_unlock(&all_stat_sessions_mutex);
-}
-
-
 /*
  * For tracers that don't provide a stat_cmp callback.
  * This one will force an immediate insertion on tail of
@@ -252,10 +187,10 @@ static int stat_seq_show(struct seq_file *s, void *v)
 }
 
 static const struct seq_operations trace_stat_seq_ops = {
-	.start = stat_seq_start,
-	.next = stat_seq_next,
-	.stop = stat_seq_stop,
-	.show = stat_seq_show
+	.start		= stat_seq_start,
+	.next		= stat_seq_next,
+	.stop		= stat_seq_stop,
+	.show		= stat_seq_show
 };
 
 /* The session stat is refilled and resorted at each stat file opening */
@@ -275,7 +210,6 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-
 /*
  * Avoid consuming memory with our now useless list.
  */
@@ -322,3 +256,64 @@ static int init_stat_file(struct tracer_stat_session *session)
 		return -ENOMEM;
 	return 0;
 }
+
+int register_stat_tracer(struct tracer_stat *trace)
+{
+	struct tracer_stat_session *session, *node, *tmp;
+	int ret;
+
+	if (!trace)
+		return -EINVAL;
+
+	if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
+		return -EINVAL;
+
+	/* Already registered? */
+	mutex_lock(&all_stat_sessions_mutex);
+	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+		if (node->ts == trace) {
+			mutex_unlock(&all_stat_sessions_mutex);
+			return -EINVAL;
+		}
+	}
+	mutex_unlock(&all_stat_sessions_mutex);
+
+	/* Init the session */
+	session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
+	if (!session)
+		return -ENOMEM;
+
+	session->ts = trace;
+	INIT_LIST_HEAD(&session->session_list);
+	INIT_LIST_HEAD(&session->stat_list);
+	mutex_init(&session->stat_mutex);
+	session->file = NULL;
+
+	ret = init_stat_file(session);
+	if (ret) {
+		destroy_session(session);
+		return ret;
+	}
+
+	/* Register */
+	mutex_lock(&all_stat_sessions_mutex);
+	list_add_tail(&session->session_list, &all_stat_sessions);
+	mutex_unlock(&all_stat_sessions_mutex);
+
+	return 0;
+}
+
+void unregister_stat_tracer(struct tracer_stat *trace)
+{
+	struct tracer_stat_session *node, *tmp;
+
+	mutex_lock(&all_stat_sessions_mutex);
+	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+		if (node->ts == trace) {
+			list_del(&node->session_list);
+			destroy_session(node);
+			break;
+		}
+	}
+	mutex_unlock(&all_stat_sessions_mutex);
+}
-- 
cgit v1.2.3-59-g8ed1b


From 6c1a99afbda99cd8d8c69d756387041567a13d87 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 15 Jan 2009 18:05:40 +0800
Subject: ftrace: fix trace_output

Impact: fix bug for handling partial line

trace_seq_printf(), seq_print_userip_objs(), ... return
0          -- partial line was written
other(>0)  -- success

duplicate output is also removed in trace_print_raw().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.c | 65 +++++++++++++++++++++------------------------
 kernel/trace/trace_output.h |  4 +--
 2 files changed, 33 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index df0c25cbed30..4e3ad36b117c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -440,9 +440,9 @@ trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (trace_seq_printf(s, "%x %x\n",
-			     field->ip,
-			     field->parent_ip))
+	if (!trace_seq_printf(s, "%x %x\n",
+			      field->ip,
+			      field->parent_ip))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	return 0;
@@ -497,14 +497,14 @@ trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags,
 	T = task_state_char(field->next_state);
 	S = task_state_char(field->prev_state);
 	comm = trace_find_cmdline(field->next_pid);
-	if (trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
-			     field->prev_pid,
-			     field->prev_prio,
-			     S, delim,
-			     field->next_cpu,
-			     field->next_pid,
-			     field->next_prio,
-			     T, comm))
+	if (!trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+			      field->prev_pid,
+			      field->prev_prio,
+			      S, delim,
+			      field->next_cpu,
+			      field->next_pid,
+			      field->next_prio,
+			      T, comm))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	return 0;
@@ -534,14 +534,14 @@ trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags,
 	if (!S)
 		task_state_char(field->prev_state);
 	T = task_state_char(field->next_state);
-	if (trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
-			     field->prev_pid,
-			     field->prev_prio,
-			     S,
-			     field->next_cpu,
-			     field->next_pid,
-			     field->next_prio,
-			     T))
+	if (!trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
+			      field->prev_pid,
+			      field->prev_prio,
+			      S,
+			      field->next_cpu,
+			      field->next_pid,
+			      field->next_prio,
+			      T))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	return 0;
@@ -639,10 +639,10 @@ trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (trace_seq_printf(s, "# %ld %ld %ld\n",
-			     field->arg1,
-			     field->arg2,
-			     field->arg3))
+	if (!trace_seq_printf(s, "# %ld %ld %ld\n",
+			      field->arg1,
+			      field->arg2,
+			      field->arg3))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	return 0;
@@ -697,13 +697,13 @@ trace_stack_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
 		if (i) {
-			if (trace_seq_puts(s, " <= "))
+			if (!trace_seq_puts(s, " <= "))
 				goto partial;
 
-			if (seq_print_ip_sym(s, field->caller[i], flags))
+			if (!seq_print_ip_sym(s, field->caller[i], flags))
 				goto partial;
 		}
-		if (trace_seq_puts(s, "\n"))
+		if (!trace_seq_puts(s, "\n"))
 			goto partial;
 	}
 
@@ -731,10 +731,10 @@ trace_user_stack_print(struct trace_seq *s, struct trace_entry *entry,
 
 	trace_assign_type(field, entry);
 
-	if (seq_print_userip_objs(field, s, flags))
+	if (!seq_print_userip_objs(field, s, flags))
 		goto partial;
 
-	if (trace_seq_putc(s, '\n'))
+	if (!trace_seq_putc(s, '\n'))
 		goto partial;
 
 	return 0;
@@ -760,10 +760,10 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (seq_print_ip_sym(s, field->ip, flags))
+	if (!seq_print_ip_sym(s, field->ip, flags))
 		goto partial;
 
-	if (trace_seq_printf(s, ": %s", field->buf))
+	if (!trace_seq_printf(s, ": %s", field->buf))
 		goto partial;
 
 	return 0;
@@ -779,10 +779,7 @@ trace_print_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (seq_print_ip_sym(s, field->ip, flags))
-		goto partial;
-
-	if (trace_seq_printf(s, "# %lx %s", field->ip, field->buf))
+	if (!trace_seq_printf(s, "# %lx %s", field->ip, field->buf))
 		goto partial;
 
 	return 0;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index ecab4ea4a4fd..b2c14615e0cd 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -45,14 +45,14 @@ trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags);
 #define SEQ_PUT_FIELD_RET(s, x)				\
 do {							\
 	if (!trace_seq_putmem(s, &(x), sizeof(x)))	\
-		return 0;				\
+		return TRACE_TYPE_PARTIAL_LINE;		\
 } while (0)
 
 #define SEQ_PUT_HEX_FIELD_RET(s, x)			\
 do {							\
 	BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES);	\
 	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\
-		return 0;				\
+		return TRACE_TYPE_PARTIAL_LINE;		\
 } while (0)
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 5361499101306cfb776c3cfa0f69d0479bc63868 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 19:12:40 -0500
Subject: ftrace: add stack trace to function tracer

Impact: new feature to stack trace any function

Chris Mason asked about being able to pick and choose a function
and get a stack trace from it. This feature enables his request.

 # echo io_schedule > /debug/tracing/set_ftrace_filter
 # echo function > /debug/tracing/current_tracer
 # echo func_stack_trace > /debug/tracing/trace_options

Produces the following in /debug/tracing/trace:

       kjournald-702   [001]   135.673060: io_schedule <-sync_buffer
       kjournald-702   [002]   135.673671:
 <= sync_buffer
 <= __wait_on_bit
 <= out_of_line_wait_on_bit
 <= __wait_on_buffer
 <= sync_dirty_buffer
 <= journal_commit_transaction
 <= kjournald

Note, be careful about turning this on without filtering the functions.
You may find that you have a 10 second lag between typing and seeing
what you typed. This is why the stack trace for the function tracer
does not use the same stack_trace flag as the other tracers use.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c           | 26 ++++++++-----
 kernel/trace/trace.h           |  7 ++++
 kernel/trace/trace_functions.c | 84 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dcb757f70d21..3c54cb125228 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -835,10 +835,10 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 		trace_function(tr, data, ip, parent_ip, flags, pc);
 }
 
-static void ftrace_trace_stack(struct trace_array *tr,
-			       struct trace_array_cpu *data,
-			       unsigned long flags,
-			       int skip, int pc)
+static void __ftrace_trace_stack(struct trace_array *tr,
+				 struct trace_array_cpu *data,
+				 unsigned long flags,
+				 int skip, int pc)
 {
 #ifdef CONFIG_STACKTRACE
 	struct ring_buffer_event *event;
@@ -846,9 +846,6 @@ static void ftrace_trace_stack(struct trace_array *tr,
 	struct stack_trace trace;
 	unsigned long irq_flags;
 
-	if (!(trace_flags & TRACE_ITER_STACKTRACE))
-		return;
-
 	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					 &irq_flags);
 	if (!event)
@@ -869,12 +866,23 @@ static void ftrace_trace_stack(struct trace_array *tr,
 #endif
 }
 
+static void ftrace_trace_stack(struct trace_array *tr,
+			       struct trace_array_cpu *data,
+			       unsigned long flags,
+			       int skip, int pc)
+{
+	if (!(trace_flags & TRACE_ITER_STACKTRACE))
+		return;
+
+	__ftrace_trace_stack(tr, data, flags, skip, pc);
+}
+
 void __trace_stack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags,
-		   int skip)
+		   int skip, int pc)
 {
-	ftrace_trace_stack(tr, data, flags, skip, preempt_count());
+	__ftrace_trace_stack(tr, data, flags, skip, pc);
 }
 
 static void ftrace_trace_userstack(struct trace_array *tr,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 79c872100dd5..bf39a369e4b3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -457,6 +457,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
 			  struct task_struct *tsk, int cpu);
 
+void __trace_stack(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long flags,
+		   int skip, int pc);
+
 extern cycle_t ftrace_now(int cpu);
 
 #ifdef CONFIG_FUNCTION_TRACER
@@ -467,6 +472,8 @@ void tracing_stop_function_trace(void);
 # define tracing_stop_function_trace()		do { } while (0)
 #endif
 
+extern int ftrace_function_enabled;
+
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 typedef void
 (*tracer_switch_func_t)(void *private,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9236d7e25a16..3a5fa08cedb0 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -16,6 +16,8 @@
 
 #include "trace.h"
 
+static struct trace_array	*func_trace;
+
 static void start_function_trace(struct trace_array *tr)
 {
 	tr->cpu = get_cpu();
@@ -34,6 +36,7 @@ static void stop_function_trace(struct trace_array *tr)
 
 static int function_trace_init(struct trace_array *tr)
 {
+	func_trace = tr;
 	start_function_trace(tr);
 	return 0;
 }
@@ -48,12 +51,93 @@ static void function_trace_start(struct trace_array *tr)
 	tracing_reset_online_cpus(tr);
 }
 
+static void
+function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = func_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	if (unlikely(!ftrace_function_enabled))
+		return;
+
+	/*
+	 * Need to use raw, since this must be called before the
+	 * recursive protection is performed.
+	 */
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		/*
+		 * skip over 5 funcs:
+		 *    __ftrace_trace_stack,
+		 *    __trace_stack,
+		 *    function_stack_trace_call
+		 *    ftrace_list_func
+		 *    ftrace_call
+		 */
+		__trace_stack(tr, data, flags, 5, pc);
+	}
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+static struct ftrace_ops trace_stack_ops __read_mostly =
+{
+	.func = function_stack_trace_call,
+};
+
+/* Our two options */
+enum {
+	TRACE_FUNC_OPT_STACK = 0x1,
+};
+
+static struct tracer_opt func_opts[] = {
+#ifdef CONFIG_STACKTRACE
+	{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
+#endif
+	{ } /* Always set a last empty entry */
+};
+
+static struct tracer_flags func_flags = {
+	.val = 0, /* By default: all flags disabled */
+	.opts = func_opts
+};
+
+static int func_set_flag(u32 old_flags, u32 bit, int set)
+{
+	if (bit == TRACE_FUNC_OPT_STACK) {
+		/* do nothing if already set */
+		if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
+			return 0;
+
+		if (set)
+			register_ftrace_function(&trace_stack_ops);
+		else
+			unregister_ftrace_function(&trace_stack_ops);
+
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
 static struct tracer function_trace __read_mostly =
 {
 	.name	     = "function",
 	.init	     = function_trace_init,
 	.reset	     = function_trace_reset,
 	.start	     = function_trace_start,
+	.flags		= &func_flags,
+	.set_flag	= func_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_function,
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From bb3c3c95f330f7bf16e33b002e48882616089db1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 20:40:23 -0500
Subject: ftrace: move function tracer functions out of trace.c

Impact: clean up of trace.c

The function tracer functions were put in trace.c because it needed
to share static variables that were in trace.c.  Since then, those
variables have become global for various reasons. This patch moves
the function tracer functions into trace_function.c where they belong.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c           | 84 ------------------------------------------
 kernel/trace/trace_functions.c | 84 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 83 insertions(+), 85 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3c54cb125228..2585ffb6c6b5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1046,65 +1046,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 	local_irq_restore(flags);
 }
 
-#ifdef CONFIG_FUNCTION_TRACER
-static void
-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
-{
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
-	long disabled;
-	int cpu, resched;
-	int pc;
-
-	if (unlikely(!ftrace_function_enabled))
-		return;
-
-	pc = preempt_count();
-	resched = ftrace_preempt_disable();
-	local_save_flags(flags);
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-
-	if (likely(disabled == 1))
-		trace_function(tr, data, ip, parent_ip, flags, pc);
-
-	atomic_dec(&data->disabled);
-	ftrace_preempt_enable(resched);
-}
-
-static void
-function_trace_call(unsigned long ip, unsigned long parent_ip)
-{
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
-	long disabled;
-	int cpu;
-	int pc;
-
-	if (unlikely(!ftrace_function_enabled))
-		return;
-
-	/*
-	 * Need to use raw, since this must be called before the
-	 * recursive protection is performed.
-	 */
-	local_irq_save(flags);
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-
-	if (likely(disabled == 1)) {
-		pc = preempt_count();
-		trace_function(tr, data, ip, parent_ip, flags, pc);
-	}
-
-	atomic_dec(&data->disabled);
-	local_irq_restore(flags);
-}
-
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 int trace_graph_entry(struct ftrace_graph_ent *trace)
 {
@@ -1162,31 +1103,6 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-static struct ftrace_ops trace_ops __read_mostly =
-{
-	.func = function_trace_call,
-};
-
-void tracing_start_function_trace(void)
-{
-	ftrace_function_enabled = 0;
-
-	if (trace_flags & TRACE_ITER_PREEMPTONLY)
-		trace_ops.func = function_trace_call_preempt_only;
-	else
-		trace_ops.func = function_trace_call;
-
-	register_ftrace_function(&trace_ops);
-	ftrace_function_enabled = 1;
-}
-
-void tracing_stop_function_trace(void)
-{
-	ftrace_function_enabled = 0;
-	unregister_ftrace_function(&trace_ops);
-}
-#endif
-
 enum trace_file_type {
 	TRACE_FILE_LAT_FMT	= 1,
 	TRACE_FILE_ANNOTATE	= 2,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 3a5fa08cedb0..2dce3c7370d1 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -20,6 +20,7 @@ static struct trace_array	*func_trace;
 
 static void start_function_trace(struct trace_array *tr)
 {
+	func_trace = tr;
 	tr->cpu = get_cpu();
 	tracing_reset_online_cpus(tr);
 	put_cpu();
@@ -36,7 +37,6 @@ static void stop_function_trace(struct trace_array *tr)
 
 static int function_trace_init(struct trace_array *tr)
 {
-	func_trace = tr;
 	start_function_trace(tr);
 	return 0;
 }
@@ -51,6 +51,64 @@ static void function_trace_start(struct trace_array *tr)
 	tracing_reset_online_cpus(tr);
 }
 
+static void
+function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = func_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu, resched;
+	int pc;
+
+	if (unlikely(!ftrace_function_enabled))
+		return;
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	local_save_flags(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		trace_function(tr, data, ip, parent_ip, flags, pc);
+
+	atomic_dec(&data->disabled);
+	ftrace_preempt_enable(resched);
+}
+
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = func_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	if (unlikely(!ftrace_function_enabled))
+		return;
+
+	/*
+	 * Need to use raw, since this must be called before the
+	 * recursive protection is performed.
+	 */
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		trace_function(tr, data, ip, parent_ip, flags, pc);
+	}
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
 static void
 function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 {
@@ -90,6 +148,30 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 	local_irq_restore(flags);
 }
 
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = function_trace_call,
+};
+
+void tracing_start_function_trace(void)
+{
+	ftrace_function_enabled = 0;
+
+	if (trace_flags & TRACE_ITER_PREEMPTONLY)
+		trace_ops.func = function_trace_call_preempt_only;
+	else
+		trace_ops.func = function_trace_call;
+
+	register_ftrace_function(&trace_ops);
+	ftrace_function_enabled = 1;
+}
+
+void tracing_stop_function_trace(void)
+{
+	ftrace_function_enabled = 0;
+	unregister_ftrace_function(&trace_ops);
+}
 static struct ftrace_ops trace_stack_ops __read_mostly =
 {
 	.func = function_stack_trace_call,
-- 
cgit v1.2.3-59-g8ed1b


From c37abc5515b5ed5b1d2134d2deaead492d9f92a2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 20:50:54 -0500
Subject: trace: add gcc printf check to trace_seq_printf

Andrew Morton suggested adding a printf checker to trace_seq_printf
since there are a number of users that have improper format arguments.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index b2c14615e0cd..1cbab5e3dc99 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -16,7 +16,8 @@ struct trace_event {
 	trace_print_func	binary;
 };
 
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
 extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
-- 
cgit v1.2.3-59-g8ed1b


From 5e4abc9839191e213965e0f1dbf36e2e44356c3a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 21:00:50 -0500
Subject: trace: clean up format errors in calls to trace_seq_printf

After adding the printf format checking for trace_seq_printf, several
warnings now show up. This patch cleans them up.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c       |  4 ++--
 kernel/trace/trace_mmiotrace.c | 13 +++++++------
 kernel/trace/trace_output.c    |  2 +-
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index faaa5ae7e75a..7ebc58cee3bd 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -139,12 +139,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Requested */
-	ret = trace_seq_printf(s, "%4d   ", entry->bytes_req);
+	ret = trace_seq_printf(s, "%4ld   ", entry->bytes_req);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Allocated */
-	ret = trace_seq_printf(s, "%4d   ", entry->bytes_alloc);
+	ret = trace_seq_printf(s, "%4ld   ", entry->bytes_alloc);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 621c8c3f3139..ec78e244242e 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -184,21 +184,22 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
 	switch (rw->opcode) {
 	case MMIO_READ:
 		ret = trace_seq_printf(s,
-			"R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			"R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
 			rw->width, secs, usec_rem, rw->map_id,
 			(unsigned long long)rw->phys,
 			rw->value, rw->pc, 0);
 		break;
 	case MMIO_WRITE:
 		ret = trace_seq_printf(s,
-			"W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			"W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
 			rw->width, secs, usec_rem, rw->map_id,
 			(unsigned long long)rw->phys,
 			rw->value, rw->pc, 0);
 		break;
 	case MMIO_UNKNOWN_OP:
 		ret = trace_seq_printf(s,
-			"UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
+			"UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
+			"%02lx 0x%lx %d\n",
 			secs, usec_rem, rw->map_id,
 			(unsigned long long)rw->phys,
 			(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
@@ -230,14 +231,14 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
 	switch (m->opcode) {
 	case MMIO_PROBE:
 		ret = trace_seq_printf(s,
-			"MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+			"MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
 			secs, usec_rem, m->map_id,
 			(unsigned long long)m->phys, m->virt, m->len,
 			0UL, 0);
 		break;
 	case MMIO_UNPROBE:
 		ret = trace_seq_printf(s,
-			"UNMAP %lu.%06lu %d 0x%lx %d\n",
+			"UNMAP %u.%06lu %d 0x%lx %d\n",
 			secs, usec_rem, m->map_id, 0UL, 0);
 		break;
 	default:
@@ -261,7 +262,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 	int ret;
 
 	/* The trailing newline must be in the message. */
-	ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg);
+	ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 4e3ad36b117c..1a4e144a9f8f 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -440,7 +440,7 @@ trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (!trace_seq_printf(s, "%x %x\n",
+	if (!trace_seq_printf(s, "%lx %lx\n",
 			      field->ip,
 			      field->parent_ip))
 		return TRACE_TYPE_PARTIAL_LINE;
-- 
cgit v1.2.3-59-g8ed1b


From 3eb36aa05329a47cbe201c151fd0024a4a3649cd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 22:21:43 -0500
Subject: ftrace: combine stack trace in function call

Impact: less likely to interleave function and stack traces

This patch does replaces the separate stack trace on function with
a record function and stack trace together. This will switch between
the function only recording to a function and stack recording.

Also some whitespace fix ups as well.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions.c | 61 +++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 2dce3c7370d1..61d0b73dabf5 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -133,6 +133,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
+		trace_function(tr, data, ip, parent_ip, flags, pc);
 		/*
 		 * skip over 5 funcs:
 		 *    __ftrace_trace_stack,
@@ -154,24 +155,6 @@ static struct ftrace_ops trace_ops __read_mostly =
 	.func = function_trace_call,
 };
 
-void tracing_start_function_trace(void)
-{
-	ftrace_function_enabled = 0;
-
-	if (trace_flags & TRACE_ITER_PREEMPTONLY)
-		trace_ops.func = function_trace_call_preempt_only;
-	else
-		trace_ops.func = function_trace_call;
-
-	register_ftrace_function(&trace_ops);
-	ftrace_function_enabled = 1;
-}
-
-void tracing_stop_function_trace(void)
-{
-	ftrace_function_enabled = 0;
-	unregister_ftrace_function(&trace_ops);
-}
 static struct ftrace_ops trace_stack_ops __read_mostly =
 {
 	.func = function_stack_trace_call,
@@ -194,6 +177,31 @@ static struct tracer_flags func_flags = {
 	.opts = func_opts
 };
 
+void tracing_start_function_trace(void)
+{
+	ftrace_function_enabled = 0;
+
+	if (trace_flags & TRACE_ITER_PREEMPTONLY)
+		trace_ops.func = function_trace_call_preempt_only;
+	else
+		trace_ops.func = function_trace_call;
+
+	if (func_flags.val & TRACE_FUNC_OPT_STACK)
+		register_ftrace_function(&trace_stack_ops);
+	else
+		register_ftrace_function(&trace_ops);
+
+	ftrace_function_enabled = 1;
+}
+
+void tracing_stop_function_trace(void)
+{
+	ftrace_function_enabled = 0;
+	/* OK if they are not registered */
+	unregister_ftrace_function(&trace_stack_ops);
+	unregister_ftrace_function(&trace_ops);
+}
+
 static int func_set_flag(u32 old_flags, u32 bit, int set)
 {
 	if (bit == TRACE_FUNC_OPT_STACK) {
@@ -201,10 +209,13 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
 		if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
 			return 0;
 
-		if (set)
+		if (set) {
+			unregister_ftrace_function(&trace_ops);
 			register_ftrace_function(&trace_stack_ops);
-		else
+		} else {
 			unregister_ftrace_function(&trace_stack_ops);
+			register_ftrace_function(&trace_ops);
+		}
 
 		return 0;
 	}
@@ -214,14 +225,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
 
 static struct tracer function_trace __read_mostly =
 {
-	.name	     = "function",
-	.init	     = function_trace_init,
-	.reset	     = function_trace_reset,
-	.start	     = function_trace_start,
+	.name		= "function",
+	.init		= function_trace_init,
+	.reset		= function_trace_reset,
+	.start		= function_trace_start,
 	.flags		= &func_flags,
 	.set_flag	= func_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
-	.selftest    = trace_selftest_startup_function,
+	.selftest	= trace_selftest_startup_function,
 #endif
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From a225cdd263f340c864febb1992802fb5b08bc328 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 23:06:03 -0500
Subject: ftrace: remove static from function tracer functions

Impact: clean up

After reorganizing the functions in trace.c and trace_function.c,
they no longer need to be in global context. This patch makes the
functions and one variable into static.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c           |  3 ---
 kernel/trace/trace.h           | 10 ----------
 kernel/trace/trace_functions.c | 10 ++++++++--
 3 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2585ffb6c6b5..7de6a94063dd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -187,9 +187,6 @@ int tracing_is_enabled(void)
 	return tracer_enabled;
 }
 
-/* function tracing enabled */
-int				ftrace_function_enabled;
-
 /*
  * trace_buf_size is the size in bytes that is allocated
  * for a buffer. Note, the number of bytes is always rounded
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bf39a369e4b3..54b72781e920 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -464,16 +464,6 @@ void __trace_stack(struct trace_array *tr,
 
 extern cycle_t ftrace_now(int cpu);
 
-#ifdef CONFIG_FUNCTION_TRACER
-void tracing_start_function_trace(void);
-void tracing_stop_function_trace(void);
-#else
-# define tracing_start_function_trace()		do { } while (0)
-# define tracing_stop_function_trace()		do { } while (0)
-#endif
-
-extern int ftrace_function_enabled;
-
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 typedef void
 (*tracer_switch_func_t)(void *private,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 61d0b73dabf5..b3a320f8aba7 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -16,8 +16,14 @@
 
 #include "trace.h"
 
+/* function tracing enabled */
+static int			ftrace_function_enabled;
+
 static struct trace_array	*func_trace;
 
+static void tracing_start_function_trace(void);
+static void tracing_stop_function_trace(void);
+
 static void start_function_trace(struct trace_array *tr)
 {
 	func_trace = tr;
@@ -177,7 +183,7 @@ static struct tracer_flags func_flags = {
 	.opts = func_opts
 };
 
-void tracing_start_function_trace(void)
+static void tracing_start_function_trace(void)
 {
 	ftrace_function_enabled = 0;
 
@@ -194,7 +200,7 @@ void tracing_start_function_trace(void)
 	ftrace_function_enabled = 1;
 }
 
-void tracing_stop_function_trace(void)
+static void tracing_stop_function_trace(void)
 {
 	ftrace_function_enabled = 0;
 	/* OK if they are not registered */
-- 
cgit v1.2.3-59-g8ed1b


From 745b1626dd71ce9661a05ea4db57859ed5c773d2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 23:40:11 -0500
Subject: trace: set max latency variable to zero on default

Impact: trace max latencies on start of latency tracing

This patch sets the max latency to zero whenever one of the
irq variant tracers or the wakeup tracer is set to current tracer.

Most developers expect to see output when starting up a latency
tracer. But since the max_latency is already set to max, and
it takes a latency greater than max_latency to be recorded, there
is no trace. This is not the expected behavior and has even confused
myself.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c              | 2 +-
 kernel/trace/trace_irqsoff.c      | 1 +
 kernel/trace/trace_sched_wakeup.c | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7de6a94063dd..220c264e3111 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,7 +41,7 @@
 
 #define TRACE_BUFFER_FLAGS	(RB_FL_OVERWRITE)
 
-unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
+unsigned long __read_mostly	tracing_max_latency;
 unsigned long __read_mostly	tracing_thresh;
 
 /*
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7c2e326bbc8b..62a78d943534 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -380,6 +380,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
 
 static void __irqsoff_tracer_init(struct trace_array *tr)
 {
+	tracing_max_latency = 0;
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visible */
 	smp_wmb();
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 43586b689e31..42ae1e77b6b3 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -333,6 +333,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 
 static int wakeup_tracer_init(struct trace_array *tr)
 {
+	tracing_max_latency = 0;
 	wakeup_trace = tr;
 	start_wakeup_tracer(tr);
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From e162b39a368f0401e41b558f430c354d12a85b37 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Thu, 15 Jan 2009 11:08:40 -0800
Subject: softlockup: decouple hung tasks check from softlockup detection

Decoupling allows:

* hung tasks check to happen at very low priority

* hung tasks check and softlockup to be enabled/disabled independently
  at compile and/or run-time

* individual panic settings to be enabled disabled independently
  at compile and/or run-time

* softlockup threshold to be reduced without increasing hung tasks
  poll frequency (hung task check is expensive relative to softlock watchdog)

* hung task check to be zero over-head when disabled at run-time

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  14 +++-
 kernel/Makefile       |   1 +
 kernel/hung_task.c    | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/softlockup.c   | 100 -------------------------
 kernel/sysctl.c       |  15 +++-
 lib/Kconfig.debug     |  38 ++++++++++
 6 files changed, 261 insertions(+), 105 deletions(-)
 create mode 100644 kernel/hung_task.c

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 54cbabf3b871..f2f94d532302 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -297,9 +297,6 @@ extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
 				    struct file *filp, void __user *buffer,
 				    size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
-extern unsigned long sysctl_hung_task_check_count;
-extern unsigned long sysctl_hung_task_timeout_secs;
-extern unsigned long sysctl_hung_task_warnings;
 extern int softlockup_thresh;
 #else
 static inline void softlockup_tick(void)
@@ -316,6 +313,15 @@ static inline void touch_all_softlockup_watchdogs(void)
 }
 #endif
 
+#ifdef CONFIG_DETECT_HUNG_TASK
+extern unsigned int  sysctl_hung_task_panic;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern unsigned long sysctl_hung_task_warnings;
+extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+					 struct file *filp, void __user *buffer,
+					 size_t *lenp, loff_t *ppos);
+#endif
 
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
@@ -1236,7 +1242,7 @@ struct task_struct {
 /* ipc stuff */
 	struct sysv_sem sysvsem;
 #endif
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_DETECT_HUNG_TASK
 /* hung task detection */
 	unsigned long last_switch_timestamp;
 	unsigned long last_switch_count;
diff --git a/kernel/Makefile b/kernel/Makefile
index 2aebc4cd7878..979745f1b4bc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += kgdb.o
 obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 000000000000..ba5a77cad3bb
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,198 @@
+/*
+ * Detect Hung Task
+ *
+ * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+/*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+static unsigned long __read_mostly hung_task_poll_jiffies;
+
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
+
+static int __read_mostly did_panic;
+
+static struct task_struct *watchdog_task;
+
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * hung task is detected:
+ */
+unsigned int __read_mostly sysctl_hung_task_panic =
+				CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+
+static int __init hung_task_panic_setup(char *str)
+{
+	sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
+
+	return 1;
+}
+__setup("hung_task_panic=", hung_task_panic_setup);
+
+static int
+hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = hung_task_panic,
+};
+
+/*
+ * Returns seconds, approximately.  We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(void)
+{
+	int this_cpu = raw_smp_processor_id();
+
+	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+}
+
+static void check_hung_task(struct task_struct *t, unsigned long now)
+{
+	unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+	if (t->flags & PF_FROZEN)
+		return;
+
+	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+		t->last_switch_count = switch_count;
+		t->last_switch_timestamp = now;
+		return;
+	}
+	if ((long)(now - t->last_switch_timestamp) <
+					sysctl_hung_task_timeout_secs)
+		return;
+	if (!sysctl_hung_task_warnings)
+		return;
+	sysctl_hung_task_warnings--;
+
+	/*
+	 * Ok, the task did not get scheduled for more than 2 minutes,
+	 * complain:
+	 */
+	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+			"%ld seconds.\n", t->comm, t->pid,
+			sysctl_hung_task_timeout_secs);
+	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+			" disables this message.\n");
+	sched_show_task(t);
+	__debug_show_held_locks(t);
+
+	t->last_switch_timestamp = now;
+	touch_nmi_watchdog();
+
+	if (sysctl_hung_task_panic)
+		panic("hung_task: blocked tasks");
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(void)
+{
+	int max_count = sysctl_hung_task_check_count;
+	unsigned long now = get_timestamp();
+	struct task_struct *g, *t;
+
+	/*
+	 * If the system crashed already then all bets are off,
+	 * do not report extra hung tasks:
+	 */
+	if (test_taint(TAINT_DIE) || did_panic)
+		return;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, t) {
+		if (!--max_count)
+			goto unlock;
+		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+		if (t->state == TASK_UNINTERRUPTIBLE)
+			check_hung_task(t, now);
+	} while_each_thread(g, t);
+ unlock:
+	read_unlock(&tasklist_lock);
+}
+
+static void update_poll_jiffies(void)
+{
+	/* timeout of 0 will disable the watchdog */
+	if (sysctl_hung_task_timeout_secs == 0)
+		hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT;
+	else
+		hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2;
+}
+
+/*
+ * Process updating of timeout sysctl
+ */
+int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+				  struct file *filp, void __user *buffer,
+				  size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+
+	if (ret || !write)
+		goto out;
+
+	update_poll_jiffies();
+
+	wake_up_process(watchdog_task);
+
+ out:
+	return ret;
+}
+
+/*
+ * kthread which checks for tasks stuck in D state
+ */
+static int watchdog(void *dummy)
+{
+	set_user_nice(current, 0);
+	update_poll_jiffies();
+
+	for ( ; ; ) {
+		while (schedule_timeout_interruptible(hung_task_poll_jiffies));
+		check_hung_uninterruptible_tasks();
+	}
+
+	return 0;
+}
+
+static int __init hung_task_init(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+	watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+
+	return 0;
+}
+
+module_init(hung_task_init);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 85d5a2455103..88796c330838 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -165,98 +165,12 @@ void softlockup_tick(void)
 		panic("softlockup: hung tasks");
 }
 
-/*
- * Have a reasonable limit on the number of tasks checked:
- */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
-
-/*
- * Zero means infinite timeout - no checking done:
- */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
-
-unsigned long __read_mostly sysctl_hung_task_warnings = 10;
-
-/*
- * Only do the hung-tasks check on one CPU:
- */
-static int check_cpu __read_mostly = -1;
-
-static void check_hung_task(struct task_struct *t, unsigned long now)
-{
-	unsigned long switch_count = t->nvcsw + t->nivcsw;
-
-	if (t->flags & PF_FROZEN)
-		return;
-
-	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
-		t->last_switch_count = switch_count;
-		t->last_switch_timestamp = now;
-		return;
-	}
-	if ((long)(now - t->last_switch_timestamp) <
-					sysctl_hung_task_timeout_secs)
-		return;
-	if (!sysctl_hung_task_warnings)
-		return;
-	sysctl_hung_task_warnings--;
-
-	/*
-	 * Ok, the task did not get scheduled for more than 2 minutes,
-	 * complain:
-	 */
-	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
-			"%ld seconds.\n", t->comm, t->pid,
-			sysctl_hung_task_timeout_secs);
-	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
-			" disables this message.\n");
-	sched_show_task(t);
-	__debug_show_held_locks(t);
-
-	t->last_switch_timestamp = now;
-	touch_nmi_watchdog();
-
-	if (softlockup_panic)
-		panic("softlockup: blocked tasks");
-}
-
-/*
- * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
- * a really long time (120 seconds). If that happens, print out
- * a warning.
- */
-static void check_hung_uninterruptible_tasks(int this_cpu)
-{
-	int max_count = sysctl_hung_task_check_count;
-	unsigned long now = get_timestamp(this_cpu);
-	struct task_struct *g, *t;
-
-	/*
-	 * If the system crashed already then all bets are off,
-	 * do not report extra hung tasks:
-	 */
-	if (test_taint(TAINT_DIE) || did_panic)
-		return;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, t) {
-		if (!--max_count)
-			goto unlock;
-		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
-		if (t->state == TASK_UNINTERRUPTIBLE)
-			check_hung_task(t, now);
-	} while_each_thread(g, t);
- unlock:
-	read_unlock(&tasklist_lock);
-}
-
 /*
  * The watchdog thread - runs every second and touches the timestamp.
  */
 static int watchdog(void *__bind_cpu)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-	int this_cpu = (long)__bind_cpu;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 
@@ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu)
 		if (kthread_should_stop())
 			break;
 
-		if (this_cpu == check_cpu) {
-			if (sysctl_hung_task_timeout_secs)
-				check_hung_uninterruptible_tasks(this_cpu);
-		}
-
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
@@ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		check_cpu = cpumask_any(cpu_online_mask);
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		if (hotcpu == check_cpu) {
-			/* Pick any other online cpu. */
-			check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
-		}
-		break;
-
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 		if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 596dc31a7116..2481ed30d2b5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -805,6 +805,19 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &neg_one,
 		.extra2		= &sixty,
 	},
+#endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hung_task_panic",
+		.data		= &sysctl_hung_task_panic,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_check_count",
@@ -820,7 +833,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_hung_task_timeout_secs,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= &proc_dohung_task_timeout_secs,
 		.strategy	= &sysctl_intvec,
 	},
 	{
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4c9ae6085c75..883ecea22f37 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -186,6 +186,44 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE
 	default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC
 	default 1 if BOOTPARAM_SOFTLOCKUP_PANIC
 
+config DETECT_HUNG_TASK
+	bool "Detect Hung Tasks"
+	depends on DEBUG_KERNEL
+	default y
+	help
+	  Say Y here to enable the kernel to detect "hung tasks",
+	  which are bugs that cause the task to be stuck in
+	  uninterruptible "D" state indefinitiley.
+
+	  When a hung task is detected, the kernel will print the
+	  current stack trace (which you should report), but the
+	  task will stay in uninterruptible state. If lockdep is
+	  enabled then all held locks will also be reported. This
+	  feature has negligible overhead.
+
+config BOOTPARAM_HUNG_TASK_PANIC
+	bool "Panic (Reboot) On Hung Tasks"
+	depends on DETECT_HUNG_TASK
+	help
+	  Say Y here to enable the kernel to panic on "hung tasks",
+	  which are bugs that cause the kernel to leave a task stuck
+	  in uninterruptible "D" state.
+
+	  The panic can be used in combination with panic_timeout,
+	  to cause the system to reboot automatically after a
+	  hung task has been detected. This feature is useful for
+	  high-availability systems that have uptime guarantees and
+	  where a hung tasks must be resolved ASAP.
+
+	  Say N if unsure.
+
+config BOOTPARAM_HUNG_TASK_PANIC_VALUE
+	int
+	depends on DETECT_HUNG_TASK
+	range 0 1
+	default 0 if !BOOTPARAM_HUNG_TASK_PANIC
+	default 1 if BOOTPARAM_HUNG_TASK_PANIC
+
 config SCHED_DEBUG
 	bool "Collect scheduler debugging info"
 	depends on DEBUG_KERNEL && PROC_FS
-- 
cgit v1.2.3-59-g8ed1b


From af432eb1cc3178ec7109aca2283aafb1c12ccac1 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Fri, 16 Jan 2009 09:09:38 -0800
Subject: softlockup: fix to allow compiling with !DETECT_HUNG_TASK

Fixes the following compile error:

 kernel/fork.c:1049: error: 'struct task_struct' has no member named 'last_switch_count'
 kernel/fork.c:1050: error: 'struct task_struct' has no member named 'last_switch_timestamp'

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 1d68f1255dd8..fb9444282836 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1041,7 +1041,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->default_timer_slack_ns = current->timer_slack_ns;
 
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_DETECT_HUNG_TASK
 	p->last_switch_count = 0;
 	p->last_switch_timestamp = 0;
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 603a148f434742fff08273207ffa44176cad13a1 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Sat, 17 Jan 2009 10:31:48 -0800
Subject: softlockup: fix potential race in hung_task when resetting timeout

Impact: fix potential false panic

A potential race exists if sysctl_hung_task_timeout_secs is reset to 0
while inside check_hung_uniterruptible_tasks(). If check_task() is
entered, a comparison with 0 will result in a false hung_task being
detected.

If sysctl_hung_task_panic is set, the system will panic.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ba5a77cad3bb..ba8ccd432963 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -72,7 +72,8 @@ static unsigned long get_timestamp(void)
 	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
 }
 
-static void check_hung_task(struct task_struct *t, unsigned long now)
+static void check_hung_task(struct task_struct *t, unsigned long now,
+			    unsigned long timeout)
 {
 	unsigned long switch_count = t->nvcsw + t->nivcsw;
 
@@ -84,8 +85,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
 		t->last_switch_timestamp = now;
 		return;
 	}
-	if ((long)(now - t->last_switch_timestamp) <
-					sysctl_hung_task_timeout_secs)
+	if ((long)(now - t->last_switch_timestamp) < timeout)
 		return;
 	if (!sysctl_hung_task_warnings)
 		return;
@@ -96,8 +96,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
 	 * complain:
 	 */
 	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
-			"%ld seconds.\n", t->comm, t->pid,
-			sysctl_hung_task_timeout_secs);
+			"%ld seconds.\n", t->comm, t->pid, timeout);
 	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
 			" disables this message.\n");
 	sched_show_task(t);
@@ -115,7 +114,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
  * a really long time (120 seconds). If that happens, print out
  * a warning.
  */
-static void check_hung_uninterruptible_tasks(void)
+static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
 	int max_count = sysctl_hung_task_check_count;
 	unsigned long now = get_timestamp();
@@ -134,7 +133,7 @@ static void check_hung_uninterruptible_tasks(void)
 			goto unlock;
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
-			check_hung_task(t, now);
+			check_hung_task(t, now, timeout);
 	} while_each_thread(g, t);
  unlock:
 	read_unlock(&tasklist_lock);
@@ -180,8 +179,17 @@ static int watchdog(void *dummy)
 	update_poll_jiffies();
 
 	for ( ; ; ) {
+		unsigned long timeout;
+
 		while (schedule_timeout_interruptible(hung_task_poll_jiffies));
-		check_hung_uninterruptible_tasks();
+
+		/*
+		 * Need to cache timeout here to avoid timeout being set
+		 * to 0 via sysctl while inside check_hung_*_tasks().
+		 */
+		timeout = sysctl_hung_task_timeout_secs;
+		if (timeout)
+			check_hung_uninterruptible_tasks(timeout);
 	}
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 5c5317de147e9b38ea9c4cbdc2d15bed7648d036 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Mon, 19 Jan 2009 10:26:53 +0100
Subject: x86, ftrace, hw-branch-tracer: support hotplug cpus

Support hotplug cpus.

Reported-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 123 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 107 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index df21c1e72b95..398195397c75 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,7 +1,8 @@
 /*
  * h/w branch tracer for x86 based on bts
  *
- * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
+ * Copyright (C) 2008-2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
  *
  */
 
@@ -10,6 +11,9 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
 
 #include <asm/ds.h>
 
@@ -19,13 +23,31 @@
 
 #define SIZEOF_BTS (1 << 13)
 
+/* The tracer mutex protects the below per-cpu tracer array.
+   It needs to be held to:
+   - start tracing on all cpus
+   - stop tracing on all cpus
+   - start tracing on a single hotplug cpu
+   - stop tracing on a single hotplug cpu
+   - read the trace from all cpus
+   - read the trace from a single cpu
+*/
+static DEFINE_MUTEX(bts_tracer_mutex);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
 static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
 
 #define this_tracer per_cpu(tracer, smp_processor_id())
 #define this_buffer per_cpu(buffer, smp_processor_id())
 
+static int __read_mostly trace_hw_branches_enabled;
 
+
+/*
+ * Start tracing on the current cpu.
+ * The argument is ignored.
+ *
+ * pre: bts_tracer_mutex must be locked.
+ */
 static void bts_trace_start_cpu(void *arg)
 {
 	if (this_tracer)
@@ -43,14 +65,20 @@ static void bts_trace_start_cpu(void *arg)
 
 static void bts_trace_start(struct trace_array *tr)
 {
-	int cpu;
+	mutex_lock(&bts_tracer_mutex);
 
-	tracing_reset_online_cpus(tr);
+	on_each_cpu(bts_trace_start_cpu, NULL, 1);
+	trace_hw_branches_enabled = 1;
 
-	for_each_cpu(cpu, cpu_possible_mask)
-		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+	mutex_unlock(&bts_tracer_mutex);
 }
 
+/*
+ * Start tracing on the current cpu.
+ * The argument is ignored.
+ *
+ * pre: bts_tracer_mutex must be locked.
+ */
 static void bts_trace_stop_cpu(void *arg)
 {
 	if (this_tracer) {
@@ -61,20 +89,58 @@ static void bts_trace_stop_cpu(void *arg)
 
 static void bts_trace_stop(struct trace_array *tr)
 {
-	int cpu;
+	mutex_lock(&bts_tracer_mutex);
+
+	trace_hw_branches_enabled = 0;
+	on_each_cpu(bts_trace_stop_cpu, NULL, 1);
 
-	for_each_cpu(cpu, cpu_possible_mask)
+	mutex_unlock(&bts_tracer_mutex);
+}
+
+static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
+				     unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+
+	mutex_lock(&bts_tracer_mutex);
+
+	if (!trace_hw_branches_enabled)
+		goto out;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+		break;
+	case CPU_DOWN_PREPARE:
 		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+		break;
+	}
+
+ out:
+	mutex_unlock(&bts_tracer_mutex);
+	return NOTIFY_DONE;
 }
 
+static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
+	.notifier_call = bts_hotcpu_handler
+};
+
 static int bts_trace_init(struct trace_array *tr)
 {
+	register_hotcpu_notifier(&bts_hotcpu_notifier);
 	tracing_reset_online_cpus(tr);
 	bts_trace_start(tr);
 
 	return 0;
 }
 
+static void bts_trace_reset(struct trace_array *tr)
+{
+	bts_trace_stop(tr);
+	unregister_hotcpu_notifier(&bts_hotcpu_notifier);
+}
+
 static void bts_trace_print_header(struct seq_file *m)
 {
 	seq_puts(m,
@@ -108,18 +174,34 @@ void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
 {
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
-	unsigned long irq;
+	unsigned long irq1, irq2;
+	int cpu;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
-	if (!event)
+	if (unlikely(!tr))
+		return;
+
+	if (unlikely(!trace_hw_branches_enabled))
 		return;
+
+	local_irq_save(irq1);
+	cpu = raw_smp_processor_id();
+	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+		goto out;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq2);
+	if (!event)
+		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, from);
 	entry->ent.type = TRACE_HW_BRANCHES;
-	entry->ent.cpu = smp_processor_id();
+	entry->ent.cpu = cpu;
 	entry->from = from;
 	entry->to   = to;
-	ring_buffer_unlock_commit(tr->buffer, event, irq);
+	ring_buffer_unlock_commit(tr->buffer, event, irq2);
+
+ out:
+	atomic_dec(&tr->data[cpu]->disabled);
+	local_irq_restore(irq1);
 }
 
 static void trace_bts_at(struct trace_array *tr,
@@ -143,6 +225,11 @@ static void trace_bts_at(struct trace_array *tr,
 	}
 }
 
+/*
+ * Collect the trace on the current cpu and write it into the ftrace buffer.
+ *
+ * pre: bts_tracer_mutex must be locked
+ */
 static void trace_bts_cpu(void *arg)
 {
 	struct trace_array *tr = (struct trace_array *) arg;
@@ -152,6 +239,9 @@ static void trace_bts_cpu(void *arg)
 	if (!this_tracer)
 		return;
 
+	if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
+		return;
+
 	ds_suspend_bts(this_tracer);
 	trace = ds_read_bts(this_tracer);
 	if (!trace)
@@ -171,17 +261,18 @@ out:
 
 static void trace_bts_prepare(struct trace_iterator *iter)
 {
-	int cpu;
+	mutex_lock(&bts_tracer_mutex);
+
+	on_each_cpu(trace_bts_cpu, iter->tr, 1);
 
-	for_each_cpu(cpu, cpu_possible_mask)
-		smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
+	mutex_unlock(&bts_tracer_mutex);
 }
 
 struct tracer bts_tracer __read_mostly =
 {
 	.name		= "hw-branch-tracer",
 	.init		= bts_trace_init,
-	.reset		= bts_trace_stop,
+	.reset		= bts_trace_reset,
 	.print_header	= bts_trace_print_header,
 	.print_line	= bts_trace_print_line,
 	.start		= bts_trace_start,
-- 
cgit v1.2.3-59-g8ed1b


From b1818748b0cf9427e48acf9713295e829a0d715f Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Mon, 19 Jan 2009 10:31:01 +0100
Subject: x86, ftrace, hw-branch-tracer: dump trace on oops

Dump the branch trace on an oops (based on ftrace_dump_on_oops).

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/dumpstack.c      |  6 ++++++
 include/linux/ftrace.h           | 13 +++++++++++++
 kernel/trace/trace.h             |  1 -
 kernel/trace/trace_hw_branches.c | 29 ++++++++++++++++++++++-------
 4 files changed, 41 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6b1f6f6f8661..077c9ea655fc 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -14,6 +14,7 @@
 #include <linux/bug.h>
 #include <linux/nmi.h>
 #include <linux/sysfs.h>
+#include <linux/ftrace.h>
 
 #include <asm/stacktrace.h>
 
@@ -195,6 +196,11 @@ unsigned __kprobes long oops_begin(void)
 	int cpu;
 	unsigned long flags;
 
+	/* notify the hw-branch tracer so it may disable tracing and
+	   add the last trace to the trace buffer -
+	   the earlier this happens, the more useful the trace. */
+	trace_hw_branch_oops();
+
 	oops_enter();
 
 	/* racy, but better than risking deadlock. */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 054721487574..9f7880d87c39 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -496,4 +496,17 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
 
 #endif /* CONFIG_TRACING */
 
+
+#ifdef CONFIG_HW_BRANCH_TRACER
+
+void trace_hw_branch(u64 from, u64 to);
+void trace_hw_branch_oops(void);
+
+#else /* CONFIG_HW_BRANCH_TRACER */
+
+static inline void trace_hw_branch(u64 from, u64 to) {}
+static inline void trace_hw_branch_oops(void) {}
+
+#endif /* CONFIG_HW_BRANCH_TRACER */
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 54b72781e920..b96037d970df 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -438,7 +438,6 @@ void trace_function(struct trace_array *tr,
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
 int trace_graph_entry(struct ftrace_graph_ent *trace);
-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 398195397c75..e56df2c7d679 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -40,6 +40,7 @@ static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
 #define this_buffer per_cpu(buffer, smp_processor_id())
 
 static int __read_mostly trace_hw_branches_enabled;
+static struct trace_array *hw_branch_trace __read_mostly;
 
 
 /*
@@ -128,6 +129,8 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 
 static int bts_trace_init(struct trace_array *tr)
 {
+	hw_branch_trace = tr;
+
 	register_hotcpu_notifier(&bts_hotcpu_notifier);
 	tracing_reset_online_cpus(tr);
 	bts_trace_start(tr);
@@ -170,8 +173,9 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 	return TRACE_TYPE_UNHANDLED;
 }
 
-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
+void trace_hw_branch(u64 from, u64 to)
 {
+	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
 	unsigned long irq1, irq2;
@@ -204,8 +208,7 @@ void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
 	local_irq_restore(irq1);
 }
 
-static void trace_bts_at(struct trace_array *tr,
-			 const struct bts_trace *trace, void *at)
+static void trace_bts_at(const struct bts_trace *trace, void *at)
 {
 	struct bts_struct bts;
 	int err = 0;
@@ -220,7 +223,7 @@ static void trace_bts_at(struct trace_array *tr,
 
 	switch (bts.qualifier) {
 	case BTS_BRANCH:
-		trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
+		trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
 		break;
 	}
 }
@@ -236,12 +239,15 @@ static void trace_bts_cpu(void *arg)
 	const struct bts_trace *trace;
 	unsigned char *at;
 
-	if (!this_tracer)
+	if (unlikely(!tr))
 		return;
 
 	if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
 		return;
 
+	if (unlikely(!this_tracer))
+		return;
+
 	ds_suspend_bts(this_tracer);
 	trace = ds_read_bts(this_tracer);
 	if (!trace)
@@ -249,11 +255,11 @@ static void trace_bts_cpu(void *arg)
 
 	for (at = trace->ds.top; (void *)at < trace->ds.end;
 	     at += trace->ds.size)
-		trace_bts_at(tr, trace, at);
+		trace_bts_at(trace, at);
 
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	     at += trace->ds.size)
-		trace_bts_at(tr, trace, at);
+		trace_bts_at(trace, at);
 
 out:
 	ds_resume_bts(this_tracer);
@@ -268,6 +274,15 @@ static void trace_bts_prepare(struct trace_iterator *iter)
 	mutex_unlock(&bts_tracer_mutex);
 }
 
+void trace_hw_branch_oops(void)
+{
+	mutex_lock(&bts_tracer_mutex);
+
+	trace_bts_cpu(hw_branch_trace);
+
+	mutex_unlock(&bts_tracer_mutex);
+}
+
 struct tracer bts_tracer __read_mostly =
 {
 	.name		= "hw-branch-tracer",
-- 
cgit v1.2.3-59-g8ed1b


From e23b8ad83430a6fdfbdbfac365f5b0312dd57f10 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Mon, 19 Jan 2009 10:33:31 +0100
Subject: x86, ftrace, hw-branch-tracer: reset trace buffer on close

Reset the ftrace buffer on close. Since we use cyclic buffers, the
trace is not contiguous, anyway.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index e56df2c7d679..372b47ac3154 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -274,6 +274,11 @@ static void trace_bts_prepare(struct trace_iterator *iter)
 	mutex_unlock(&bts_tracer_mutex);
 }
 
+static void trace_bts_close(struct trace_iterator *iter)
+{
+	tracing_reset_online_cpus(iter->tr);
+}
+
 void trace_hw_branch_oops(void)
 {
 	mutex_lock(&bts_tracer_mutex);
@@ -292,7 +297,8 @@ struct tracer bts_tracer __read_mostly =
 	.print_line	= bts_trace_print_line,
 	.start		= bts_trace_start,
 	.stop		= bts_trace_stop,
-	.open		= trace_bts_prepare
+	.open		= trace_bts_prepare,
+	.close		= trace_bts_close
 };
 
 __init static int init_bts_trace(void)
-- 
cgit v1.2.3-59-g8ed1b


From 11edda06289d412d13ff7c672bd72e043f637e74 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Mon, 19 Jan 2009 10:29:16 +0100
Subject: x86, ftrace, hw-branch-tracer: change trace format

Change the hw-branch-tracer format to be more readable.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 372b47ac3154..fff3545fc866 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -146,10 +146,7 @@ static void bts_trace_reset(struct trace_array *tr)
 
 static void bts_trace_print_header(struct seq_file *m)
 {
-	seq_puts(m,
-		 "# CPU#        FROM                   TO         FUNCTION\n");
-	seq_puts(m,
-		 "#  |           |                     |             |\n");
+	seq_puts(m, "# CPU#        TO  <-  FROM\n");
 }
 
 static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
@@ -157,15 +154,15 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *seq = &iter->seq;
 	struct hw_branch_entry *it;
+	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
 
 	trace_assign_type(it, entry);
 
 	if (entry->type == TRACE_HW_BRANCHES) {
 		if (trace_seq_printf(seq, "%4d  ", entry->cpu) &&
-		    trace_seq_printf(seq, "0x%016llx -> 0x%016llx ",
-				     it->from, it->to) &&
-		    (!it->from ||
-		     seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
+		    seq_print_ip_sym(seq, it->to, symflags) &&
+		    trace_seq_printf(seq, "\t  <-  ") &&
+		    seq_print_ip_sym(seq, it->from, symflags) &&
 		    trace_seq_printf(seq, "\n"))
 			return TRACE_TYPE_HANDLED;
 		return TRACE_TYPE_PARTIAL_LINE;;
-- 
cgit v1.2.3-59-g8ed1b


From 3690b5e6fd9daa030039ae9bda69044228bd476d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 16 Jan 2009 16:32:25 +0800
Subject: trace_workqueue: use percpu data for workqueue stat

Impact: use percpu data instead of a global structure

Use:

   static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);

instead of allocating a global structure.

percpu data also works well on NUMA.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_workqueue.c | 64 +++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index f8118d39ca9b..4664990fe9c5 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -8,6 +8,7 @@
 
 #include <trace/workqueue.h>
 #include <linux/list.h>
+#include <linux/percpu.h>
 #include "trace_stat.h"
 #include "trace.h"
 
@@ -37,7 +38,8 @@ struct workqueue_global_stats {
 /* Don't need a global lock because allocated before the workqueues, and
  * never freed.
  */
-static struct workqueue_global_stats *all_workqueue_stat;
+static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
+#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
 
 /* Insertion of a work */
 static void
@@ -48,8 +50,8 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
 	struct cpu_workqueue_stats *node, *next;
 	unsigned long flags;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
 							list) {
 		if (node->pid == wq_thread->pid) {
 			atomic_inc(&node->inserted);
@@ -58,7 +60,7 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
 	}
 	pr_debug("trace_workqueue: entry not found\n");
 found:
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 }
 
 /* Execution of a work */
@@ -70,8 +72,8 @@ probe_workqueue_execution(struct task_struct *wq_thread,
 	struct cpu_workqueue_stats *node, *next;
 	unsigned long flags;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
 							list) {
 		if (node->pid == wq_thread->pid) {
 			node->executed++;
@@ -80,7 +82,7 @@ probe_workqueue_execution(struct task_struct *wq_thread,
 	}
 	pr_debug("trace_workqueue: entry not found\n");
 found:
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 }
 
 /* Creation of a cpu workqueue thread */
@@ -104,11 +106,11 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 
 	cws->pid = wq_thread->pid;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	if (list_empty(&all_workqueue_stat[cpu].list))
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	if (list_empty(&workqueue_cpu_stat(cpu)->list))
 		cws->first_entry = true;
-	list_add_tail(&cws->list, &all_workqueue_stat[cpu].list);
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 }
 
 /* Destruction of a cpu workqueue thread */
@@ -119,8 +121,8 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
 	struct cpu_workqueue_stats *node, *next;
 	unsigned long flags;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
 							list) {
 		if (node->pid == wq_thread->pid) {
 			list_del(&node->list);
@@ -131,7 +133,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
 
 	pr_debug("trace_workqueue: don't find workqueue to destroy\n");
 found:
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
 }
 
@@ -141,13 +143,13 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
 	struct cpu_workqueue_stats *ret = NULL;
 
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
 
-	if (!list_empty(&all_workqueue_stat[cpu].list))
-		ret = list_entry(all_workqueue_stat[cpu].list.next,
+	if (!list_empty(&workqueue_cpu_stat(cpu)->list))
+		ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
 				 struct cpu_workqueue_stats, list);
 
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
 	return ret;
 }
@@ -172,9 +174,9 @@ static void *workqueue_stat_next(void *prev, int idx)
 	unsigned long flags;
 	void *ret = NULL;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	if (list_is_last(&prev_cws->list, &all_workqueue_stat[cpu].list)) {
-		spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
+		spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 		for (++cpu ; cpu < num_possible_cpus(); cpu++) {
 			ret = workqueue_stat_start_cpu(cpu);
 			if (ret)
@@ -182,7 +184,7 @@ static void *workqueue_stat_next(void *prev, int idx)
 		}
 		return NULL;
 	}
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
 	return list_entry(prev_cws->list.next, struct cpu_workqueue_stats,
 			  list);
@@ -199,10 +201,10 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 		   cws->executed,
 		   trace_find_cmdline(cws->pid));
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	if (&cws->list == all_workqueue_stat[cpu].list.next)
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
 		seq_printf(s, "\n");
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
 	return 0;
 }
@@ -258,17 +260,9 @@ int __init trace_workqueue_early_init(void)
 	if (ret)
 		goto no_creation;
 
-	all_workqueue_stat = kmalloc(sizeof(struct workqueue_global_stats)
-				     * num_possible_cpus(), GFP_KERNEL);
-
-	if (!all_workqueue_stat) {
-		pr_warning("trace_workqueue: not enough memory\n");
-		goto no_creation;
-	}
-
 	for_each_possible_cpu(cpu) {
-		spin_lock_init(&all_workqueue_stat[cpu].lock);
-		INIT_LIST_HEAD(&all_workqueue_stat[cpu].list);
+		spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
+		INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
 	}
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 5bc4564b224c3d9fe6dddafa25f56059bd978231 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 14:36:52 -0500
Subject: trace: do not disable wake up tracer on output of trace

Impact: fix to erased trace output

To try not to have the outputing of a trace interfere with the wakeup
tracer, it would disable tracing while the output was printing. But
if a trace had started when it was disabled, it can show a partial
trace. To try to solve this, on closing of the tracer, it would
clear the trace buffer.

The latency tracers (wakeup and irqsoff) have two buffers. One for
recording and one for holding the max trace that is printed. The
clearing of the trace above should only affect the recording buffer.
But for some reason it would move the erased trace to the print
buffer. Probably due to a race with the closing of the trace and
the saving ofhe max race.

The above is all pretty useless, and if the user does not want the
printing of the trace to be traced itself, then the user can manual
disable tracing. This patch removes all the code that tries to keep
the output of the tracer from modifying the trace.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 33 ++-------------------------------
 1 file changed, 2 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 42ae1e77b6b3..e27adef0171a 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -262,12 +262,6 @@ out:
 	atomic_dec(&wakeup_trace->data[cpu]->disabled);
 }
 
-/*
- * save_tracer_enabled is used to save the state of the tracer_enabled
- * variable when we disable it when we open a trace output file.
- */
-static int save_tracer_enabled;
-
 static void start_wakeup_tracer(struct trace_array *tr)
 {
 	int ret;
@@ -306,13 +300,10 @@ static void start_wakeup_tracer(struct trace_array *tr)
 
 	register_ftrace_function(&trace_ops);
 
-	if (tracing_is_enabled()) {
+	if (tracing_is_enabled())
 		tracer_enabled = 1;
-		save_tracer_enabled = 1;
-	} else {
+	else
 		tracer_enabled = 0;
-		save_tracer_enabled = 0;
-	}
 
 	return;
 fail_deprobe_wake_new:
@@ -324,7 +315,6 @@ fail_deprobe:
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	save_tracer_enabled = 0;
 	unregister_ftrace_function(&trace_ops);
 	unregister_trace_sched_switch(probe_wakeup_sched_switch);
 	unregister_trace_sched_wakeup_new(probe_wakeup);
@@ -350,28 +340,11 @@ static void wakeup_tracer_start(struct trace_array *tr)
 {
 	wakeup_reset(tr);
 	tracer_enabled = 1;
-	save_tracer_enabled = 1;
 }
 
 static void wakeup_tracer_stop(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	save_tracer_enabled = 0;
-}
-
-static void wakeup_tracer_open(struct trace_iterator *iter)
-{
-	/* stop the trace while dumping */
-	tracer_enabled = 0;
-}
-
-static void wakeup_tracer_close(struct trace_iterator *iter)
-{
-	/* forget about any processes we were recording */
-	if (save_tracer_enabled) {
-		wakeup_reset(iter->tr);
-		tracer_enabled = 1;
-	}
 }
 
 static struct tracer wakeup_tracer __read_mostly =
@@ -381,8 +354,6 @@ static struct tracer wakeup_tracer __read_mostly =
 	.reset		= wakeup_tracer_reset,
 	.start		= wakeup_tracer_start,
 	.stop		= wakeup_tracer_stop,
-	.open		= wakeup_tracer_open,
-	.close		= wakeup_tracer_close,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
-- 
cgit v1.2.3-59-g8ed1b


From 97b17efe4537e11bf6669106cfe4ee2c5331b267 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 15:24:56 -0500
Subject: ring-buffer: do not swap if recording is disabled

If the ring buffer recording has been disabled. Do not let
swapping of ring buffers occur. Simply return -EAGAIN.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 15 +++++++++++++++
 kernel/trace/trace.c       |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0b9de5a3d699..890020e28a35 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2266,9 +2266,24 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 	if (buffer_a->pages != buffer_b->pages)
 		return -EINVAL;
 
+	if (ring_buffer_flags != RB_BUFFERS_ON)
+		return -EAGAIN;
+
+	if (atomic_read(&buffer_a->record_disabled))
+		return -EAGAIN;
+
+	if (atomic_read(&buffer_b->record_disabled))
+		return -EAGAIN;
+
 	cpu_buffer_a = buffer_a->buffers[cpu];
 	cpu_buffer_b = buffer_b->buffers[cpu];
 
+	if (atomic_read(&cpu_buffer_a->record_disabled))
+		return -EAGAIN;
+
+	if (atomic_read(&cpu_buffer_b->record_disabled))
+		return -EAGAIN;
+
 	/*
 	 * We can't do a synchronize_sched here because this
 	 * function can be called in atomic context.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 220c264e3111..757ae6f7e648 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -415,7 +415,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	ftrace_enable_cpu();
 
-	WARN_ON_ONCE(ret);
+	WARN_ON_ONCE(ret && ret != -EAGAIN);
 
 	__update_max_tr(tr, tsk, cpu);
 	__raw_spin_unlock(&ftrace_max_lock);
-- 
cgit v1.2.3-59-g8ed1b


From 3244351c31211a8b1ba8b4b34c3de04d5dfa03e4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 16:24:46 -0500
Subject: trace: separate out rt tasks from wakeup tracer

Impact: add option to trace all tasks or just RT tasks

The current wakeup tracer only traces RT task wakeups. This is
fine for those interested in wake up timings of RT tasks, but
it is useless for those that are interested in the causes
of long wakeups for non RT tasks.

This patch creates a "wakeup_rt" to implement the tracing of just
RT tasks (as the current "wakeup" does). And makes "wakeup" now
trace all tasks as an average developer would expect.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e27adef0171a..f48957886102 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -25,6 +25,7 @@ static int __read_mostly	tracer_enabled;
 static struct task_struct	*wakeup_task;
 static int			wakeup_cpu;
 static unsigned			wakeup_prio = -1;
+static int			wakeup_rt;
 
 static raw_spinlock_t wakeup_lock =
 	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
@@ -224,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 	tracing_record_cmdline(p);
 	tracing_record_cmdline(current);
 
-	if (likely(!rt_task(p)) ||
+	if ((wakeup_rt && !rt_task(p)) ||
 			p->prio >= wakeup_prio ||
 			p->prio >= current->prio)
 		return;
@@ -321,7 +322,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 	unregister_trace_sched_wakeup(probe_wakeup);
 }
 
-static int wakeup_tracer_init(struct trace_array *tr)
+static int __wakeup_tracer_init(struct trace_array *tr)
 {
 	tracing_max_latency = 0;
 	wakeup_trace = tr;
@@ -329,6 +330,18 @@ static int wakeup_tracer_init(struct trace_array *tr)
 	return 0;
 }
 
+static int wakeup_tracer_init(struct trace_array *tr)
+{
+	wakeup_rt = 0;
+	return __wakeup_tracer_init(tr);
+}
+
+static int wakeup_rt_tracer_init(struct trace_array *tr)
+{
+	wakeup_rt = 1;
+	return __wakeup_tracer_init(tr);
+}
+
 static void wakeup_tracer_reset(struct trace_array *tr)
 {
 	stop_wakeup_tracer(tr);
@@ -360,6 +373,19 @@ static struct tracer wakeup_tracer __read_mostly =
 #endif
 };
 
+static struct tracer wakeup_rt_tracer __read_mostly =
+{
+	.name		= "wakeup_rt",
+	.init		= wakeup_rt_tracer_init,
+	.reset		= wakeup_tracer_reset,
+	.start		= wakeup_tracer_start,
+	.stop		= wakeup_tracer_stop,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_wakeup,
+#endif
+};
+
 __init static int init_wakeup_tracer(void)
 {
 	int ret;
@@ -368,6 +394,10 @@ __init static int init_wakeup_tracer(void)
 	if (ret)
 		return ret;
 
+	ret = register_tracer(&wakeup_rt_tracer);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 device_initcall(init_wakeup_tracer);
-- 
cgit v1.2.3-59-g8ed1b


From f8ec1062f589cdb1cffcffab1376124a1bc08500 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 17:17:04 -0500
Subject: wakeup-tracer: show scheduling data in output

Impact: better data for wakeup tracer

This patch adds the wakeup and schedule calls that are used by
the scheduler tracer to make the wakeup tracer more readable.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index f48957886102..93cecda650b2 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -153,6 +153,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 		goto out_unlock;
 
 	trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	tracing_sched_switch_trace(wakeup_trace, data, prev, next, flags, pc);
 
 	/*
 	 * usecs conversion is slow so we try to delay the conversion
@@ -214,6 +215,7 @@ static void wakeup_reset(struct trace_array *tr)
 static void
 probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 {
+	struct trace_array_cpu *data;
 	int cpu = smp_processor_id();
 	unsigned long flags;
 	long disabled;
@@ -253,9 +255,12 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 
 	local_save_flags(flags);
 
-	wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
-	trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu],
-		       CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	data = wakeup_trace->data[wakeup_cpu];
+	data->preempt_timestamp = ftrace_now(cpu);
+	tracing_sched_wakeup_trace(wakeup_trace, data, p, current,
+				   flags, pc);
+	trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2,
+		       flags, pc);
 
 out_locked:
 	__raw_spin_unlock(&wakeup_lock);
-- 
cgit v1.2.3-59-g8ed1b


From 69507c06539332e6e49f83aa478844130233bece Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 18:45:57 -0500
Subject: ring-buffer: reset timestamps when ring buffer is reset

Impact: fix bad times of recent resets

The ring buffer needs to reset its timestamps when reseting of the
buffer, otherwise the timestamps are stale and might be used to
calculate times in the buffer causing funny timestamps to appear.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 890020e28a35..7839280ffcd8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2166,6 +2166,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 
 	cpu_buffer->overrun = 0;
 	cpu_buffer->entries = 0;
+
+	cpu_buffer->write_stamp = 0;
+	cpu_buffer->read_stamp = 0;
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 94523e818f283d3c69f621406f633afff46dbf82 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 22 Jan 2009 11:18:06 -0500
Subject: trace: remove internal irqsoff disabling for trace output

Impact: cleanup of duplicate features

The trace output disables the ring buffer and prevents tracing to
occur. The code in irqsoff to do the same thing is no longer needed.
This patch removes it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_irqsoff.c | 34 ++--------------------------------
 1 file changed, 2 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 62a78d943534..ed344b022a14 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -353,28 +353,18 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
 
-/*
- * save_tracer_enabled is used to save the state of the tracer_enabled
- * variable when we disable it when we open a trace output file.
- */
-static int save_tracer_enabled;
-
 static void start_irqsoff_tracer(struct trace_array *tr)
 {
 	register_ftrace_function(&trace_ops);
-	if (tracing_is_enabled()) {
+	if (tracing_is_enabled())
 		tracer_enabled = 1;
-		save_tracer_enabled = 1;
-	} else {
+	else
 		tracer_enabled = 0;
-		save_tracer_enabled = 0;
-	}
 }
 
 static void stop_irqsoff_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	save_tracer_enabled = 0;
 	unregister_ftrace_function(&trace_ops);
 }
 
@@ -395,25 +385,11 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
 static void irqsoff_tracer_start(struct trace_array *tr)
 {
 	tracer_enabled = 1;
-	save_tracer_enabled = 1;
 }
 
 static void irqsoff_tracer_stop(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	save_tracer_enabled = 0;
-}
-
-static void irqsoff_tracer_open(struct trace_iterator *iter)
-{
-	/* stop the trace while dumping */
-	tracer_enabled = 0;
-}
-
-static void irqsoff_tracer_close(struct trace_iterator *iter)
-{
-	/* restart tracing */
-	tracer_enabled = save_tracer_enabled;
 }
 
 #ifdef CONFIG_IRQSOFF_TRACER
@@ -431,8 +407,6 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.reset		= irqsoff_tracer_reset,
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
-	.open		= irqsoff_tracer_open,
-	.close		= irqsoff_tracer_close,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_irqsoff,
@@ -459,8 +433,6 @@ static struct tracer preemptoff_tracer __read_mostly =
 	.reset		= irqsoff_tracer_reset,
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
-	.open		= irqsoff_tracer_open,
-	.close		= irqsoff_tracer_close,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptoff,
@@ -489,8 +461,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 	.reset		= irqsoff_tracer_reset,
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
-	.open		= irqsoff_tracer_open,
-	.close		= irqsoff_tracer_close,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptirqsoff,
-- 
cgit v1.2.3-59-g8ed1b


From b06a830183b610c0a88c29a92feb7991a867ab46 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 22 Jan 2009 14:26:15 -0500
Subject: trace: fix logic to start/stop counting

The logic in the tracing_start/stop code prevents the WARN_ON
from ever detecting if a start/stop pair was mismatched.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 757ae6f7e648..2129ab9d2a48 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -610,13 +610,12 @@ void tracing_start(void)
 		return;
 
 	spin_lock_irqsave(&tracing_start_lock, flags);
-	if (--trace_stop_count)
-		goto out;
-
-	if (trace_stop_count < 0) {
-		/* Someone screwed up their debugging */
-		WARN_ON_ONCE(1);
-		trace_stop_count = 0;
+	if (--trace_stop_count) {
+		if (trace_stop_count < 0) {
+			/* Someone screwed up their debugging */
+			WARN_ON_ONCE(1);
+			trace_stop_count = 0;
+		}
 		goto out;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 7e49fcce1bdadd723ae6a0b3b324c4daced61563 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 22 Jan 2009 19:01:40 -0500
Subject: trace, lockdep: manual preempt count adding for local_bh_disable

Impact: fix to preempt trace triggering lockdep check_flag failure

In local_bh_disable, the use of add_preempt_count causes the
preempt tracer to start recording the time preemption is off.
But because it already modified the preempt_count to show
softirqs disabled, and before it called the lockdep code to
handle this, it causes a state that lockdep can not handle.

The preempt tracer will reset the ring buffer on start of a trace,
and the ring buffer reset code does a spin_lock_irqsave. This
calls into lockdep and lockdep will fail when it detects the
invalid state of having softirqs disabled but the internal
current->softirqs_enabled is still set.

The fix is to manually add the SOFTIRQ_OFFSET to preempt count
and call the preempt tracer code outside the lockdep critical
area.

Thanks to Peter Zijlstra for suggesting this solution.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 ++
 kernel/sched.c        |  8 ++++----
 kernel/softirq.c      | 13 ++++++++++++-
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b81a1f8..33085b88f87b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -137,6 +137,8 @@ extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
 
+extern unsigned long get_parent_ip(unsigned long addr);
+
 struct seq_file;
 struct cfs_rq;
 struct task_group;
diff --git a/kernel/sched.c b/kernel/sched.c
index 52bbf1c842a8..c154825ae753 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4399,10 +4399,7 @@ void scheduler_tick(void)
 #endif
 }
 
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-				defined(CONFIG_PREEMPT_TRACER))
-
-static inline unsigned long get_parent_ip(unsigned long addr)
+unsigned long get_parent_ip(unsigned long addr)
 {
 	if (in_lock_functions(addr)) {
 		addr = CALLER_ADDR2;
@@ -4412,6 +4409,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
 	return addr;
 }
 
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+
 void __kprobes add_preempt_count(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..6edfc2c11d99 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -21,6 +21,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
 
@@ -79,13 +80,23 @@ static void __local_bh_disable(unsigned long ip)
 	WARN_ON_ONCE(in_irq());
 
 	raw_local_irq_save(flags);
-	add_preempt_count(SOFTIRQ_OFFSET);
+	/*
+	 * The preempt tracer hooks into add_preempt_count and will break
+	 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
+	 * is set and before current->softirq_enabled is cleared.
+	 * We must manually increment preempt_count here and manually
+	 * call the trace_preempt_off later.
+	 */
+	preempt_count() += SOFTIRQ_OFFSET;
 	/*
 	 * Were softirqs turned off above:
 	 */
 	if (softirq_count() == SOFTIRQ_OFFSET)
 		trace_softirqs_off(ip);
 	raw_local_irq_restore(flags);
+
+	if (preempt_count() == SOFTIRQ_OFFSET)
+		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 #else /* !CONFIG_TRACE_IRQFLAGS */
 static inline void __local_bh_disable(unsigned long ip)
-- 
cgit v1.2.3-59-g8ed1b


From 9005f3ebebfcfe9ccd731d16c468907a35ac1f9a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 22 Jan 2009 17:04:53 -0800
Subject: tracing/function-graph-tracer: various fixes and features

This patch brings various bugfixes:

- Drop the first irrelevant task switch on the very beginning of a trace.

- Drop the OVERHEAD word from the headers, the DURATION word is sufficient
  and will not overlap other columns.

- Make the headers fit well their respective columns whatever the
  selected options.

Ie, default options:

 # tracer: function_graph
 #
 # CPU  DURATION                  FUNCTION CALLS
 # |     |   |                     |   |   |   |

  1)   0.646 us    |                    }
  1)               |                    mem_cgroup_del_lru_list() {
  1)   0.624 us    |                      lookup_page_cgroup();
  1)   1.970 us    |                    }

 echo funcgraph-proc > trace_options

 # tracer: function_graph
 #
 # CPU  TASK/PID        DURATION                  FUNCTION CALLS
 # |    |    |           |   |                     |   |   |   |

  0)   bash-2937    |   0.895 us    |                }
  0)   bash-2937    |   0.888 us    |                __rcu_read_unlock();
  0)   bash-2937    |   0.864 us    |                conv_uni_to_pc();
  0)   bash-2937    |   1.015 us    |                __rcu_read_lock();

 echo nofuncgraph-cpu > trace_options
 echo nofuncgraph-proc > trace_options

 # tracer: function_graph
 #
 #   DURATION                  FUNCTION CALLS
 #    |   |                     |   |   |   |

   3.752 us    |                  native_pud_val();
   0.616 us    |                  native_pud_val();
   0.624 us    |                  native_pmd_val();

About features, one can now disable the duration (this will hide the
overhead too for convenient reasons and because on  doesn't need
overhead if it hasn't the duration):

 echo nofuncgraph-duration > trace_options

 # tracer: function_graph
 #
 #                FUNCTION CALLS
 #                |   |   |   |

           cap_vm_enough_memory() {
             __vm_enough_memory() {
               vm_acct_memory();
             }
           }
         }

And at last, an option to print the absolute time:

 //Restart from default options
 echo funcgraph-abstime > trace_options

 # tracer: function_graph
 #
 #      TIME       CPU  DURATION                  FUNCTION CALLS
 #       |         |     |   |                     |   |   |   |

   261.339774 |   1) + 42.823 us   |    }
   261.339775 |   1)   1.045 us    |    _spin_lock_irq();
   261.339777 |   1)   0.940 us    |    _spin_lock_irqsave();
   261.339778 |   1)   0.752 us    |    _spin_unlock_irqrestore();
   261.339780 |   1)   0.857 us    |    _spin_unlock_irq();
   261.339782 |   1)               |    flush_to_ldisc() {
   261.339783 |   1)               |      tty_ldisc_ref() {
   261.339783 |   1)               |        tty_ldisc_try() {
   261.339784 |   1)   1.075 us    |          _spin_lock_irqsave();
   261.339786 |   1)   0.842 us    |          _spin_unlock_irqrestore();
   261.339788 |   1)   4.211 us    |        }
   261.339788 |   1)   5.662 us    |      }

The format is seconds.usecs.

I guess no one needs the nanosec precision here, the main goal is to have
an overview about the general timings of events, and to see the place when
the trace switches from one cpu to another.

ie:

   274.874760 |   1)   0.676 us    |      _spin_unlock();
   274.874762 |   1)   0.609 us    |      native_load_sp0();
   274.874763 |   1)   0.602 us    |      native_load_tls();
   274.878739 |   0)   0.722 us    |                  }
   274.878740 |   0)   0.714 us    |                  native_pmd_val();
   274.878741 |   0)   0.730 us    |                  native_pmd_val();

Here there is a 4000 usecs difference when we switch the cpu.

Changes in V2:

- Completely fix the first pointless task switch.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 298 +++++++++++++++++++++++------------
 1 file changed, 195 insertions(+), 103 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3c545984816f..66fc7b806a38 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1,7 +1,7 @@
 /*
  *
  * Function graph tracer.
- * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
  * Mostly borrowed from function tracer which
  * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
  *
@@ -21,9 +21,11 @@
 #define TRACE_GRAPH_PRINT_CPU		0x2
 #define TRACE_GRAPH_PRINT_OVERHEAD	0x4
 #define TRACE_GRAPH_PRINT_PROC		0x8
+#define TRACE_GRAPH_PRINT_DURATION	0x10
+#define TRACE_GRAPH_PRINT_ABS_TIME	0X20
 
 static struct tracer_opt trace_opts[] = {
-	/* Display overruns ? */
+	/* Display overruns? (for self-debug purpose) */
 	{ TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
 	/* Display CPU ? */
 	{ TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
@@ -31,17 +33,22 @@ static struct tracer_opt trace_opts[] = {
 	{ TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
 	/* Display proc name/pid */
 	{ TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
+	/* Display duration of execution */
+	{ TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
+	/* Display absolute time of an entry */
+	{ TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
 	{ } /* Empty entry */
 };
 
 static struct tracer_flags tracer_flags = {
 	/* Don't display overruns and proc by default */
-	.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
+	.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
+	       TRACE_GRAPH_PRINT_DURATION,
 	.opts = trace_opts
 };
 
 /* pid on the last trace processed */
-static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
+
 
 static int graph_trace_init(struct trace_array *tr)
 {
@@ -154,17 +161,25 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
 
 /* If the pid changed since the last trace, output this event */
 static enum print_line_t
-verif_pid(struct trace_seq *s, pid_t pid, int cpu)
+verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
 {
 	pid_t prev_pid;
+	pid_t *last_pid;
 	int ret;
 
-	if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
+	if (!last_pids_cpu)
+		return TRACE_TYPE_HANDLED;
+
+	last_pid = per_cpu_ptr(last_pids_cpu, cpu);
+
+	if (*last_pid == pid)
 		return TRACE_TYPE_HANDLED;
 
-	prev_pid = last_pid[cpu];
-	last_pid[cpu] = pid;
+	prev_pid = *last_pid;
+	*last_pid = pid;
 
+	if (prev_pid == -1)
+		return TRACE_TYPE_HANDLED;
 /*
  * Context-switch trace line:
 
@@ -232,9 +247,34 @@ trace_branch_is_leaf(struct trace_iterator *iter,
 	return true;
 }
 
+/* Signal a overhead of time execution to the output */
+static int
+print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+{
+	/* If duration disappear, we don't need anything */
+	if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
+		return 1;
+
+	/* Non nested entry or return */
+	if (duration == -1)
+		return trace_seq_printf(s, "  ");
+
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+		/* Duration exceeded 100 msecs */
+		if (duration > 100000ULL)
+			return trace_seq_printf(s, "! ");
+
+		/* Duration exceeded 10 msecs */
+		if (duration > 10000ULL)
+			return trace_seq_printf(s, "+ ");
+	}
+
+	return trace_seq_printf(s, "  ");
+}
+
 static enum print_line_t
 print_graph_irq(struct trace_seq *s, unsigned long addr,
-				enum trace_type type, int cpu, pid_t pid)
+		enum trace_type type, int cpu, pid_t pid)
 {
 	int ret;
 
@@ -242,35 +282,40 @@ print_graph_irq(struct trace_seq *s, unsigned long addr,
 		addr >= (unsigned long)__irqentry_text_end)
 		return TRACE_TYPE_UNHANDLED;
 
-	if (type == TRACE_GRAPH_ENT) {
-		ret = trace_seq_printf(s, "==========> |  ");
-	} else {
-		/* Cpu */
-		if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
-			ret = print_graph_cpu(s, cpu);
-			if (ret == TRACE_TYPE_PARTIAL_LINE)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
-		/* Proc */
-		if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-			ret = print_graph_proc(s, pid);
-			if (ret == TRACE_TYPE_PARTIAL_LINE)
-				return TRACE_TYPE_PARTIAL_LINE;
+	/* Cpu */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+		ret = print_graph_cpu(s, cpu);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+	/* Proc */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+		ret = print_graph_proc(s, pid);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+		ret = trace_seq_printf(s, " | ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
-			ret = trace_seq_printf(s, " | ");
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
+	/* No overhead */
+	ret = print_graph_overhead(-1, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		/* No overhead */
-		if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-			ret = trace_seq_printf(s, "  ");
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
+	if (type == TRACE_GRAPH_ENT)
+		ret = trace_seq_printf(s, "==========>");
+	else
+		ret = trace_seq_printf(s, "<==========");
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Don't close the duration column if haven't one */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+		trace_seq_printf(s, " |");
+	ret = trace_seq_printf(s, "\n");
 
-		ret = trace_seq_printf(s, "<========== |\n");
-	}
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 	return TRACE_TYPE_HANDLED;
@@ -289,7 +334,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 	sprintf(msecs_str, "%lu", (unsigned long) duration);
 
 	/* Print msecs */
-	ret = trace_seq_printf(s, msecs_str);
+	ret = trace_seq_printf(s, "%s", msecs_str);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -322,19 +367,15 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 
 }
 
-/* Signal a overhead of time execution to the output */
-static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+static int print_graph_abs_time(u64 t, struct trace_seq *s)
 {
-	/* Duration exceeded 100 msecs */
-	if (duration > 100000ULL)
-		return trace_seq_printf(s, "! ");
+	unsigned long usecs_rem;
 
-	/* Duration exceeded 10 msecs */
-	if (duration > 10000ULL)
-		return trace_seq_printf(s, "+ ");
+	usecs_rem = do_div(t, 1000000000);
+	usecs_rem /= 1000;
 
-	return trace_seq_printf(s, "  ");
+	return trace_seq_printf(s, "%5lu.%06lu |  ",
+			(unsigned long)t, usecs_rem);
 }
 
 /* Case of a leaf function on its call entry */
@@ -357,16 +398,16 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	duration = graph_ret->rettime - graph_ret->calltime;
 
 	/* Overhead */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		ret = print_graph_overhead(duration, s);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
+	ret = print_graph_overhead(duration, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Duration */
-	ret = print_graph_duration(duration, s);
-	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+		ret = print_graph_duration(duration, s);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Function */
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -395,25 +436,17 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 	struct ftrace_graph_ent *call = &entry->graph_ent;
 
 	/* No overhead */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		ret = trace_seq_printf(s, "  ");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
+	ret = print_graph_overhead(-1, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-	/* Interrupt */
-	ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid);
-	if (ret == TRACE_TYPE_UNHANDLED) {
-		/* No time */
+	/* No time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
 		ret = trace_seq_printf(s, "            |  ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
-	} else {
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-
 	/* Function */
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
 		ret = trace_seq_printf(s, " ");
@@ -434,15 +467,30 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
-			struct trace_iterator *iter, int cpu)
+			struct trace_iterator *iter)
 {
 	int ret;
+	int cpu = iter->cpu;
+	pid_t *last_entry = iter->private;
 	struct trace_entry *ent = iter->ent;
+	struct ftrace_graph_ent *call = &field->graph_ent;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Interrupt */
+	ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, ent->pid);
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
+	/* Absolute time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+		ret = print_graph_abs_time(iter->ts, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
@@ -470,16 +518,25 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 
 static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
-		   struct trace_entry *ent, int cpu)
+		   struct trace_entry *ent, struct trace_iterator *iter)
 {
 	int i;
 	int ret;
+	int cpu = iter->cpu;
+	pid_t *last_pid = iter->private;
 	unsigned long long duration = trace->rettime - trace->calltime;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
+	/* Absolute time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+		ret = print_graph_abs_time(iter->ts, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
@@ -499,16 +556,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	}
 
 	/* Overhead */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		ret = print_graph_overhead(duration, s);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
+	ret = print_graph_overhead(duration, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Duration */
-	ret = print_graph_duration(duration, s);
-	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+		ret = print_graph_duration(duration, s);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Closing brace */
 	for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
@@ -542,14 +599,23 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 {
 	int i;
 	int ret;
+	int cpu = iter->cpu;
+	pid_t *last_pid = iter->private;
+
+	/* Absolute time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+		ret = print_graph_abs_time(iter->ts, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
-		ret = print_graph_cpu(s, iter->cpu);
+		ret = print_graph_cpu(s, cpu);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
@@ -566,17 +632,17 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 	}
 
 	/* No overhead */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		ret = trace_seq_printf(s, "  ");
+	ret = print_graph_overhead(-1, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* No time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+		ret = trace_seq_printf(s, "            |  ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	/* No time */
-	ret = trace_seq_printf(s, "            |  ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
 	/* Indentation */
 	if (trace->depth > 0)
 		for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
@@ -614,13 +680,12 @@ print_graph_function(struct trace_iterator *iter)
 	case TRACE_GRAPH_ENT: {
 		struct ftrace_graph_ent_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_entry(field, s, iter,
-					 iter->cpu);
+		return print_graph_entry(field, s, iter);
 	}
 	case TRACE_GRAPH_RET: {
 		struct ftrace_graph_ret_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_return(&field->ret, s, entry, iter->cpu);
+		return print_graph_return(&field->ret, s, entry, iter);
 	}
 	case TRACE_PRINT: {
 		struct print_entry *field;
@@ -636,28 +701,55 @@ static void print_graph_headers(struct seq_file *s)
 {
 	/* 1st line */
 	seq_printf(s, "# ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+		seq_printf(s, "     TIME       ");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
-		seq_printf(s, "CPU ");
+		seq_printf(s, "CPU");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
-		seq_printf(s, "TASK/PID     ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD)
-		seq_printf(s, "OVERHEAD/");
-	seq_printf(s, "DURATION            FUNCTION CALLS\n");
+		seq_printf(s, "  TASK/PID      ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+		seq_printf(s, "  DURATION   ");
+	seq_printf(s, "               FUNCTION CALLS\n");
 
 	/* 2nd line */
 	seq_printf(s, "# ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+		seq_printf(s, "      |         ");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
-		seq_printf(s, "|   ");
+		seq_printf(s, "|  ");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
-		seq_printf(s, "|      |     ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		seq_printf(s, "|        ");
-		seq_printf(s, "|                   |   |   |   |\n");
-	} else
-		seq_printf(s, "    |               |   |   |   |\n");
+		seq_printf(s, "  |    |        ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+		seq_printf(s, "   |   |      ");
+	seq_printf(s, "               |   |   |   |\n");
 }
+
+static void graph_trace_open(struct trace_iterator *iter)
+{
+	/* pid on the last trace processed */
+	pid_t *last_pid = alloc_percpu(pid_t);
+	int cpu;
+
+	if (!last_pid)
+		pr_warning("function graph tracer: not enough memory\n");
+	else
+		for_each_possible_cpu(cpu) {
+			pid_t *pid = per_cpu_ptr(last_pid, cpu);
+			*pid = -1;
+		}
+
+	iter->private = last_pid;
+}
+
+static void graph_trace_close(struct trace_iterator *iter)
+{
+	percpu_free(iter->private);
+}
+
 static struct tracer graph_trace __read_mostly = {
 	.name	     	= "function_graph",
+	.open		= graph_trace_open,
+	.close		= graph_trace_close,
 	.init	     	= graph_trace_init,
 	.reset	     	= graph_trace_reset,
 	.print_line	= print_graph_function,
-- 
cgit v1.2.3-59-g8ed1b


From cc2f6d90e950b69ad31d483c19cc1d121bb25c16 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 23 Jan 2009 13:03:37 -0800
Subject: kmemtrace: fix printk format warnings

Fix kmemtrace printk warnings:

  kernel/trace/kmemtrace.c:142: warning: format '%4ld' expects type 'long int', but argument 3 has type 'size_t'
  kernel/trace/kmemtrace.c:147: warning: format '%4ld' expects type 'long int', but argument 3 has type 'size_t'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 7ebc58cee3bd..72b326bba005 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -139,12 +139,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Requested */
-	ret = trace_seq_printf(s, "%4ld   ", entry->bytes_req);
+	ret = trace_seq_printf(s, "%4zd   ", entry->bytes_req);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Allocated */
-	ret = trace_seq_printf(s, "%4ld   ", entry->bytes_alloc);
+	ret = trace_seq_printf(s, "%4zd   ", entry->bytes_alloc);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-- 
cgit v1.2.3-59-g8ed1b


From 9011262a37cb438f0fa9394b5e83840db8f9680a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 23 Jan 2009 12:06:23 -0200
Subject: ftrace: add ftrace_vprintk

Impact: new helper function

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 8 ++++++++
 kernel/trace/trace.c   | 9 +++++++++
 2 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 9f7880d87c39..7840e718c6c7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -302,6 +302,9 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 extern int
 __ftrace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
+# define ftrace_vprintk(fmt, ap) __ftrace_printk(_THIS_IP_, fmt, ap)
+extern int
+__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 extern void ftrace_dump(void);
 #else
 static inline void
@@ -317,6 +320,11 @@ ftrace_printk(const char *fmt, ...)
 {
 	return 0;
 }
+static inline int
+ftrace_vprintk(const char *fmt, va_list ap)
+{
+	return 0;
+}
 static inline void ftrace_dump(void) { }
 #endif
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2129ab9d2a48..2f8ac1f008f5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2951,6 +2951,15 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(__ftrace_printk);
 
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+{
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
-- 
cgit v1.2.3-59-g8ed1b


From c71a896154119f4ca9e89d6078f5f63ad60ef199 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 23 Jan 2009 12:06:27 -0200
Subject: blktrace: add ftrace plugin

Impact: New way of using the blktrace infrastructure

This drops the requirement of userspace utilities to use the blktrace
facility.

Configuration is done thru sysfs, adding a "trace" directory to the
partition directory where blktrace can be enabled for the associated
request_queue.

The same filters present in the IOCTL interface are present as sysfs
device attributes.

The /sys/block/sdX/sdXN/trace/enable file allows tracing without any
filters.

The other files in this directory: pid, act_mask, start_lba and end_lba
can be used with the same meaning as with the IOCTL interface.

Using the sysfs interface will only setup the request_queue->blk_trace
fields, tracing will only take place when the "blk" tracer is selected
via the ftrace interface, as in the following example:

To see the trace, one can use the /d/tracing/trace file or the
/d/tracign/trace_pipe file, with semantics defined in the ftrace
documentation in Documentation/ftrace.txt.

[root@f10-1 ~]# cat /t/trace
       kjournald-305   [000]  3046.491224:   8,1    A WBS 6367 + 8 <- (8,1) 6304
       kjournald-305   [000]  3046.491227:   8,1    Q   R 6367 + 8 [kjournald]
       kjournald-305   [000]  3046.491236:   8,1    G  RB 6367 + 8 [kjournald]
       kjournald-305   [000]  3046.491239:   8,1    P  NS [kjournald]
       kjournald-305   [000]  3046.491242:   8,1    I RBS 6367 + 8 [kjournald]
       kjournald-305   [000]  3046.491251:   8,1    D  WB 6367 + 8 [kjournald]
       kjournald-305   [000]  3046.491610:   8,1    U  WS [kjournald] 1
          <idle>-0     [000]  3046.511914:   8,1    C  RS 6367 + 8 [6367]
[root@f10-1 ~]#

The default line context (prefix) format is the one described in the ftrace
documentation, with the blktrace specific bits using its existing format,
described in blkparse(8).

If one wants to have the classic blktrace formatting, this is possible by
using:

[root@f10-1 ~]# echo blk_classic > /t/trace_options
[root@f10-1 ~]# cat /t/trace
  8,1    0  3046.491224   305  A WBS 6367 + 8 <- (8,1) 6304
  8,1    0  3046.491227   305  Q   R 6367 + 8 [kjournald]
  8,1    0  3046.491236   305  G  RB 6367 + 8 [kjournald]
  8,1    0  3046.491239   305  P  NS [kjournald]
  8,1    0  3046.491242   305  I RBS 6367 + 8 [kjournald]
  8,1    0  3046.491251   305  D  WB 6367 + 8 [kjournald]
  8,1    0  3046.491610   305  U  WS [kjournald] 1
  8,1    0  3046.511914     0  C  RS 6367 + 8 [6367]
[root@f10-1 ~]#

Using the ftrace standard format allows more flexibility, such
as the ability of asking for backtraces via trace_options:

[root@f10-1 ~]# echo noblk_classic > /t/trace_options
[root@f10-1 ~]# echo stacktrace > /t/trace_options

[root@f10-1 ~]# cat /t/trace
       kjournald-305   [000]  3318.826779:   8,1    A WBS 6375 + 8 <- (8,1) 6312
       kjournald-305   [000]  3318.826782:
 <= submit_bio
 <= submit_bh
 <= sync_dirty_buffer
 <= journal_commit_transaction
 <= kjournald
 <= kthread
 <= child_rip
       kjournald-305   [000]  3318.826836:   8,1    Q   R 6375 + 8 [kjournald]
       kjournald-305   [000]  3318.826837:
 <= generic_make_request
 <= submit_bio
 <= submit_bh
 <= sync_dirty_buffer
 <= journal_commit_transaction
 <= kjournald
 <= kthread

Please read the ftrace documentation to use aditional, standardized
tracing filters such as /d/tracing/trace_cpumask, etc.

See also /d/tracing/trace_mark to add comments in the trace stream,
that is equivalent to the /d/block/sdaN/msg interface.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c      | 651 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/partitions/check.c |   7 +
 kernel/trace/trace.h  |   1 +
 3 files changed, 654 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index b0a2cae886db..630f167f8240 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -25,9 +25,27 @@
 #include <linux/time.h>
 #include <trace/block.h>
 #include <asm/uaccess.h>
+#include <../kernel/trace/trace_output.h>
 
 static unsigned int blktrace_seq __read_mostly = 1;
 
+static struct trace_array *blk_tr;
+static int __read_mostly  blk_tracer_enabled;
+
+/* Select an alternative, minimalistic output than the original one */
+#define TRACE_BLK_OPT_CLASSIC 	0x1
+
+static struct tracer_opt blk_tracer_opts[] = {
+	/* Default disable the minimalistic output */
+	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC ) },
+	{ }
+};
+
+static struct tracer_flags blk_tracer_flags = {
+	.val  = 0,
+	.opts = blk_tracer_opts,
+};
+
 /* Global reference count of probes */
 static DEFINE_MUTEX(blk_probe_mutex);
 static atomic_t blk_probes_ref = ATOMIC_INIT(0);
@@ -43,6 +61,9 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 {
 	struct blk_io_trace *t;
 
+	if (!bt->rchan)
+		return;
+
 	t = relay_reserve(bt->rchan, sizeof(*t) + len);
 	if (t) {
 		const int cpu = smp_processor_id();
@@ -90,6 +111,16 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 	unsigned long flags;
 	char *buf;
 
+	if (blk_tr) {
+		va_start(args, fmt);
+		ftrace_vprintk(fmt, args);
+		va_end(args);
+		return;
+	}
+
+	if (!bt->msg_data)
+		return;
+
 	local_irq_save(flags);
 	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
 	va_start(args, fmt);
@@ -131,13 +162,14 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
 {
 	struct task_struct *tsk = current;
+	struct ring_buffer_event *event = NULL;
 	struct blk_io_trace *t;
 	unsigned long flags;
 	unsigned long *sequence;
 	pid_t pid;
-	int cpu;
+	int cpu, pc = 0;
 
-	if (unlikely(bt->trace_state != Blktrace_running))
+	if (unlikely(bt->trace_state != Blktrace_running || !blk_tracer_enabled))
 		return;
 
 	what |= ddir_act[rw & WRITE];
@@ -150,6 +182,24 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	pid = tsk->pid;
 	if (unlikely(act_log_check(bt, what, sector, pid)))
 		return;
+	cpu = raw_smp_processor_id();
+
+	if (blk_tr) {
+		struct trace_entry *ent;
+		tracing_record_cmdline(current);
+
+		event = ring_buffer_lock_reserve(blk_tr->buffer,
+						 sizeof(*t) + pdu_len, &flags);
+		if (!event)
+			return;
+		
+		ent = ring_buffer_event_data(event);
+		t = (struct blk_io_trace *)ent;
+		pc = preempt_count();
+		tracing_generic_entry_update(ent, 0, pc);
+		ent->type = TRACE_BLK;
+		goto record_it;
+	}
 
 	/*
 	 * A word about the locking here - we disable interrupts to reserve
@@ -163,23 +213,33 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 
 	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
 	if (t) {
-		cpu = smp_processor_id();
 		sequence = per_cpu_ptr(bt->sequence, cpu);
 
 		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
 		t->sequence = ++(*sequence);
 		t->time = ktime_to_ns(ktime_get());
+		t->cpu = cpu;
+		t->pid = pid;
+record_it:
 		t->sector = sector;
 		t->bytes = bytes;
 		t->action = what;
-		t->pid = pid;
 		t->device = bt->dev;
-		t->cpu = cpu;
 		t->error = error;
 		t->pdu_len = pdu_len;
 
 		if (pdu_len)
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+
+		if (blk_tr) {
+			ring_buffer_unlock_commit(blk_tr->buffer, event, flags);
+			if (pid != 0 &&
+			    (blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) == 0 &&
+			    (trace_flags & TRACE_ITER_STACKTRACE) != 0)
+				__trace_stack(blk_tr, NULL, flags, 5, pc);
+			trace_wake_up();
+			return;
+		}
 	}
 
 	local_irq_restore(flags);
@@ -888,3 +948,584 @@ static void blk_unregister_tracepoints(void)
 
 	tracepoint_synchronize_unregister();
 }
+
+/*
+ * struct blk_io_tracer formatting routines
+ */
+
+static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
+{
+        int i = 0;
+
+        if (t->action & BLK_TC_DISCARD)	   rwbs[i++] = 'D';
+        else if (t->action & BLK_TC_WRITE) rwbs[i++] = 'W';
+        else if (t->bytes)		   rwbs[i++] = 'R';
+        else				   rwbs[i++] = 'N';
+
+        if (t->action & BLK_TC_AHEAD)	   rwbs[i++] = 'A';
+        if (t->action & BLK_TC_BARRIER)	   rwbs[i++] = 'B';
+        if (t->action & BLK_TC_SYNC)	   rwbs[i++] = 'S';
+        if (t->action & BLK_TC_META)	   rwbs[i++] = 'M';
+
+        rwbs[i] = '\0';
+}
+
+static inline
+const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
+{
+	return (const struct blk_io_trace *)ent;
+}
+
+static inline const void *pdu_start(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent) + 1;
+}
+
+static inline u32 t_sec(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->bytes >> 9;
+}
+
+static inline unsigned long long t_sector(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->sector;
+}
+
+static inline __u16 t_error(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->sector;
+}
+
+static __u64 get_pdu_int(const struct trace_entry *ent)
+{
+	const __u64 *val = pdu_start(ent);
+	return be64_to_cpu(*val);
+}
+
+static void get_pdu_remap(const struct trace_entry *ent,
+			  struct blk_io_trace_remap *r)
+{
+	const struct blk_io_trace_remap *__r = pdu_start(ent);
+	__u64 sector = __r->sector;
+
+	r->device = be32_to_cpu(__r->device);
+	r->device_from = be32_to_cpu(__r->device_from);
+	r->sector = be64_to_cpu(sector);
+}
+
+static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
+{
+	char rwbs[6];
+	unsigned long long ts  = ns2usecs(iter->ts);
+	unsigned long usec_rem = do_div(ts, USEC_PER_SEC);
+	unsigned secs	       = (unsigned long)ts;
+	const struct trace_entry *ent = iter->ent;
+	const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
+
+	fill_rwbs(rwbs, t);
+
+	return trace_seq_printf(&iter->seq,
+				"%3d,%-3d %2d %5d.%06lu %5u %2s %3s ",
+				MAJOR(t->device), MINOR(t->device), iter->cpu,
+				secs, usec_rem, ent->pid, act, rwbs);
+}
+
+static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
+			      const char *act)
+{
+	char rwbs[6];
+	fill_rwbs(rwbs, t);
+	return trace_seq_printf(s, "%3d,%-3d %2s %3s ",
+				MAJOR(t->device), MINOR(t->device), act, rwbs);
+}
+
+static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+{
+	const char *cmd = trace_find_cmdline(ent->pid);
+
+	if (t_sec(ent))
+		return trace_seq_printf(s, "%llu + %u [%s]\n",
+					t_sector(ent), t_sec(ent), cmd);
+	return trace_seq_printf(s, "[%s]\n", cmd);
+}
+
+static int blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent)
+{
+	if (t_sec(ent))
+		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
+					t_sec(ent), t_error(ent));
+	return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
+}
+
+static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+{
+	struct blk_io_trace_remap r = { .device = 0, };
+
+	get_pdu_remap(ent, &r);
+	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
+			       t_sector(ent),
+			       t_sec(ent), MAJOR(r.device), MINOR(r.device),
+			       (unsigned long long)r.sector);
+}
+
+static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid));
+}
+
+static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid),
+				get_pdu_int(ent));
+}
+
+static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
+				get_pdu_int(ent), trace_find_cmdline(ent->pid));
+}
+
+/*
+ * struct tracer operations
+ */
+
+static void blk_tracer_print_header(struct seq_file *m)
+{
+	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+		return;
+	seq_puts(m, "# DEV   CPU TIMESTAMP     PID ACT FLG\n"
+		    "#  |     |     |           |   |   |\n");
+}
+
+static void blk_tracer_start(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_add_return(1, &blk_probes_ref) == 1)
+		if (blk_register_tracepoints())
+			atomic_dec(&blk_probes_ref);
+	mutex_unlock(&blk_probe_mutex);
+}
+
+static int blk_tracer_init(struct trace_array *tr)
+{
+	blk_tr = tr;
+	blk_tracer_start(tr);
+	mutex_lock(&blk_probe_mutex);
+	blk_tracer_enabled++;
+	mutex_unlock(&blk_probe_mutex);
+	return 0;
+}
+
+static void blk_tracer_stop(struct trace_array *tr)
+{
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_dec_and_test(&blk_probes_ref))
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
+static void blk_tracer_reset(struct trace_array *tr)
+{
+	if (!atomic_read(&blk_probes_ref))
+		return;
+
+	mutex_lock(&blk_probe_mutex);
+	blk_tracer_enabled--;
+	WARN_ON(blk_tracer_enabled < 0);
+	mutex_unlock(&blk_probe_mutex);
+
+	blk_tracer_stop(tr);
+}
+
+static struct {
+	const char *act[2];
+	int 	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
+} what2act[] __read_mostly = {
+	[__BLK_TA_QUEUE]	= {{  "Q", "queue" }, 	   blk_log_generic },
+	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
+	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
+	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
+	[__BLK_TA_SLEEPRQ]	= {{  "S", "sleeprq" },	   blk_log_generic },
+	[__BLK_TA_REQUEUE]	= {{  "R", "requeue" },	   blk_log_with_error },
+	[__BLK_TA_ISSUE]	= {{  "D", "issue" },	   blk_log_generic },
+	[__BLK_TA_COMPLETE]	= {{  "C", "complete" },   blk_log_with_error },
+	[__BLK_TA_PLUG]		= {{  "P", "plug" },	   blk_log_plug },
+	[__BLK_TA_UNPLUG_IO]	= {{  "U", "unplug_io" },  blk_log_unplug },
+	[__BLK_TA_UNPLUG_TIMER]	= {{ "UT", "unplug_timer" }, blk_log_unplug },
+	[__BLK_TA_INSERT]	= {{  "I", "insert" },	   blk_log_generic },
+	[__BLK_TA_SPLIT]	= {{  "X", "split" },	   blk_log_split },
+	[__BLK_TA_BOUNCE]	= {{  "B", "bounce" },	   blk_log_generic },
+	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
+};
+
+static int blk_trace_event_print(struct trace_seq *s, struct trace_entry *ent,
+				 int flags)
+{
+	const struct blk_io_trace *t = (struct blk_io_trace *)ent;
+	const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+	int ret;
+
+	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
+	else {
+		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
+		ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
+		if (ret)
+			ret = what2act[what].print(s, ent);
+	}
+
+	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
+{
+	const struct blk_io_trace *t;
+	u16 what;
+	int ret;
+
+	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+		return TRACE_TYPE_UNHANDLED;
+
+	t = (const struct blk_io_trace *)iter->ent;
+	what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+
+	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+		ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
+	else {
+		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
+		ret = blk_log_action_iter(iter, what2act[what].act[long_act]);
+		if (ret)
+			ret = what2act[what].print(&iter->seq, iter->ent);
+	}
+
+	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct tracer blk_tracer __read_mostly = {
+	.name		= "blk",
+	.init		= blk_tracer_init,
+	.reset		= blk_tracer_reset,
+	.start		= blk_tracer_start,
+	.stop		= blk_tracer_stop,
+	.print_header	= blk_tracer_print_header,
+	.print_line	= blk_tracer_print_line,
+	.flags		= &blk_tracer_flags,
+};
+
+static struct trace_event trace_blk_event = {
+	.type	 	= TRACE_BLK,
+	.trace		= blk_trace_event_print,
+	.latency_trace	= blk_trace_event_print,
+	.raw		= trace_nop_print,
+	.hex		= trace_nop_print,
+	.binary		= trace_nop_print,
+};
+
+static int __init init_blk_tracer(void)
+{
+	if (!register_ftrace_event(&trace_blk_event)) {
+		pr_warning("Warning: could not register block events\n");
+		return 1;
+	}
+
+	if (register_tracer(&blk_tracer) != 0) {
+		pr_warning("Warning: could not register the block tracer\n");
+		unregister_ftrace_event(&trace_blk_event);
+		return 1;
+	}
+
+	return 0;
+}
+
+device_initcall(init_blk_tracer);
+
+static int blk_trace_remove_queue(struct request_queue *q)
+{
+	struct blk_trace *bt;
+
+	bt = xchg(&q->blk_trace, NULL);
+	if (bt == NULL)
+		return -EINVAL;
+
+	kfree(bt);
+	return 0;
+}
+
+/*
+ * Setup everything required to start tracing
+ */
+static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
+{
+	struct blk_trace *old_bt, *bt = NULL;
+	int ret;
+
+	ret = -ENOMEM;
+	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+	if (!bt)
+		goto err;
+
+	bt->dev = dev;
+	bt->act_mask = (u16)-1;
+	bt->end_lba = -1ULL;
+	bt->trace_state = Blktrace_running;
+
+	old_bt = xchg(&q->blk_trace, bt);
+	if (old_bt != NULL) {
+		(void)xchg(&q->blk_trace, old_bt);
+		kfree(bt);
+		ret = -EBUSY;
+	}
+	return 0;
+err:
+	return ret;
+}
+
+/*
+ * sysfs interface to enable and configure tracing
+ */
+
+static ssize_t sysfs_blk_trace_enable_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	struct block_device *bdev;
+	ssize_t ret = -ENXIO;
+
+	lock_kernel();
+	bdev = bdget(part_devt(p));
+	if (bdev != NULL) {
+		struct request_queue *q = bdev_get_queue(bdev);
+
+		if (q != NULL) {
+			mutex_lock(&bdev->bd_mutex);
+			ret = sprintf(buf, "%u\n", !!q->blk_trace);
+			mutex_unlock(&bdev->bd_mutex);
+		}
+
+		bdput(bdev);
+	}
+
+	unlock_kernel();
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_enable_store(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct block_device *bdev;
+	struct request_queue *q;
+	struct hd_struct *p;
+	int value;
+	ssize_t ret = -ENXIO;
+
+	if (count == 0 || sscanf(buf, "%d", &value) != 1)
+		goto out;
+
+	lock_kernel();
+	p = dev_to_part(dev);
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+
+	mutex_lock(&bdev->bd_mutex);
+	if (value)
+		ret = blk_trace_setup_queue(q, bdev->bd_dev);
+	else
+		ret = blk_trace_remove_queue(q);
+	mutex_unlock(&bdev->bd_mutex);
+
+	if (ret == 0)
+		ret = count;
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+out:
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf);
+static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count);
+#define BLK_TRACE_DEVICE_ATTR(_name) \
+	DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
+		    sysfs_blk_trace_attr_show, \
+		    sysfs_blk_trace_attr_store)
+
+static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,
+		   sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store);
+static BLK_TRACE_DEVICE_ATTR(act_mask);
+static BLK_TRACE_DEVICE_ATTR(pid);
+static BLK_TRACE_DEVICE_ATTR(start_lba);
+static BLK_TRACE_DEVICE_ATTR(end_lba);
+
+static struct attribute *blk_trace_attrs[] = {
+	&dev_attr_enable.attr,
+	&dev_attr_act_mask.attr,
+	&dev_attr_pid.attr,
+	&dev_attr_start_lba.attr,
+	&dev_attr_end_lba.attr,
+	NULL
+};
+
+struct attribute_group blk_trace_attr_group = {
+	.name  = "trace",
+	.attrs = blk_trace_attrs,
+};
+
+static int blk_str2act_mask(const char *str)
+{
+	int mask = 0;
+	char *copy = kstrdup(str, GFP_KERNEL), *s;
+
+	if (copy == NULL)
+		return -ENOMEM;
+
+	s = strstrip(copy);
+
+	while (1) {
+		char *sep = strchr(s, ',');
+
+		if (sep != NULL)
+			*sep = '\0';
+
+		if (strcasecmp(s, "barrier") == 0)
+			mask |= BLK_TC_BARRIER;
+		else if (strcasecmp(s, "complete") == 0)
+			mask |= BLK_TC_COMPLETE;
+		else if (strcasecmp(s, "fs") == 0)
+			mask |= BLK_TC_FS;
+		else if (strcasecmp(s, "issue") == 0)
+			mask |= BLK_TC_ISSUE;
+		else if (strcasecmp(s, "pc") == 0)
+			mask |= BLK_TC_PC;
+		else if (strcasecmp(s, "queue") == 0)
+			mask |= BLK_TC_QUEUE;
+		else if (strcasecmp(s, "read") == 0)
+			mask |= BLK_TC_READ;
+		else if (strcasecmp(s, "requeue") == 0)
+			mask |= BLK_TC_REQUEUE;
+		else if (strcasecmp(s, "sync") == 0)
+			mask |= BLK_TC_SYNC;
+		else if (strcasecmp(s, "write") == 0)
+			mask |= BLK_TC_WRITE;
+
+		if (sep == NULL)
+			break;
+
+		s = sep + 1;
+	}
+	kfree(copy);
+
+	return mask;
+}
+
+static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	struct request_queue *q;
+	struct block_device *bdev;
+	ssize_t ret = -ENXIO;
+
+	lock_kernel();
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+	mutex_lock(&bdev->bd_mutex);
+	if (q->blk_trace == NULL)
+		ret = sprintf(buf, "disabled\n");
+	else if (attr == &dev_attr_act_mask)
+		ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask);
+	else if (attr == &dev_attr_pid)
+		ret = sprintf(buf, "%u\n", q->blk_trace->pid);
+	else if (attr == &dev_attr_start_lba)
+		ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
+	else if (attr == &dev_attr_end_lba)
+		ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+	mutex_unlock(&bdev->bd_mutex);
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct block_device *bdev;
+	struct request_queue *q;
+	struct hd_struct *p;
+	u64 value;
+	ssize_t ret = -ENXIO;
+
+	if (count == 0)
+		goto out;
+
+	if (attr == &dev_attr_act_mask) {
+		if (sscanf(buf, "%llx", &value) != 1) {
+			/* Assume it is a list of trace category names */
+			value = blk_str2act_mask(buf);
+			if (value < 0)
+				goto out;
+		}
+	} else if (sscanf(buf, "%llu", &value) != 1)
+		goto out;
+
+	lock_kernel();
+	p = dev_to_part(dev);
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+
+	mutex_lock(&bdev->bd_mutex);
+	ret = 0;
+	if (q->blk_trace == NULL)
+		ret = blk_trace_setup_queue(q, bdev->bd_dev);
+
+	if (ret == 0) {
+		if (attr == &dev_attr_act_mask)
+			q->blk_trace->act_mask = value;
+		else if (attr == &dev_attr_pid)
+			q->blk_trace->pid = value;
+		else if (attr == &dev_attr_start_lba)
+			q->blk_trace->start_lba = value;
+		else if (attr == &dev_attr_end_lba)
+			q->blk_trace->end_lba = value;
+		ret = count;
+	}
+	mutex_unlock(&bdev->bd_mutex);
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+out:
+	return ret;
+}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d720243f5f4..01714efdc65a 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -268,6 +268,10 @@ ssize_t part_fail_store(struct device *dev,
 }
 #endif
 
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+extern struct attribute_group blk_trace_attr_group;
+#endif
+
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
@@ -294,6 +298,9 @@ static struct attribute_group part_attr_group = {
 
 static struct attribute_group *part_attr_groups[] = {
 	&part_attr_group,
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+	&blk_trace_attr_group,
+#endif
 	NULL
 };
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b96037d970df..e603a291134b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -32,6 +32,7 @@ enum trace_type {
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
+	TRACE_BLK,
 
 	__TRACE_LAST_TYPE,
 };
-- 
cgit v1.2.3-59-g8ed1b


From f04109bf1be7449e27d38ae1bb8465013374bd49 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 28 Jan 2009 13:02:12 -0200
Subject: trace: Use tracing_reset_online_cpus in more places
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c          | 6 +-----
 kernel/trace/trace_functions_graph.c | 8 ++------
 kernel/trace/trace_nop.c             | 6 +-----
 3 files changed, 4 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index ca017e0a9a27..1284145c8898 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -133,11 +133,7 @@ static void stop_branch_trace(struct trace_array *tr)
 
 static int branch_trace_init(struct trace_array *tr)
 {
-	int cpu;
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-
+	tracing_reset_online_cpus(tr);
 	start_branch_trace(tr);
 	return 0;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 66fc7b806a38..c97594d826bc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -52,15 +52,11 @@ static struct tracer_flags tracer_flags = {
 
 static int graph_trace_init(struct trace_array *tr)
 {
-	int cpu, ret;
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-
-	ret = register_ftrace_graph(&trace_graph_return,
+	int ret = register_ftrace_graph(&trace_graph_return,
 					&trace_graph_entry);
 	if (ret)
 		return ret;
+	tracing_reset_online_cpus(tr);
 	tracing_start_cmdline_record();
 
 	return 0;
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index b9767acd30ac..087b6cbf4ea5 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -47,12 +47,8 @@ static void stop_nop_trace(struct trace_array *tr)
 
 static int nop_trace_init(struct trace_array *tr)
 {
-	int cpu;
 	ctx_trace = tr;
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-
+	tracing_reset_online_cpus(tr);
 	start_nop_trace(tr);
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From b3a8c34886d0e3dd3a24a5b614ee025181da2f41 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 28 Jan 2009 13:08:37 -0200
Subject: trace_sched_wakeup: Remove unused variable

Impact: cleanup

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 93cecda650b2..a48c9b4b0c85 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -184,13 +184,10 @@ out:
 
 static void __wakeup_reset(struct trace_array *tr)
 {
-	struct trace_array_cpu *data;
 	int cpu;
 
-	for_each_possible_cpu(cpu) {
-		data = tr->data[cpu];
+	for_each_possible_cpu(cpu)
 		tracing_reset(tr, cpu);
-	}
 
 	wakeup_cpu = -1;
 	wakeup_prio = -1;
-- 
cgit v1.2.3-59-g8ed1b


From ecf441b593ac41cb8cd8cd3695110167c42e098c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 29 Jan 2009 13:49:45 -0800
Subject: kmemtrace: fix printk formats, fix

Geert Uytterhoeven wrote:

> %4zu?

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 72b326bba005..f04c0625f1cd 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -139,12 +139,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Requested */
-	ret = trace_seq_printf(s, "%4zd   ", entry->bytes_req);
+	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_req);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Allocated */
-	ret = trace_seq_printf(s, "%4zd   ", entry->bytes_alloc);
+	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_alloc);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-- 
cgit v1.2.3-59-g8ed1b


From b2821ae68b14480bfc85ea1629537163310bc5cd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Feb 2009 21:38:32 -0500
Subject: trace: fix default boot up tracer

Peter Zijlstra started the functionality to start up a default
tracing at bootup. This patch finishes the work.

Now if you add 'ftrace=<tracer>' to the command line, when that tracer
is registered on bootup, that tracer is selected and starts tracing.

Note, all selftests for tracers that are registered after this tracer
is disabled. This prevents the selftests from disturbing the running
tracer, or the running tracer from disturbing the selftest.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 54 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2f8ac1f008f5..2c720c79bc60 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -53,6 +53,11 @@ unsigned long __read_mostly	tracing_thresh;
  */
 static bool __read_mostly tracing_selftest_running;
 
+/*
+ * If a tracer is running, we do not want to run SELFTEST.
+ */
+static bool __read_mostly tracing_selftest_disabled;
+
 /* For tracers that don't implement custom flags */
 static struct tracer_opt dummy_tracer_opt[] = {
 	{ }
@@ -110,14 +115,19 @@ static cpumask_var_t __read_mostly	tracing_buffer_mask;
  */
 int ftrace_dump_on_oops;
 
-static int tracing_set_tracer(char *buf);
+static int tracing_set_tracer(const char *buf);
+
+#define BOOTUP_TRACER_SIZE		100
+static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata;
+static char *default_bootup_tracer;
 
 static int __init set_ftrace(char *str)
 {
-	tracing_set_tracer(str);
+	strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
+	default_bootup_tracer = bootup_tracer_buf;
 	return 1;
 }
-__setup("ftrace", set_ftrace);
+__setup("ftrace=", set_ftrace);
 
 static int __init set_ftrace_dump_on_oops(char *str)
 {
@@ -468,7 +478,7 @@ int register_tracer(struct tracer *type)
 			type->flags->opts = dummy_tracer_opt;
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
-	if (type->selftest) {
+	if (type->selftest && !tracing_selftest_disabled) {
 		struct tracer *saved_tracer = current_trace;
 		struct trace_array *tr = &global_trace;
 		int i;
@@ -510,8 +520,25 @@ int register_tracer(struct tracer *type)
  out:
 	tracing_selftest_running = false;
 	mutex_unlock(&trace_types_lock);
-	lock_kernel();
 
+	if (!ret && default_bootup_tracer) {
+		if (!strncmp(default_bootup_tracer, type->name,
+			     BOOTUP_TRACER_SIZE)) {
+			printk(KERN_INFO "Starting tracer '%s'\n",
+			       type->name);
+			/* Do we want this tracer to start on bootup? */
+			tracing_set_tracer(type->name);
+			default_bootup_tracer = NULL;
+			/* disable other selftests, since this will break it. */
+			tracing_selftest_disabled = 1;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+			printk(KERN_INFO "Disabling FTRACE selftests due"
+			       " to running tracer '%s'\n", type->name);
+#endif
+		}
+	}
+
+	lock_kernel();
 	return ret;
 }
 
@@ -2245,7 +2272,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
-static int tracing_set_tracer(char *buf)
+static int tracing_set_tracer(const char *buf)
 {
 	struct trace_array *tr = &global_trace;
 	struct tracer *t;
@@ -3163,5 +3190,26 @@ out_free_buffer_mask:
 out:
 	return ret;
 }
+
+__init static int clear_boot_tracer(void)
+{
+	/*
+	 * The default tracer at boot buffer is an init section.
+	 * This function is called in lateinit. If we did not
+	 * find the boot tracer, then clear it out, to prevent
+	 * later registration from accessing the buffer that is
+	 * about to be freed.
+	 */
+	if (!default_bootup_tracer)
+		return 0;
+
+	printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n",
+	       default_bootup_tracer);
+	default_bootup_tracer = NULL;
+
+	return 0;
+}
+
 early_initcall(tracer_alloc_buffers);
 fs_initcall(tracer_init_debugfs);
+late_initcall(clear_boot_tracer);
-- 
cgit v1.2.3-59-g8ed1b


From 79fb0768fbd371f3b94d909f51f587b3a24ab272 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Feb 2009 21:38:33 -0500
Subject: trace: let boot trace be chosen by command line

Now that we have a working ftrace=<tracer> function, make the boot
tracer get activated by it. This way we can turn it on or off without
recompiling the kernel, as well as keeping the selftests on. The
selftests are disabled whenever a default tracer starts running.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig      |  7 +++----
 kernel/trace/trace.c      |  5 +----
 kernel/trace/trace_boot.c | 11 +++++++----
 3 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index dde1d46f77e5..28f2644484d9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -164,9 +164,8 @@ config BOOT_TRACER
 	  representation of the delays during initcalls - but the raw
 	  /debug/tracing/trace text output is readable too.
 
-	  ( Note that tracing self tests can't be enabled if this tracer is
-	    selected, because the self-tests are an initcall as well and that
-	    would invalidate the boot trace. )
+	  You must pass in ftrace=initcall to the kernel command line
+	  to enable this on bootup.
 
 config TRACE_BRANCH_PROFILING
 	bool "Trace likely/unlikely profiler"
@@ -326,7 +325,7 @@ config FTRACE_SELFTEST
 
 config FTRACE_STARTUP_TEST
 	bool "Perform a startup test on ftrace"
-	depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER
+	depends on TRACING && DEBUG_KERNEL
 	select FTRACE_SELFTEST
 	help
 	  This option performs a series of startup tests on ftrace. On bootup
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2c720c79bc60..40edef4255c5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3167,12 +3167,9 @@ __init static int tracer_alloc_buffers(void)
 	trace_init_cmdlines();
 
 	register_tracer(&nop_trace);
+	current_trace = &nop_trace;
 #ifdef CONFIG_BOOT_TRACER
 	register_tracer(&boot_tracer);
-	current_trace = &boot_tracer;
-	current_trace->init(&global_trace);
-#else
-	current_trace = &nop_trace;
 #endif
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 0e94b3d091f7..1f07895977a0 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -28,13 +28,13 @@ void start_boot_trace(void)
 
 void enable_boot_trace(void)
 {
-	if (pre_initcalls_finished)
+	if (boot_trace && pre_initcalls_finished)
 		tracing_start_sched_switch_record();
 }
 
 void disable_boot_trace(void)
 {
-	if (pre_initcalls_finished)
+	if (boot_trace && pre_initcalls_finished)
 		tracing_stop_sched_switch_record();
 }
 
@@ -43,6 +43,9 @@ static int boot_trace_init(struct trace_array *tr)
 	int cpu;
 	boot_trace = tr;
 
+	if (!tr)
+		return 0;
+
 	for_each_cpu(cpu, cpu_possible_mask)
 		tracing_reset(tr, cpu);
 
@@ -132,7 +135,7 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
-	if (!pre_initcalls_finished)
+	if (!tr || !pre_initcalls_finished)
 		return;
 
 	/* Get its name now since this function could
@@ -164,7 +167,7 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
-	if (!pre_initcalls_finished)
+	if (!tr || !pre_initcalls_finished)
 		return;
 
 	sprint_symbol(bt->func, (unsigned long)fn);
-- 
cgit v1.2.3-59-g8ed1b


From c4a8e8be2d43cc22b371e8e9c05c253409759d94 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 2 Feb 2009 20:29:21 -0200
Subject: trace: better manage the context info for events

Impact: make trace_event more convenient for tracers

All tracers (for the moment) that use the struct trace_event want to
have the context info printed before their own output: the pid/cmdline,
cpu, and timestamp.

But some other tracers that want to implement their trace_event
callbacks will not necessary need these information or they may want to
format them as they want.

This patch adds a new default-enabled trace option:
TRACE_ITER_CONTEXT_INFO When disabled through:

echo nocontext-info > /debugfs/tracing/trace_options

The pid, cpu and timestamps headers will not be printed.

IE with the sched_switch tracer with context-info (default):

     bash-2935 [001] 100.356561: 2935:120:S ==> [001]  0:140:R <idle>
   <idle>-0    [000] 100.412804:    0:140:R   + [000] 11:115:S events/0
   <idle>-0    [000] 100.412816:    0:140:R ==> [000] 11:115:R events/0
 events/0-11   [000] 100.412829:   11:115:S ==> [000]  0:140:R <idle>

Without context-info:

 2935:120:S ==> [001]  0:140:R <idle>
    0:140:R   + [000] 11:115:S events/0
    0:140:R ==> [000] 11:115:R events/0
   11:115:S ==> [000]  0:140:R <idle>

A tracer can disable it at runtime by clearing the bit
TRACE_ITER_CONTEXT_INFO in trace_flags.

The print routines were renamed to trace_print_context and
trace_print_lat_context, so that they can be used by tracers if they
want to use them for one of the trace_event callbacks.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        | 149 +++++++++++---------------------------------
 kernel/trace/trace.h        |   7 ++-
 kernel/trace/trace_output.c | 107 +++++++++++++++++++++++++++++++
 kernel/trace/trace_output.h |   3 +
 4 files changed, 151 insertions(+), 115 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2f8ac1f008f5..5ec49c3c1597 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -227,7 +227,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
-	TRACE_ITER_ANNOTATE;
+	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
@@ -285,6 +285,7 @@ static const char *trace_options[] = {
 	"userstacktrace",
 	"sym-userobj",
 	"printk-msg-only",
+	"context-info",
 	NULL
 };
 
@@ -1171,8 +1172,8 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 }
 
 /* Find the next real entry, without updating the iterator itself */
-static struct trace_entry *
-find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
+					  int *ent_cpu, u64 *ent_ts)
 {
 	return __find_next_entry(iter, ent_cpu, ent_ts);
 }
@@ -1351,57 +1352,6 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 	seq_puts(m, "\n");
 }
 
-static void
-lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
-{
-	int hardirq, softirq;
-	char *comm;
-
-	comm = trace_find_cmdline(entry->pid);
-
-	trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
-	trace_seq_printf(s, "%3d", cpu);
-	trace_seq_printf(s, "%c%c",
-			(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
-			 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
-			((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
-
-	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
-	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
-	if (hardirq && softirq) {
-		trace_seq_putc(s, 'H');
-	} else {
-		if (hardirq) {
-			trace_seq_putc(s, 'h');
-		} else {
-			if (softirq)
-				trace_seq_putc(s, 's');
-			else
-				trace_seq_putc(s, '.');
-		}
-	}
-
-	if (entry->preempt_count)
-		trace_seq_printf(s, "%x", entry->preempt_count);
-	else
-		trace_seq_puts(s, ".");
-}
-
-unsigned long preempt_mark_thresh = 100;
-
-static void
-lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
-		    unsigned long rel_usecs)
-{
-	trace_seq_printf(s, " %4lldus", abs_usecs);
-	if (rel_usecs > preempt_mark_thresh)
-		trace_seq_puts(s, "!: ");
-	else if (rel_usecs > 1)
-		trace_seq_puts(s, "+: ");
-	else
-		trace_seq_puts(s, " : ");
-}
-
 static void test_cpu_buff_start(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1419,46 +1369,24 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 	trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
 }
 
-static enum print_line_t
-print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
+static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
-	struct trace_entry *next_entry;
 	struct trace_event *event;
-	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
 	struct trace_entry *entry = iter->ent;
-	unsigned long abs_usecs;
-	unsigned long rel_usecs;
-	u64 next_ts;
-	char *comm;
 	int ret;
 
 	test_cpu_buff_start(iter);
 
-	next_entry = find_next_entry(iter, NULL, &next_ts);
-	if (!next_entry)
-		next_ts = iter->ts;
-	rel_usecs = ns2usecs(next_ts - iter->ts);
-	abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
-
-	if (verbose) {
-		comm = trace_find_cmdline(entry->pid);
-		trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]"
-				 " %ld.%03ldms (+%ld.%03ldms): ",
-				 comm,
-				 entry->pid, cpu, entry->flags,
-				 entry->preempt_count, trace_idx,
-				 ns2usecs(iter->ts),
-				 abs_usecs/1000,
-				 abs_usecs % 1000, rel_usecs/1000,
-				 rel_usecs % 1000);
-	} else {
-		lat_print_generic(s, entry, cpu);
-		lat_print_timestamp(s, abs_usecs, rel_usecs);
+	event = ftrace_find_event(entry->type);
+
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		ret = trace_print_lat_context(iter);
+		if (ret)
+			return ret;
 	}
 
-	event = ftrace_find_event(entry->type);
 	if (event && event->latency_trace) {
 		ret = event->latency_trace(s, entry, sym_flags);
 		if (ret)
@@ -1476,33 +1404,20 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_entry *entry;
 	struct trace_event *event;
-	unsigned long usec_rem;
-	unsigned long long t;
-	unsigned long secs;
-	char *comm;
 	int ret;
 
 	entry = iter->ent;
 
 	test_cpu_buff_start(iter);
 
-	comm = trace_find_cmdline(iter->ent->pid);
-
-	t = ns2usecs(iter->ts);
-	usec_rem = do_div(t, 1000000ULL);
-	secs = (unsigned long)t;
+	event = ftrace_find_event(entry->type);
 
-	ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	ret = trace_seq_printf(s, "[%03d] ", iter->cpu);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		ret = trace_print_context(iter);
+		if (ret)
+			return ret;
+	}
 
-	event = ftrace_find_event(entry->type);
 	if (event && event->trace) {
 		ret = event->trace(s, entry, sym_flags);
 		if (ret)
@@ -1525,10 +1440,12 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	ret = trace_seq_printf(s, "%d %d %llu ",
-		entry->pid, iter->cpu, iter->ts);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		ret = trace_seq_printf(s, "%d %d %llu ",
+			entry->pid, iter->cpu, iter->ts);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->raw) {
@@ -1553,9 +1470,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
-	SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
-	SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
+		SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
+		SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
+	}
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->hex)
@@ -1575,7 +1494,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_printf(s, field->buf);
+	ret = trace_seq_printf(s, "%s", field->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -1590,9 +1509,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	SEQ_PUT_FIELD_RET(s, entry->pid);
-	SEQ_PUT_FIELD_RET(s, entry->cpu);
-	SEQ_PUT_FIELD_RET(s, iter->ts);
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		SEQ_PUT_FIELD_RET(s, entry->pid);
+		SEQ_PUT_FIELD_RET(s, entry->cpu);
+		SEQ_PUT_FIELD_RET(s, iter->ts);
+	}
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->binary)
@@ -1643,7 +1564,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 		return print_raw_fmt(iter);
 
 	if (iter->iter_flags & TRACE_FILE_LAT_FMT)
-		return print_lat_fmt(iter, iter->idx, iter->cpu);
+		return print_lat_fmt(iter);
 
 	return print_trace_fmt(iter);
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e603a291134b..f0c7a0f08cac 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -405,6 +405,10 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
+
+struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
+					  int *ent_cpu, u64 *ent_ts);
+
 void tracing_generic_entry_update(struct trace_entry *entry,
 				  unsigned long flags,
 				  int pc);
@@ -591,7 +595,8 @@ enum trace_iterator_flags {
 	TRACE_ITER_ANNOTATE		= 0x2000,
 	TRACE_ITER_USERSTACKTRACE       = 0x4000,
 	TRACE_ITER_SYM_USEROBJ          = 0x8000,
-	TRACE_ITER_PRINTK_MSGONLY	= 0x10000
+	TRACE_ITER_PRINTK_MSGONLY	= 0x10000,
+	TRACE_ITER_CONTEXT_INFO		= 0x20000 /* Print pid/cpu/time */
 };
 
 /*
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 1a4e144a9f8f..a5752d4d3c33 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -286,6 +286,113 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
+static void
+lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+{
+	int hardirq, softirq;
+	char *comm;
+
+	comm = trace_find_cmdline(entry->pid);
+
+	trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
+	trace_seq_printf(s, "%3d", cpu);
+	trace_seq_printf(s, "%c%c",
+			(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+			 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
+			((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq) {
+		trace_seq_putc(s, 'H');
+	} else {
+		if (hardirq) {
+			trace_seq_putc(s, 'h');
+		} else {
+			if (softirq)
+				trace_seq_putc(s, 's');
+			else
+				trace_seq_putc(s, '.');
+		}
+	}
+
+	if (entry->preempt_count)
+		trace_seq_printf(s, "%x", entry->preempt_count);
+	else
+		trace_seq_puts(s, ".");
+}
+
+static unsigned long preempt_mark_thresh = 100;
+
+static void
+lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
+		    unsigned long rel_usecs)
+{
+	trace_seq_printf(s, " %4lldus", abs_usecs);
+	if (rel_usecs > preempt_mark_thresh)
+		trace_seq_puts(s, "!: ");
+	else if (rel_usecs > 1)
+		trace_seq_puts(s, "+: ");
+	else
+		trace_seq_puts(s, " : ");
+}
+
+int trace_print_context(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	char *comm = trace_find_cmdline(entry->pid);
+	unsigned long long t = ns2usecs(iter->ts);
+	unsigned long usec_rem = do_div(t, USEC_PER_SEC);
+	unsigned long secs = (unsigned long)t;
+
+	if (!trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid))
+		goto partial;
+	if (!trace_seq_printf(s, "[%03d] ", entry->cpu))
+		goto partial;
+	if (!trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem))
+		goto partial;
+
+	return 0;
+
+partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+int trace_print_lat_context(struct trace_iterator *iter)
+{
+	u64 next_ts;
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent,
+			   *next_entry = trace_find_next_entry(iter, NULL,
+							       &next_ts);
+	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+	unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
+	unsigned long rel_usecs;
+
+	if (!next_entry)
+		next_ts = iter->ts;
+	rel_usecs = ns2usecs(next_ts - iter->ts);
+
+	if (verbose) {
+		char *comm = trace_find_cmdline(entry->pid);
+		trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
+				 " %ld.%03ldms (+%ld.%03ldms): ",
+				 comm,
+				 entry->pid, entry->cpu, entry->flags,
+				 entry->preempt_count, iter->idx,
+				 ns2usecs(iter->ts),
+				 abs_usecs/1000,
+				 abs_usecs % 1000, rel_usecs/1000,
+				 rel_usecs % 1000);
+	} else {
+		lat_print_generic(s, entry, entry->cpu);
+		lat_print_timestamp(s, abs_usecs, rel_usecs);
+	}
+
+	return 0;
+}
+
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
 
 static int task_state_char(unsigned long state)
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 1cbab5e3dc99..ec2ed90f10f0 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -33,6 +33,9 @@ int seq_print_userip_objs(const struct userstack_entry *entry,
 int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 		      unsigned long ip, unsigned long sym_flags);
 
+int trace_print_context(struct trace_iterator *iter);
+int trace_print_lat_context(struct trace_iterator *iter);
+
 struct trace_event *ftrace_find_event(int type);
 int register_ftrace_event(struct trace_event *event);
 int unregister_ftrace_event(struct trace_event *event);
-- 
cgit v1.2.3-59-g8ed1b


From 2c9b238eb325895d3312dad64e2685783575e474 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 2 Feb 2009 20:30:12 -0200
Subject: trace: Change struct trace_event callbacks parameter list

Impact: API change

The trace_seq and trace_entry are in trace_iterator, where there are
more fields that may be needed by tracers, so just pass the
tracer_iterator as is already the case for struct tracer->print_line.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c            |   8 +--
 kernel/trace/trace.c        |  10 ++--
 kernel/trace/trace_branch.c |   7 +--
 kernel/trace/trace_output.c | 139 ++++++++++++++++++++------------------------
 kernel/trace/trace_output.h |   6 +-
 5 files changed, 76 insertions(+), 94 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index 3f25425ade12..570cd3c40bd1 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -1140,10 +1140,10 @@ static struct {
 	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
 };
 
-static int blk_trace_event_print(struct trace_seq *s, struct trace_entry *ent,
-				 int flags)
+static int blk_trace_event_print(struct trace_iterator *iter, int flags)
 {
-	const struct blk_io_trace *t = (struct blk_io_trace *)ent;
+	struct trace_seq *s = &iter->seq;
+	const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
 	const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
 	int ret;
 
@@ -1153,7 +1153,7 @@ static int blk_trace_event_print(struct trace_seq *s, struct trace_entry *ent,
 		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
 		ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
 		if (ret)
-			ret = what2act[what].print(s, ent);
+			ret = what2act[what].print(s, iter->ent);
 	}
 
 	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5ec49c3c1597..152d0969adf8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1388,7 +1388,7 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 	}
 
 	if (event && event->latency_trace) {
-		ret = event->latency_trace(s, entry, sym_flags);
+		ret = event->latency_trace(iter, sym_flags);
 		if (ret)
 			return ret;
 		return TRACE_TYPE_HANDLED;
@@ -1419,7 +1419,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	}
 
 	if (event && event->trace) {
-		ret = event->trace(s, entry, sym_flags);
+		ret = event->trace(iter, sym_flags);
 		if (ret)
 			return ret;
 		return TRACE_TYPE_HANDLED;
@@ -1449,7 +1449,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->raw) {
-		ret = event->raw(s, entry, 0);
+		ret = event->raw(iter, 0);
 		if (ret)
 			return ret;
 		return TRACE_TYPE_HANDLED;
@@ -1478,7 +1478,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->hex)
-		event->hex(s, entry, 0);
+		event->hex(iter, 0);
 
 	SEQ_PUT_FIELD_RET(s, newline);
 
@@ -1517,7 +1517,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->binary)
-		event->binary(s, entry, 0);
+		event->binary(iter, 0);
 
 	return TRACE_TYPE_HANDLED;
 }
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 1284145c8898..ea62f101e615 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -160,14 +160,13 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int
-trace_branch_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_branch_print(struct trace_iterator *iter, int flags)
 {
 	struct trace_branch *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
-	if (trace_seq_printf(s, "[%s] %s:%s:%d\n",
+	if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
 			     field->correct ? "  ok  " : " MISS ",
 			     field->func,
 			     field->file,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a5752d4d3c33..c24503b281a0 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -484,19 +484,18 @@ int unregister_ftrace_event(struct trace_event *event)
  * Standard events
  */
 
-int
-trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+int trace_nop_print(struct trace_iterator *iter, int flags)
 {
 	return 0;
 }
 
 /* TRACE_FN */
-static int
-trace_fn_latency(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_latency(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!seq_print_ip_sym(s, field->ip, flags))
 		goto partial;
@@ -513,12 +512,12 @@ trace_fn_latency(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int
-trace_fn_trace(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_trace(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!seq_print_ip_sym(s, field->ip, flags))
 		goto partial;
@@ -540,14 +539,13 @@ trace_fn_trace(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int
-trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_raw(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_printf(s, "%lx %lx\n",
+	if (!trace_seq_printf(&iter->seq, "%lx %lx\n",
 			      field->ip,
 			      field->parent_ip))
 		return TRACE_TYPE_PARTIAL_LINE;
@@ -555,12 +553,12 @@ trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
-static int
-trace_fn_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_hex(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_HEX_FIELD_RET(s, field->ip);
 	SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
@@ -568,12 +566,12 @@ trace_fn_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
-static int
-trace_fn_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_bin(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_FIELD_RET(s, field->ip);
 	SEQ_PUT_FIELD_RET(s, field->parent_ip);
@@ -591,20 +589,19 @@ static struct trace_event trace_fn_event = {
 };
 
 /* TRACE_CTX an TRACE_WAKE */
-static int
-trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags,
-		    char *delim)
+static int trace_ctxwake_print(struct trace_iterator *iter, char *delim)
 {
 	struct ctx_switch_entry *field;
 	char *comm;
 	int S, T;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	T = task_state_char(field->next_state);
 	S = task_state_char(field->prev_state);
 	comm = trace_find_cmdline(field->next_pid);
-	if (!trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+	if (!trace_seq_printf(&iter->seq,
+			      " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
 			      field->prev_pid,
 			      field->prev_prio,
 			      S, delim,
@@ -617,31 +614,27 @@ trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags,
 	return 0;
 }
 
-static int
-trace_ctx_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_ctx_print(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_print(s, entry, flags, "==>");
+	return trace_ctxwake_print(iter, "==>");
 }
 
-static int
-trace_wake_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_wake_print(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_print(s, entry, flags, "  +");
+	return trace_ctxwake_print(iter, "  +");
 }
 
-static int
-trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags,
-		  char S)
+static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 {
 	struct ctx_switch_entry *field;
 	int T;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!S)
 		task_state_char(field->prev_state);
 	T = task_state_char(field->next_state);
-	if (!trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
+	if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
 			      field->prev_pid,
 			      field->prev_prio,
 			      S,
@@ -654,27 +647,24 @@ trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags,
 	return 0;
 }
 
-static int
-trace_ctx_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_ctx_raw(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_raw(s, entry, flags, 0);
+	return trace_ctxwake_raw(iter, 0);
 }
 
-static int
-trace_wake_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_wake_raw(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_raw(s, entry, flags, '+');
+	return trace_ctxwake_raw(iter, '+');
 }
 
 
-static int
-trace_ctxwake_hex(struct trace_seq *s, struct trace_entry *entry, int flags,
-		  char S)
+static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 {
 	struct ctx_switch_entry *field;
+	struct trace_seq *s = &iter->seq;
 	int T;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!S)
 		task_state_char(field->prev_state);
@@ -691,24 +681,22 @@ trace_ctxwake_hex(struct trace_seq *s, struct trace_entry *entry, int flags,
 	return 0;
 }
 
-static int
-trace_ctx_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_ctx_hex(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_hex(s, entry, flags, 0);
+	return trace_ctxwake_hex(iter, 0);
 }
 
-static int
-trace_wake_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_wake_hex(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_hex(s, entry, flags, '+');
+	return trace_ctxwake_hex(iter, '+');
 }
 
-static int
-trace_ctxwake_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_ctxwake_bin(struct trace_iterator *iter, int flags)
 {
 	struct ctx_switch_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_FIELD_RET(s, field->prev_pid);
 	SEQ_PUT_FIELD_RET(s, field->prev_prio);
@@ -739,14 +727,13 @@ static struct trace_event trace_wake_event = {
 };
 
 /* TRACE_SPECIAL */
-static int
-trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_special_print(struct trace_iterator *iter, int flags)
 {
 	struct special_entry *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_printf(s, "# %ld %ld %ld\n",
+	if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
 			      field->arg1,
 			      field->arg2,
 			      field->arg3))
@@ -755,12 +742,12 @@ trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
-static int
-trace_special_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_special_hex(struct trace_iterator *iter, int flags)
 {
 	struct special_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
 	SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
@@ -769,12 +756,12 @@ trace_special_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
-static int
-trace_special_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_special_bin(struct trace_iterator *iter, int flags)
 {
 	struct special_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_FIELD_RET(s, field->arg1);
 	SEQ_PUT_FIELD_RET(s, field->arg2);
@@ -794,13 +781,13 @@ static struct trace_event trace_special_event = {
 
 /* TRACE_STACK */
 
-static int
-trace_stack_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_stack_print(struct trace_iterator *iter, int flags)
 {
 	struct stack_entry *field;
+	struct trace_seq *s = &iter->seq;
 	int i;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
 		if (i) {
@@ -830,13 +817,12 @@ static struct trace_event trace_stack_event = {
 };
 
 /* TRACE_USER_STACK */
-static int
-trace_user_stack_print(struct trace_seq *s, struct trace_entry *entry,
-		       int flags)
+static int trace_user_stack_print(struct trace_iterator *iter, int flags)
 {
 	struct userstack_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!seq_print_userip_objs(field, s, flags))
 		goto partial;
@@ -860,12 +846,12 @@ static struct trace_event trace_user_stack_event = {
 };
 
 /* TRACE_PRINT */
-static int
-trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_print_print(struct trace_iterator *iter, int flags)
 {
 	struct print_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!seq_print_ip_sym(s, field->ip, flags))
 		goto partial;
@@ -879,14 +865,13 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int
-trace_print_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_print_raw(struct trace_iterator *iter, int flags)
 {
 	struct print_entry *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_printf(s, "# %lx %s", field->ip, field->buf))
+	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
 		goto partial;
 
 	return 0;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index ec2ed90f10f0..3aeb31f6506b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -3,8 +3,7 @@
 
 #include "trace.h"
 
-typedef int (*trace_print_func)(struct trace_seq *s, struct trace_entry *entry,
-				int flags);
+typedef int (*trace_print_func)(struct trace_iterator *iter, int flags);
 
 struct trace_event {
 	struct hlist_node	node;
@@ -40,8 +39,7 @@ struct trace_event *ftrace_find_event(int type);
 int register_ftrace_event(struct trace_event *event);
 int unregister_ftrace_event(struct trace_event *event);
 
-int
-trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags);
+int trace_nop_print(struct trace_iterator *iter, int flags);
 
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
-- 
cgit v1.2.3-59-g8ed1b


From d9793bd8018f835c64b10f44e278c86cecb8e932 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 3 Feb 2009 20:20:41 -0200
Subject: trace: judicious error checking of trace_seq results

Impact: bugfix and cleanup

Some callsites were returning either TRACE_ITER_PARTIAL_LINE if the
trace_seq routines (trace_seq_printf, etc) returned 0 meaning its buffer
was full, or zero otherwise.

But...

/* Return values for print_line callback */
enum print_line_t {
        TRACE_TYPE_PARTIAL_LINE = 0,    /* Retry after flushing the seq */
        TRACE_TYPE_HANDLED      = 1,
        TRACE_TYPE_UNHANDLED    = 2     /* Relay to other output functions */
};

In other cases the return value was not being relayed at all.

Most of the time it didn't hurt because the page wasn't get filled, but
for correctness sake, handle the return values everywhere.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c            |   2 +-
 kernel/trace/trace.c        |  75 ++++++++++++---------------
 kernel/trace/trace_branch.c |   2 +-
 kernel/trace/trace_output.c | 123 ++++++++++++++++++--------------------------
 4 files changed, 87 insertions(+), 115 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index 8f5c37b0f80f..12df27693972 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -1165,7 +1165,7 @@ static int blk_trace_event_print(struct trace_iterator *iter, int flags)
 	const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
 	int ret;
 
-	if (trace_print_context(iter))
+	if (!trace_print_context(iter))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bbdfaa2cbdb9..5822ff4e5a3e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1402,27 +1402,25 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_event *event;
 	struct trace_entry *entry = iter->ent;
-	int ret;
 
 	test_cpu_buff_start(iter);
 
 	event = ftrace_find_event(entry->type);
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		ret = trace_print_lat_context(iter);
-		if (ret)
-			return ret;
+		if (!trace_print_lat_context(iter))
+			goto partial;
 	}
 
-	if (event && event->latency_trace) {
-		ret = event->latency_trace(iter, sym_flags);
-		if (ret)
-			return ret;
-		return TRACE_TYPE_HANDLED;
-	}
+	if (event && event->latency_trace)
+		return event->latency_trace(iter, sym_flags);
+
+	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
+		goto partial;
 
-	trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	return TRACE_TYPE_HANDLED;
+partial:
+	return TRACE_TYPE_PARTIAL_LINE;
 }
 
 static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
@@ -1431,7 +1429,6 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_entry *entry;
 	struct trace_event *event;
-	int ret;
 
 	entry = iter->ent;
 
@@ -1440,22 +1437,19 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	event = ftrace_find_event(entry->type);
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		ret = trace_print_context(iter);
-		if (ret)
-			return ret;
+		if (!trace_print_context(iter))
+			goto partial;
 	}
 
-	if (event && event->trace) {
-		ret = event->trace(iter, sym_flags);
-		if (ret)
-			return ret;
-		return TRACE_TYPE_HANDLED;
-	}
-	ret = trace_seq_printf(s, "Unknown type %d\n", entry->type);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (event && event->trace)
+		return event->trace(iter, sym_flags);
+
+	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
+		goto partial;
 
 	return TRACE_TYPE_HANDLED;
+partial:
+	return TRACE_TYPE_PARTIAL_LINE;
 }
 
 static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
@@ -1463,29 +1457,25 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry;
 	struct trace_event *event;
-	int ret;
 
 	entry = iter->ent;
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		ret = trace_seq_printf(s, "%d %d %llu ",
-			entry->pid, iter->cpu, iter->ts);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
+		if (!trace_seq_printf(s, "%d %d %llu ",
+				      entry->pid, iter->cpu, iter->ts))
+			goto partial;
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->raw) {
-		ret = event->raw(iter, 0);
-		if (ret)
-			return ret;
-		return TRACE_TYPE_HANDLED;
-	}
-	ret = trace_seq_printf(s, "%d ?\n", entry->type);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (event && event->raw)
+		return event->raw(iter, 0);
+
+	if (!trace_seq_printf(s, "%d ?\n", entry->type))
+		goto partial;
 
 	return TRACE_TYPE_HANDLED;
+partial:
+	return TRACE_TYPE_PARTIAL_LINE;
 }
 
 static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
@@ -1504,8 +1494,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->hex)
-		event->hex(iter, 0);
+	if (event && event->hex) {
+		int ret = event->hex(iter, 0);
+		if (ret != TRACE_TYPE_HANDLED)
+			return ret;
+	}
 
 	SEQ_PUT_FIELD_RET(s, newline);
 
@@ -1544,7 +1537,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->binary)
-		event->binary(iter, 0);
+		return event->binary(iter, 0);
 
 	return TRACE_TYPE_HANDLED;
 }
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index ea62f101e615..f6b35e162dfa 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -173,7 +173,7 @@ static int trace_branch_print(struct trace_iterator *iter, int flags)
 			     field->line))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c24503b281a0..5b3c914053f2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -286,55 +286,41 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
-static void
+static int
 lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 {
 	int hardirq, softirq;
 	char *comm;
 
 	comm = trace_find_cmdline(entry->pid);
-
-	trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
-	trace_seq_printf(s, "%3d", cpu);
-	trace_seq_printf(s, "%c%c",
-			(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
-			 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
-			((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
-
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
 	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
-	if (hardirq && softirq) {
-		trace_seq_putc(s, 'H');
-	} else {
-		if (hardirq) {
-			trace_seq_putc(s, 'h');
-		} else {
-			if (softirq)
-				trace_seq_putc(s, 's');
-			else
-				trace_seq_putc(s, '.');
-		}
-	}
+
+	if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c",
+			      comm, entry->pid, cpu,
+			      (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+				(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
+				  'X' : '.',
+			      (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
+				'N' : '.',
+			      (hardirq && softirq) ? 'H' :
+				hardirq ? 'h' : softirq ? 's' : '.'))
+		return 0;
 
 	if (entry->preempt_count)
-		trace_seq_printf(s, "%x", entry->preempt_count);
-	else
-		trace_seq_puts(s, ".");
+		return trace_seq_printf(s, "%x", entry->preempt_count);
+	return trace_seq_puts(s, ".");
 }
 
 static unsigned long preempt_mark_thresh = 100;
 
-static void
+static int
 lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
 		    unsigned long rel_usecs)
 {
-	trace_seq_printf(s, " %4lldus", abs_usecs);
-	if (rel_usecs > preempt_mark_thresh)
-		trace_seq_puts(s, "!: ");
-	else if (rel_usecs > 1)
-		trace_seq_puts(s, "+: ");
-	else
-		trace_seq_puts(s, " : ");
+	return trace_seq_printf(s, " %4lldus%c: ", abs_usecs,
+				rel_usecs > preempt_mark_thresh ? '!' :
+				  rel_usecs > 1 ? '+' : ' ');
 }
 
 int trace_print_context(struct trace_iterator *iter)
@@ -346,22 +332,14 @@ int trace_print_context(struct trace_iterator *iter)
 	unsigned long usec_rem = do_div(t, USEC_PER_SEC);
 	unsigned long secs = (unsigned long)t;
 
-	if (!trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid))
-		goto partial;
-	if (!trace_seq_printf(s, "[%03d] ", entry->cpu))
-		goto partial;
-	if (!trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem))
-		goto partial;
-
-	return 0;
-
-partial:
-	return TRACE_TYPE_PARTIAL_LINE;
+	return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
+				comm, entry->pid, entry->cpu, secs, usec_rem);
 }
 
 int trace_print_lat_context(struct trace_iterator *iter)
 {
 	u64 next_ts;
+	int ret;
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry = iter->ent,
 			   *next_entry = trace_find_next_entry(iter, NULL,
@@ -376,21 +354,22 @@ int trace_print_lat_context(struct trace_iterator *iter)
 
 	if (verbose) {
 		char *comm = trace_find_cmdline(entry->pid);
-		trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
-				 " %ld.%03ldms (+%ld.%03ldms): ",
-				 comm,
-				 entry->pid, entry->cpu, entry->flags,
-				 entry->preempt_count, iter->idx,
-				 ns2usecs(iter->ts),
-				 abs_usecs/1000,
-				 abs_usecs % 1000, rel_usecs/1000,
-				 rel_usecs % 1000);
+		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
+				       " %ld.%03ldms (+%ld.%03ldms): ", comm,
+				       entry->pid, entry->cpu, entry->flags,
+				       entry->preempt_count, iter->idx,
+				       ns2usecs(iter->ts),
+				       abs_usecs / USEC_PER_MSEC,
+				       abs_usecs % USEC_PER_MSEC,
+				       rel_usecs / USEC_PER_MSEC,
+				       rel_usecs % USEC_PER_MSEC);
 	} else {
-		lat_print_generic(s, entry, entry->cpu);
-		lat_print_timestamp(s, abs_usecs, rel_usecs);
+		ret = lat_print_generic(s, entry, entry->cpu);
+		if (ret)
+			ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
 	}
 
-	return 0;
+	return ret;
 }
 
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
@@ -486,7 +465,7 @@ int unregister_ftrace_event(struct trace_event *event)
 
 int trace_nop_print(struct trace_iterator *iter, int flags)
 {
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 /* TRACE_FN */
@@ -506,7 +485,7 @@ static int trace_fn_latency(struct trace_iterator *iter, int flags)
 	if (!trace_seq_puts(s, ")\n"))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -533,7 +512,7 @@ static int trace_fn_trace(struct trace_iterator *iter, int flags)
 	if (!trace_seq_printf(s, "\n"))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -550,7 +529,7 @@ static int trace_fn_raw(struct trace_iterator *iter, int flags)
 			      field->parent_ip))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_fn_hex(struct trace_iterator *iter, int flags)
@@ -563,7 +542,7 @@ static int trace_fn_hex(struct trace_iterator *iter, int flags)
 	SEQ_PUT_HEX_FIELD_RET(s, field->ip);
 	SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_fn_bin(struct trace_iterator *iter, int flags)
@@ -576,7 +555,7 @@ static int trace_fn_bin(struct trace_iterator *iter, int flags)
 	SEQ_PUT_FIELD_RET(s, field->ip);
 	SEQ_PUT_FIELD_RET(s, field->parent_ip);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static struct trace_event trace_fn_event = {
@@ -611,7 +590,7 @@ static int trace_ctxwake_print(struct trace_iterator *iter, char *delim)
 			      T, comm))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_ctx_print(struct trace_iterator *iter, int flags)
@@ -644,7 +623,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 			      T))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_ctx_raw(struct trace_iterator *iter, int flags)
@@ -678,7 +657,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 	SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
 	SEQ_PUT_HEX_FIELD_RET(s, T);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_ctx_hex(struct trace_iterator *iter, int flags)
@@ -705,7 +684,7 @@ static int trace_ctxwake_bin(struct trace_iterator *iter, int flags)
 	SEQ_PUT_FIELD_RET(s, field->next_prio);
 	SEQ_PUT_FIELD_RET(s, field->next_state);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static struct trace_event trace_ctx_event = {
@@ -739,7 +718,7 @@ static int trace_special_print(struct trace_iterator *iter, int flags)
 			      field->arg3))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_special_hex(struct trace_iterator *iter, int flags)
@@ -753,7 +732,7 @@ static int trace_special_hex(struct trace_iterator *iter, int flags)
 	SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
 	SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_special_bin(struct trace_iterator *iter, int flags)
@@ -767,7 +746,7 @@ static int trace_special_bin(struct trace_iterator *iter, int flags)
 	SEQ_PUT_FIELD_RET(s, field->arg2);
 	SEQ_PUT_FIELD_RET(s, field->arg3);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static struct trace_event trace_special_event = {
@@ -801,7 +780,7 @@ static int trace_stack_print(struct trace_iterator *iter, int flags)
 			goto partial;
 	}
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -830,7 +809,7 @@ static int trace_user_stack_print(struct trace_iterator *iter, int flags)
 	if (!trace_seq_putc(s, '\n'))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -859,7 +838,7 @@ static int trace_print_print(struct trace_iterator *iter, int flags)
 	if (!trace_seq_printf(s, ": %s", field->buf))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -874,7 +853,7 @@ static int trace_print_raw(struct trace_iterator *iter, int flags)
 	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
-- 
cgit v1.2.3-59-g8ed1b


From ae7462b4f1fe1f36b5d562dbd5202a2eba01f072 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 3 Feb 2009 22:05:50 -0200
Subject: trace: make the trace_event callbacks return enum print_line_t

As they actually all return these enumerators.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c            |  6 ++++--
 kernel/trace/trace.c        |  2 +-
 kernel/trace/trace_branch.c |  3 ++-
 kernel/trace/trace_output.c | 52 +++++++++++++++++++++++++++------------------
 kernel/trace/trace_output.h |  5 +++--
 5 files changed, 41 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index 12df27693972..c7698d1617a1 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -1158,7 +1158,8 @@ static struct {
 	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
 };
 
-static int blk_trace_event_print(struct trace_iterator *iter, int flags)
+static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
+					       int flags)
 {
 	struct trace_seq *s = &iter->seq;
 	const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
@@ -1196,7 +1197,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 				sizeof(old) - offset + t->pdu_len);
 }
 
-static int blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
+static enum print_line_t
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
 {
 	return blk_trace_synthesize_old_trace(iter) ?
 			TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5822ff4e5a3e..fd51cf0b94c7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1495,7 +1495,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->hex) {
-		int ret = event->hex(iter, 0);
+		enum print_line_t ret = event->hex(iter, 0);
 		if (ret != TRACE_TYPE_HANDLED)
 			return ret;
 	}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index f6b35e162dfa..7ac72a44b2d3 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -160,7 +160,8 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int trace_branch_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_branch_print(struct trace_iterator *iter,
+					    int flags)
 {
 	struct trace_branch *field;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 5b3c914053f2..b7380eee9fa1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -463,13 +463,14 @@ int unregister_ftrace_event(struct trace_event *event)
  * Standard events
  */
 
-int trace_nop_print(struct trace_iterator *iter, int flags)
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
 {
 	return TRACE_TYPE_HANDLED;
 }
 
 /* TRACE_FN */
-static int trace_fn_latency(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_latency(struct trace_iterator *iter,
+					  int flags)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -491,7 +492,7 @@ static int trace_fn_latency(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int trace_fn_trace(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -518,7 +519,7 @@ static int trace_fn_trace(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int trace_fn_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 
@@ -532,7 +533,7 @@ static int trace_fn_raw(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_fn_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -545,7 +546,7 @@ static int trace_fn_hex(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_fn_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -568,7 +569,8 @@ static struct trace_event trace_fn_event = {
 };
 
 /* TRACE_CTX an TRACE_WAKE */
-static int trace_ctxwake_print(struct trace_iterator *iter, char *delim)
+static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
+					     char *delim)
 {
 	struct ctx_switch_entry *field;
 	char *comm;
@@ -593,12 +595,13 @@ static int trace_ctxwake_print(struct trace_iterator *iter, char *delim)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_ctx_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_print(iter, "==>");
 }
 
-static int trace_wake_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_print(struct trace_iterator *iter,
+					  int flags)
 {
 	return trace_ctxwake_print(iter, "  +");
 }
@@ -626,12 +629,12 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_ctx_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_raw(iter, 0);
 }
 
-static int trace_wake_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_raw(iter, '+');
 }
@@ -660,17 +663,18 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_ctx_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_hex(iter, 0);
 }
 
-static int trace_wake_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_hex(iter, '+');
 }
 
-static int trace_ctxwake_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
+					   int flags)
 {
 	struct ctx_switch_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -706,7 +710,8 @@ static struct trace_event trace_wake_event = {
 };
 
 /* TRACE_SPECIAL */
-static int trace_special_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_special_print(struct trace_iterator *iter,
+					     int flags)
 {
 	struct special_entry *field;
 
@@ -721,7 +726,8 @@ static int trace_special_print(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_special_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_special_hex(struct trace_iterator *iter,
+					   int flags)
 {
 	struct special_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -735,7 +741,8 @@ static int trace_special_hex(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_special_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_special_bin(struct trace_iterator *iter,
+					   int flags)
 {
 	struct special_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -760,7 +767,8 @@ static struct trace_event trace_special_event = {
 
 /* TRACE_STACK */
 
-static int trace_stack_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_stack_print(struct trace_iterator *iter,
+					   int flags)
 {
 	struct stack_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -796,7 +804,8 @@ static struct trace_event trace_stack_event = {
 };
 
 /* TRACE_USER_STACK */
-static int trace_user_stack_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
+						int flags)
 {
 	struct userstack_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -825,7 +834,8 @@ static struct trace_event trace_user_stack_event = {
 };
 
 /* TRACE_PRINT */
-static int trace_print_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_print_print(struct trace_iterator *iter,
+					   int flags)
 {
 	struct print_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -844,7 +854,7 @@ static int trace_print_print(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 {
 	struct print_entry *field;
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 3aeb31f6506b..551a25a72217 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -3,7 +3,8 @@
 
 #include "trace.h"
 
-typedef int (*trace_print_func)(struct trace_iterator *iter, int flags);
+typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
+					      int flags);
 
 struct trace_event {
 	struct hlist_node	node;
@@ -39,7 +40,7 @@ struct trace_event *ftrace_find_event(int type);
 int register_ftrace_event(struct trace_event *event);
 int unregister_ftrace_event(struct trace_event *event);
 
-int trace_nop_print(struct trace_iterator *iter, int flags);
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags);
 
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
-- 
cgit v1.2.3-59-g8ed1b


From 268ccda0cb4d1292029d07ee3dbd07117baf6ecb Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 4 Feb 2009 20:16:39 -0200
Subject: trace: assign defaults at register_ftrace_event

Impact: simplification of tracers

As all tracers are doing this we might as well do it in
register_ftrace_event and save one branch each time we call these
callbacks.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c            |  2 --
 kernel/trace/trace.c        | 13 +++++--------
 kernel/trace/trace_branch.c |  3 ---
 kernel/trace/trace_output.c | 13 +++++++++++--
 4 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index c7698d1617a1..1ebd068061ec 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -1243,8 +1243,6 @@ static struct trace_event trace_blk_event = {
 	.type	 	= TRACE_BLK,
 	.trace		= blk_trace_event_print,
 	.latency_trace	= blk_trace_event_print,
-	.raw		= trace_nop_print,
-	.hex		= trace_nop_print,
 	.binary		= blk_trace_event_print_binary,
 };
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fd51cf0b94c7..a5e4c0af9bb0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1412,7 +1412,7 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 			goto partial;
 	}
 
-	if (event && event->latency_trace)
+	if (event)
 		return event->latency_trace(iter, sym_flags);
 
 	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
@@ -1441,7 +1441,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 			goto partial;
 	}
 
-	if (event && event->trace)
+	if (event)
 		return event->trace(iter, sym_flags);
 
 	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
@@ -1467,7 +1467,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->raw)
+	if (event)
 		return event->raw(iter, 0);
 
 	if (!trace_seq_printf(s, "%d ?\n", entry->type))
@@ -1494,7 +1494,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->hex) {
+	if (event) {
 		enum print_line_t ret = event->hex(iter, 0);
 		if (ret != TRACE_TYPE_HANDLED)
 			return ret;
@@ -1536,10 +1536,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->binary)
-		return event->binary(iter, 0);
-
-	return TRACE_TYPE_HANDLED;
+	return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
 }
 
 static int trace_empty(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7ac72a44b2d3..297deb202b68 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -182,9 +182,6 @@ static struct trace_event trace_branch_event = {
 	.type	 	= TRACE_BRANCH,
 	.trace		= trace_branch_print,
 	.latency_trace	= trace_branch_print,
-	.raw		= trace_nop_print,
-	.hex		= trace_nop_print,
-	.binary		= trace_nop_print,
 };
 
 static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b7380eee9fa1..b6e99af79214 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -435,6 +435,17 @@ int register_ftrace_event(struct trace_event *event)
 	if (ftrace_find_event(event->type))
 		goto out;
 
+	if (event->trace == NULL)
+		event->trace = trace_nop_print;
+	if (event->latency_trace == NULL)
+		event->latency_trace = trace_nop_print;
+	if (event->raw == NULL)
+		event->raw = trace_nop_print;
+	if (event->hex == NULL)
+		event->hex = trace_nop_print;
+	if (event->binary == NULL)
+		event->binary = trace_nop_print;
+
 	key = event->type & (EVENT_HASHSIZE - 1);
 
 	hlist_add_head_rcu(&event->node, &event_hash[key]);
@@ -874,8 +885,6 @@ static struct trace_event trace_print_event = {
 	.trace		= trace_print_print,
 	.latency_trace	= trace_print_print,
 	.raw		= trace_print_raw,
-	.hex		= trace_nop_print,
-	.binary		= trace_nop_print,
 };
 
 static struct trace_event *events[] __initdata = {
-- 
cgit v1.2.3-59-g8ed1b


From 97e5b191ae7dc0f4f5b82b9db29782928b103b4d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 01:13:36 -0500
Subject: trace_branch: Remove unused function

Impact: cleanup

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 297deb202b68..027e83690615 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,23 +143,6 @@ static void branch_trace_reset(struct trace_array *tr)
 	stop_branch_trace(tr);
 }
 
-static int
-trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
-{
-	struct print_entry *field;
-
-	trace_assign_type(field, entry);
-
-	if (seq_print_ip_sym(s, field->ip, flags))
-		goto partial;
-
-	if (trace_seq_printf(s, ": %s", field->buf))
-		goto partial;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
 static enum print_line_t trace_branch_print(struct trace_iterator *iter,
 					    int flags)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 7be421510b91491d5aa5a29fa1005712039b95af Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 01:13:37 -0500
Subject: trace: Remove unused trace_array_cpu parameter

Impact: cleanup

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c                  |  2 +-
 kernel/trace/trace.c              | 47 +++++++++++++++------------------------
 kernel/trace/trace.h              |  4 ----
 kernel/trace/trace_functions.c    |  8 +++----
 kernel/trace/trace_irqsoff.c      | 10 ++++-----
 kernel/trace/trace_sched_switch.c |  4 ++--
 kernel/trace/trace_sched_wakeup.c | 12 +++++-----
 7 files changed, 35 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index 1ebd068061ec..d9d7146ee023 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -245,7 +245,7 @@ record_it:
 			if (pid != 0 &&
 			    !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) &&
 			    (trace_flags & TRACE_ITER_STACKTRACE) != 0)
-				__trace_stack(blk_tr, NULL, flags, 5, pc);
+				__trace_stack(blk_tr, flags, 5, pc);
 			trace_wake_up();
 			return;
 		}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a5e4c0af9bb0..1d4ff568cc4d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -776,7 +776,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 }
 
 void
-trace_function(struct trace_array *tr, struct trace_array_cpu *data,
+trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
 	       int pc)
 {
@@ -802,7 +802,6 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static void __trace_graph_entry(struct trace_array *tr,
-				struct trace_array_cpu *data,
 				struct ftrace_graph_ent *trace,
 				unsigned long flags,
 				int pc)
@@ -826,7 +825,6 @@ static void __trace_graph_entry(struct trace_array *tr,
 }
 
 static void __trace_graph_return(struct trace_array *tr,
-				struct trace_array_cpu *data,
 				struct ftrace_graph_ret *trace,
 				unsigned long flags,
 				int pc)
@@ -856,11 +854,10 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
        int pc)
 {
 	if (likely(!atomic_read(&data->disabled)))
-		trace_function(tr, data, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, flags, pc);
 }
 
 static void __ftrace_trace_stack(struct trace_array *tr,
-				 struct trace_array_cpu *data,
 				 unsigned long flags,
 				 int skip, int pc)
 {
@@ -891,27 +888,24 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 }
 
 static void ftrace_trace_stack(struct trace_array *tr,
-			       struct trace_array_cpu *data,
 			       unsigned long flags,
 			       int skip, int pc)
 {
 	if (!(trace_flags & TRACE_ITER_STACKTRACE))
 		return;
 
-	__ftrace_trace_stack(tr, data, flags, skip, pc);
+	__ftrace_trace_stack(tr, flags, skip, pc);
 }
 
 void __trace_stack(struct trace_array *tr,
-		   struct trace_array_cpu *data,
 		   unsigned long flags,
 		   int skip, int pc)
 {
-	__ftrace_trace_stack(tr, data, flags, skip, pc);
+	__ftrace_trace_stack(tr, flags, skip, pc);
 }
 
 static void ftrace_trace_userstack(struct trace_array *tr,
-		   struct trace_array_cpu *data,
-		   unsigned long flags, int pc)
+				   unsigned long flags, int pc)
 {
 #ifdef CONFIG_STACKTRACE
 	struct ring_buffer_event *event;
@@ -942,20 +936,17 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 #endif
 }
 
-void __trace_userstack(struct trace_array *tr,
-		   struct trace_array_cpu *data,
-		   unsigned long flags)
+void __trace_userstack(struct trace_array *tr, unsigned long flags)
 {
-	ftrace_trace_userstack(tr, data, flags, preempt_count());
+	ftrace_trace_userstack(tr, flags, preempt_count());
 }
 
 static void
-ftrace_trace_special(void *__tr, void *__data,
+ftrace_trace_special(void *__tr,
 		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
 		     int pc)
 {
 	struct ring_buffer_event *event;
-	struct trace_array_cpu *data = __data;
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
 	unsigned long irq_flags;
@@ -971,8 +962,8 @@ ftrace_trace_special(void *__tr, void *__data,
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-	ftrace_trace_stack(tr, data, irq_flags, 4, pc);
-	ftrace_trace_userstack(tr, data, irq_flags, pc);
+	ftrace_trace_stack(tr, irq_flags, 4, pc);
+	ftrace_trace_userstack(tr, irq_flags, pc);
 
 	trace_wake_up();
 }
@@ -981,12 +972,11 @@ void
 __trace_special(void *__tr, void *__data,
 		unsigned long arg1, unsigned long arg2, unsigned long arg3)
 {
-	ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count());
+	ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
 }
 
 void
 tracing_sched_switch_trace(struct trace_array *tr,
-			   struct trace_array_cpu *data,
 			   struct task_struct *prev,
 			   struct task_struct *next,
 			   unsigned long flags, int pc)
@@ -1010,13 +1000,12 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-	ftrace_trace_stack(tr, data, flags, 5, pc);
-	ftrace_trace_userstack(tr, data, flags, pc);
+	ftrace_trace_stack(tr, flags, 5, pc);
+	ftrace_trace_userstack(tr, flags, pc);
 }
 
 void
 tracing_sched_wakeup_trace(struct trace_array *tr,
-			   struct trace_array_cpu *data,
 			   struct task_struct *wakee,
 			   struct task_struct *curr,
 			   unsigned long flags, int pc)
@@ -1040,8 +1029,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-	ftrace_trace_stack(tr, data, flags, 6, pc);
-	ftrace_trace_userstack(tr, data, flags, pc);
+	ftrace_trace_stack(tr, flags, 6, pc);
+	ftrace_trace_userstack(tr, flags, pc);
 
 	trace_wake_up();
 }
@@ -1064,7 +1053,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 	data = tr->data[cpu];
 
 	if (likely(atomic_inc_return(&data->disabled) == 1))
-		ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
+		ftrace_trace_special(tr, arg1, arg2, arg3, pc);
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
@@ -1092,7 +1081,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_graph_entry(tr, data, trace, flags, pc);
+		__trace_graph_entry(tr, trace, flags, pc);
 	}
 	/* Only do the atomic if it is not already set */
 	if (!test_tsk_trace_graph(current))
@@ -1118,7 +1107,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_graph_return(tr, data, trace, flags, pc);
+		__trace_graph_return(tr, trace, flags, pc);
 	}
 	if (!trace->depth)
 		clear_tsk_trace_graph(current);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f0c7a0f08cac..df627a948694 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -419,14 +419,12 @@ void ftrace(struct trace_array *tr,
 			    unsigned long parent_ip,
 			    unsigned long flags, int pc);
 void tracing_sched_switch_trace(struct trace_array *tr,
-				struct trace_array_cpu *data,
 				struct task_struct *prev,
 				struct task_struct *next,
 				unsigned long flags, int pc);
 void tracing_record_cmdline(struct task_struct *tsk);
 
 void tracing_sched_wakeup_trace(struct trace_array *tr,
-				struct trace_array_cpu *data,
 				struct task_struct *wakee,
 				struct task_struct *cur,
 				unsigned long flags, int pc);
@@ -436,7 +434,6 @@ void trace_special(struct trace_array *tr,
 		   unsigned long arg2,
 		   unsigned long arg3, int pc);
 void trace_function(struct trace_array *tr,
-		    struct trace_array_cpu *data,
 		    unsigned long ip,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
@@ -462,7 +459,6 @@ void update_max_tr_single(struct trace_array *tr,
 			  struct task_struct *tsk, int cpu);
 
 void __trace_stack(struct trace_array *tr,
-		   struct trace_array_cpu *data,
 		   unsigned long flags,
 		   int skip, int pc);
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b3a320f8aba7..d067cea2ccc3 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -78,7 +78,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
 	disabled = atomic_inc_return(&data->disabled);
 
 	if (likely(disabled == 1))
-		trace_function(tr, data, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, flags, pc);
 
 	atomic_dec(&data->disabled);
 	ftrace_preempt_enable(resched);
@@ -108,7 +108,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		trace_function(tr, data, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, flags, pc);
 	}
 
 	atomic_dec(&data->disabled);
@@ -139,7 +139,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		trace_function(tr, data, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, flags, pc);
 		/*
 		 * skip over 5 funcs:
 		 *    __ftrace_trace_stack,
@@ -148,7 +148,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 		 *    ftrace_list_func
 		 *    ftrace_call
 		 */
-		__trace_stack(tr, data, flags, 5, pc);
+		__trace_stack(tr, flags, 5, pc);
 	}
 
 	atomic_dec(&data->disabled);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index ed344b022a14..c6b442d88de8 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -95,7 +95,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 	disabled = atomic_inc_return(&data->disabled);
 
 	if (likely(disabled == 1))
-		trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+		trace_function(tr, ip, parent_ip, flags, preempt_count());
 
 	atomic_dec(&data->disabled);
 }
@@ -153,7 +153,7 @@ check_critical_timing(struct trace_array *tr,
 	if (!report_latency(delta))
 		goto out_unlock;
 
-	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
+	trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 
 	latency = nsecs_to_usecs(delta);
 
@@ -177,7 +177,7 @@ out:
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = ftrace_now(cpu);
 	tracing_reset(tr, cpu);
-	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
+	trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 }
 
 static inline void
@@ -210,7 +210,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 
 	local_save_flags(flags);
 
-	trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+	trace_function(tr, ip, parent_ip, flags, preempt_count());
 
 	per_cpu(tracing_cpu, cpu) = 1;
 
@@ -244,7 +244,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	atomic_inc(&data->disabled);
 
 	local_save_flags(flags);
-	trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+	trace_function(tr, ip, parent_ip, flags, preempt_count());
 	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
 	data->critical_start = 0;
 	atomic_dec(&data->disabled);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index df175cb4564f..c4f9add5ec90 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -43,7 +43,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 	data = ctx_trace->data[cpu];
 
 	if (likely(!atomic_read(&data->disabled)))
-		tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc);
+		tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
 
 	local_irq_restore(flags);
 }
@@ -66,7 +66,7 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
 	data = ctx_trace->data[cpu];
 
 	if (likely(!atomic_read(&data->disabled)))
-		tracing_sched_wakeup_trace(ctx_trace, data, wakee, current,
+		tracing_sched_wakeup_trace(ctx_trace, wakee, current,
 					   flags, pc);
 
 	local_irq_restore(flags);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index a48c9b4b0c85..96d716485898 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -72,7 +72,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 	if (task_cpu(wakeup_task) != cpu)
 		goto unlock;
 
-	trace_function(tr, data, ip, parent_ip, flags, pc);
+	trace_function(tr, ip, parent_ip, flags, pc);
 
  unlock:
 	__raw_spin_unlock(&wakeup_lock);
@@ -152,8 +152,8 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 	if (unlikely(!tracer_enabled || next != wakeup_task))
 		goto out_unlock;
 
-	trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
-	tracing_sched_switch_trace(wakeup_trace, data, prev, next, flags, pc);
+	trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
 
 	/*
 	 * usecs conversion is slow so we try to delay the conversion
@@ -254,10 +254,8 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 
 	data = wakeup_trace->data[wakeup_cpu];
 	data->preempt_timestamp = ftrace_now(cpu);
-	tracing_sched_wakeup_trace(wakeup_trace, data, p, current,
-				   flags, pc);
-	trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2,
-		       flags, pc);
+	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
+	trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
 
 out_locked:
 	__raw_spin_unlock(&wakeup_lock);
-- 
cgit v1.2.3-59-g8ed1b


From dac74940289f350c2590bec92737833bad608541 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Feb 2009 01:13:38 -0500
Subject: trace: code style clean up

Ingo Molnar suggested using goto logic to keep the indentation
down and to be able to remove the nasty line breaks. This actually
makes the code a bit more readable.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1d4ff568cc4d..3536ef41575d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -522,23 +522,24 @@ int register_tracer(struct tracer *type)
 	tracing_selftest_running = false;
 	mutex_unlock(&trace_types_lock);
 
-	if (!ret && default_bootup_tracer) {
-		if (!strncmp(default_bootup_tracer, type->name,
-			     BOOTUP_TRACER_SIZE)) {
-			printk(KERN_INFO "Starting tracer '%s'\n",
-			       type->name);
-			/* Do we want this tracer to start on bootup? */
-			tracing_set_tracer(type->name);
-			default_bootup_tracer = NULL;
-			/* disable other selftests, since this will break it. */
-			tracing_selftest_disabled = 1;
+	if (ret || !default_bootup_tracer)
+		goto out_unlock;
+
+	if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE))
+		goto out_unlock;
+
+	printk(KERN_INFO "Starting tracer '%s'\n", type->name);
+	/* Do we want this tracer to start on bootup? */
+	tracing_set_tracer(type->name);
+	default_bootup_tracer = NULL;
+	/* disable other selftests, since this will break it. */
+	tracing_selftest_disabled = 1;
 #ifdef CONFIG_FTRACE_STARTUP_TEST
-			printk(KERN_INFO "Disabling FTRACE selftests due"
-			       " to running tracer '%s'\n", type->name);
+	printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
+	       type->name);
 #endif
-		}
-	}
 
+ out_unlock:
 	lock_kernel();
 	return ret;
 }
-- 
cgit v1.2.3-59-g8ed1b


From ce9dbe244bf2063c41792e40dae7745957b118e0 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Wed, 4 Feb 2009 20:35:48 -0800
Subject: softlockup: check all tasks in hung_task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: extend the scope of hung-task checks

Changed the default value of hung_task_check_count to PID_MAX_LIMIT.
hung_task_batch_count added to put an upper bound on the critical
section. Every hung_task_batch_count checks, the rcu lock is never
held for a too long time.

Keeping the critical section small minimizes time preemption is disabled
and keeps rcu grace periods small.

To prevent following a stale pointer, get_task_struct is called on g and t.
To verify that g and t have not been unhashed while outside the critical
section, the task states are checked.

The design was proposed by Frédéric Weisbecker.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Suggested-by: Frédéric Weisbecker <fweisbec@gmail.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ba8ccd432963..481ca8b5c2bc 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -17,9 +17,18 @@
 #include <linux/sysctl.h>
 
 /*
- * Have a reasonable limit on the number of tasks checked:
+ * The number of tasks checked:
  */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+
+/*
+ * Limit number of tasks checked in a batch.
+ *
+ * This value controls the preemptibility of khungtaskd since preemption
+ * is disabled during the critical section. It also controls the size of
+ * the RCU grace period. So it needs to be upper-bound.
+ */
+#define HUNG_TASK_BATCHING 1024
 
 /*
  * Zero means infinite timeout - no checking done:
@@ -109,6 +118,24 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 		panic("hung_task: blocked tasks");
 }
 
+/*
+ * To avoid extending the RCU grace period for an unbounded amount of time,
+ * periodically exit the critical section and enter a new one.
+ *
+ * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+ * exit the grace period. For classic RCU, a reschedule is required.
+ */
+static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
+{
+	get_task_struct(g);
+	get_task_struct(t);
+	rcu_read_unlock();
+	cond_resched();
+	rcu_read_lock();
+	put_task_struct(t);
+	put_task_struct(g);
+}
+
 /*
  * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
  * a really long time (120 seconds). If that happens, print out
@@ -117,6 +144,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
 	int max_count = sysctl_hung_task_check_count;
+	int batch_count = HUNG_TASK_BATCHING;
 	unsigned long now = get_timestamp();
 	struct task_struct *g, *t;
 
@@ -131,6 +159,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	do_each_thread(g, t) {
 		if (!--max_count)
 			goto unlock;
+		if (!--batch_count) {
+			batch_count = HUNG_TASK_BATCHING;
+			rcu_lock_break(g, t);
+			/* Exit if t or g was unhashed during refresh. */
+			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
+				goto unlock;
+		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, now, timeout);
-- 
cgit v1.2.3-59-g8ed1b


From 94be52dc075a32af4aa73d7e10f68734d62d6af2 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Thu, 5 Feb 2009 09:56:08 -0800
Subject: softlockup: convert read_lock in hung_task to rcu_read_lock

Since the tasklist is protected by rcu list operations, it is safe
to convert the read_lock()s to rcu_read_lock().

Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 481ca8b5c2bc..3951a80e7cbe 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -155,7 +155,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	if (test_taint(TAINT_DIE) || did_panic)
 		return;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	do_each_thread(g, t) {
 		if (!--max_count)
 			goto unlock;
@@ -171,7 +171,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 			check_hung_task(t, now, timeout);
 	} while_each_thread(g, t);
  unlock:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 }
 
 static void update_poll_jiffies(void)
-- 
cgit v1.2.3-59-g8ed1b


From 0a9877514c4fed10a70720293b37213dd172ee3e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 16:12:56 -0200
Subject: ring_buffer: remove unused flags parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: API change, cleanup

>From ring_buffer_{lock_reserve,unlock_commit}.

$ codiff /tmp/vmlinux.before /tmp/vmlinux.after
linux-2.6-tip/kernel/trace/trace.c:
  trace_vprintk              |  -14
  trace_graph_return         |  -14
  trace_graph_entry          |  -10
  trace_function             |   -8
  __ftrace_trace_stack       |   -8
  ftrace_trace_userstack     |   -8
  tracing_sched_switch_trace |   -8
  ftrace_trace_special       |  -12
  tracing_sched_wakeup_trace |   -8
 9 functions changed, 90 bytes removed, diff: -90

linux-2.6-tip/block/blktrace.c:
  __blk_add_trace |   -1
 1 function changed, 1 bytes removed, diff: -1

/tmp/vmlinux.after:
 10 functions changed, 91 bytes removed, diff: -91

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c                 |  8 +++---
 include/linux/ring_buffer.h      |  9 +++----
 kernel/trace/kmemtrace.c         | 12 +++------
 kernel/trace/ring_buffer.c       |  9 ++-----
 kernel/trace/trace.c             | 56 ++++++++++++++--------------------------
 kernel/trace/trace_boot.c        | 12 +++------
 kernel/trace/trace_branch.c      |  7 +++--
 kernel/trace/trace_hw_branches.c |  6 ++---
 kernel/trace/trace_mmiotrace.c   | 12 +++------
 kernel/trace/trace_power.c       | 12 +++------
 10 files changed, 51 insertions(+), 92 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index d9d7146ee023..8e52f24cc8f9 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -165,7 +165,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	struct task_struct *tsk = current;
 	struct ring_buffer_event *event = NULL;
 	struct blk_io_trace *t;
-	unsigned long flags;
+	unsigned long flags = 0;
 	unsigned long *sequence;
 	pid_t pid;
 	int cpu, pc = 0;
@@ -191,7 +191,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		tracing_record_cmdline(current);
 
 		event = ring_buffer_lock_reserve(blk_tr->buffer,
-						 sizeof(*t) + pdu_len, &flags);
+						 sizeof(*t) + pdu_len);
 		if (!event)
 			return;
 
@@ -241,11 +241,11 @@ record_it:
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 
 		if (blk_tr) {
-			ring_buffer_unlock_commit(blk_tr->buffer, event, flags);
+			ring_buffer_unlock_commit(blk_tr->buffer, event);
 			if (pid != 0 &&
 			    !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) &&
 			    (trace_flags & TRACE_ITER_STACKTRACE) != 0)
-				__trace_stack(blk_tr, flags, 5, pc);
+				__trace_stack(blk_tr, 0, 5, pc);
 			trace_wake_up();
 			return;
 		}
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b3b359660082..3110d92e7d81 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -74,13 +74,10 @@ void ring_buffer_free(struct ring_buffer *buffer);
 
 int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
 
-struct ring_buffer_event *
-ring_buffer_lock_reserve(struct ring_buffer *buffer,
-			 unsigned long length,
-			 unsigned long *flags);
+struct ring_buffer_event *ring_buffer_lock_reserve(struct ring_buffer *buffer,
+						   unsigned long length);
 int ring_buffer_unlock_commit(struct ring_buffer *buffer,
-			      struct ring_buffer_event *event,
-			      unsigned long flags);
+			      struct ring_buffer_event *event);
 int ring_buffer_write(struct ring_buffer *buffer,
 		      unsigned long length, void *data);
 
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index f04c0625f1cd..256749d1032a 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -272,13 +272,11 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	struct ring_buffer_event *event;
 	struct kmemtrace_alloc_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
-	unsigned long irq_flags;
 
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -292,7 +290,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	entry->gfp_flags = gfp_flags;
 	entry->node	=	node;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
@@ -305,13 +303,11 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 	struct ring_buffer_event *event;
 	struct kmemtrace_free_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
-	unsigned long irq_flags;
 
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -322,7 +318,7 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 	entry->call_site = call_site;
 	entry->ptr = ptr;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b36d7374ceef..aee76b3eeed2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1257,7 +1257,6 @@ static DEFINE_PER_CPU(int, rb_need_resched);
  * ring_buffer_lock_reserve - reserve a part of the buffer
  * @buffer: the ring buffer to reserve from
  * @length: the length of the data to reserve (excluding event header)
- * @flags: a pointer to save the interrupt flags
  *
  * Returns a reseverd event on the ring buffer to copy directly to.
  * The user of this interface will need to get the body to write into
@@ -1270,9 +1269,7 @@ static DEFINE_PER_CPU(int, rb_need_resched);
  * If NULL is returned, then nothing has been allocated or locked.
  */
 struct ring_buffer_event *
-ring_buffer_lock_reserve(struct ring_buffer *buffer,
-			 unsigned long length,
-			 unsigned long *flags)
+ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
@@ -1339,15 +1336,13 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
  * ring_buffer_unlock_commit - commit a reserved
  * @buffer: The buffer to commit to
  * @event: The event pointer to commit.
- * @flags: the interrupt flags received from ring_buffer_lock_reserve.
  *
  * This commits the data to the ring buffer, and releases any locks held.
  *
  * Must be paired with ring_buffer_lock_reserve.
  */
 int ring_buffer_unlock_commit(struct ring_buffer *buffer,
-			      struct ring_buffer_event *event,
-			      unsigned long flags)
+			      struct ring_buffer_event *event)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	int cpu = raw_smp_processor_id();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3536ef41575d..eb453a238a6f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -783,14 +783,12 @@ trace_function(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
-	unsigned long irq_flags;
 
 	/* If we are reading the ring buffer, don't trace */
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -798,7 +796,7 @@ trace_function(struct trace_array *tr,
 	entry->ent.type			= TRACE_FN;
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -809,20 +807,18 @@ static void __trace_graph_entry(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ent_entry *entry;
-	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
 	entry->ent.type			= TRACE_GRAPH_ENT;
 	entry->graph_ent			= *trace;
-	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 
 static void __trace_graph_return(struct trace_array *tr,
@@ -832,20 +828,18 @@ static void __trace_graph_return(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ret_entry *entry;
-	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
 	entry->ent.type			= TRACE_GRAPH_RET;
 	entry->ret				= *trace;
-	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 #endif
 
@@ -866,10 +860,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct stack_entry *entry;
 	struct stack_trace trace;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -884,7 +876,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace(&trace);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -912,13 +904,11 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
-	unsigned long irq_flags;
 
 	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -933,7 +923,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace_user(&trace);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -950,10 +940,8 @@ ftrace_trace_special(void *__tr,
 	struct ring_buffer_event *event;
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -962,9 +950,9 @@ ftrace_trace_special(void *__tr,
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-	ftrace_trace_stack(tr, irq_flags, 4, pc);
-	ftrace_trace_userstack(tr, irq_flags, pc);
+	ring_buffer_unlock_commit(tr->buffer, event);
+	ftrace_trace_stack(tr, 0, 4, pc);
+	ftrace_trace_userstack(tr, 0, pc);
 
 	trace_wake_up();
 }
@@ -984,10 +972,8 @@ tracing_sched_switch_trace(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -1000,7 +986,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_prio		= next->prio;
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 5, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 }
@@ -1013,10 +999,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -1029,7 +1013,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_prio		= wakee->prio;
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 
@@ -2841,7 +2825,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	trace_buf[len] = 0;
 
 	size = sizeof(*entry) + len + 1;
-	event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, size);
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
@@ -2852,7 +2836,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
 	spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 1f07895977a0..4e08debf662d 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -132,7 +132,6 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 {
 	struct ring_buffer_event *event;
 	struct trace_boot_call *entry;
-	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
 	if (!tr || !pre_initcalls_finished)
@@ -144,15 +143,14 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_BOOT_CALL;
 	entry->boot_call = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
@@ -164,7 +162,6 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 {
 	struct ring_buffer_event *event;
 	struct trace_boot_ret *entry;
-	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
 	if (!tr || !pre_initcalls_finished)
@@ -173,15 +170,14 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_BOOT_RET;
 	entry->boot_ret = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 027e83690615..770e52acfc10 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -33,7 +33,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	struct trace_array *tr = branch_tracer;
 	struct ring_buffer_event *event;
 	struct trace_branch *entry;
-	unsigned long flags, irq_flags;
+	unsigned long flags;
 	int cpu, pc;
 	const char *p;
 
@@ -52,8 +52,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 
@@ -75,7 +74,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry->line = f->line;
 	entry->correct = val == expect;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index fff3545fc866..e720c001db2b 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -175,7 +175,7 @@ void trace_hw_branch(u64 from, u64 to)
 	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
-	unsigned long irq1, irq2;
+	unsigned long irq1;
 	int cpu;
 
 	if (unlikely(!tr))
@@ -189,7 +189,7 @@ void trace_hw_branch(u64 from, u64 to)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq2);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
@@ -198,7 +198,7 @@ void trace_hw_branch(u64 from, u64 to)
 	entry->ent.cpu = cpu;
 	entry->from = from;
 	entry->to   = to;
-	ring_buffer_unlock_commit(tr->buffer, event, irq2);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index ec78e244242e..104ddebc11d1 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,10 +307,8 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
-	unsigned long irq_flags;
 
-	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
@@ -319,7 +317,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_RW;
 	entry->rw			= *rw;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
@@ -337,10 +335,8 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
-	unsigned long irq_flags;
 
-	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
@@ -349,7 +345,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_MAP;
 	entry->map			= *map;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index faa6ab7a1f5c..3b1a292d12d2 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -115,7 +115,6 @@ void trace_power_end(struct power_trace *it)
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
-	unsigned long irq_flags;
 	struct trace_array *tr = power_trace;
 
 	if (!trace_power_enabled)
@@ -125,15 +124,14 @@ void trace_power_end(struct power_trace *it)
 	it->end = ktime_get();
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
@@ -148,7 +146,6 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
-	unsigned long irq_flags;
 	struct trace_array *tr = power_trace;
 
 	if (!trace_power_enabled)
@@ -162,15 +159,14 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
 	it->end = it->stamp;
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
-- 
cgit v1.2.3-59-g8ed1b


From 51a763dd84253bab1d0a1e68e11a7753d1b702ca Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 16:14:13 -0200
Subject: tracing: Introduce trace_buffer_{lock_reserve,unlock_commit}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: new API

These new functions do what previously was being open coded, reducing
the number of details ftrace plugin writers have to worry about.

It also standardizes the handling of stacktrace, userstacktrace and
other trace options we may introduce in the future.

With this patch, for instance, the blk tracer (and some others already
in the tree) can use the "userstacktrace" /d/tracing/trace_options
facility.

$ codiff /tmp/vmlinux.before /tmp/vmlinux.after
linux-2.6-tip/kernel/trace/trace.c:
  trace_vprintk              |   -5
  trace_graph_return         |  -22
  trace_graph_entry          |  -26
  trace_function             |  -45
  __ftrace_trace_stack       |  -27
  ftrace_trace_userstack     |  -29
  tracing_sched_switch_trace |  -66
  tracing_stop               |   +1
  trace_seq_to_user          |   -1
  ftrace_trace_special       |  -63
  ftrace_special             |   +1
  tracing_sched_wakeup_trace |  -70
  tracing_reset_online_cpus  |   -1
 13 functions changed, 2 bytes added, 355 bytes removed, diff: -353

linux-2.6-tip/block/blktrace.c:
  __blk_add_trace |  -58
 1 function changed, 58 bytes removed, diff: -58

linux-2.6-tip/kernel/trace/trace.c:
  trace_buffer_lock_reserve  |  +88
  trace_buffer_unlock_commit |  +86
 2 functions changed, 174 bytes added, diff: +174

/tmp/vmlinux.after:
 16 functions changed, 176 bytes added, 413 bytes removed, diff: -237

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c                 | 21 +++------
 kernel/trace/kmemtrace.c         | 19 +++-----
 kernel/trace/trace.c             | 94 ++++++++++++++++++++++------------------
 kernel/trace/trace.h             | 11 +++++
 kernel/trace/trace_boot.c        | 20 +++------
 kernel/trace/trace_branch.c      |  7 ++-
 kernel/trace/trace_hw_branches.c |  7 ++-
 kernel/trace/trace_mmiotrace.c   | 20 ++++-----
 kernel/trace/trace_power.c       | 20 +++------
 9 files changed, 102 insertions(+), 117 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index 8e52f24cc8f9..834cd84037b2 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -187,19 +187,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	cpu = raw_smp_processor_id();
 
 	if (blk_tr) {
-		struct trace_entry *ent;
 		tracing_record_cmdline(current);
 
-		event = ring_buffer_lock_reserve(blk_tr->buffer,
-						 sizeof(*t) + pdu_len);
+		pc = preempt_count();
+		event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
+						  sizeof(*t) + pdu_len,
+						  0, pc);
 		if (!event)
 			return;
-
-		ent = ring_buffer_event_data(event);
-		t = (struct blk_io_trace *)ent;
-		pc = preempt_count();
-		tracing_generic_entry_update(ent, 0, pc);
-		ent->type = TRACE_BLK;
+		t = ring_buffer_event_data(event);
 		goto record_it;
 	}
 
@@ -241,12 +237,7 @@ record_it:
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 
 		if (blk_tr) {
-			ring_buffer_unlock_commit(blk_tr->buffer, event);
-			if (pid != 0 &&
-			    !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) &&
-			    (trace_flags & TRACE_ITER_STACKTRACE) != 0)
-				__trace_stack(blk_tr, 0, 5, pc);
-			trace_wake_up();
+			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
 			return;
 		}
 	}
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 256749d1032a..ae201b3eda89 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -276,13 +276,12 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
 
-	entry->ent.type = TRACE_KMEM_ALLOC;
 	entry->call_site = call_site;
 	entry->ptr = ptr;
 	entry->bytes_req = bytes_req;
@@ -290,9 +289,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	entry->gfp_flags = gfp_flags;
 	entry->node	=	node;
 
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, 0);
 }
 EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
 
@@ -307,20 +304,16 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-
-	entry->ent.type = TRACE_KMEM_FREE;
 	entry->type_id	= type_id;
 	entry->call_site = call_site;
 	entry->ptr = ptr;
 
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, 0);
 }
 EXPORT_SYMBOL(kmemtrace_mark_free);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index eb453a238a6f..8fad3776e843 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -776,6 +776,39 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
 }
 
+struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
+						    unsigned char type,
+						    unsigned long len,
+						    unsigned long flags, int pc)
+{
+	struct ring_buffer_event *event;
+
+	event = ring_buffer_lock_reserve(tr->buffer, len);
+	if (event != NULL) {
+		struct trace_entry *ent = ring_buffer_event_data(event);
+
+		tracing_generic_entry_update(ent, flags, pc);
+		ent->type = type;
+	}
+
+	return event;
+}
+static void ftrace_trace_stack(struct trace_array *tr,
+			       unsigned long flags, int skip, int pc);
+static void ftrace_trace_userstack(struct trace_array *tr,
+				   unsigned long flags, int pc);
+
+void trace_buffer_unlock_commit(struct trace_array *tr,
+				struct ring_buffer_event *event,
+				unsigned long flags, int pc)
+{
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+	ftrace_trace_stack(tr, flags, 6, pc);
+	ftrace_trace_userstack(tr, flags, pc);
+	trace_wake_up();
+}
+
 void
 trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -788,12 +821,11 @@ trace_function(struct trace_array *tr,
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry),
+					  flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_FN;
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
 	ring_buffer_unlock_commit(tr->buffer, event);
@@ -811,12 +843,11 @@ static void __trace_graph_entry(struct trace_array *tr,
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_GRAPH_ENT;
 	entry->graph_ent			= *trace;
 	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
@@ -832,12 +863,11 @@ static void __trace_graph_return(struct trace_array *tr,
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_GRAPH_RET;
 	entry->ret				= *trace;
 	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
@@ -861,13 +891,11 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	struct stack_entry *entry;
 	struct stack_trace trace;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_STACK,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type		= TRACE_STACK;
-
 	memset(&entry->caller, 0, sizeof(entry->caller));
 
 	trace.nr_entries	= 0;
@@ -908,12 +936,11 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type		= TRACE_USER_STACK;
 
 	memset(&entry->caller, 0, sizeof(entry->caller));
 
@@ -941,20 +968,15 @@ ftrace_trace_special(void *__tr,
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL,
+					  sizeof(*entry), 0, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, pc);
-	entry->ent.type			= TRACE_SPECIAL;
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
-	ring_buffer_unlock_commit(tr->buffer, event);
-	ftrace_trace_stack(tr, 0, 4, pc);
-	ftrace_trace_userstack(tr, 0, pc);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
 void
@@ -973,12 +995,11 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_CTX,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_CTX;
 	entry->prev_pid			= prev->pid;
 	entry->prev_prio		= prev->prio;
 	entry->prev_state		= prev->state;
@@ -986,9 +1007,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_prio		= next->prio;
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
-	ring_buffer_unlock_commit(tr->buffer, event);
-	ftrace_trace_stack(tr, flags, 5, pc);
-	ftrace_trace_userstack(tr, flags, pc);
+	trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 
 void
@@ -1000,12 +1019,11 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_WAKE;
 	entry->prev_pid			= curr->pid;
 	entry->prev_prio		= curr->prio;
 	entry->prev_state		= curr->state;
@@ -1013,11 +1031,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_prio		= wakee->prio;
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
-	ring_buffer_unlock_commit(tr->buffer, event);
-	ftrace_trace_stack(tr, flags, 6, pc);
-	ftrace_trace_userstack(tr, flags, pc);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 
 void
@@ -2825,12 +2839,10 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	trace_buf[len] = 0;
 
 	size = sizeof(*entry) + len + 1;
-	event = ring_buffer_lock_reserve(tr->buffer, size);
+	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, irq_flags, pc);
-	entry->ent.type			= TRACE_PRINT;
 	entry->ip			= ip;
 	entry->depth			= depth;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index df627a948694..e03f157c772e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -403,6 +403,17 @@ int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *tracing_init_dentry(void);
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 
+struct ring_buffer_event;
+
+struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
+						    unsigned char type,
+						    unsigned long len,
+						    unsigned long flags,
+						    int pc);
+void trace_buffer_unlock_commit(struct trace_array *tr,
+				struct ring_buffer_event *event,
+				unsigned long flags, int pc);
+
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
 
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 4e08debf662d..7a30fc4c3642 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -143,17 +143,13 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_BOOT_CALL;
 	entry->boot_call = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-
+	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
@@ -170,17 +166,13 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_BOOT_RET;
 	entry->boot_ret = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-
+	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 770e52acfc10..48b2196abe37 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -52,14 +52,13 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	pc = preempt_count();
+	event = trace_buffer_lock_reserve(tr, TRACE_BRANCH,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		goto out;
 
-	pc = preempt_count();
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type		= TRACE_BRANCH;
 
 	/* Strip off the path, only save the file */
 	p = f->file + strlen(f->file);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index e720c001db2b..2aa1c9f4c7d8 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -189,16 +189,15 @@ void trace_hw_branch(u64 from, u64 to)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, from);
-	entry->ent.type = TRACE_HW_BRANCHES;
 	entry->ent.cpu = cpu;
 	entry->from = from;
 	entry->to   = to;
-	ring_buffer_unlock_commit(tr->buffer, event);
+	trace_buffer_unlock_commit(tr, event, 0, 0);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 104ddebc11d1..c401b908e805 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,19 +307,17 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
+	int pc = preempt_count();
 
-	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW,
+					  sizeof(*entry), 0, pc);
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
 	}
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
-	entry->ent.type			= TRACE_MMIO_RW;
 	entry->rw			= *rw;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
 void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -335,19 +333,17 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
+	int pc = preempt_count();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP,
+					  sizeof(*entry), 0, pc);
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
 	}
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
-	entry->ent.type			= TRACE_MMIO_MAP;
 	entry->map			= *map;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
 void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 3b1a292d12d2..bfc21f8079ab 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -124,17 +124,13 @@ void trace_power_end(struct power_trace *it)
 	it->end = ktime_get();
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-
+	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
@@ -159,17 +155,13 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
 	it->end = it->stamp;
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-
+	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
-- 
cgit v1.2.3-59-g8ed1b


From b6f11df26fdc28324cf9c9e3b77f2dc985c1bb13 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 18:02:00 -0200
Subject: trace: Call tracing_reset_online_cpus before tracer->init()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

To make it easy for ftrace plugin writers, as this was open coded in
the existing plugins

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c                     |  2 --
 kernel/trace/trace.c                 |  8 +++++++-
 kernel/trace/trace.h                 |  1 +
 kernel/trace/trace_branch.c          |  1 -
 kernel/trace/trace_functions.c       | 17 +++--------------
 kernel/trace/trace_functions_graph.c |  1 -
 kernel/trace/trace_hw_branches.c     |  1 -
 kernel/trace/trace_nop.c             |  1 -
 kernel/trace/trace_sched_switch.c    |  8 +-------
 kernel/trace/trace_selftest.c        | 18 +++++++++---------
 kernel/trace/trace_sysprof.c         | 14 ++++----------
 11 files changed, 25 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index 834cd84037b2..ca6d32061e4f 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -1086,8 +1086,6 @@ static void blk_tracer_print_header(struct seq_file *m)
 
 static void blk_tracer_start(struct trace_array *tr)
 {
-	tracing_reset_online_cpus(tr);
-
 	mutex_lock(&blk_probe_mutex);
 	if (atomic_add_return(1, &blk_probes_ref) == 1)
 		if (blk_register_tracepoints())
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8fad3776e843..ef4dbac95568 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2171,6 +2171,12 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+int tracer_init(struct tracer *t, struct trace_array *tr)
+{
+	tracing_reset_online_cpus(tr);
+	return t->init(tr);
+}
+
 static int tracing_set_tracer(const char *buf)
 {
 	struct trace_array *tr = &global_trace;
@@ -2195,7 +2201,7 @@ static int tracing_set_tracer(const char *buf)
 
 	current_trace = t;
 	if (t->init) {
-		ret = t->init(tr);
+		ret = tracer_init(t, tr);
 		if (ret)
 			goto out;
 	}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e03f157c772e..f2742fb1575a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -395,6 +395,7 @@ struct trace_iterator {
 	cpumask_var_t		started;
 };
 
+int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 48b2196abe37..f8ae2c50e01d 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -131,7 +131,6 @@ static void stop_branch_trace(struct trace_array *tr)
 
 static int branch_trace_init(struct trace_array *tr)
 {
-	tracing_reset_online_cpus(tr);
 	start_branch_trace(tr);
 	return 0;
 }
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index d067cea2ccc3..36bf9568ccd9 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -24,32 +24,21 @@ static struct trace_array	*func_trace;
 static void tracing_start_function_trace(void);
 static void tracing_stop_function_trace(void);
 
-static void start_function_trace(struct trace_array *tr)
+static int function_trace_init(struct trace_array *tr)
 {
 	func_trace = tr;
 	tr->cpu = get_cpu();
-	tracing_reset_online_cpus(tr);
 	put_cpu();
 
 	tracing_start_cmdline_record();
 	tracing_start_function_trace();
-}
-
-static void stop_function_trace(struct trace_array *tr)
-{
-	tracing_stop_function_trace();
-	tracing_stop_cmdline_record();
-}
-
-static int function_trace_init(struct trace_array *tr)
-{
-	start_function_trace(tr);
 	return 0;
 }
 
 static void function_trace_reset(struct trace_array *tr)
 {
-	stop_function_trace(tr);
+	tracing_stop_function_trace();
+	tracing_stop_cmdline_record();
 }
 
 static void function_trace_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c97594d826bc..222f97d336a6 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -56,7 +56,6 @@ static int graph_trace_init(struct trace_array *tr)
 					&trace_graph_entry);
 	if (ret)
 		return ret;
-	tracing_reset_online_cpus(tr);
 	tracing_start_cmdline_record();
 
 	return 0;
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 2aa1c9f4c7d8..ca4bbcfb9e2c 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -132,7 +132,6 @@ static int bts_trace_init(struct trace_array *tr)
 	hw_branch_trace = tr;
 
 	register_hotcpu_notifier(&bts_hotcpu_notifier);
-	tracing_reset_online_cpus(tr);
 	bts_trace_start(tr);
 
 	return 0;
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 087b6cbf4ea5..9aa84bde23cd 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -48,7 +48,6 @@ static void stop_nop_trace(struct trace_array *tr)
 static int nop_trace_init(struct trace_array *tr)
 {
 	ctx_trace = tr;
-	tracing_reset_online_cpus(tr);
 	start_nop_trace(tr);
 	return 0;
 }
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index c4f9add5ec90..30e14fe85896 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -185,12 +185,6 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
 	ctx_trace = tr;
 }
 
-static void start_sched_trace(struct trace_array *tr)
-{
-	tracing_reset_online_cpus(tr);
-	tracing_start_sched_switch_record();
-}
-
 static void stop_sched_trace(struct trace_array *tr)
 {
 	tracing_stop_sched_switch_record();
@@ -199,7 +193,7 @@ static void stop_sched_trace(struct trace_array *tr)
 static int sched_switch_trace_init(struct trace_array *tr)
 {
 	ctx_trace = tr;
-	start_sched_trace(tr);
+	tracing_start_sched_switch_record();
 	return 0;
 }
 
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 5013812578b1..445700e51f6d 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -115,7 +115,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	ftrace_set_filter(func_name, strlen(func_name), 1);
 
 	/* enable tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		goto out;
@@ -189,7 +189,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	ftrace_enabled = 1;
 	tracer_enabled = 1;
 
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		goto out;
@@ -236,7 +236,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
@@ -290,7 +290,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	}
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
@@ -344,7 +344,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	}
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		goto out;
@@ -476,7 +476,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	wait_for_completion(&isrt);
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
@@ -537,7 +537,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 	int ret;
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
@@ -569,7 +569,7 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return 0;
@@ -596,7 +596,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index eaca5ad803ff..84ca9d81e74d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -226,15 +226,6 @@ static void stop_stack_timers(void)
 		stop_stack_timer(cpu);
 }
 
-static void start_stack_trace(struct trace_array *tr)
-{
-	mutex_lock(&sample_timer_lock);
-	tracing_reset_online_cpus(tr);
-	start_stack_timers();
-	tracer_enabled = 1;
-	mutex_unlock(&sample_timer_lock);
-}
-
 static void stop_stack_trace(struct trace_array *tr)
 {
 	mutex_lock(&sample_timer_lock);
@@ -247,7 +238,10 @@ static int stack_trace_init(struct trace_array *tr)
 {
 	sysprof_trace = tr;
 
-	start_stack_trace(tr);
+	mutex_lock(&sample_timer_lock);
+	start_stack_timers();
+	tracer_enabled = 1;
+	mutex_unlock(&sample_timer_lock);
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 1830b52d0de8c60c4f5dfbac134aa8f69d815801 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 7 Feb 2009 19:38:43 -0500
Subject: trace: remove deprecated entry->cpu

Impact: fix to prevent developers from using entry->cpu

With the new ring buffer infrastructure, the cpu for the entry is
implicit with which CPU buffer it is on.

The original code use to record the current cpu into the generic
entry header, which can be retrieved by entry->cpu. When the
ring buffer was introduced, the users were convert to use the
the cpu number of which cpu ring buffer was in use (this was passed
to the tracers by the iterator: iter->cpu).

Unfortunately, the cpu item in the entry structure was never removed.
This allowed for developers to use it instead of the proper iter->cpu,
unknowingly, using an uninitialized variable. This was not the fault
of the developers, since it would seem like the logical place to
retrieve the cpu identifier.

This patch removes the cpu item from the entry structure and fixes
all the users that should have been using iter->cpu.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c             | 2 +-
 kernel/trace/trace.h             | 1 -
 kernel/trace/trace_hw_branches.c | 3 +--
 kernel/trace/trace_output.c      | 6 +++---
 4 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fd51cf0b94c7..bd4d9f8818fa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1531,7 +1531,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
 		SEQ_PUT_FIELD_RET(s, entry->pid);
-		SEQ_PUT_FIELD_RET(s, entry->cpu);
+		SEQ_PUT_FIELD_RET(s, iter->cpu);
 		SEQ_PUT_FIELD_RET(s, iter->ts);
 	}
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f0c7a0f08cac..5efc4c707f7e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -45,7 +45,6 @@ enum trace_type {
  */
 struct trace_entry {
 	unsigned char		type;
-	unsigned char		cpu;
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index fff3545fc866..549238a9b133 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -159,7 +159,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 	trace_assign_type(it, entry);
 
 	if (entry->type == TRACE_HW_BRANCHES) {
-		if (trace_seq_printf(seq, "%4d  ", entry->cpu) &&
+		if (trace_seq_printf(seq, "%4d  ", iter->cpu) &&
 		    seq_print_ip_sym(seq, it->to, symflags) &&
 		    trace_seq_printf(seq, "\t  <-  ") &&
 		    seq_print_ip_sym(seq, it->from, symflags) &&
@@ -195,7 +195,6 @@ void trace_hw_branch(u64 from, u64 to)
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, from);
 	entry->ent.type = TRACE_HW_BRANCHES;
-	entry->ent.cpu = cpu;
 	entry->from = from;
 	entry->to   = to;
 	ring_buffer_unlock_commit(tr->buffer, event, irq2);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b7380eee9fa1..463a310b1d3b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -333,7 +333,7 @@ int trace_print_context(struct trace_iterator *iter)
 	unsigned long secs = (unsigned long)t;
 
 	return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
-				comm, entry->pid, entry->cpu, secs, usec_rem);
+				comm, entry->pid, iter->cpu, secs, usec_rem);
 }
 
 int trace_print_lat_context(struct trace_iterator *iter)
@@ -356,7 +356,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
 		char *comm = trace_find_cmdline(entry->pid);
 		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
 				       " %ld.%03ldms (+%ld.%03ldms): ", comm,
-				       entry->pid, entry->cpu, entry->flags,
+				       entry->pid, iter->cpu, entry->flags,
 				       entry->preempt_count, iter->idx,
 				       ns2usecs(iter->ts),
 				       abs_usecs / USEC_PER_MSEC,
@@ -364,7 +364,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
 				       rel_usecs / USEC_PER_MSEC,
 				       rel_usecs % USEC_PER_MSEC);
 	} else {
-		ret = lat_print_generic(s, entry, entry->cpu);
+		ret = lat_print_generic(s, entry, iter->cpu);
 		if (ret)
 			ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 78d904b46a72fcf15ea6a39672bbef92953876b5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Feb 2009 18:43:07 -0500
Subject: ring-buffer: add NMI protection for spinlocks

Impact: prevent deadlock in NMI

The ring buffers are not yet totally lockless with writing to
the buffer. When a writer crosses a page, it grabs a per cpu spinlock
to protect against a reader. The spinlocks taken by a writer are not
to protect against other writers, since a writer can only write to
its own per cpu buffer. The spinlocks protect against readers that
can touch any cpu buffer. The writers are made to be reentrant
with the spinlocks disabling interrupts.

The problem arises when an NMI writes to the buffer, and that write
crosses a page boundary. If it grabs a spinlock, it can be racing
with another writer (since disabling interrupts does not protect
against NMIs) or with a reader on the same CPU. Luckily, most of the
users are not reentrant and protects against this issue. But if a
user of the ring buffer becomes reentrant (which is what the ring
buffers do allow), if the NMI also writes to the ring buffer then
we risk the chance of a deadlock.

This patch moves the ftrace_nmi_enter called by nmi_enter() to the
ring buffer code. It replaces the current ftrace_nmi_enter that is
used by arch specific code to arch_ftrace_nmi_enter and updates
the Kconfig to handle it.

When an NMI is called, it will set a per cpu variable in the ring buffer
code and will clear it when the NMI exits. If a write to the ring buffer
crosses page boundaries inside an NMI, a trylock is used on the spin
lock instead. If the spinlock fails to be acquired, then the entry
is discarded.

This bug appeared in the ftrace work in the RT tree, where event tracing
is reentrant. This workaround solved the deadlocks that appeared there.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/Kconfig           |  1 +
 arch/x86/kernel/ftrace.c   |  8 ++++----
 include/linux/ftrace_irq.h | 10 +++++++++-
 kernel/trace/Kconfig       |  8 ++++++++
 kernel/trace/ring_buffer.c | 48 ++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 68 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 73f7fe8fd4d1..a6be725cb049 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -34,6 +34,7 @@ config X86
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE || FUNCTION_GRAPH_TRACER
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 	select HAVE_ARCH_TRACEHOOK
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 4d33224c055f..4c683587055b 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -113,7 +113,7 @@ static void ftrace_mod_code(void)
 					     MCOUNT_INSN_SIZE);
 }
 
-void ftrace_nmi_enter(void)
+void arch_ftrace_nmi_enter(void)
 {
 	atomic_inc(&in_nmi);
 	/* Must have in_nmi seen before reading write flag */
@@ -124,7 +124,7 @@ void ftrace_nmi_enter(void)
 	}
 }
 
-void ftrace_nmi_exit(void)
+void arch_ftrace_nmi_exit(void)
 {
 	/* Finish all executions before clearing in_nmi */
 	smp_wmb();
@@ -376,12 +376,12 @@ int ftrace_disable_ftrace_graph_caller(void)
  */
 static atomic_t in_nmi;
 
-void ftrace_nmi_enter(void)
+void arch_ftrace_nmi_enter(void)
 {
 	atomic_inc(&in_nmi);
 }
 
-void ftrace_nmi_exit(void)
+void arch_ftrace_nmi_exit(void)
 {
 	atomic_dec(&in_nmi);
 }
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index 366a054d0b05..29de6779a963 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -2,7 +2,15 @@
 #define _LINUX_FTRACE_IRQ_H
 
 
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
+#ifdef CONFIG_FTRACE_NMI_ENTER
+extern void arch_ftrace_nmi_enter(void);
+extern void arch_ftrace_nmi_exit(void);
+#else
+static inline void arch_ftrace_nmi_enter(void) { }
+static inline void arch_ftrace_nmi_exit(void) { }
+#endif
+
+#ifdef CONFIG_RING_BUFFER
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 28f2644484d9..25131a5d5e4f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
 config NOP_TRACER
 	bool
 
+config HAVE_FTRACE_NMI_ENTER
+	bool
+
 config HAVE_FUNCTION_TRACER
 	bool
 
@@ -37,6 +40,11 @@ config TRACER_MAX_TRACE
 config RING_BUFFER
 	bool
 
+config FTRACE_NMI_ENTER
+       bool
+       depends on HAVE_FTRACE_NMI_ENTER
+       default y
+
 config TRACING
 	bool
 	select DEBUG_FS
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b36d7374ceef..a60a6a852f42 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  */
 #include <linux/ring_buffer.h>
+#include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
@@ -18,6 +19,35 @@
 
 #include "trace.h"
 
+/*
+ * Since the write to the buffer is still not fully lockless,
+ * we must be careful with NMIs. The locks in the writers
+ * are taken when a write crosses to a new page. The locks
+ * protect against races with the readers (this will soon
+ * be fixed with a lockless solution).
+ *
+ * Because we can not protect against NMIs, and we want to
+ * keep traces reentrant, we need to manage what happens
+ * when we are in an NMI.
+ */
+static DEFINE_PER_CPU(int, rb_in_nmi);
+
+void ftrace_nmi_enter(void)
+{
+	__get_cpu_var(rb_in_nmi)++;
+	/* call arch specific handler too */
+	arch_ftrace_nmi_enter();
+}
+
+void ftrace_nmi_exit(void)
+{
+	arch_ftrace_nmi_exit();
+	__get_cpu_var(rb_in_nmi)--;
+	/* NMIs are not recursive */
+	WARN_ON_ONCE(__get_cpu_var(rb_in_nmi));
+}
+
+
 /*
  * A fast way to enable or disable all ring buffers is to
  * call tracing_on or tracing_off. Turning off the ring buffers
@@ -982,6 +1012,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
 	unsigned long flags;
+	bool lock_taken = false;
 
 	commit_page = cpu_buffer->commit_page;
 	/* we just need to protect against interrupts */
@@ -995,7 +1026,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		struct buffer_page *next_page = tail_page;
 
 		local_irq_save(flags);
-		__raw_spin_lock(&cpu_buffer->lock);
+		/*
+		 * NMIs can happen after we take the lock.
+		 * If we are in an NMI, only take the lock
+		 * if it is not already taken. Otherwise
+		 * simply fail.
+		 */
+		if (unlikely(__get_cpu_var(rb_in_nmi))) {
+			if (!__raw_spin_trylock(&cpu_buffer->lock))
+				goto out_unlock;
+		} else
+			__raw_spin_lock(&cpu_buffer->lock);
+
+		lock_taken = true;
 
 		rb_inc_page(cpu_buffer, &next_page);
 
@@ -1097,7 +1140,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	if (tail <= BUF_PAGE_SIZE)
 		local_set(&tail_page->write, tail);
 
-	__raw_spin_unlock(&cpu_buffer->lock);
+	if (likely(lock_taken))
+		__raw_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 	return NULL;
 }
-- 
cgit v1.2.3-59-g8ed1b


From a81bd80a0b0a405dc0483e2c428332d69da2c79f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 6 Feb 2009 01:45:16 -0500
Subject: ring-buffer: use generic version of in_nmi

Impact: clean up

Now that a generic in_nmi is available, this patch removes the
special code in the ring_buffer and implements the in_nmi generic
version instead.

With this change, I was also able to rename the "arch_ftrace_nmi_enter"
back to "ftrace_nmi_enter" and remove the code from the ring buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/kernel/ftrace.c   |  4 ++--
 include/linux/ftrace_irq.h |  8 --------
 kernel/trace/ring_buffer.c | 43 +++++++++++++------------------------------
 3 files changed, 15 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 918073c6681b..d74d75e0952d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -113,7 +113,7 @@ static void ftrace_mod_code(void)
 					     MCOUNT_INSN_SIZE);
 }
 
-void arch_ftrace_nmi_enter(void)
+void ftrace_nmi_enter(void)
 {
 	atomic_inc(&nmi_running);
 	/* Must have nmi_running seen before reading write flag */
@@ -124,7 +124,7 @@ void arch_ftrace_nmi_enter(void)
 	}
 }
 
-void arch_ftrace_nmi_exit(void)
+void ftrace_nmi_exit(void)
 {
 	/* Finish all executions before clearing nmi_running */
 	smp_wmb();
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index 29de6779a963..dca7bf8cffe2 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -3,14 +3,6 @@
 
 
 #ifdef CONFIG_FTRACE_NMI_ENTER
-extern void arch_ftrace_nmi_enter(void);
-extern void arch_ftrace_nmi_exit(void);
-#else
-static inline void arch_ftrace_nmi_enter(void) { }
-static inline void arch_ftrace_nmi_exit(void) { }
-#endif
-
-#ifdef CONFIG_RING_BUFFER
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a60a6a852f42..5ee344417cd5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -8,6 +8,7 @@
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
+#include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -19,35 +20,6 @@
 
 #include "trace.h"
 
-/*
- * Since the write to the buffer is still not fully lockless,
- * we must be careful with NMIs. The locks in the writers
- * are taken when a write crosses to a new page. The locks
- * protect against races with the readers (this will soon
- * be fixed with a lockless solution).
- *
- * Because we can not protect against NMIs, and we want to
- * keep traces reentrant, we need to manage what happens
- * when we are in an NMI.
- */
-static DEFINE_PER_CPU(int, rb_in_nmi);
-
-void ftrace_nmi_enter(void)
-{
-	__get_cpu_var(rb_in_nmi)++;
-	/* call arch specific handler too */
-	arch_ftrace_nmi_enter();
-}
-
-void ftrace_nmi_exit(void)
-{
-	arch_ftrace_nmi_exit();
-	__get_cpu_var(rb_in_nmi)--;
-	/* NMIs are not recursive */
-	WARN_ON_ONCE(__get_cpu_var(rb_in_nmi));
-}
-
-
 /*
  * A fast way to enable or disable all ring buffers is to
  * call tracing_on or tracing_off. Turning off the ring buffers
@@ -1027,12 +999,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 		local_irq_save(flags);
 		/*
+		 * Since the write to the buffer is still not
+		 * fully lockless, we must be careful with NMIs.
+		 * The locks in the writers are taken when a write
+		 * crosses to a new page. The locks protect against
+		 * races with the readers (this will soon be fixed
+		 * with a lockless solution).
+		 *
+		 * Because we can not protect against NMIs, and we
+		 * want to keep traces reentrant, we need to manage
+		 * what happens when we are in an NMI.
+		 *
 		 * NMIs can happen after we take the lock.
 		 * If we are in an NMI, only take the lock
 		 * if it is not already taken. Otherwise
 		 * simply fail.
 		 */
-		if (unlikely(__get_cpu_var(rb_in_nmi))) {
+		if (unlikely(in_nmi())) {
 			if (!__raw_spin_trylock(&cpu_buffer->lock))
 				goto out_unlock;
 		} else
-- 
cgit v1.2.3-59-g8ed1b


From 57794a9d48b63e34acbe63282628c9f029603308 Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Fri, 6 Feb 2009 17:33:27 +0800
Subject: trace: trivial fixes in comment typos.

Impact: clean up

Fixed several typos in the comments.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h | 2 +-
 kernel/trace/ftrace.c  | 6 +++---
 kernel/trace/trace.h   | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 7840e718c6c7..5e302d636fc2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -140,7 +140,7 @@ static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; }
 #endif
 
 /**
- * ftrace_make_nop - convert code into top
+ * ftrace_make_nop - convert code into nop
  * @mod: module structure if called by module load initialization
  * @rec: the mcount call site record
  * @addr: the address that the call site should be calling
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 68610031780b..1796e018fbff 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -465,7 +465,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	 * it is not enabled then do nothing.
 	 *
 	 * If this record is not to be traced and
-	 * it is enabled then disabled it.
+	 * it is enabled then disable it.
 	 *
 	 */
 	if (rec->flags & FTRACE_FL_NOTRACE) {
@@ -485,7 +485,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 		if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
 			return 0;
 
-		/* Record is not filtered and is not enabled do nothing */
+		/* Record is not filtered or enabled, do nothing */
 		if (!fl)
 			return 0;
 
@@ -507,7 +507,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 		} else {
 
-			/* if record is not enabled do nothing */
+			/* if record is not enabled, do nothing */
 			if (!(rec->flags & FTRACE_FL_ENABLED))
 				return 0;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5efc4c707f7e..f92aba52a894 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -616,12 +616,12 @@ extern struct tracer nop_trace;
  * preempt_enable (after a disable), a schedule might take place
  * causing an infinite recursion.
  *
- * To prevent this, we read the need_recshed flag before
+ * To prevent this, we read the need_resched flag before
  * disabling preemption. When we want to enable preemption we
  * check the flag, if it is set, then we call preempt_enable_no_resched.
  * Otherwise, we call preempt_enable.
  *
- * The rational for doing the above is that if need resched is set
+ * The rational for doing the above is that if need_resched is set
  * and we have yet to reschedule, we are either in an atomic location
  * (where we do not need to check for scheduling) or we are inside
  * the scheduler and do not want to resched.
@@ -642,7 +642,7 @@ static inline int ftrace_preempt_disable(void)
  *
  * This is a scheduler safe way to enable preemption and not miss
  * any preemption checks. The disabled saved the state of preemption.
- * If resched is set, then we were either inside an atomic or
+ * If resched is set, then we are either inside an atomic or
  * are inside the scheduler (we would have already scheduled
  * otherwise). In this case, we do not want to call normal
  * preempt_enable, but preempt_enable_no_resched instead.
-- 
cgit v1.2.3-59-g8ed1b


From 2db270a80b8f2238e536876cfb3987af02684df8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Feb 2009 20:46:45 +0100
Subject: tracing/blktrace: move the tracing file to kernel/trace

Impact: cleanup

Move blktrace.c to kernel/trace, also move its config entry.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/Kconfig           |   24 -
 block/Makefile          |    1 -
 block/blktrace.c        | 1538 -----------------------------------------------
 kernel/trace/Kconfig    |   23 +
 kernel/trace/Makefile   |    1 +
 kernel/trace/blktrace.c | 1538 +++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 1562 insertions(+), 1563 deletions(-)
 delete mode 100644 block/blktrace.c
 create mode 100644 kernel/trace/blktrace.c

(limited to 'kernel')

diff --git a/block/Kconfig b/block/Kconfig
index 7cdaa1d72252..e7d12782bcfb 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -44,30 +44,6 @@ config LBD
 
 	  If unsure, say N.
 
-config BLK_DEV_IO_TRACE
-	bool "Support for tracing block io actions"
-	depends on SYSFS
-	select RELAY
-	select DEBUG_FS
-	select TRACEPOINTS
-	select TRACING
-	select STACKTRACE
-	help
-	  Say Y here if you want to be able to trace the block layer actions
-	  on a given queue. Tracing allows you to see any traffic happening
-	  on a block device queue. For more information (and the userspace
-	  support tools needed), fetch the blktrace tools from:
-
-	  git://git.kernel.dk/blktrace.git
-
-	  Tracing also is possible using the ftrace interface, e.g.:
-
-	    echo 1 > /sys/block/sda/sda1/trace/enable
-	    echo blk > /sys/kernel/debug/tracing/current_tracer
-	    cat /sys/kernel/debug/tracing/trace_pipe
-
-	  If unsure, say N.
-
 config BLK_DEV_BSG
 	bool "Block layer SG support v4 (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff --git a/block/Makefile b/block/Makefile
index bfe73049f939..e9fa4dd690f2 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -13,6 +13,5 @@ obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 
-obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
diff --git a/block/blktrace.c b/block/blktrace.c
deleted file mode 100644
index ca6d32061e4f..000000000000
--- a/block/blktrace.c
+++ /dev/null
@@ -1,1538 +0,0 @@
-/*
- * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- */
-#include <linux/kernel.h>
-#include <linux/blkdev.h>
-#include <linux/blktrace_api.h>
-#include <linux/percpu.h>
-#include <linux/init.h>
-#include <linux/mutex.h>
-#include <linux/debugfs.h>
-#include <linux/time.h>
-#include <trace/block.h>
-#include <linux/uaccess.h>
-#include <../kernel/trace/trace_output.h>
-
-static unsigned int blktrace_seq __read_mostly = 1;
-
-static struct trace_array *blk_tr;
-static int __read_mostly  blk_tracer_enabled;
-
-/* Select an alternative, minimalistic output than the original one */
-#define TRACE_BLK_OPT_CLASSIC 	0x1
-
-static struct tracer_opt blk_tracer_opts[] = {
-	/* Default disable the minimalistic output */
-	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
-	{ }
-};
-
-static struct tracer_flags blk_tracer_flags = {
-	.val  = 0,
-	.opts = blk_tracer_opts,
-};
-
-/* Global reference count of probes */
-static DEFINE_MUTEX(blk_probe_mutex);
-static atomic_t blk_probes_ref = ATOMIC_INIT(0);
-
-static int blk_register_tracepoints(void);
-static void blk_unregister_tracepoints(void);
-
-/*
- * Send out a notify message.
- */
-static void trace_note(struct blk_trace *bt, pid_t pid, int action,
-		       const void *data, size_t len)
-{
-	struct blk_io_trace *t;
-
-	if (!bt->rchan)
-		return;
-
-	t = relay_reserve(bt->rchan, sizeof(*t) + len);
-	if (t) {
-		const int cpu = smp_processor_id();
-
-		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
-		t->time = ktime_to_ns(ktime_get());
-		t->device = bt->dev;
-		t->action = action;
-		t->pid = pid;
-		t->cpu = cpu;
-		t->pdu_len = len;
-		memcpy((void *) t + sizeof(*t), data, len);
-	}
-}
-
-/*
- * Send out a notify for this process, if we haven't done so since a trace
- * started
- */
-static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
-{
-	tsk->btrace_seq = blktrace_seq;
-	trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
-}
-
-static void trace_note_time(struct blk_trace *bt)
-{
-	struct timespec now;
-	unsigned long flags;
-	u32 words[2];
-
-	getnstimeofday(&now);
-	words[0] = now.tv_sec;
-	words[1] = now.tv_nsec;
-
-	local_irq_save(flags);
-	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
-	local_irq_restore(flags);
-}
-
-void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
-{
-	int n;
-	va_list args;
-	unsigned long flags;
-	char *buf;
-
-	if (blk_tr) {
-		va_start(args, fmt);
-		ftrace_vprintk(fmt, args);
-		va_end(args);
-		return;
-	}
-
-	if (!bt->msg_data)
-		return;
-
-	local_irq_save(flags);
-	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
-	va_start(args, fmt);
-	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
-	va_end(args);
-
-	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(__trace_note_message);
-
-static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
-			 pid_t pid)
-{
-	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
-		return 1;
-	if (sector < bt->start_lba || sector > bt->end_lba)
-		return 1;
-	if (bt->pid && pid != bt->pid)
-		return 1;
-
-	return 0;
-}
-
-/*
- * Data direction bit lookup
- */
-static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ),
-					 BLK_TC_ACT(BLK_TC_WRITE) };
-
-/* The ilog2() calls fall out because they're constant */
-#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
-	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
-
-/*
- * The worker for the various blk_add_trace*() types. Fills out a
- * blk_io_trace structure and places it in a per-cpu subbuffer.
- */
-static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
-		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
-{
-	struct task_struct *tsk = current;
-	struct ring_buffer_event *event = NULL;
-	struct blk_io_trace *t;
-	unsigned long flags = 0;
-	unsigned long *sequence;
-	pid_t pid;
-	int cpu, pc = 0;
-
-	if (unlikely(bt->trace_state != Blktrace_running ||
-		     !blk_tracer_enabled))
-		return;
-
-	what |= ddir_act[rw & WRITE];
-	what |= MASK_TC_BIT(rw, BARRIER);
-	what |= MASK_TC_BIT(rw, SYNC);
-	what |= MASK_TC_BIT(rw, AHEAD);
-	what |= MASK_TC_BIT(rw, META);
-	what |= MASK_TC_BIT(rw, DISCARD);
-
-	pid = tsk->pid;
-	if (unlikely(act_log_check(bt, what, sector, pid)))
-		return;
-	cpu = raw_smp_processor_id();
-
-	if (blk_tr) {
-		tracing_record_cmdline(current);
-
-		pc = preempt_count();
-		event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
-						  sizeof(*t) + pdu_len,
-						  0, pc);
-		if (!event)
-			return;
-		t = ring_buffer_event_data(event);
-		goto record_it;
-	}
-
-	/*
-	 * A word about the locking here - we disable interrupts to reserve
-	 * some space in the relay per-cpu buffer, to prevent an irq
-	 * from coming in and stepping on our toes.
-	 */
-	local_irq_save(flags);
-
-	if (unlikely(tsk->btrace_seq != blktrace_seq))
-		trace_note_tsk(bt, tsk);
-
-	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
-	if (t) {
-		sequence = per_cpu_ptr(bt->sequence, cpu);
-
-		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
-		t->sequence = ++(*sequence);
-		t->time = ktime_to_ns(ktime_get());
-record_it:
-		/*
-		 * These two are not needed in ftrace as they are in the
-		 * generic trace_entry, filled by tracing_generic_entry_update,
-		 * but for the trace_event->bin() synthesizer benefit we do it
-		 * here too.
-		 */
-		t->cpu = cpu;
-		t->pid = pid;
-
-		t->sector = sector;
-		t->bytes = bytes;
-		t->action = what;
-		t->device = bt->dev;
-		t->error = error;
-		t->pdu_len = pdu_len;
-
-		if (pdu_len)
-			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
-
-		if (blk_tr) {
-			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
-			return;
-		}
-	}
-
-	local_irq_restore(flags);
-}
-
-static struct dentry *blk_tree_root;
-static DEFINE_MUTEX(blk_tree_mutex);
-
-static void blk_trace_cleanup(struct blk_trace *bt)
-{
-	debugfs_remove(bt->msg_file);
-	debugfs_remove(bt->dropped_file);
-	relay_close(bt->rchan);
-	free_percpu(bt->sequence);
-	free_percpu(bt->msg_data);
-	kfree(bt);
-	mutex_lock(&blk_probe_mutex);
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
-	mutex_unlock(&blk_probe_mutex);
-}
-
-int blk_trace_remove(struct request_queue *q)
-{
-	struct blk_trace *bt;
-
-	bt = xchg(&q->blk_trace, NULL);
-	if (!bt)
-		return -EINVAL;
-
-	if (bt->trace_state == Blktrace_setup ||
-	    bt->trace_state == Blktrace_stopped)
-		blk_trace_cleanup(bt);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(blk_trace_remove);
-
-static int blk_dropped_open(struct inode *inode, struct file *filp)
-{
-	filp->private_data = inode->i_private;
-
-	return 0;
-}
-
-static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
-				size_t count, loff_t *ppos)
-{
-	struct blk_trace *bt = filp->private_data;
-	char buf[16];
-
-	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
-
-	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
-}
-
-static const struct file_operations blk_dropped_fops = {
-	.owner =	THIS_MODULE,
-	.open =		blk_dropped_open,
-	.read =		blk_dropped_read,
-};
-
-static int blk_msg_open(struct inode *inode, struct file *filp)
-{
-	filp->private_data = inode->i_private;
-
-	return 0;
-}
-
-static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
-				size_t count, loff_t *ppos)
-{
-	char *msg;
-	struct blk_trace *bt;
-
-	if (count > BLK_TN_MAX_MSG)
-		return -EINVAL;
-
-	msg = kmalloc(count, GFP_KERNEL);
-	if (msg == NULL)
-		return -ENOMEM;
-
-	if (copy_from_user(msg, buffer, count)) {
-		kfree(msg);
-		return -EFAULT;
-	}
-
-	bt = filp->private_data;
-	__trace_note_message(bt, "%s", msg);
-	kfree(msg);
-
-	return count;
-}
-
-static const struct file_operations blk_msg_fops = {
-	.owner =	THIS_MODULE,
-	.open =		blk_msg_open,
-	.write =	blk_msg_write,
-};
-
-/*
- * Keep track of how many times we encountered a full subbuffer, to aid
- * the user space app in telling how many lost events there were.
- */
-static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
-				     void *prev_subbuf, size_t prev_padding)
-{
-	struct blk_trace *bt;
-
-	if (!relay_buf_full(buf))
-		return 1;
-
-	bt = buf->chan->private_data;
-	atomic_inc(&bt->dropped);
-	return 0;
-}
-
-static int blk_remove_buf_file_callback(struct dentry *dentry)
-{
-	struct dentry *parent = dentry->d_parent;
-	debugfs_remove(dentry);
-
-	/*
-	* this will fail for all but the last file, but that is ok. what we
-	* care about is the top level buts->name directory going away, when
-	* the last trace file is gone. Then we don't have to rmdir() that
-	* manually on trace stop, so it nicely solves the issue with
-	* force killing of running traces.
-	*/
-
-	debugfs_remove(parent);
-	return 0;
-}
-
-static struct dentry *blk_create_buf_file_callback(const char *filename,
-						   struct dentry *parent,
-						   int mode,
-						   struct rchan_buf *buf,
-						   int *is_global)
-{
-	return debugfs_create_file(filename, mode, parent, buf,
-					&relay_file_operations);
-}
-
-static struct rchan_callbacks blk_relay_callbacks = {
-	.subbuf_start		= blk_subbuf_start_callback,
-	.create_buf_file	= blk_create_buf_file_callback,
-	.remove_buf_file	= blk_remove_buf_file_callback,
-};
-
-/*
- * Setup everything required to start tracing
- */
-int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-			struct blk_user_trace_setup *buts)
-{
-	struct blk_trace *old_bt, *bt = NULL;
-	struct dentry *dir = NULL;
-	int ret, i;
-
-	if (!buts->buf_size || !buts->buf_nr)
-		return -EINVAL;
-
-	strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
-	buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
-
-	/*
-	 * some device names have larger paths - convert the slashes
-	 * to underscores for this to work as expected
-	 */
-	for (i = 0; i < strlen(buts->name); i++)
-		if (buts->name[i] == '/')
-			buts->name[i] = '_';
-
-	ret = -ENOMEM;
-	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
-	if (!bt)
-		goto err;
-
-	bt->sequence = alloc_percpu(unsigned long);
-	if (!bt->sequence)
-		goto err;
-
-	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
-	if (!bt->msg_data)
-		goto err;
-
-	ret = -ENOENT;
-
-	if (!blk_tree_root) {
-		blk_tree_root = debugfs_create_dir("block", NULL);
-		if (!blk_tree_root)
-			return -ENOMEM;
-	}
-
-	dir = debugfs_create_dir(buts->name, blk_tree_root);
-
-	if (!dir)
-		goto err;
-
-	bt->dir = dir;
-	bt->dev = dev;
-	atomic_set(&bt->dropped, 0);
-
-	ret = -EIO;
-	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
-					       &blk_dropped_fops);
-	if (!bt->dropped_file)
-		goto err;
-
-	bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
-	if (!bt->msg_file)
-		goto err;
-
-	bt->rchan = relay_open("trace", dir, buts->buf_size,
-				buts->buf_nr, &blk_relay_callbacks, bt);
-	if (!bt->rchan)
-		goto err;
-
-	bt->act_mask = buts->act_mask;
-	if (!bt->act_mask)
-		bt->act_mask = (u16) -1;
-
-	bt->start_lba = buts->start_lba;
-	bt->end_lba = buts->end_lba;
-	if (!bt->end_lba)
-		bt->end_lba = -1ULL;
-
-	bt->pid = buts->pid;
-	bt->trace_state = Blktrace_setup;
-
-	mutex_lock(&blk_probe_mutex);
-	if (atomic_add_return(1, &blk_probes_ref) == 1) {
-		ret = blk_register_tracepoints();
-		if (ret)
-			goto probe_err;
-	}
-	mutex_unlock(&blk_probe_mutex);
-
-	ret = -EBUSY;
-	old_bt = xchg(&q->blk_trace, bt);
-	if (old_bt) {
-		(void) xchg(&q->blk_trace, old_bt);
-		goto err;
-	}
-
-	return 0;
-probe_err:
-	atomic_dec(&blk_probes_ref);
-	mutex_unlock(&blk_probe_mutex);
-err:
-	if (bt) {
-		if (bt->msg_file)
-			debugfs_remove(bt->msg_file);
-		if (bt->dropped_file)
-			debugfs_remove(bt->dropped_file);
-		free_percpu(bt->sequence);
-		free_percpu(bt->msg_data);
-		if (bt->rchan)
-			relay_close(bt->rchan);
-		kfree(bt);
-	}
-	return ret;
-}
-
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-		    char __user *arg)
-{
-	struct blk_user_trace_setup buts;
-	int ret;
-
-	ret = copy_from_user(&buts, arg, sizeof(buts));
-	if (ret)
-		return -EFAULT;
-
-	ret = do_blk_trace_setup(q, name, dev, &buts);
-	if (ret)
-		return ret;
-
-	if (copy_to_user(arg, &buts, sizeof(buts)))
-		return -EFAULT;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(blk_trace_setup);
-
-int blk_trace_startstop(struct request_queue *q, int start)
-{
-	int ret;
-	struct blk_trace *bt = q->blk_trace;
-
-	if (bt == NULL)
-		return -EINVAL;
-
-	/*
-	 * For starting a trace, we can transition from a setup or stopped
-	 * trace. For stopping a trace, the state must be running
-	 */
-	ret = -EINVAL;
-	if (start) {
-		if (bt->trace_state == Blktrace_setup ||
-		    bt->trace_state == Blktrace_stopped) {
-			blktrace_seq++;
-			smp_mb();
-			bt->trace_state = Blktrace_running;
-
-			trace_note_time(bt);
-			ret = 0;
-		}
-	} else {
-		if (bt->trace_state == Blktrace_running) {
-			bt->trace_state = Blktrace_stopped;
-			relay_flush(bt->rchan);
-			ret = 0;
-		}
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(blk_trace_startstop);
-
-/**
- * blk_trace_ioctl: - handle the ioctls associated with tracing
- * @bdev:	the block device
- * @cmd: 	the ioctl cmd
- * @arg:	the argument data, if any
- *
- **/
-int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
-{
-	struct request_queue *q;
-	int ret, start = 0;
-	char b[BDEVNAME_SIZE];
-
-	q = bdev_get_queue(bdev);
-	if (!q)
-		return -ENXIO;
-
-	mutex_lock(&bdev->bd_mutex);
-
-	switch (cmd) {
-	case BLKTRACESETUP:
-		bdevname(bdev, b);
-		ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
-		break;
-	case BLKTRACESTART:
-		start = 1;
-	case BLKTRACESTOP:
-		ret = blk_trace_startstop(q, start);
-		break;
-	case BLKTRACETEARDOWN:
-		ret = blk_trace_remove(q);
-		break;
-	default:
-		ret = -ENOTTY;
-		break;
-	}
-
-	mutex_unlock(&bdev->bd_mutex);
-	return ret;
-}
-
-/**
- * blk_trace_shutdown: - stop and cleanup trace structures
- * @q:    the request queue associated with the device
- *
- **/
-void blk_trace_shutdown(struct request_queue *q)
-{
-	if (q->blk_trace) {
-		blk_trace_startstop(q, 0);
-		blk_trace_remove(q);
-	}
-}
-
-/*
- * blktrace probes
- */
-
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:		queue the io is for
- * @rq:		the source request
- * @what:	the action
- *
- * Description:
- *     Records an action against a request. Will log the bio offset + size.
- *
- **/
-static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-				    u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-	int rw = rq->cmd_flags & 0x03;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_discard_rq(rq))
-		rw |= (1 << BIO_RW_DISCARD);
-
-	if (blk_pc_request(rq)) {
-		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
-				sizeof(rq->cmd), rq->cmd);
-	} else  {
-		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
-				rw, what, rq->errors, 0, NULL);
-	}
-}
-
-static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
-{
-	blk_add_trace_rq(q, rq, BLK_TA_ABORT);
-}
-
-static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
-{
-	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
-}
-
-static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
-{
-	blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
-}
-
-static void blk_add_trace_rq_requeue(struct request_queue *q,
-				     struct request *rq)
-{
-	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
-}
-
-static void blk_add_trace_rq_complete(struct request_queue *q,
-				      struct request *rq)
-{
-	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @what:	the action
- *
- * Description:
- *     Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-				     u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
-			!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
-{
-	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
-}
-
-static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
-{
-	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
-}
-
-static void blk_add_trace_bio_backmerge(struct request_queue *q,
-					struct bio *bio)
-{
-	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
-}
-
-static void blk_add_trace_bio_frontmerge(struct request_queue *q,
-					 struct bio *bio)
-{
-	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
-}
-
-static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
-{
-	blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
-}
-
-static void blk_add_trace_getrq(struct request_queue *q,
-				struct bio *bio, int rw)
-{
-	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
-	else {
-		struct blk_trace *bt = q->blk_trace;
-
-		if (bt)
-			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
-	}
-}
-
-
-static void blk_add_trace_sleeprq(struct request_queue *q,
-				  struct bio *bio, int rw)
-{
-	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
-	else {
-		struct blk_trace *bt = q->blk_trace;
-
-		if (bt)
-			__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
-					0, 0, NULL);
-	}
-}
-
-static void blk_add_trace_plug(struct request_queue *q)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (bt)
-		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
-}
-
-static void blk_add_trace_unplug_io(struct request_queue *q)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (bt) {
-		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
-		__be64 rpdu = cpu_to_be64(pdu);
-
-		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
-				sizeof(rpdu), &rpdu);
-	}
-}
-
-static void blk_add_trace_unplug_timer(struct request_queue *q)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (bt) {
-		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
-		__be64 rpdu = cpu_to_be64(pdu);
-
-		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
-				sizeof(rpdu), &rpdu);
-	}
-}
-
-static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
-				unsigned int pdu)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (bt) {
-		__be64 rpdu = cpu_to_be64(pdu);
-
-		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
-				BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
-				sizeof(rpdu), &rpdu);
-	}
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q:		queue the io is for
- * @bio:	the source bio
- * @dev:	target device
- * @from:	source sector
- * @to:		target sector
- *
- * Description:
- *     Device mapper or raid target sometimes need to split a bio because
- *     it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
-{
-	struct blk_trace *bt = q->blk_trace;
-	struct blk_io_trace_remap r;
-
-	if (likely(!bt))
-		return;
-
-	r.device = cpu_to_be32(dev);
-	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-	r.sector = cpu_to_be64(to);
-
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
-			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
-
-/**
- * blk_add_driver_data - Add binary message with driver-specific data
- * @q:		queue the io is for
- * @rq:		io request
- * @data:	driver-specific data
- * @len:	length of driver-specific data
- *
- * Description:
- *     Some drivers might want to write driver-specific data per request.
- *
- **/
-void blk_add_driver_data(struct request_queue *q,
-			 struct request *rq,
-			 void *data, size_t len)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_pc_request(rq))
-		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
-				rq->errors, len, data);
-	else
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
-				0, BLK_TA_DRV_DATA, rq->errors, len, data);
-}
-EXPORT_SYMBOL_GPL(blk_add_driver_data);
-
-static int blk_register_tracepoints(void)
-{
-	int ret;
-
-	ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
-	WARN_ON(ret);
-	ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
-	WARN_ON(ret);
-	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
-	WARN_ON(ret);
-	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
-	WARN_ON(ret);
-	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
-	WARN_ON(ret);
-	ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
-	WARN_ON(ret);
-	ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
-	WARN_ON(ret);
-	ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
-	WARN_ON(ret);
-	ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
-	WARN_ON(ret);
-	ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
-	WARN_ON(ret);
-	ret = register_trace_block_getrq(blk_add_trace_getrq);
-	WARN_ON(ret);
-	ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
-	WARN_ON(ret);
-	ret = register_trace_block_plug(blk_add_trace_plug);
-	WARN_ON(ret);
-	ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
-	WARN_ON(ret);
-	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
-	WARN_ON(ret);
-	ret = register_trace_block_split(blk_add_trace_split);
-	WARN_ON(ret);
-	ret = register_trace_block_remap(blk_add_trace_remap);
-	WARN_ON(ret);
-	return 0;
-}
-
-static void blk_unregister_tracepoints(void)
-{
-	unregister_trace_block_remap(blk_add_trace_remap);
-	unregister_trace_block_split(blk_add_trace_split);
-	unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
-	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
-	unregister_trace_block_plug(blk_add_trace_plug);
-	unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
-	unregister_trace_block_getrq(blk_add_trace_getrq);
-	unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
-	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
-	unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
-	unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
-	unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
-	unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
-	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
-	unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
-	unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
-	unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
-
-	tracepoint_synchronize_unregister();
-}
-
-/*
- * struct blk_io_tracer formatting routines
- */
-
-static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
-{
-	int i = 0;
-
-	if (t->action & BLK_TC_DISCARD)
-		rwbs[i++] = 'D';
-	else if (t->action & BLK_TC_WRITE)
-		rwbs[i++] = 'W';
-	else if (t->bytes)
-		rwbs[i++] = 'R';
-	else
-		rwbs[i++] = 'N';
-
-	if (t->action & BLK_TC_AHEAD)
-		rwbs[i++] = 'A';
-	if (t->action & BLK_TC_BARRIER)
-		rwbs[i++] = 'B';
-	if (t->action & BLK_TC_SYNC)
-		rwbs[i++] = 'S';
-	if (t->action & BLK_TC_META)
-		rwbs[i++] = 'M';
-
-	rwbs[i] = '\0';
-}
-
-static inline
-const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
-{
-	return (const struct blk_io_trace *)ent;
-}
-
-static inline const void *pdu_start(const struct trace_entry *ent)
-{
-	return te_blk_io_trace(ent) + 1;
-}
-
-static inline u32 t_sec(const struct trace_entry *ent)
-{
-	return te_blk_io_trace(ent)->bytes >> 9;
-}
-
-static inline unsigned long long t_sector(const struct trace_entry *ent)
-{
-	return te_blk_io_trace(ent)->sector;
-}
-
-static inline __u16 t_error(const struct trace_entry *ent)
-{
-	return te_blk_io_trace(ent)->sector;
-}
-
-static __u64 get_pdu_int(const struct trace_entry *ent)
-{
-	const __u64 *val = pdu_start(ent);
-	return be64_to_cpu(*val);
-}
-
-static void get_pdu_remap(const struct trace_entry *ent,
-			  struct blk_io_trace_remap *r)
-{
-	const struct blk_io_trace_remap *__r = pdu_start(ent);
-	__u64 sector = __r->sector;
-
-	r->device = be32_to_cpu(__r->device);
-	r->device_from = be32_to_cpu(__r->device_from);
-	r->sector = be64_to_cpu(sector);
-}
-
-static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
-{
-	char rwbs[6];
-	unsigned long long ts  = ns2usecs(iter->ts);
-	unsigned long usec_rem = do_div(ts, USEC_PER_SEC);
-	unsigned secs	       = (unsigned long)ts;
-	const struct trace_entry *ent = iter->ent;
-	const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
-
-	fill_rwbs(rwbs, t);
-
-	return trace_seq_printf(&iter->seq,
-				"%3d,%-3d %2d %5d.%06lu %5u %2s %3s ",
-				MAJOR(t->device), MINOR(t->device), iter->cpu,
-				secs, usec_rem, ent->pid, act, rwbs);
-}
-
-static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
-			      const char *act)
-{
-	char rwbs[6];
-	fill_rwbs(rwbs, t);
-	return trace_seq_printf(s, "%3d,%-3d %2s %3s ",
-				MAJOR(t->device), MINOR(t->device), act, rwbs);
-}
-
-static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
-{
-	const char *cmd = trace_find_cmdline(ent->pid);
-
-	if (t_sec(ent))
-		return trace_seq_printf(s, "%llu + %u [%s]\n",
-					t_sector(ent), t_sec(ent), cmd);
-	return trace_seq_printf(s, "[%s]\n", cmd);
-}
-
-static int blk_log_with_error(struct trace_seq *s,
-			      const struct trace_entry *ent)
-{
-	if (t_sec(ent))
-		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
-					t_sec(ent), t_error(ent));
-	return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
-}
-
-static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
-{
-	struct blk_io_trace_remap r = { .device = 0, };
-
-	get_pdu_remap(ent, &r);
-	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
-			       t_sector(ent),
-			       t_sec(ent), MAJOR(r.device), MINOR(r.device),
-			       (unsigned long long)r.sector);
-}
-
-static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
-{
-	return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid));
-}
-
-static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
-{
-	return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid),
-				get_pdu_int(ent));
-}
-
-static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
-{
-	return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
-				get_pdu_int(ent), trace_find_cmdline(ent->pid));
-}
-
-/*
- * struct tracer operations
- */
-
-static void blk_tracer_print_header(struct seq_file *m)
-{
-	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
-		return;
-	seq_puts(m, "# DEV   CPU TIMESTAMP     PID ACT FLG\n"
-		    "#  |     |     |           |   |   |\n");
-}
-
-static void blk_tracer_start(struct trace_array *tr)
-{
-	mutex_lock(&blk_probe_mutex);
-	if (atomic_add_return(1, &blk_probes_ref) == 1)
-		if (blk_register_tracepoints())
-			atomic_dec(&blk_probes_ref);
-	mutex_unlock(&blk_probe_mutex);
-	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
-}
-
-static int blk_tracer_init(struct trace_array *tr)
-{
-	blk_tr = tr;
-	blk_tracer_start(tr);
-	mutex_lock(&blk_probe_mutex);
-	blk_tracer_enabled++;
-	mutex_unlock(&blk_probe_mutex);
-	return 0;
-}
-
-static void blk_tracer_stop(struct trace_array *tr)
-{
-	trace_flags |= TRACE_ITER_CONTEXT_INFO;
-	mutex_lock(&blk_probe_mutex);
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
-	mutex_unlock(&blk_probe_mutex);
-}
-
-static void blk_tracer_reset(struct trace_array *tr)
-{
-	if (!atomic_read(&blk_probes_ref))
-		return;
-
-	mutex_lock(&blk_probe_mutex);
-	blk_tracer_enabled--;
-	WARN_ON(blk_tracer_enabled < 0);
-	mutex_unlock(&blk_probe_mutex);
-
-	blk_tracer_stop(tr);
-}
-
-static struct {
-	const char *act[2];
-	int 	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
-} what2act[] __read_mostly = {
-	[__BLK_TA_QUEUE]	= {{  "Q", "queue" }, 	   blk_log_generic },
-	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
-	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
-	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
-	[__BLK_TA_SLEEPRQ]	= {{  "S", "sleeprq" },	   blk_log_generic },
-	[__BLK_TA_REQUEUE]	= {{  "R", "requeue" },	   blk_log_with_error },
-	[__BLK_TA_ISSUE]	= {{  "D", "issue" },	   blk_log_generic },
-	[__BLK_TA_COMPLETE]	= {{  "C", "complete" },   blk_log_with_error },
-	[__BLK_TA_PLUG]		= {{  "P", "plug" },	   blk_log_plug },
-	[__BLK_TA_UNPLUG_IO]	= {{  "U", "unplug_io" },  blk_log_unplug },
-	[__BLK_TA_UNPLUG_TIMER]	= {{ "UT", "unplug_timer" }, blk_log_unplug },
-	[__BLK_TA_INSERT]	= {{  "I", "insert" },	   blk_log_generic },
-	[__BLK_TA_SPLIT]	= {{  "X", "split" },	   blk_log_split },
-	[__BLK_TA_BOUNCE]	= {{  "B", "bounce" },	   blk_log_generic },
-	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
-};
-
-static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
-					       int flags)
-{
-	struct trace_seq *s = &iter->seq;
-	const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
-	const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
-	int ret;
-
-	if (!trace_print_context(iter))
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
-		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
-	else {
-		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
-		ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
-		if (ret)
-			ret = what2act[what].print(s, iter->ent);
-	}
-
-	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
-}
-
-static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
-{
-	struct trace_seq *s = &iter->seq;
-	struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
-	const int offset = offsetof(struct blk_io_trace, sector);
-	struct blk_io_trace old = {
-		.magic	  = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
-		.time     = ns2usecs(iter->ts),
-	};
-
-	if (!trace_seq_putmem(s, &old, offset))
-		return 0;
-	return trace_seq_putmem(s, &t->sector,
-				sizeof(old) - offset + t->pdu_len);
-}
-
-static enum print_line_t
-blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
-{
-	return blk_trace_synthesize_old_trace(iter) ?
-			TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
-}
-
-static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
-{
-	const struct blk_io_trace *t;
-	u16 what;
-	int ret;
-
-	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
-		return TRACE_TYPE_UNHANDLED;
-
-	t = (const struct blk_io_trace *)iter->ent;
-	what = t->action & ((1 << BLK_TC_SHIFT) - 1);
-
-	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
-		ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
-	else {
-		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
-		ret = blk_log_action_iter(iter, what2act[what].act[long_act]);
-		if (ret)
-			ret = what2act[what].print(&iter->seq, iter->ent);
-	}
-
-	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
-}
-
-static struct tracer blk_tracer __read_mostly = {
-	.name		= "blk",
-	.init		= blk_tracer_init,
-	.reset		= blk_tracer_reset,
-	.start		= blk_tracer_start,
-	.stop		= blk_tracer_stop,
-	.print_header	= blk_tracer_print_header,
-	.print_line	= blk_tracer_print_line,
-	.flags		= &blk_tracer_flags,
-};
-
-static struct trace_event trace_blk_event = {
-	.type	 	= TRACE_BLK,
-	.trace		= blk_trace_event_print,
-	.latency_trace	= blk_trace_event_print,
-	.binary		= blk_trace_event_print_binary,
-};
-
-static int __init init_blk_tracer(void)
-{
-	if (!register_ftrace_event(&trace_blk_event)) {
-		pr_warning("Warning: could not register block events\n");
-		return 1;
-	}
-
-	if (register_tracer(&blk_tracer) != 0) {
-		pr_warning("Warning: could not register the block tracer\n");
-		unregister_ftrace_event(&trace_blk_event);
-		return 1;
-	}
-
-	return 0;
-}
-
-device_initcall(init_blk_tracer);
-
-static int blk_trace_remove_queue(struct request_queue *q)
-{
-	struct blk_trace *bt;
-
-	bt = xchg(&q->blk_trace, NULL);
-	if (bt == NULL)
-		return -EINVAL;
-
-	kfree(bt);
-	return 0;
-}
-
-/*
- * Setup everything required to start tracing
- */
-static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
-{
-	struct blk_trace *old_bt, *bt = NULL;
-	int ret;
-
-	ret = -ENOMEM;
-	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
-	if (!bt)
-		goto err;
-
-	bt->dev = dev;
-	bt->act_mask = (u16)-1;
-	bt->end_lba = -1ULL;
-	bt->trace_state = Blktrace_running;
-
-	old_bt = xchg(&q->blk_trace, bt);
-	if (old_bt != NULL) {
-		(void)xchg(&q->blk_trace, old_bt);
-		kfree(bt);
-		ret = -EBUSY;
-	}
-	return 0;
-err:
-	return ret;
-}
-
-/*
- * sysfs interface to enable and configure tracing
- */
-
-static ssize_t sysfs_blk_trace_enable_show(struct device *dev,
-					   struct device_attribute *attr,
-					   char *buf)
-{
-	struct hd_struct *p = dev_to_part(dev);
-	struct block_device *bdev;
-	ssize_t ret = -ENXIO;
-
-	lock_kernel();
-	bdev = bdget(part_devt(p));
-	if (bdev != NULL) {
-		struct request_queue *q = bdev_get_queue(bdev);
-
-		if (q != NULL) {
-			mutex_lock(&bdev->bd_mutex);
-			ret = sprintf(buf, "%u\n", !!q->blk_trace);
-			mutex_unlock(&bdev->bd_mutex);
-		}
-
-		bdput(bdev);
-	}
-
-	unlock_kernel();
-	return ret;
-}
-
-static ssize_t sysfs_blk_trace_enable_store(struct device *dev,
-					    struct device_attribute *attr,
-					    const char *buf, size_t count)
-{
-	struct block_device *bdev;
-	struct request_queue *q;
-	struct hd_struct *p;
-	int value;
-	ssize_t ret = -ENXIO;
-
-	if (count == 0 || sscanf(buf, "%d", &value) != 1)
-		goto out;
-
-	lock_kernel();
-	p = dev_to_part(dev);
-	bdev = bdget(part_devt(p));
-	if (bdev == NULL)
-		goto out_unlock_kernel;
-
-	q = bdev_get_queue(bdev);
-	if (q == NULL)
-		goto out_bdput;
-
-	mutex_lock(&bdev->bd_mutex);
-	if (value)
-		ret = blk_trace_setup_queue(q, bdev->bd_dev);
-	else
-		ret = blk_trace_remove_queue(q);
-	mutex_unlock(&bdev->bd_mutex);
-
-	if (ret == 0)
-		ret = count;
-out_bdput:
-	bdput(bdev);
-out_unlock_kernel:
-	unlock_kernel();
-out:
-	return ret;
-}
-
-static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
-					 struct device_attribute *attr,
-					 char *buf);
-static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
-					  struct device_attribute *attr,
-					  const char *buf, size_t count);
-#define BLK_TRACE_DEVICE_ATTR(_name) \
-	DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
-		    sysfs_blk_trace_attr_show, \
-		    sysfs_blk_trace_attr_store)
-
-static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,
-		   sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store);
-static BLK_TRACE_DEVICE_ATTR(act_mask);
-static BLK_TRACE_DEVICE_ATTR(pid);
-static BLK_TRACE_DEVICE_ATTR(start_lba);
-static BLK_TRACE_DEVICE_ATTR(end_lba);
-
-static struct attribute *blk_trace_attrs[] = {
-	&dev_attr_enable.attr,
-	&dev_attr_act_mask.attr,
-	&dev_attr_pid.attr,
-	&dev_attr_start_lba.attr,
-	&dev_attr_end_lba.attr,
-	NULL
-};
-
-struct attribute_group blk_trace_attr_group = {
-	.name  = "trace",
-	.attrs = blk_trace_attrs,
-};
-
-static int blk_str2act_mask(const char *str)
-{
-	int mask = 0;
-	char *copy = kstrdup(str, GFP_KERNEL), *s;
-
-	if (copy == NULL)
-		return -ENOMEM;
-
-	s = strstrip(copy);
-
-	while (1) {
-		char *sep = strchr(s, ',');
-
-		if (sep != NULL)
-			*sep = '\0';
-
-		if (strcasecmp(s, "barrier") == 0)
-			mask |= BLK_TC_BARRIER;
-		else if (strcasecmp(s, "complete") == 0)
-			mask |= BLK_TC_COMPLETE;
-		else if (strcasecmp(s, "fs") == 0)
-			mask |= BLK_TC_FS;
-		else if (strcasecmp(s, "issue") == 0)
-			mask |= BLK_TC_ISSUE;
-		else if (strcasecmp(s, "pc") == 0)
-			mask |= BLK_TC_PC;
-		else if (strcasecmp(s, "queue") == 0)
-			mask |= BLK_TC_QUEUE;
-		else if (strcasecmp(s, "read") == 0)
-			mask |= BLK_TC_READ;
-		else if (strcasecmp(s, "requeue") == 0)
-			mask |= BLK_TC_REQUEUE;
-		else if (strcasecmp(s, "sync") == 0)
-			mask |= BLK_TC_SYNC;
-		else if (strcasecmp(s, "write") == 0)
-			mask |= BLK_TC_WRITE;
-
-		if (sep == NULL)
-			break;
-
-		s = sep + 1;
-	}
-	kfree(copy);
-
-	return mask;
-}
-
-static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
-					 struct device_attribute *attr,
-					 char *buf)
-{
-	struct hd_struct *p = dev_to_part(dev);
-	struct request_queue *q;
-	struct block_device *bdev;
-	ssize_t ret = -ENXIO;
-
-	lock_kernel();
-	bdev = bdget(part_devt(p));
-	if (bdev == NULL)
-		goto out_unlock_kernel;
-
-	q = bdev_get_queue(bdev);
-	if (q == NULL)
-		goto out_bdput;
-	mutex_lock(&bdev->bd_mutex);
-	if (q->blk_trace == NULL)
-		ret = sprintf(buf, "disabled\n");
-	else if (attr == &dev_attr_act_mask)
-		ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask);
-	else if (attr == &dev_attr_pid)
-		ret = sprintf(buf, "%u\n", q->blk_trace->pid);
-	else if (attr == &dev_attr_start_lba)
-		ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
-	else if (attr == &dev_attr_end_lba)
-		ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
-	mutex_unlock(&bdev->bd_mutex);
-out_bdput:
-	bdput(bdev);
-out_unlock_kernel:
-	unlock_kernel();
-	return ret;
-}
-
-static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
-					  struct device_attribute *attr,
-					  const char *buf, size_t count)
-{
-	struct block_device *bdev;
-	struct request_queue *q;
-	struct hd_struct *p;
-	u64 value;
-	ssize_t ret = -ENXIO;
-
-	if (count == 0)
-		goto out;
-
-	if (attr == &dev_attr_act_mask) {
-		if (sscanf(buf, "%llx", &value) != 1) {
-			/* Assume it is a list of trace category names */
-			value = blk_str2act_mask(buf);
-			if (value < 0)
-				goto out;
-		}
-	} else if (sscanf(buf, "%llu", &value) != 1)
-		goto out;
-
-	lock_kernel();
-	p = dev_to_part(dev);
-	bdev = bdget(part_devt(p));
-	if (bdev == NULL)
-		goto out_unlock_kernel;
-
-	q = bdev_get_queue(bdev);
-	if (q == NULL)
-		goto out_bdput;
-
-	mutex_lock(&bdev->bd_mutex);
-	ret = 0;
-	if (q->blk_trace == NULL)
-		ret = blk_trace_setup_queue(q, bdev->bd_dev);
-
-	if (ret == 0) {
-		if (attr == &dev_attr_act_mask)
-			q->blk_trace->act_mask = value;
-		else if (attr == &dev_attr_pid)
-			q->blk_trace->pid = value;
-		else if (attr == &dev_attr_start_lba)
-			q->blk_trace->start_lba = value;
-		else if (attr == &dev_attr_end_lba)
-			q->blk_trace->end_lba = value;
-		ret = count;
-	}
-	mutex_unlock(&bdev->bd_mutex);
-out_bdput:
-	bdput(bdev);
-out_unlock_kernel:
-	unlock_kernel();
-out:
-	return ret;
-}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 25131a5d5e4f..4fee43c01942 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -302,6 +302,29 @@ config WORKQUEUE_TRACER
           For example it can help a developer to decide whether he should
           choose a per cpu workqueue instead of a singlethreaded one.
 
+config BLK_DEV_IO_TRACE
+	bool "Support for tracing block io actions"
+	depends on SYSFS
+	select RELAY
+	select DEBUG_FS
+	select TRACEPOINTS
+	select TRACING
+	select STACKTRACE
+	help
+	  Say Y here if you want to be able to trace the block layer actions
+	  on a given queue. Tracing allows you to see any traffic happening
+	  on a block device queue. For more information (and the userspace
+	  support tools needed), fetch the blktrace tools from:
+
+	  git://git.kernel.dk/blktrace.git
+
+	  Tracing also is possible using the ftrace interface, e.g.:
+
+	    echo 1 > /sys/block/sda/sda1/trace/enable
+	    echo blk > /sys/kernel/debug/tracing/current_tracer
+	    cat /sys/kernel/debug/tracing/trace_pipe
+
+	  If unsure, say N.
 
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index f76d48f3527d..627090bc262d 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -37,5 +37,6 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
new file mode 100644
index 000000000000..3b91da064820
--- /dev/null
+++ b/kernel/trace/blktrace.c
@@ -0,0 +1,1538 @@
+/*
+ * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <linux/time.h>
+#include <trace/block.h>
+#include <linux/uaccess.h>
+#include "trace_output.h"
+
+static unsigned int blktrace_seq __read_mostly = 1;
+
+static struct trace_array *blk_tr;
+static int __read_mostly  blk_tracer_enabled;
+
+/* Select an alternative, minimalistic output than the original one */
+#define TRACE_BLK_OPT_CLASSIC 	0x1
+
+static struct tracer_opt blk_tracer_opts[] = {
+	/* Default disable the minimalistic output */
+	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
+	{ }
+};
+
+static struct tracer_flags blk_tracer_flags = {
+	.val  = 0,
+	.opts = blk_tracer_opts,
+};
+
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+
+static int blk_register_tracepoints(void);
+static void blk_unregister_tracepoints(void);
+
+/*
+ * Send out a notify message.
+ */
+static void trace_note(struct blk_trace *bt, pid_t pid, int action,
+		       const void *data, size_t len)
+{
+	struct blk_io_trace *t;
+
+	if (!bt->rchan)
+		return;
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + len);
+	if (t) {
+		const int cpu = smp_processor_id();
+
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->time = ktime_to_ns(ktime_get());
+		t->device = bt->dev;
+		t->action = action;
+		t->pid = pid;
+		t->cpu = cpu;
+		t->pdu_len = len;
+		memcpy((void *) t + sizeof(*t), data, len);
+	}
+}
+
+/*
+ * Send out a notify for this process, if we haven't done so since a trace
+ * started
+ */
+static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
+{
+	tsk->btrace_seq = blktrace_seq;
+	trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
+}
+
+static void trace_note_time(struct blk_trace *bt)
+{
+	struct timespec now;
+	unsigned long flags;
+	u32 words[2];
+
+	getnstimeofday(&now);
+	words[0] = now.tv_sec;
+	words[1] = now.tv_nsec;
+
+	local_irq_save(flags);
+	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
+	local_irq_restore(flags);
+}
+
+void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
+{
+	int n;
+	va_list args;
+	unsigned long flags;
+	char *buf;
+
+	if (blk_tr) {
+		va_start(args, fmt);
+		ftrace_vprintk(fmt, args);
+		va_end(args);
+		return;
+	}
+
+	if (!bt->msg_data)
+		return;
+
+	local_irq_save(flags);
+	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
+	va_start(args, fmt);
+	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
+	va_end(args);
+
+	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(__trace_note_message);
+
+static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
+			 pid_t pid)
+{
+	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
+		return 1;
+	if (sector < bt->start_lba || sector > bt->end_lba)
+		return 1;
+	if (bt->pid && pid != bt->pid)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Data direction bit lookup
+ */
+static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ),
+					 BLK_TC_ACT(BLK_TC_WRITE) };
+
+/* The ilog2() calls fall out because they're constant */
+#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
+	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
+
+/*
+ * The worker for the various blk_add_trace*() types. Fills out a
+ * blk_io_trace structure and places it in a per-cpu subbuffer.
+ */
+static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
+{
+	struct task_struct *tsk = current;
+	struct ring_buffer_event *event = NULL;
+	struct blk_io_trace *t;
+	unsigned long flags = 0;
+	unsigned long *sequence;
+	pid_t pid;
+	int cpu, pc = 0;
+
+	if (unlikely(bt->trace_state != Blktrace_running ||
+		     !blk_tracer_enabled))
+		return;
+
+	what |= ddir_act[rw & WRITE];
+	what |= MASK_TC_BIT(rw, BARRIER);
+	what |= MASK_TC_BIT(rw, SYNC);
+	what |= MASK_TC_BIT(rw, AHEAD);
+	what |= MASK_TC_BIT(rw, META);
+	what |= MASK_TC_BIT(rw, DISCARD);
+
+	pid = tsk->pid;
+	if (unlikely(act_log_check(bt, what, sector, pid)))
+		return;
+	cpu = raw_smp_processor_id();
+
+	if (blk_tr) {
+		tracing_record_cmdline(current);
+
+		pc = preempt_count();
+		event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
+						  sizeof(*t) + pdu_len,
+						  0, pc);
+		if (!event)
+			return;
+		t = ring_buffer_event_data(event);
+		goto record_it;
+	}
+
+	/*
+	 * A word about the locking here - we disable interrupts to reserve
+	 * some space in the relay per-cpu buffer, to prevent an irq
+	 * from coming in and stepping on our toes.
+	 */
+	local_irq_save(flags);
+
+	if (unlikely(tsk->btrace_seq != blktrace_seq))
+		trace_note_tsk(bt, tsk);
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+	if (t) {
+		sequence = per_cpu_ptr(bt->sequence, cpu);
+
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->sequence = ++(*sequence);
+		t->time = ktime_to_ns(ktime_get());
+record_it:
+		/*
+		 * These two are not needed in ftrace as they are in the
+		 * generic trace_entry, filled by tracing_generic_entry_update,
+		 * but for the trace_event->bin() synthesizer benefit we do it
+		 * here too.
+		 */
+		t->cpu = cpu;
+		t->pid = pid;
+
+		t->sector = sector;
+		t->bytes = bytes;
+		t->action = what;
+		t->device = bt->dev;
+		t->error = error;
+		t->pdu_len = pdu_len;
+
+		if (pdu_len)
+			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+
+		if (blk_tr) {
+			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
+			return;
+		}
+	}
+
+	local_irq_restore(flags);
+}
+
+static struct dentry *blk_tree_root;
+static DEFINE_MUTEX(blk_tree_mutex);
+
+static void blk_trace_cleanup(struct blk_trace *bt)
+{
+	debugfs_remove(bt->msg_file);
+	debugfs_remove(bt->dropped_file);
+	relay_close(bt->rchan);
+	free_percpu(bt->sequence);
+	free_percpu(bt->msg_data);
+	kfree(bt);
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_dec_and_test(&blk_probes_ref))
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
+int blk_trace_remove(struct request_queue *q)
+{
+	struct blk_trace *bt;
+
+	bt = xchg(&q->blk_trace, NULL);
+	if (!bt)
+		return -EINVAL;
+
+	if (bt->trace_state == Blktrace_setup ||
+	    bt->trace_state == Blktrace_stopped)
+		blk_trace_cleanup(bt);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_trace_remove);
+
+static int blk_dropped_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->i_private;
+
+	return 0;
+}
+
+static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct blk_trace *bt = filp->private_data;
+	char buf[16];
+
+	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+
+	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+}
+
+static const struct file_operations blk_dropped_fops = {
+	.owner =	THIS_MODULE,
+	.open =		blk_dropped_open,
+	.read =		blk_dropped_read,
+};
+
+static int blk_msg_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->i_private;
+
+	return 0;
+}
+
+static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	char *msg;
+	struct blk_trace *bt;
+
+	if (count > BLK_TN_MAX_MSG)
+		return -EINVAL;
+
+	msg = kmalloc(count, GFP_KERNEL);
+	if (msg == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(msg, buffer, count)) {
+		kfree(msg);
+		return -EFAULT;
+	}
+
+	bt = filp->private_data;
+	__trace_note_message(bt, "%s", msg);
+	kfree(msg);
+
+	return count;
+}
+
+static const struct file_operations blk_msg_fops = {
+	.owner =	THIS_MODULE,
+	.open =		blk_msg_open,
+	.write =	blk_msg_write,
+};
+
+/*
+ * Keep track of how many times we encountered a full subbuffer, to aid
+ * the user space app in telling how many lost events there were.
+ */
+static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
+				     void *prev_subbuf, size_t prev_padding)
+{
+	struct blk_trace *bt;
+
+	if (!relay_buf_full(buf))
+		return 1;
+
+	bt = buf->chan->private_data;
+	atomic_inc(&bt->dropped);
+	return 0;
+}
+
+static int blk_remove_buf_file_callback(struct dentry *dentry)
+{
+	struct dentry *parent = dentry->d_parent;
+	debugfs_remove(dentry);
+
+	/*
+	* this will fail for all but the last file, but that is ok. what we
+	* care about is the top level buts->name directory going away, when
+	* the last trace file is gone. Then we don't have to rmdir() that
+	* manually on trace stop, so it nicely solves the issue with
+	* force killing of running traces.
+	*/
+
+	debugfs_remove(parent);
+	return 0;
+}
+
+static struct dentry *blk_create_buf_file_callback(const char *filename,
+						   struct dentry *parent,
+						   int mode,
+						   struct rchan_buf *buf,
+						   int *is_global)
+{
+	return debugfs_create_file(filename, mode, parent, buf,
+					&relay_file_operations);
+}
+
+static struct rchan_callbacks blk_relay_callbacks = {
+	.subbuf_start		= blk_subbuf_start_callback,
+	.create_buf_file	= blk_create_buf_file_callback,
+	.remove_buf_file	= blk_remove_buf_file_callback,
+};
+
+/*
+ * Setup everything required to start tracing
+ */
+int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+			struct blk_user_trace_setup *buts)
+{
+	struct blk_trace *old_bt, *bt = NULL;
+	struct dentry *dir = NULL;
+	int ret, i;
+
+	if (!buts->buf_size || !buts->buf_nr)
+		return -EINVAL;
+
+	strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
+	buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
+
+	/*
+	 * some device names have larger paths - convert the slashes
+	 * to underscores for this to work as expected
+	 */
+	for (i = 0; i < strlen(buts->name); i++)
+		if (buts->name[i] == '/')
+			buts->name[i] = '_';
+
+	ret = -ENOMEM;
+	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+	if (!bt)
+		goto err;
+
+	bt->sequence = alloc_percpu(unsigned long);
+	if (!bt->sequence)
+		goto err;
+
+	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
+	if (!bt->msg_data)
+		goto err;
+
+	ret = -ENOENT;
+
+	if (!blk_tree_root) {
+		blk_tree_root = debugfs_create_dir("block", NULL);
+		if (!blk_tree_root)
+			return -ENOMEM;
+	}
+
+	dir = debugfs_create_dir(buts->name, blk_tree_root);
+
+	if (!dir)
+		goto err;
+
+	bt->dir = dir;
+	bt->dev = dev;
+	atomic_set(&bt->dropped, 0);
+
+	ret = -EIO;
+	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
+					       &blk_dropped_fops);
+	if (!bt->dropped_file)
+		goto err;
+
+	bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+	if (!bt->msg_file)
+		goto err;
+
+	bt->rchan = relay_open("trace", dir, buts->buf_size,
+				buts->buf_nr, &blk_relay_callbacks, bt);
+	if (!bt->rchan)
+		goto err;
+
+	bt->act_mask = buts->act_mask;
+	if (!bt->act_mask)
+		bt->act_mask = (u16) -1;
+
+	bt->start_lba = buts->start_lba;
+	bt->end_lba = buts->end_lba;
+	if (!bt->end_lba)
+		bt->end_lba = -1ULL;
+
+	bt->pid = buts->pid;
+	bt->trace_state = Blktrace_setup;
+
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_add_return(1, &blk_probes_ref) == 1) {
+		ret = blk_register_tracepoints();
+		if (ret)
+			goto probe_err;
+	}
+	mutex_unlock(&blk_probe_mutex);
+
+	ret = -EBUSY;
+	old_bt = xchg(&q->blk_trace, bt);
+	if (old_bt) {
+		(void) xchg(&q->blk_trace, old_bt);
+		goto err;
+	}
+
+	return 0;
+probe_err:
+	atomic_dec(&blk_probes_ref);
+	mutex_unlock(&blk_probe_mutex);
+err:
+	if (bt) {
+		if (bt->msg_file)
+			debugfs_remove(bt->msg_file);
+		if (bt->dropped_file)
+			debugfs_remove(bt->dropped_file);
+		free_percpu(bt->sequence);
+		free_percpu(bt->msg_data);
+		if (bt->rchan)
+			relay_close(bt->rchan);
+		kfree(bt);
+	}
+	return ret;
+}
+
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+		    char __user *arg)
+{
+	struct blk_user_trace_setup buts;
+	int ret;
+
+	ret = copy_from_user(&buts, arg, sizeof(buts));
+	if (ret)
+		return -EFAULT;
+
+	ret = do_blk_trace_setup(q, name, dev, &buts);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(arg, &buts, sizeof(buts)))
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_trace_setup);
+
+int blk_trace_startstop(struct request_queue *q, int start)
+{
+	int ret;
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt == NULL)
+		return -EINVAL;
+
+	/*
+	 * For starting a trace, we can transition from a setup or stopped
+	 * trace. For stopping a trace, the state must be running
+	 */
+	ret = -EINVAL;
+	if (start) {
+		if (bt->trace_state == Blktrace_setup ||
+		    bt->trace_state == Blktrace_stopped) {
+			blktrace_seq++;
+			smp_mb();
+			bt->trace_state = Blktrace_running;
+
+			trace_note_time(bt);
+			ret = 0;
+		}
+	} else {
+		if (bt->trace_state == Blktrace_running) {
+			bt->trace_state = Blktrace_stopped;
+			relay_flush(bt->rchan);
+			ret = 0;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blk_trace_startstop);
+
+/**
+ * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * @bdev:	the block device
+ * @cmd: 	the ioctl cmd
+ * @arg:	the argument data, if any
+ *
+ **/
+int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+{
+	struct request_queue *q;
+	int ret, start = 0;
+	char b[BDEVNAME_SIZE];
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	mutex_lock(&bdev->bd_mutex);
+
+	switch (cmd) {
+	case BLKTRACESETUP:
+		bdevname(bdev, b);
+		ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
+		break;
+	case BLKTRACESTART:
+		start = 1;
+	case BLKTRACESTOP:
+		ret = blk_trace_startstop(q, start);
+		break;
+	case BLKTRACETEARDOWN:
+		ret = blk_trace_remove(q);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+	return ret;
+}
+
+/**
+ * blk_trace_shutdown: - stop and cleanup trace structures
+ * @q:    the request queue associated with the device
+ *
+ **/
+void blk_trace_shutdown(struct request_queue *q)
+{
+	if (q->blk_trace) {
+		blk_trace_startstop(q, 0);
+		blk_trace_remove(q);
+	}
+}
+
+/*
+ * blktrace probes
+ */
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q:		queue the io is for
+ * @rq:		the source request
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+				    u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+	int rw = rq->cmd_flags & 0x03;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_discard_rq(rq))
+		rw |= (1 << BIO_RW_DISCARD);
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
+				sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+				rw, what, rq->errors, 0, NULL);
+	}
+}
+
+static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+}
+
+static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+}
+
+static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+}
+
+static void blk_add_trace_rq_requeue(struct request_queue *q,
+				     struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+}
+
+static void blk_add_trace_rq_complete(struct request_queue *q,
+				      struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+				     u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+			!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+}
+
+static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+}
+
+static void blk_add_trace_bio_backmerge(struct request_queue *q,
+					struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+}
+
+static void blk_add_trace_bio_frontmerge(struct request_queue *q,
+					 struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+}
+
+static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+}
+
+static void blk_add_trace_getrq(struct request_queue *q,
+				struct bio *bio, int rw)
+{
+	if (bio)
+		blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+	else {
+		struct blk_trace *bt = q->blk_trace;
+
+		if (bt)
+			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+	}
+}
+
+
+static void blk_add_trace_sleeprq(struct request_queue *q,
+				  struct bio *bio, int rw)
+{
+	if (bio)
+		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+	else {
+		struct blk_trace *bt = q->blk_trace;
+
+		if (bt)
+			__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
+					0, 0, NULL);
+	}
+}
+
+static void blk_add_trace_plug(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt)
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+}
+
+static void blk_add_trace_unplug_io(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+static void blk_add_trace_unplug_timer(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+				unsigned int pdu)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+				BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+				       dev_t dev, sector_t from, sector_t to)
+{
+	struct blk_trace *bt = q->blk_trace;
+	struct blk_io_trace_remap r;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
+			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+/**
+ * blk_add_driver_data - Add binary message with driver-specific data
+ * @q:		queue the io is for
+ * @rq:		io request
+ * @data:	driver-specific data
+ * @len:	length of driver-specific data
+ *
+ * Description:
+ *     Some drivers might want to write driver-specific data per request.
+ *
+ **/
+void blk_add_driver_data(struct request_queue *q,
+			 struct request *rq,
+			 void *data, size_t len)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq))
+		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
+				rq->errors, len, data);
+	else
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+				0, BLK_TA_DRV_DATA, rq->errors, len, data);
+}
+EXPORT_SYMBOL_GPL(blk_add_driver_data);
+
+static int blk_register_tracepoints(void)
+{
+	int ret;
+
+	ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+	WARN_ON(ret);
+	ret = register_trace_block_getrq(blk_add_trace_getrq);
+	WARN_ON(ret);
+	ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+	WARN_ON(ret);
+	ret = register_trace_block_plug(blk_add_trace_plug);
+	WARN_ON(ret);
+	ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+	WARN_ON(ret);
+	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+	WARN_ON(ret);
+	ret = register_trace_block_split(blk_add_trace_split);
+	WARN_ON(ret);
+	ret = register_trace_block_remap(blk_add_trace_remap);
+	WARN_ON(ret);
+	return 0;
+}
+
+static void blk_unregister_tracepoints(void)
+{
+	unregister_trace_block_remap(blk_add_trace_remap);
+	unregister_trace_block_split(blk_add_trace_split);
+	unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
+	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+	unregister_trace_block_plug(blk_add_trace_plug);
+	unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
+	unregister_trace_block_getrq(blk_add_trace_getrq);
+	unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
+	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+	unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+	unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
+	unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+	unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
+	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+	unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
+	unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
+	unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+
+	tracepoint_synchronize_unregister();
+}
+
+/*
+ * struct blk_io_tracer formatting routines
+ */
+
+static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
+{
+	int i = 0;
+
+	if (t->action & BLK_TC_DISCARD)
+		rwbs[i++] = 'D';
+	else if (t->action & BLK_TC_WRITE)
+		rwbs[i++] = 'W';
+	else if (t->bytes)
+		rwbs[i++] = 'R';
+	else
+		rwbs[i++] = 'N';
+
+	if (t->action & BLK_TC_AHEAD)
+		rwbs[i++] = 'A';
+	if (t->action & BLK_TC_BARRIER)
+		rwbs[i++] = 'B';
+	if (t->action & BLK_TC_SYNC)
+		rwbs[i++] = 'S';
+	if (t->action & BLK_TC_META)
+		rwbs[i++] = 'M';
+
+	rwbs[i] = '\0';
+}
+
+static inline
+const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
+{
+	return (const struct blk_io_trace *)ent;
+}
+
+static inline const void *pdu_start(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent) + 1;
+}
+
+static inline u32 t_sec(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->bytes >> 9;
+}
+
+static inline unsigned long long t_sector(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->sector;
+}
+
+static inline __u16 t_error(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->sector;
+}
+
+static __u64 get_pdu_int(const struct trace_entry *ent)
+{
+	const __u64 *val = pdu_start(ent);
+	return be64_to_cpu(*val);
+}
+
+static void get_pdu_remap(const struct trace_entry *ent,
+			  struct blk_io_trace_remap *r)
+{
+	const struct blk_io_trace_remap *__r = pdu_start(ent);
+	__u64 sector = __r->sector;
+
+	r->device = be32_to_cpu(__r->device);
+	r->device_from = be32_to_cpu(__r->device_from);
+	r->sector = be64_to_cpu(sector);
+}
+
+static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
+{
+	char rwbs[6];
+	unsigned long long ts  = ns2usecs(iter->ts);
+	unsigned long usec_rem = do_div(ts, USEC_PER_SEC);
+	unsigned secs	       = (unsigned long)ts;
+	const struct trace_entry *ent = iter->ent;
+	const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
+
+	fill_rwbs(rwbs, t);
+
+	return trace_seq_printf(&iter->seq,
+				"%3d,%-3d %2d %5d.%06lu %5u %2s %3s ",
+				MAJOR(t->device), MINOR(t->device), iter->cpu,
+				secs, usec_rem, ent->pid, act, rwbs);
+}
+
+static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
+			      const char *act)
+{
+	char rwbs[6];
+	fill_rwbs(rwbs, t);
+	return trace_seq_printf(s, "%3d,%-3d %2s %3s ",
+				MAJOR(t->device), MINOR(t->device), act, rwbs);
+}
+
+static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+{
+	const char *cmd = trace_find_cmdline(ent->pid);
+
+	if (t_sec(ent))
+		return trace_seq_printf(s, "%llu + %u [%s]\n",
+					t_sector(ent), t_sec(ent), cmd);
+	return trace_seq_printf(s, "[%s]\n", cmd);
+}
+
+static int blk_log_with_error(struct trace_seq *s,
+			      const struct trace_entry *ent)
+{
+	if (t_sec(ent))
+		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
+					t_sec(ent), t_error(ent));
+	return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
+}
+
+static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+{
+	struct blk_io_trace_remap r = { .device = 0, };
+
+	get_pdu_remap(ent, &r);
+	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
+			       t_sector(ent),
+			       t_sec(ent), MAJOR(r.device), MINOR(r.device),
+			       (unsigned long long)r.sector);
+}
+
+static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid));
+}
+
+static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid),
+				get_pdu_int(ent));
+}
+
+static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
+				get_pdu_int(ent), trace_find_cmdline(ent->pid));
+}
+
+/*
+ * struct tracer operations
+ */
+
+static void blk_tracer_print_header(struct seq_file *m)
+{
+	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+		return;
+	seq_puts(m, "# DEV   CPU TIMESTAMP     PID ACT FLG\n"
+		    "#  |     |     |           |   |   |\n");
+}
+
+static void blk_tracer_start(struct trace_array *tr)
+{
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_add_return(1, &blk_probes_ref) == 1)
+		if (blk_register_tracepoints())
+			atomic_dec(&blk_probes_ref);
+	mutex_unlock(&blk_probe_mutex);
+	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+}
+
+static int blk_tracer_init(struct trace_array *tr)
+{
+	blk_tr = tr;
+	blk_tracer_start(tr);
+	mutex_lock(&blk_probe_mutex);
+	blk_tracer_enabled++;
+	mutex_unlock(&blk_probe_mutex);
+	return 0;
+}
+
+static void blk_tracer_stop(struct trace_array *tr)
+{
+	trace_flags |= TRACE_ITER_CONTEXT_INFO;
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_dec_and_test(&blk_probes_ref))
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
+static void blk_tracer_reset(struct trace_array *tr)
+{
+	if (!atomic_read(&blk_probes_ref))
+		return;
+
+	mutex_lock(&blk_probe_mutex);
+	blk_tracer_enabled--;
+	WARN_ON(blk_tracer_enabled < 0);
+	mutex_unlock(&blk_probe_mutex);
+
+	blk_tracer_stop(tr);
+}
+
+static struct {
+	const char *act[2];
+	int 	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
+} what2act[] __read_mostly = {
+	[__BLK_TA_QUEUE]	= {{  "Q", "queue" }, 	   blk_log_generic },
+	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
+	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
+	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
+	[__BLK_TA_SLEEPRQ]	= {{  "S", "sleeprq" },	   blk_log_generic },
+	[__BLK_TA_REQUEUE]	= {{  "R", "requeue" },	   blk_log_with_error },
+	[__BLK_TA_ISSUE]	= {{  "D", "issue" },	   blk_log_generic },
+	[__BLK_TA_COMPLETE]	= {{  "C", "complete" },   blk_log_with_error },
+	[__BLK_TA_PLUG]		= {{  "P", "plug" },	   blk_log_plug },
+	[__BLK_TA_UNPLUG_IO]	= {{  "U", "unplug_io" },  blk_log_unplug },
+	[__BLK_TA_UNPLUG_TIMER]	= {{ "UT", "unplug_timer" }, blk_log_unplug },
+	[__BLK_TA_INSERT]	= {{  "I", "insert" },	   blk_log_generic },
+	[__BLK_TA_SPLIT]	= {{  "X", "split" },	   blk_log_split },
+	[__BLK_TA_BOUNCE]	= {{  "B", "bounce" },	   blk_log_generic },
+	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
+};
+
+static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
+					       int flags)
+{
+	struct trace_seq *s = &iter->seq;
+	const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
+	const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+	int ret;
+
+	if (!trace_print_context(iter))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
+	else {
+		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
+		ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
+		if (ret)
+			ret = what2act[what].print(s, iter->ent);
+	}
+
+	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
+	const int offset = offsetof(struct blk_io_trace, sector);
+	struct blk_io_trace old = {
+		.magic	  = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
+		.time     = ns2usecs(iter->ts),
+	};
+
+	if (!trace_seq_putmem(s, &old, offset))
+		return 0;
+	return trace_seq_putmem(s, &t->sector,
+				sizeof(old) - offset + t->pdu_len);
+}
+
+static enum print_line_t
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
+{
+	return blk_trace_synthesize_old_trace(iter) ?
+			TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
+{
+	const struct blk_io_trace *t;
+	u16 what;
+	int ret;
+
+	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+		return TRACE_TYPE_UNHANDLED;
+
+	t = (const struct blk_io_trace *)iter->ent;
+	what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+
+	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+		ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
+	else {
+		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
+		ret = blk_log_action_iter(iter, what2act[what].act[long_act]);
+		if (ret)
+			ret = what2act[what].print(&iter->seq, iter->ent);
+	}
+
+	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct tracer blk_tracer __read_mostly = {
+	.name		= "blk",
+	.init		= blk_tracer_init,
+	.reset		= blk_tracer_reset,
+	.start		= blk_tracer_start,
+	.stop		= blk_tracer_stop,
+	.print_header	= blk_tracer_print_header,
+	.print_line	= blk_tracer_print_line,
+	.flags		= &blk_tracer_flags,
+};
+
+static struct trace_event trace_blk_event = {
+	.type	 	= TRACE_BLK,
+	.trace		= blk_trace_event_print,
+	.latency_trace	= blk_trace_event_print,
+	.binary		= blk_trace_event_print_binary,
+};
+
+static int __init init_blk_tracer(void)
+{
+	if (!register_ftrace_event(&trace_blk_event)) {
+		pr_warning("Warning: could not register block events\n");
+		return 1;
+	}
+
+	if (register_tracer(&blk_tracer) != 0) {
+		pr_warning("Warning: could not register the block tracer\n");
+		unregister_ftrace_event(&trace_blk_event);
+		return 1;
+	}
+
+	return 0;
+}
+
+device_initcall(init_blk_tracer);
+
+static int blk_trace_remove_queue(struct request_queue *q)
+{
+	struct blk_trace *bt;
+
+	bt = xchg(&q->blk_trace, NULL);
+	if (bt == NULL)
+		return -EINVAL;
+
+	kfree(bt);
+	return 0;
+}
+
+/*
+ * Setup everything required to start tracing
+ */
+static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
+{
+	struct blk_trace *old_bt, *bt = NULL;
+	int ret;
+
+	ret = -ENOMEM;
+	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+	if (!bt)
+		goto err;
+
+	bt->dev = dev;
+	bt->act_mask = (u16)-1;
+	bt->end_lba = -1ULL;
+	bt->trace_state = Blktrace_running;
+
+	old_bt = xchg(&q->blk_trace, bt);
+	if (old_bt != NULL) {
+		(void)xchg(&q->blk_trace, old_bt);
+		kfree(bt);
+		ret = -EBUSY;
+	}
+	return 0;
+err:
+	return ret;
+}
+
+/*
+ * sysfs interface to enable and configure tracing
+ */
+
+static ssize_t sysfs_blk_trace_enable_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	struct block_device *bdev;
+	ssize_t ret = -ENXIO;
+
+	lock_kernel();
+	bdev = bdget(part_devt(p));
+	if (bdev != NULL) {
+		struct request_queue *q = bdev_get_queue(bdev);
+
+		if (q != NULL) {
+			mutex_lock(&bdev->bd_mutex);
+			ret = sprintf(buf, "%u\n", !!q->blk_trace);
+			mutex_unlock(&bdev->bd_mutex);
+		}
+
+		bdput(bdev);
+	}
+
+	unlock_kernel();
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_enable_store(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct block_device *bdev;
+	struct request_queue *q;
+	struct hd_struct *p;
+	int value;
+	ssize_t ret = -ENXIO;
+
+	if (count == 0 || sscanf(buf, "%d", &value) != 1)
+		goto out;
+
+	lock_kernel();
+	p = dev_to_part(dev);
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+
+	mutex_lock(&bdev->bd_mutex);
+	if (value)
+		ret = blk_trace_setup_queue(q, bdev->bd_dev);
+	else
+		ret = blk_trace_remove_queue(q);
+	mutex_unlock(&bdev->bd_mutex);
+
+	if (ret == 0)
+		ret = count;
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+out:
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf);
+static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count);
+#define BLK_TRACE_DEVICE_ATTR(_name) \
+	DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
+		    sysfs_blk_trace_attr_show, \
+		    sysfs_blk_trace_attr_store)
+
+static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,
+		   sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store);
+static BLK_TRACE_DEVICE_ATTR(act_mask);
+static BLK_TRACE_DEVICE_ATTR(pid);
+static BLK_TRACE_DEVICE_ATTR(start_lba);
+static BLK_TRACE_DEVICE_ATTR(end_lba);
+
+static struct attribute *blk_trace_attrs[] = {
+	&dev_attr_enable.attr,
+	&dev_attr_act_mask.attr,
+	&dev_attr_pid.attr,
+	&dev_attr_start_lba.attr,
+	&dev_attr_end_lba.attr,
+	NULL
+};
+
+struct attribute_group blk_trace_attr_group = {
+	.name  = "trace",
+	.attrs = blk_trace_attrs,
+};
+
+static int blk_str2act_mask(const char *str)
+{
+	int mask = 0;
+	char *copy = kstrdup(str, GFP_KERNEL), *s;
+
+	if (copy == NULL)
+		return -ENOMEM;
+
+	s = strstrip(copy);
+
+	while (1) {
+		char *sep = strchr(s, ',');
+
+		if (sep != NULL)
+			*sep = '\0';
+
+		if (strcasecmp(s, "barrier") == 0)
+			mask |= BLK_TC_BARRIER;
+		else if (strcasecmp(s, "complete") == 0)
+			mask |= BLK_TC_COMPLETE;
+		else if (strcasecmp(s, "fs") == 0)
+			mask |= BLK_TC_FS;
+		else if (strcasecmp(s, "issue") == 0)
+			mask |= BLK_TC_ISSUE;
+		else if (strcasecmp(s, "pc") == 0)
+			mask |= BLK_TC_PC;
+		else if (strcasecmp(s, "queue") == 0)
+			mask |= BLK_TC_QUEUE;
+		else if (strcasecmp(s, "read") == 0)
+			mask |= BLK_TC_READ;
+		else if (strcasecmp(s, "requeue") == 0)
+			mask |= BLK_TC_REQUEUE;
+		else if (strcasecmp(s, "sync") == 0)
+			mask |= BLK_TC_SYNC;
+		else if (strcasecmp(s, "write") == 0)
+			mask |= BLK_TC_WRITE;
+
+		if (sep == NULL)
+			break;
+
+		s = sep + 1;
+	}
+	kfree(copy);
+
+	return mask;
+}
+
+static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	struct request_queue *q;
+	struct block_device *bdev;
+	ssize_t ret = -ENXIO;
+
+	lock_kernel();
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+	mutex_lock(&bdev->bd_mutex);
+	if (q->blk_trace == NULL)
+		ret = sprintf(buf, "disabled\n");
+	else if (attr == &dev_attr_act_mask)
+		ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask);
+	else if (attr == &dev_attr_pid)
+		ret = sprintf(buf, "%u\n", q->blk_trace->pid);
+	else if (attr == &dev_attr_start_lba)
+		ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
+	else if (attr == &dev_attr_end_lba)
+		ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+	mutex_unlock(&bdev->bd_mutex);
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct block_device *bdev;
+	struct request_queue *q;
+	struct hd_struct *p;
+	u64 value;
+	ssize_t ret = -ENXIO;
+
+	if (count == 0)
+		goto out;
+
+	if (attr == &dev_attr_act_mask) {
+		if (sscanf(buf, "%llx", &value) != 1) {
+			/* Assume it is a list of trace category names */
+			value = blk_str2act_mask(buf);
+			if (value < 0)
+				goto out;
+		}
+	} else if (sscanf(buf, "%llu", &value) != 1)
+		goto out;
+
+	lock_kernel();
+	p = dev_to_part(dev);
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+
+	mutex_lock(&bdev->bd_mutex);
+	ret = 0;
+	if (q->blk_trace == NULL)
+		ret = blk_trace_setup_queue(q, bdev->bd_dev);
+
+	if (ret == 0) {
+		if (attr == &dev_attr_act_mask)
+			q->blk_trace->act_mask = value;
+		else if (attr == &dev_attr_pid)
+			q->blk_trace->pid = value;
+		else if (attr == &dev_attr_start_lba)
+			q->blk_trace->start_lba = value;
+		else if (attr == &dev_attr_end_lba)
+			q->blk_trace->end_lba = value;
+		ret = count;
+	}
+	mutex_unlock(&bdev->bd_mutex);
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+out:
+	return ret;
+}
-- 
cgit v1.2.3-59-g8ed1b


From 7447dce96f2233d250bc39a4a10a42f7c3dd46fc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Feb 2009 21:33:57 +0100
Subject: tracing/function-graph-tracer: provide a selftest for the function
 graph tracer

Making it more easy to do a basic regression test for this tracer.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h                 |  2 ++
 kernel/trace/trace_functions_graph.c |  3 +++
 kernel/trace/trace_selftest.c        | 50 ++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b9838f4a6929..a011ec062225 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -500,6 +500,8 @@ extern int DYN_FTRACE_TEST_NAME(void);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 extern int trace_selftest_startup_function(struct tracer *trace,
 					   struct trace_array *tr);
+extern int trace_selftest_startup_function_graph(struct tracer *trace,
+						 struct trace_array *tr);
 extern int trace_selftest_startup_irqsoff(struct tracer *trace,
 					  struct trace_array *tr);
 extern int trace_selftest_startup_preemptoff(struct tracer *trace,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 222f97d336a6..88f8d9d80a93 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -750,6 +750,9 @@ static struct tracer graph_trace __read_mostly = {
 	.print_line	= print_graph_function,
 	.print_header	= print_graph_headers,
 	.flags		= &tracer_flags,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_function_graph,
+#endif
 };
 
 static __init int init_graph_trace(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 445700e51f6d..0c9aa1457e51 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,6 +13,8 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_PRINT:
 	case TRACE_SPECIAL:
 	case TRACE_BRANCH:
+	case TRACE_GRAPH_ENT:
+	case TRACE_GRAPH_RET:
 		return 1;
 	}
 	return 0;
@@ -227,6 +229,54 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 }
 #endif /* CONFIG_FUNCTION_TRACER */
 
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/*
+ * Pretty much the same than for the function tracer from which the selftest
+ * has been borrowed.
+ */
+int
+trace_selftest_startup_function_graph(struct tracer *trace,
+					struct trace_array *tr)
+{
+	int ret;
+	unsigned long count;
+
+	ret = tracer_init(trace, tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		goto out;
+	}
+
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+
+	tracing_stop();
+
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+
+	trace->reset(tr);
+	tracing_start();
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+		goto out;
+	}
+
+	/* Don't test dynamic tracing, the function tracer already did */
+
+out:
+	/* Stop it if we failed */
+	if (ret)
+		ftrace_graph_stop();
+
+	return ret;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+
 #ifdef CONFIG_IRQSOFF_TRACER
 int
 trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
-- 
cgit v1.2.3-59-g8ed1b


From 1292211058aaf872eeb2a0e2677d237916b4501f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Feb 2009 22:16:12 +0100
Subject: tracing/power: move the power trace headers to a dedicated file

Impact: cleanup

Move the power tracer headers to trace/power.h to keep ftrace.h and power bits
more easy to maintain as separated topics.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |  2 +-
 arch/x86/kernel/process.c                  |  2 +-
 include/linux/ftrace.h                     | 30 -------------------------
 include/trace/power.h                      | 35 ++++++++++++++++++++++++++++++
 kernel/trace/trace.h                       |  1 +
 kernel/trace/trace_power.c                 |  2 +-
 6 files changed, 39 insertions(+), 33 deletions(-)
 create mode 100644 include/trace/power.h

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c3..7ed925edf4d2 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,7 +33,7 @@
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 
 #include <linux/acpi.h>
 #include <acpi/processor.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e68bb9e30864..026819ffcb0c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,7 +8,7 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 5e302d636fc2..106b7909d500 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -339,36 +339,6 @@ ftrace_init_module(struct module *mod,
 		   unsigned long *start, unsigned long *end) { }
 #endif
 
-enum {
-	POWER_NONE = 0,
-	POWER_CSTATE = 1,
-	POWER_PSTATE = 2,
-};
-
-struct power_trace {
-#ifdef CONFIG_POWER_TRACER
-	ktime_t			stamp;
-	ktime_t			end;
-	int			type;
-	int			state;
-#endif
-};
-
-#ifdef CONFIG_POWER_TRACER
-extern void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_end(struct power_trace *it);
-#else
-static inline void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_end(struct power_trace *it) { }
-#endif
-
-
 /*
  * Structure that defines an entry function trace.
  */
diff --git a/include/trace/power.h b/include/trace/power.h
new file mode 100644
index 000000000000..c7cefbcdaea4
--- /dev/null
+++ b/include/trace/power.h
@@ -0,0 +1,35 @@
+#ifndef _TRACE_POWER_H
+#define _TRACE_POWER_H
+
+#include <linux/ktime.h>
+
+enum {
+	POWER_NONE = 0,
+	POWER_CSTATE = 1,
+	POWER_PSTATE = 2,
+};
+
+struct power_trace {
+#ifdef CONFIG_POWER_TRACER
+	ktime_t			stamp;
+	ktime_t			end;
+	int			type;
+	int			state;
+#endif
+};
+
+#ifdef CONFIG_POWER_TRACER
+extern void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_end(struct power_trace *it);
+#else
+static inline void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_end(struct power_trace *it) { }
+#endif
+
+#endif /* _TRACE_POWER_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a011ec062225..1ecfb9d2b365 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -10,6 +10,7 @@
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <trace/kmemtrace.h>
+#include <trace/power.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index bfc21f8079ab..b1d0d087d3a6 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -11,7 +11,7 @@
 
 #include <linux/init.h>
 #include <linux/debugfs.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 #include <linux/kallsyms.h>
 #include <linux/module.h>
 
-- 
cgit v1.2.3-59-g8ed1b


From 3861a17bcc0af815f684c6178bc9ec2d790c350e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 8 Feb 2009 00:04:02 +0100
Subject: tracing/function-graph-tracer: drop the kernel_text_address check

When the function graph tracer picks a return address, it ensures this address
is really a kernel text one by calling __kernel_text_address()

Actually this path has never been taken.Its role was more likely to debug the tracer
on the beginning of its development but this function is wasteful since it is called
for every traced function.

The fault check is already sufficient.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c | 7 -------
 kernel/extable.c         | 4 ++--
 kernel/module.c          | 2 +-
 3 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d74d75e0952d..18828aee8781 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -491,13 +491,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	if (unlikely(!__kernel_text_address(old))) {
-		ftrace_graph_stop();
-		*parent = old;
-		WARN_ON(1);
-		return;
-	}
-
 	calltime = cpu_clock(raw_smp_processor_id());
 
 	if (push_return_trace(old, calltime,
diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8d82ba..0df6253730be 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	return e;
 }
 
-__notrace_funcgraph int core_kernel_text(unsigned long addr)
+int core_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext &&
 	    addr <= (unsigned long)_etext)
@@ -54,7 +54,7 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr)
 	return 0;
 }
 
-__notrace_funcgraph int __kernel_text_address(unsigned long addr)
+int __kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
diff --git a/kernel/module.c b/kernel/module.c
index ba22484a987e..22d7379709da 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2735,7 +2735,7 @@ int is_module_address(unsigned long addr)
 
 
 /* Is this a valid kernel address? */
-__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
+struct module *__module_text_address(unsigned long addr)
 {
 	struct module *mod;
 
-- 
cgit v1.2.3-59-g8ed1b


From b5db03c4355e568f1567758287b30a6a262d5057 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 7 Feb 2009 18:52:59 -0200
Subject: tracing: handle unregistering the current tracer

Impact: simplification

Instead of requiring that plugins have the sequence:

  my_tracer_stop(my_trace_array);
  unregister_tracer(my_tracer);

it should be possible just do a:

  unregister_tracer(my_tracer);

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 03fbd4c20bc2..93040f1bef13 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -559,6 +559,15 @@ void unregister_tracer(struct tracer *type)
 
  found:
 	*t = (*t)->next;
+
+	if (type == current_trace && tracer_enabled) {
+		tracer_enabled = 0;
+		tracing_stop();
+		if (current_trace->stop)
+			current_trace->stop(&global_trace);
+		current_trace = &nop_trace;
+	}
+
 	if (strlen(type->name) != max_tracer_type_len)
 		goto out;
 
-- 
cgit v1.2.3-59-g8ed1b


From 17406b82d621930cca8ccc1272cdac9a7dae8e40 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Fri, 6 Feb 2009 15:37:47 -0800
Subject: softlockup: remove timestamp checking from hung_task

Impact: saves sizeof(long) bytes per task_struct

By guaranteeing that sysctl_hung_task_timeout_secs have elapsed between
tasklist scans we can avoid using timestamps.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 -
 kernel/fork.c         |  8 +++-----
 kernel/hung_task.c    | 48 +++++++++---------------------------------------
 3 files changed, 12 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2a2811c6239d..e0d723fea9f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1241,7 +1241,6 @@ struct task_struct {
 #endif
 #ifdef CONFIG_DETECT_HUNG_TASK
 /* hung task detection */
-	unsigned long last_switch_timestamp;
 	unsigned long last_switch_count;
 #endif
 /* CPU-specific state of this task */
diff --git a/kernel/fork.c b/kernel/fork.c
index fb9444282836..bf582f75014b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -639,6 +639,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 
 	tsk->min_flt = tsk->maj_flt = 0;
 	tsk->nvcsw = tsk->nivcsw = 0;
+#ifdef CONFIG_DETECT_HUNG_TASK
+	tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+#endif
 
 	tsk->mm = NULL;
 	tsk->active_mm = NULL;
@@ -1041,11 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->default_timer_slack_ns = current->timer_slack_ns;
 
-#ifdef CONFIG_DETECT_HUNG_TASK
-	p->last_switch_count = 0;
-	p->last_switch_timestamp = 0;
-#endif
-
 	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
 
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 3951a80e7cbe..0c924de58cb2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -34,7 +34,6 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
  * Zero means infinite timeout - no checking done:
  */
 unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
-static unsigned long __read_mostly hung_task_poll_jiffies;
 
 unsigned long __read_mostly sysctl_hung_task_warnings = 10;
 
@@ -69,33 +68,17 @@ static struct notifier_block panic_block = {
 	.notifier_call = hung_task_panic,
 };
 
-/*
- * Returns seconds, approximately.  We don't need nanosecond
- * resolution, and we don't need to waste time with a big divide when
- * 2^30ns == 1.074s.
- */
-static unsigned long get_timestamp(void)
-{
-	int this_cpu = raw_smp_processor_id();
-
-	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
-}
-
-static void check_hung_task(struct task_struct *t, unsigned long now,
-			    unsigned long timeout)
+static void check_hung_task(struct task_struct *t, unsigned long timeout)
 {
 	unsigned long switch_count = t->nvcsw + t->nivcsw;
 
 	if (t->flags & PF_FROZEN)
 		return;
 
-	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+	if (switch_count != t->last_switch_count) {
 		t->last_switch_count = switch_count;
-		t->last_switch_timestamp = now;
 		return;
 	}
-	if ((long)(now - t->last_switch_timestamp) < timeout)
-		return;
 	if (!sysctl_hung_task_warnings)
 		return;
 	sysctl_hung_task_warnings--;
@@ -111,7 +94,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 	sched_show_task(t);
 	__debug_show_held_locks(t);
 
-	t->last_switch_timestamp = now;
 	touch_nmi_watchdog();
 
 	if (sysctl_hung_task_panic)
@@ -145,7 +127,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
 	int max_count = sysctl_hung_task_check_count;
 	int batch_count = HUNG_TASK_BATCHING;
-	unsigned long now = get_timestamp();
 	struct task_struct *g, *t;
 
 	/*
@@ -168,19 +149,16 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
-			check_hung_task(t, now, timeout);
+			check_hung_task(t, timeout);
 	} while_each_thread(g, t);
  unlock:
 	rcu_read_unlock();
 }
 
-static void update_poll_jiffies(void)
+static unsigned long timeout_jiffies(unsigned long timeout)
 {
 	/* timeout of 0 will disable the watchdog */
-	if (sysctl_hung_task_timeout_secs == 0)
-		hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT;
-	else
-		hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2;
+	return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
 }
 
 /*
@@ -197,8 +175,6 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 	if (ret || !write)
 		goto out;
 
-	update_poll_jiffies();
-
 	wake_up_process(watchdog_task);
 
  out:
@@ -211,20 +187,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 static int watchdog(void *dummy)
 {
 	set_user_nice(current, 0);
-	update_poll_jiffies();
 
 	for ( ; ; ) {
-		unsigned long timeout;
+		unsigned long timeout = sysctl_hung_task_timeout_secs;
 
-		while (schedule_timeout_interruptible(hung_task_poll_jiffies));
+		while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
+			timeout = sysctl_hung_task_timeout_secs;
 
-		/*
-		 * Need to cache timeout here to avoid timeout being set
-		 * to 0 via sysctl while inside check_hung_*_tasks().
-		 */
-		timeout = sysctl_hung_task_timeout_secs;
-		if (timeout)
-			check_hung_uninterruptible_tasks(timeout);
+		check_hung_uninterruptible_tasks(timeout);
 	}
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 1dfba05d0f1a9b4245bb242a7c17fe448811a520 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Feb 2009 12:06:54 +0100
Subject: tracing/blktrace: move the tracing file to kernel/trace, fix

Impact: build fix

The BLK_DEV_IO_TRACE entry used to be in block/Kconfig - which
file itself was dependent on CONFIG_BLOCK. But now the entry is
in kernel/trace/Kconfig - which is present even on !CONFIG_BLOCK.

So add a 'depends on BLOCK' to BLK_DEV_IO_TRACE.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 4fee43c01942..3a331289457a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -305,6 +305,7 @@ config WORKQUEUE_TRACER
 config BLK_DEV_IO_TRACE
 	bool "Support for tracing block io actions"
 	depends on SYSFS
+	depends on BLOCK
 	select RELAY
 	select DEBUG_FS
 	select TRACEPOINTS
-- 
cgit v1.2.3-59-g8ed1b


From b91facc367366b3f71375f337eb5997ec9ab4e69 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 6 Feb 2009 18:30:44 +0100
Subject: tracing/function-graph-tracer: handle the leaf functions from
 trace_pipe

When one cats the trace file, the leaf functions are printed without brackets:

 function();

whereas in the trace_pipe file we'll see the following:

 function() {
 }

This is because the ring_buffer handling is not the same between those two files.
On the trace file, when an entry is printed, the iterator advanced and then we can
check the next entry.

There is no iterator with trace_pipe, the current entry to print has been peeked
and not consumed. So checking the next entry will still return the current one while
we don't consume it.

This patch introduces a new value for the output callbacks to ask the tracing
core to not consume the current entry after printing it.

We need it because we will have to consume the current entry ourself to check
the next one.

Now the trace_pipe is able to handle well the leaf functions.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                 |  4 +--
 kernel/trace/trace.h                 |  7 +++---
 kernel/trace/trace_functions_graph.c | 48 ++++++++++++++++++++++--------------
 3 files changed, 36 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 93040f1bef13..5b1e9a9e9906 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2468,8 +2468,8 @@ waitagain:
 			iter->seq.len = len;
 			break;
 		}
-
-		trace_consume(iter);
+		if (ret != TRACE_TYPE_NO_CONSUME)
+			trace_consume(iter);
 
 		if (iter->seq.len >= cnt)
 			break;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1ecfb9d2b365..7b0518adf6d7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -63,13 +63,13 @@ struct ftrace_entry {
 
 /* Function call entry */
 struct ftrace_graph_ent_entry {
-	struct trace_entry			ent;
+	struct trace_entry		ent;
 	struct ftrace_graph_ent		graph_ent;
 };
 
 /* Function return entry */
 struct ftrace_graph_ret_entry {
-	struct trace_entry			ent;
+	struct trace_entry		ent;
 	struct ftrace_graph_ret		ret;
 };
 extern struct tracer boot_tracer;
@@ -309,7 +309,8 @@ extern void __ftrace_bad_type(void);
 enum print_line_t {
 	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */
 	TRACE_TYPE_HANDLED	= 1,
-	TRACE_TYPE_UNHANDLED	= 2	/* Relay to other output functions */
+	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */
+	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
 };
 
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 88f8d9d80a93..782ec0fdf453 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -212,8 +212,8 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
 	return ret;
 }
 
-static bool
-trace_branch_is_leaf(struct trace_iterator *iter,
+static struct ftrace_graph_ret_entry *
+get_return_for_leaf(struct trace_iterator *iter,
 		struct ftrace_graph_ent_entry *curr)
 {
 	struct ring_buffer_iter *ring_iter;
@@ -222,24 +222,33 @@ trace_branch_is_leaf(struct trace_iterator *iter,
 
 	ring_iter = iter->buffer_iter[iter->cpu];
 
-	if (!ring_iter)
-		return false;
-
-	event = ring_buffer_iter_peek(ring_iter, NULL);
+	/* First peek to compare current entry and the next one */
+	if (ring_iter)
+		event = ring_buffer_iter_peek(ring_iter, NULL);
+	else {
+	/* We need to consume the current entry to see the next one */
+		ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+		event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+					NULL);
+	}
 
 	if (!event)
-		return false;
+		return NULL;
 
 	next = ring_buffer_event_data(event);
 
 	if (next->ent.type != TRACE_GRAPH_RET)
-		return false;
+		return NULL;
 
 	if (curr->ent.pid != next->ent.pid ||
 			curr->graph_ent.func != next->ret.func)
-		return false;
+		return NULL;
 
-	return true;
+	/* this is a leaf, now advance the iterator */
+	if (ring_iter)
+		ring_buffer_read(ring_iter, NULL);
+
+	return next;
 }
 
 /* Signal a overhead of time execution to the output */
@@ -376,18 +385,15 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
 /* Case of a leaf function on its call entry */
 static enum print_line_t
 print_graph_entry_leaf(struct trace_iterator *iter,
-		struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
+		struct ftrace_graph_ent_entry *entry,
+		struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
 {
-	struct ftrace_graph_ret_entry *ret_entry;
 	struct ftrace_graph_ret *graph_ret;
-	struct ring_buffer_event *event;
 	struct ftrace_graph_ent *call;
 	unsigned long long duration;
 	int ret;
 	int i;
 
-	event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
-	ret_entry = ring_buffer_event_data(event);
 	graph_ret = &ret_entry->ret;
 	call = &entry->graph_ent;
 	duration = graph_ret->rettime - graph_ret->calltime;
@@ -457,7 +463,11 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return TRACE_TYPE_HANDLED;
+	/*
+	 * we already consumed the current entry to check the next one
+	 * and see if this is a leaf.
+	 */
+	return TRACE_TYPE_NO_CONSUME;
 }
 
 static enum print_line_t
@@ -469,6 +479,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	pid_t *last_entry = iter->private;
 	struct trace_entry *ent = iter->ent;
 	struct ftrace_graph_ent *call = &field->graph_ent;
+	struct ftrace_graph_ret_entry *leaf_ret;
 
 	/* Pid */
 	if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE)
@@ -504,8 +515,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	if (trace_branch_is_leaf(iter, field))
-		return print_graph_entry_leaf(iter, field, s);
+	leaf_ret = get_return_for_leaf(iter, field);
+	if (leaf_ret)
+		return print_graph_entry_leaf(iter, field, leaf_ret, s);
 	else
 		return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
 
-- 
cgit v1.2.3-59-g8ed1b


From 3c56819b14b00dd449bd776303e61f8532fad09f Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 9 Feb 2009 08:15:56 +0200
Subject: tracing: splice support for tracing_pipe

Added and implemented tracing_pipe_fops->splice_read(). This allows
userspace programs to get tracing data more efficiently.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h |   6 +++
 2 files changed, 142 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5b1e9a9e9906..9e29fdb0dfe5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -31,6 +31,7 @@
 #include <linux/fs.h>
 #include <linux/kprobes.h>
 #include <linux/writeback.h>
+#include <linux/splice.h>
 
 #include <linux/stacktrace.h>
 #include <linux/ring_buffer.h>
@@ -364,6 +365,25 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 	return cnt;
 }
 
+ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
+{
+	int len;
+	void *ret;
+
+	if (s->len <= s->readpos)
+		return -EBUSY;
+
+	len = s->len - s->readpos;
+	if (cnt > len)
+		cnt = len;
+	ret = memcpy(buf, s->buffer + s->readpos, cnt);
+	if (!ret)
+		return -EFAULT;
+
+	s->readpos += len;
+	return cnt;
+}
+
 static void
 trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
@@ -2493,6 +2513,121 @@ out:
 	return sret;
 }
 
+static void tracing_pipe_buf_release(struct pipe_inode_info *pipe,
+				     struct pipe_buffer *buf)
+{
+	__free_page(buf->page);
+}
+
+static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
+				     unsigned int idx)
+{
+	__free_page(spd->pages[idx]);
+}
+
+static struct pipe_buf_operations tracing_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = tracing_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static ssize_t tracing_splice_read_pipe(struct file *filp,
+					loff_t *ppos,
+					struct pipe_inode_info *pipe,
+					size_t len,
+					unsigned int flags)
+{
+	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_BUFFERS];
+	struct trace_iterator *iter = filp->private_data;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.nr_pages = 0, /* This gets updated below. */
+		.flags = flags,
+		.ops = &tracing_pipe_buf_ops,
+		.spd_release = tracing_spd_release_pipe,
+	};
+	ssize_t ret;
+	size_t count, rem;
+	unsigned int i;
+
+	mutex_lock(&trace_types_lock);
+
+	if (iter->trace->splice_read) {
+		ret = iter->trace->splice_read(iter, filp,
+					       ppos, pipe, len, flags);
+		if (ret)
+			goto out;
+	}
+
+	ret = tracing_wait_pipe(filp);
+	if (ret <= 0)
+		goto out;
+
+	if (!iter->ent && !find_next_entry_inc(iter)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* Fill as many pages as possible. */
+	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+
+		/* Seq buffer is page-sized, exactly what we need. */
+		for (;;) {
+			count = iter->seq.len;
+			ret = print_trace_line(iter);
+			count = iter->seq.len - count;
+			if (rem < count) {
+				rem = 0;
+				iter->seq.len -= count;
+				break;
+			}
+			if (ret == TRACE_TYPE_PARTIAL_LINE) {
+				iter->seq.len -= count;
+				break;
+			}
+
+			trace_consume(iter);
+			rem -= count;
+			if (!find_next_entry_inc(iter))	{
+				rem = 0;
+				iter->ent = NULL;
+				break;
+			}
+		}
+
+		/* Copy the data into the page, so we can start over. */
+		ret = trace_seq_to_buffer(&iter->seq,
+					  page_address(pages[i]),
+					  iter->seq.len);
+		if (ret < 0) {
+			__free_page(pages[i]);
+			break;
+		}
+		partial[i].offset = 0;
+		partial[i].len = iter->seq.len;
+
+		trace_seq_reset(&iter->seq);
+	}
+
+	mutex_unlock(&trace_types_lock);
+
+	spd.nr_pages = i;
+
+	return splice_to_pipe(pipe, &spd);
+
+out:
+	mutex_unlock(&trace_types_lock);
+
+	return ret;
+}
+
 static ssize_t
 tracing_entries_read(struct file *filp, char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
@@ -2656,6 +2791,7 @@ static struct file_operations tracing_pipe_fops = {
 	.open		= tracing_open_pipe,
 	.poll		= tracing_poll_pipe,
 	.read		= tracing_read_pipe,
+	.splice_read	= tracing_splice_read_pipe,
 	.release	= tracing_release_pipe,
 };
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7b0518adf6d7..dbff0207b213 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -353,6 +353,12 @@ struct tracer {
 	ssize_t			(*read)(struct trace_iterator *iter,
 					struct file *filp, char __user *ubuf,
 					size_t cnt, loff_t *ppos);
+	ssize_t			(*splice_read)(struct trace_iterator *iter,
+					       struct file *filp,
+					       loff_t *ppos,
+					       struct pipe_inode_info *pipe,
+					       size_t len,
+					       unsigned int flags);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 	int			(*selftest)(struct tracer *trace,
 					    struct trace_array *tr);
-- 
cgit v1.2.3-59-g8ed1b


From ff98781bab2735e6c89793034173e0cb5007a7e5 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 9 Feb 2009 08:15:55 +0200
Subject: tracing: Move pipe waiting code out of tracing_read_pipe().

This moves the pipe waiting code from tracing_read_pipe() into
tracing_wait_pipe(), which is useful to implement other fops, like
splice_read.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 69 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9e29fdb0dfe5..11fde0ad9cb6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2388,37 +2388,15 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
 	}
 }
 
-/*
- * Consumer reader.
- */
-static ssize_t
-tracing_read_pipe(struct file *filp, char __user *ubuf,
-		  size_t cnt, loff_t *ppos)
+/* Must be called with trace_types_lock mutex held. */
+static int tracing_wait_pipe(struct file *filp)
 {
 	struct trace_iterator *iter = filp->private_data;
-	ssize_t sret;
-
-	/* return any leftover data */
-	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
-	if (sret != -EBUSY)
-		return sret;
-
-	trace_seq_reset(&iter->seq);
 
-	mutex_lock(&trace_types_lock);
-	if (iter->trace->read) {
-		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
-		if (sret)
-			goto out;
-	}
-
-waitagain:
-	sret = 0;
 	while (trace_empty(iter)) {
 
 		if ((filp->f_flags & O_NONBLOCK)) {
-			sret = -EAGAIN;
-			goto out;
+			return -EAGAIN;
 		}
 
 		/*
@@ -2443,12 +2421,11 @@ waitagain:
 		iter->tr->waiter = NULL;
 
 		if (signal_pending(current)) {
-			sret = -EINTR;
-			goto out;
+			return -EINTR;
 		}
 
 		if (iter->trace != current_trace)
-			goto out;
+			return 0;
 
 		/*
 		 * We block until we read something and tracing is disabled.
@@ -2465,9 +2442,43 @@ waitagain:
 		continue;
 	}
 
+	return 1;
+}
+
+/*
+ * Consumer reader.
+ */
+static ssize_t
+tracing_read_pipe(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	struct trace_iterator *iter = filp->private_data;
+	ssize_t sret;
+
+	/* return any leftover data */
+	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+	if (sret != -EBUSY)
+		return sret;
+
+	trace_seq_reset(&iter->seq);
+
+	mutex_lock(&trace_types_lock);
+	if (iter->trace->read) {
+		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
+		if (sret)
+			goto out;
+	}
+
+waitagain:
+	sret = tracing_wait_pipe(filp);
+	if (sret <= 0)
+		goto out;
+
 	/* stop when tracing is finished */
-	if (trace_empty(iter))
+	if (trace_empty(iter)) {
+		sret = 0;
 		goto out;
+	}
 
 	if (cnt >= PAGE_SIZE)
 		cnt = PAGE_SIZE - 1;
-- 
cgit v1.2.3-59-g8ed1b


From 34cd4998d38f9bd04f34b78a7cb0c7f1bee00bd9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Feb 2009 12:06:29 -0500
Subject: tracing: clean up splice code

Ingo Molnar suggested a series of clean ups for the splice code.
This patch implements those suggestions.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 96 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 11fde0ad9cb6..d89821283b47 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2537,15 +2537,49 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
 }
 
 static struct pipe_buf_operations tracing_pipe_buf_ops = {
-	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
-	.confirm = generic_pipe_buf_confirm,
-	.release = tracing_pipe_buf_release,
-	.steal = generic_pipe_buf_steal,
-	.get = generic_pipe_buf_get,
+	.can_merge		= 0,
+	.map			= generic_pipe_buf_map,
+	.unmap			= generic_pipe_buf_unmap,
+	.confirm		= generic_pipe_buf_confirm,
+	.release		= tracing_pipe_buf_release,
+	.steal			= generic_pipe_buf_steal,
+	.get			= generic_pipe_buf_get,
 };
 
+static size_t
+tracing_fill_pipe_page(struct page *pages, size_t rem,
+			struct trace_iterator *iter)
+{
+	size_t count;
+	int ret;
+
+	/* Seq buffer is page-sized, exactly what we need. */
+	for (;;) {
+		count = iter->seq.len;
+		ret = print_trace_line(iter);
+		count = iter->seq.len - count;
+		if (rem < count) {
+			rem = 0;
+			iter->seq.len -= count;
+			break;
+		}
+		if (ret == TRACE_TYPE_PARTIAL_LINE) {
+			iter->seq.len -= count;
+			break;
+		}
+
+		trace_consume(iter);
+		rem -= count;
+		if (!find_next_entry_inc(iter))	{
+			rem = 0;
+			iter->ent = NULL;
+			break;
+		}
+	}
+
+	return rem;
+}
+
 static ssize_t tracing_splice_read_pipe(struct file *filp,
 					loff_t *ppos,
 					struct pipe_inode_info *pipe,
@@ -2556,15 +2590,15 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	struct partial_page partial[PIPE_BUFFERS];
 	struct trace_iterator *iter = filp->private_data;
 	struct splice_pipe_desc spd = {
-		.pages = pages,
-		.partial = partial,
-		.nr_pages = 0, /* This gets updated below. */
-		.flags = flags,
-		.ops = &tracing_pipe_buf_ops,
-		.spd_release = tracing_spd_release_pipe,
+		.pages		= pages,
+		.partial	= partial,
+		.nr_pages	= 0, /* This gets updated below. */
+		.flags		= flags,
+		.ops		= &tracing_pipe_buf_ops,
+		.spd_release	= tracing_spd_release_pipe,
 	};
 	ssize_t ret;
-	size_t count, rem;
+	size_t rem;
 	unsigned int i;
 
 	mutex_lock(&trace_types_lock);
@@ -2573,45 +2607,25 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		ret = iter->trace->splice_read(iter, filp,
 					       ppos, pipe, len, flags);
 		if (ret)
-			goto out;
+			goto out_err;
 	}
 
 	ret = tracing_wait_pipe(filp);
 	if (ret <= 0)
-		goto out;
+		goto out_err;
 
 	if (!iter->ent && !find_next_entry_inc(iter)) {
 		ret = -EFAULT;
-		goto out;
+		goto out_err;
 	}
 
 	/* Fill as many pages as possible. */
 	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
 		pages[i] = alloc_page(GFP_KERNEL);
+		if (!pages[i])
+			break;
 
-		/* Seq buffer is page-sized, exactly what we need. */
-		for (;;) {
-			count = iter->seq.len;
-			ret = print_trace_line(iter);
-			count = iter->seq.len - count;
-			if (rem < count) {
-				rem = 0;
-				iter->seq.len -= count;
-				break;
-			}
-			if (ret == TRACE_TYPE_PARTIAL_LINE) {
-				iter->seq.len -= count;
-				break;
-			}
-
-			trace_consume(iter);
-			rem -= count;
-			if (!find_next_entry_inc(iter))	{
-				rem = 0;
-				iter->ent = NULL;
-				break;
-			}
-		}
+		rem = tracing_fill_pipe_page(pages[i], rem, iter);
 
 		/* Copy the data into the page, so we can start over. */
 		ret = trace_seq_to_buffer(&iter->seq,
@@ -2633,7 +2647,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 
 	return splice_to_pipe(pipe, &spd);
 
-out:
+out_err:
 	mutex_unlock(&trace_types_lock);
 
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From b85fa01ed958ca59523a2db3c2ee647b98745d6a Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 9 Feb 2009 14:21:14 +0800
Subject: ring_buffer: fix typing mistake

Impact: Fix bug

I found several very very curious line.
It's so curious that it may be brought by typing mistake.

When (cpu_buffer->reader_page == cpu_buffer->commit_page):

1) We haven't copied it for bpage is changed:
   bpage = cpu_buffer->reader_page->page;
   memcpy(bpage->data, cpu_buffer->reader_page->page->data + read ... )
2) We need update cpu_buffer->reader_page->read, but
   "cpu_buffer->reader_page += read;" is not right.

[
  This bug was a typo. The commit->reader_page is a page pointer
  and not an index into the page. The line should have been
  commit->reader_page->read += read.  The other changes
  by Lai are nice clean ups to the code.  - SDR
]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 53ba3a6d16d0..eca282720838 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2406,7 +2406,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  * to swap with a page in the ring buffer.
  *
  * for example:
- *	rpage = ring_buffer_alloc_page(buffer);
+ *	rpage = ring_buffer_alloc_read_page(buffer);
  *	if (!rpage)
  *		return error;
  *	ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
@@ -2461,18 +2461,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	 */
 	if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
 		unsigned int read = cpu_buffer->reader_page->read;
+		unsigned int commit = rb_page_commit(cpu_buffer->reader_page);
 
 		if (full)
 			goto out;
 		/* The writer is still on the reader page, we must copy */
-		bpage = cpu_buffer->reader_page->page;
 		memcpy(bpage->data,
 		       cpu_buffer->reader_page->page->data + read,
-		       local_read(&bpage->commit) - read);
+		       commit - read);
 
 		/* consume what was read */
-		cpu_buffer->reader_page += read;
-
+		cpu_buffer->reader_page->read = commit;
 	} else {
 		/* swap the pages */
 		rb_init_page(bpage);
-- 
cgit v1.2.3-59-g8ed1b


From 667d24125839b6f3363d8177d7ed9fab8a40e45f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 9 Feb 2009 14:21:17 +0800
Subject: ring_buffer: fix ring_buffer_read_page()

Impact: change API and init bpage when copy

ring_buffer_read_page()/rb_remove_entries() may be called for
a partially consumed page.

Add a parameter for rb_remove_entries() and make it update
cpu_buffer->entries correctly for partially consumed pages.

ring_buffer_read_page() now returns the offset to the next event.

Init the bpage's time_stamp when return value is 0.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index eca282720838..10d202ea06f3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2332,13 +2332,14 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 
 static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
-			      struct buffer_data_page *bpage)
+			      struct buffer_data_page *bpage,
+			      unsigned int offset)
 {
 	struct ring_buffer_event *event;
 	unsigned long head;
 
 	__raw_spin_lock(&cpu_buffer->lock);
-	for (head = 0; head < local_read(&bpage->commit);
+	for (head = offset; head < local_read(&bpage->commit);
 	     head += rb_event_length(event)) {
 
 		event = __rb_data_page_index(bpage, head);
@@ -2410,8 +2411,8 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *	if (!rpage)
  *		return error;
  *	ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
- *	if (ret)
- *		process_page(rpage);
+ *	if (ret >= 0)
+ *		process_page(rpage, ret);
  *
  * When @full is set, the function will not return true unless
  * the writer is off the reader page.
@@ -2422,8 +2423,8 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *  responsible for that.
  *
  * Returns:
- *  1 if data has been transferred
- *  0 if no data has been transferred.
+ *  >=0 if data has been transferred, returns the offset of consumed data.
+ *  <0 if no data has been transferred.
  */
 int ring_buffer_read_page(struct ring_buffer *buffer,
 			    void **data_page, int cpu, int full)
@@ -2432,7 +2433,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct buffer_data_page *bpage;
 	unsigned long flags;
-	int ret = 0;
+	unsigned int read;
+	int ret = -1;
 
 	if (!data_page)
 		return 0;
@@ -2454,24 +2456,29 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	/* check for data */
 	if (!local_read(&cpu_buffer->reader_page->page->commit))
 		goto out;
+
+	read = cpu_buffer->reader_page->read;
 	/*
 	 * If the writer is already off of the read page, then simply
 	 * switch the read page with the given page. Otherwise
 	 * we need to copy the data from the reader to the writer.
 	 */
 	if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
-		unsigned int read = cpu_buffer->reader_page->read;
 		unsigned int commit = rb_page_commit(cpu_buffer->reader_page);
+		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
 
 		if (full)
 			goto out;
 		/* The writer is still on the reader page, we must copy */
-		memcpy(bpage->data,
-		       cpu_buffer->reader_page->page->data + read,
-		       commit - read);
+		memcpy(bpage->data + read, rpage->data + read, commit - read);
 
 		/* consume what was read */
 		cpu_buffer->reader_page->read = commit;
+
+		/* update bpage */
+		local_set(&bpage->commit, commit);
+		if (!read)
+			bpage->time_stamp = rpage->time_stamp;
 	} else {
 		/* swap the pages */
 		rb_init_page(bpage);
@@ -2480,10 +2487,10 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		cpu_buffer->reader_page->read = 0;
 		*data_page = bpage;
 	}
-	ret = 1;
+	ret = read;
 
 	/* update the entry counter */
-	rb_remove_entries(cpu_buffer, bpage);
+	rb_remove_entries(cpu_buffer, bpage, read);
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-- 
cgit v1.2.3-59-g8ed1b


From 4543ae7ce1cb8b5ff27a59009e7991ea63791a71 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Mon, 9 Feb 2009 23:09:32 +0100
Subject: tracing: storage class should be before const qualifier

The C99 specification states in section 6.11.5:

The placement of a storage-class specifier other than at the beginning
of the declaration specifiers in a declaration is an obsolescent
feature.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_sysprof.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 84ca9d81e74d..4e2de4de1d65 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -88,7 +88,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
 	}
 }
 
-const static struct stacktrace_ops backtrace_ops = {
+static const struct stacktrace_ops backtrace_ops = {
 	.warning		= backtrace_warning,
 	.warning_symbol		= backtrace_warning_symbol,
 	.stack			= backtrace_stack,
-- 
cgit v1.2.3-59-g8ed1b


From f54fc98aa656f334c1571df6e3ca9178ea223847 Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 10 Feb 2009 01:02:46 -0500
Subject: tracing: remove unneeded variable

Impact: clean up.

Remove the unnecessary variable ret.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_branch.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index f8ae2c50e01d..c2e68d440c4d 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -91,8 +91,6 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 
 int enable_branch_tracing(struct trace_array *tr)
 {
-	int ret = 0;
-
 	mutex_lock(&branch_tracing_mutex);
 	branch_tracer = tr;
 	/*
@@ -103,7 +101,7 @@ int enable_branch_tracing(struct trace_array *tr)
 	branch_tracing_enabled++;
 	mutex_unlock(&branch_tracing_mutex);
 
-	return ret;
+	return 0;
 }
 
 void disable_branch_tracing(void)
-- 
cgit v1.2.3-59-g8ed1b


From 810dc73265cd690b2bc6010489b4317bba2cda39 Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 10 Feb 2009 01:03:05 -0500
Subject: tracing: provide correct return value after outputting the event

This patch is to make the function return early on failure, and give
correct return value on success.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions_graph.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 782ec0fdf453..519a0cab1530 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -186,30 +186,30 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
 	ret = trace_seq_printf(s,
 		" ------------------------------------------\n");
 	if (!ret)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = print_graph_cpu(s, cpu);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = print_graph_proc(s, prev_pid);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = trace_seq_printf(s, " => ");
 	if (!ret)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = print_graph_proc(s, pid);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = trace_seq_printf(s,
 		"\n ------------------------------------------\n\n");
 	if (!ret)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
-	return ret;
+	return TRACE_TYPE_HANDLED;
 }
 
 static struct ftrace_graph_ret_entry *
-- 
cgit v1.2.3-59-g8ed1b


From c3706f005c3aaf570e71f0f083fdbb59a5a9fa2e Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 10 Feb 2009 01:03:18 -0500
Subject: tracing: fix typos in comments

Impact: clean up.

Fix typos in the comments.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h      | 2 +-
 kernel/trace/ring_buffer.c       | 8 ++++----
 kernel/trace/trace.c             | 2 +-
 kernel/trace/trace_hw_branches.c | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 3c103d636da3..8e6646a54acf 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -8,7 +8,7 @@ struct ring_buffer;
 struct ring_buffer_iter;
 
 /*
- * Don't reference this struct directly, use functions below.
+ * Don't refer to this struct directly, use functions below.
  */
 struct ring_buffer_event {
 	u32		type:2, len:3, time_delta:27;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 10d202ea06f3..fa64e1f003eb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -91,7 +91,7 @@ EXPORT_SYMBOL_GPL(tracing_off);
  * tracing_off_permanent - permanently disable ring buffers
  *
  * This function, once called, will disable all ring buffers
- * permanenty.
+ * permanently.
  */
 void tracing_off_permanent(void)
 {
@@ -210,7 +210,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 
 struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
-	local_t		 commit;	/* write commited index */
+	local_t		 commit;	/* write committed index */
 	unsigned char	 data[];	/* data of buffer page */
 };
 
@@ -260,7 +260,7 @@ struct ring_buffer_per_cpu {
 	struct list_head		pages;
 	struct buffer_page		*head_page;	/* read from head */
 	struct buffer_page		*tail_page;	/* write to tail */
-	struct buffer_page		*commit_page;	/* commited pages */
+	struct buffer_page		*commit_page;	/* committed pages */
 	struct buffer_page		*reader_page;
 	unsigned long			overrun;
 	unsigned long			entries;
@@ -303,7 +303,7 @@ struct ring_buffer_iter {
  * check_pages - integrity check of buffer pages
  * @cpu_buffer: CPU buffer with pages to test
  *
- * As a safty measure we check to make sure the data pages have not
+ * As a safety measure we check to make sure the data pages have not
  * been corrupted.
  */
 static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d89821283b47..d7c175a442df 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1963,7 +1963,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	struct tracer_opt *trace_opts = current_trace->flags->opts;
 
 
-	/* calulate max size */
+	/* calculate max size */
 	for (i = 0; trace_options[i]; i++) {
 		len += strlen(trace_options[i]);
 		len += 3; /* "no" and space */
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index e3e7db61c067..0794dd33f27b 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -75,7 +75,7 @@ static void bts_trace_start(struct trace_array *tr)
 }
 
 /*
- * Start tracing on the current cpu.
+ * Stop tracing on the current cpu.
  * The argument is ignored.
  *
  * pre: bts_tracer_mutex must be locked.
-- 
cgit v1.2.3-59-g8ed1b


From 4fd2735881bf4d8bf5e30979f31fc2f1b1d505fa Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Tue, 10 Feb 2009 19:44:12 +0100
Subject: tracing: fix sparse warnings: make symbols static

Impact: make global variables and a global function static

The function '__trace_userstack' does not seem to have a caller, so it
is commented out.

Fix this sparse warnings:
  kernel/trace/trace.c:82:5: warning: symbol 'tracing_disabled' was not declared. Should it be static?
  kernel/trace/trace.c:600:10: warning: symbol 'trace_record_cmdline_disabled' was not declared. Should it be static?
  kernel/trace/trace.c:957:6: warning: symbol '__trace_userstack' was not declared. Should it be static?
  kernel/trace/trace.c:1694:5: warning: symbol 'tracing_release' was not declared. Should it be static?

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7c175a442df..c157ba70f30c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -80,7 +80,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
  * of the tracer is successful. But that is the only place that sets
  * this back to zero.
  */
-int tracing_disabled = 1;
+static int tracing_disabled = 1;
 
 static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
 
@@ -626,7 +626,7 @@ static int cmdline_idx;
 static DEFINE_SPINLOCK(trace_cmdline_lock);
 
 /* temporary disable recording */
-atomic_t trace_record_cmdline_disabled __read_mostly;
+static atomic_t trace_record_cmdline_disabled __read_mostly;
 
 static void trace_init_cmdlines(void)
 {
@@ -983,10 +983,12 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 #endif
 }
 
-void __trace_userstack(struct trace_array *tr, unsigned long flags)
+#ifdef UNUSED
+static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 {
 	ftrace_trace_userstack(tr, flags, preempt_count());
 }
+#endif /* UNUSED */
 
 static void
 ftrace_trace_special(void *__tr,
@@ -1720,7 +1722,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-int tracing_release(struct inode *inode, struct file *file)
+static int tracing_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *m = (struct seq_file *)file->private_data;
 	struct trace_iterator *iter = m->private;
-- 
cgit v1.2.3-59-g8ed1b


From 5e39841c45cf5e6ea930ede1b0303309e03037a2 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Tue, 10 Feb 2009 19:44:34 +0100
Subject: tracing: fix sparse warnings: fix (un-)signedness

Fix these sparse warnings:

  kernel/trace/ring_buffer.c:70:37: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:84:39: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:96:43: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2475:13: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2475:13: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2478:42: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2478:42: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2500:40: warning: incorrect type in argument 3 (different signedness)
  kernel/trace/ring_buffer.c:2505:44: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2507:46: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/trace.c:2130:40: warning: incorrect type in argument 3 (different signedness)
  kernel/trace/trace.c:2280:40: warning: incorrect type in argument 3 (different signedness)

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 8 ++++----
 kernel/trace/trace.c       | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fa64e1f003eb..dc18b5b9ccb4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -59,7 +59,7 @@ enum {
 	RB_BUFFERS_DISABLED	= 1 << RB_BUFFERS_DISABLED_BIT,
 };
 
-static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
+static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
 
 /**
  * tracing_on - enable all tracing buffers
@@ -2501,7 +2501,7 @@ static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
 	       size_t cnt, loff_t *ppos)
 {
-	long *p = filp->private_data;
+	unsigned long *p = filp->private_data;
 	char buf[64];
 	int r;
 
@@ -2517,9 +2517,9 @@ static ssize_t
 rb_simple_write(struct file *filp, const char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
-	long *p = filp->private_data;
+	unsigned long *p = filp->private_data;
 	char buf[64];
-	long val;
+	unsigned long val;
 	int ret;
 
 	if (cnt >= sizeof(buf))
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c157ba70f30c..e2434990277c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2147,7 +2147,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
 {
 	struct trace_array *tr = filp->private_data;
 	char buf[64];
-	long val;
+	unsigned long val;
 	int ret;
 
 	if (cnt >= sizeof(buf))
@@ -2295,9 +2295,9 @@ static ssize_t
 tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 		      size_t cnt, loff_t *ppos)
 {
-	long *ptr = filp->private_data;
+	unsigned long *ptr = filp->private_data;
 	char buf[64];
-	long val;
+	unsigned long val;
 	int ret;
 
 	if (cnt >= sizeof(buf))
-- 
cgit v1.2.3-59-g8ed1b


From e7669b8e329255bbcb40af65b38e342825d97a46 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Tue, 10 Feb 2009 19:44:45 +0100
Subject: tracing: fix sparse warning: attribute function with
 __acquires/__releases

Fix this sparse warning:

  kernel/trace/trace.c:458:9: warning: context imbalance in 'register_tracer' - unexpected unlock

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e2434990277c..95f99a7abf2f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -459,6 +459,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
  * Register a new plugin tracer.
  */
 int register_tracer(struct tracer *type)
+__releases(kernel_lock)
+__acquires(kernel_lock)
 {
 	struct tracer *t;
 	int len;
-- 
cgit v1.2.3-59-g8ed1b


From cf2592f59c0e8ed4308adbdb2e0a88655379d579 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 10 Feb 2009 16:52:37 +0100
Subject: softlockup: ensure the task has been switched out once

When we check if a task has been switched out since the last scan, we might
have a race condition on the following scenario:

- the task is freshly created and scheduled

- it puts its state to TASK_UNINTERRUPTIBLE and is not yet switched out

- check_hung_task() scans this task and will report a false positive because
  t->nvcsw + t->nivcsw == t->last_switch_count == 0

Add a check for such cases.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0c924de58cb2..022a4927b785 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -72,7 +72,13 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 {
 	unsigned long switch_count = t->nvcsw + t->nivcsw;
 
-	if (t->flags & PF_FROZEN)
+	/*
+	 * Ensure the task is not frozen.
+	 * Also, when a freshly created task is scheduled once, changes
+	 * its state to TASK_UNINTERRUPTIBLE without having ever been
+	 * switched out once, it musn't be checked.
+	 */
+	if (unlikely(t->flags & PF_FROZEN || !switch_count))
 		return;
 
 	if (switch_count != t->last_switch_count) {
-- 
cgit v1.2.3-59-g8ed1b


From b22f4858126a6aa852ad745b94f6b25dbdea708e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 10 Feb 2009 15:49:11 +0100
Subject: tracing/sysprof: add missing tracing_{start,stop}_record_cmdline()

Add the missing pair tracing_{start,stop}_record_cmdline() to record well
the cmdline associated with pid.

Changes in v2:

- fix a build error, the sched_switch tracer is needed to record the
  cmdline.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig         | 1 +
 kernel/trace/trace_sysprof.c | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3a331289457a..6ff928acd453 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -134,6 +134,7 @@ config SYSPROF_TRACER
 	bool "Sysprof Tracer"
 	depends on X86
 	select TRACING
+	select CONTEXT_SWITCH_TRACER
 	help
 	  This tracer provides the trace needed by the 'Sysprof' userspace
 	  tool.
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 84ca9d81e74d..9902c15997ad 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -238,6 +238,8 @@ static int stack_trace_init(struct trace_array *tr)
 {
 	sysprof_trace = tr;
 
+	tracing_start_cmdline_record();
+
 	mutex_lock(&sample_timer_lock);
 	start_stack_timers();
 	tracer_enabled = 1;
@@ -247,6 +249,7 @@ static int stack_trace_init(struct trace_array *tr)
 
 static void stack_trace_reset(struct trace_array *tr)
 {
+	tracing_stop_cmdline_record();
 	stop_stack_trace(tr);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 00f62f614bb713027b9296068d1879fbca511eb7 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 9 Feb 2009 17:04:06 -0200
Subject: ring_buffer: pahole struct ring_buffer

While fixing some bugs in pahole (built-in.o files were not being
processed due to relocation problems) I found out about these packable
structures:

$ pahole --packable kernel/trace/ring_buffer.o  | grep ring
ring_buffer	72	64	8
ring_buffer_per_cpu	112	104	8

If we take a look at the current layout of struct ring_buffer we can see
that we have two 4 bytes holes.

$ pahole -C ring_buffer kernel/trace/ring_buffer.o
struct ring_buffer {
	unsigned int               pages;           /*     0     4 */
	unsigned int               flags;           /*     4     4 */
	int                        cpus;            /*     8     4 */

	/* XXX 4 bytes hole, try to pack */

	cpumask_var_t              cpumask;         /*    16     8 */
	atomic_t                   record_disabled; /*    24     4 */

	/* XXX 4 bytes hole, try to pack */

	struct mutex               mutex;           /*    32    32 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct ring_buffer_per_cpu * * buffers;     /*    64     8 */

	/* size: 72, cachelines: 2, members: 7 */
	/* sum members: 64, holes: 2, sum holes: 8 */
	/* last cacheline: 8 bytes */
};

So, if I ask pahole to reorganize it:

$ pahole -C ring_buffer --reorganize kernel/trace/ring_buffer.o

struct ring_buffer {
	unsigned int               pages;           /*     0     4 */
	unsigned int               flags;           /*     4     4 */
	int                        cpus;            /*     8     4 */
	atomic_t                   record_disabled; /*    12     4 */
	cpumask_var_t              cpumask;         /*    16     8 */
	struct mutex               mutex;           /*    24    32 */
	struct ring_buffer_per_cpu * * buffers;     /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */

	/* size: 64, cachelines: 1, members: 7 */
};   /* saved 8 bytes and 1 cacheline! */

We get it using just one 64 bytes cacheline.

To see what it did:

$ pahole -C ring_buffer --reorganize --show_reorg_steps \
	kernel/trace/ring_buffer.o | grep \/
/* Moving 'record_disabled' from after 'cpumask' to after 'cpus' */

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 53ba3a6d16d0..27ef3bf13ed2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -273,8 +273,8 @@ struct ring_buffer {
 	unsigned			pages;
 	unsigned			flags;
 	int				cpus;
-	cpumask_var_t			cpumask;
 	atomic_t			record_disabled;
+	cpumask_var_t			cpumask;
 
 	struct mutex			mutex;
 
-- 
cgit v1.2.3-59-g8ed1b


From 9f8d979f082a3ee1b27f32b7e0811b51c3ad1d15 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 12 Feb 2009 13:10:17 +0100
Subject: softlockup: move 'one' to the softlockup section in sysctl.c

CONFIG_SOFTLOCKUP=y || CONFIG_DETECT_HUNG_TASKS=y is now the only user
of the 'one' constant in kernel/sysctl.c. Move it to the softlockup
block of constants.

This fixes a GCC warning.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3b6b54c8ac0d..6d2aeff92b3d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -90,6 +90,9 @@ extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 
 /* Constants used for minimum and  maximum */
+#if defined(CONFIG_DETECT_HUNG_TASK) || defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
+static int one = 1;
+#endif
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 static int sixty = 60;
 static int neg_one = -1;
@@ -100,7 +103,6 @@ static int two = 2;
 #endif
 
 static int zero;
-static int one = 1;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 
-- 
cgit v1.2.3-59-g8ed1b


From 45141d4667d208421ca787a3301542b6a5e0b112 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Feb 2009 13:19:48 -0500
Subject: ring-buffer: rename label out_unlock to out_reset

Impact: clean up

While reviewing the ring buffer code, I thougth I saw a bug with

	if (!__raw_spin_trylock(&cpu_buffer->lock))
		goto out_unlock;

But I forgot that we use a variable "lock_taken" that is set if
the spinlock is taken, and only unlock it if that variable is set.

To avoid further confusion from other reviewers, this patch
renames the label out_unlock with out_reset, which is the more
appropriate name.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dc18b5b9ccb4..f39d7e9a4305 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1017,7 +1017,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 */
 		if (unlikely(in_nmi())) {
 			if (!__raw_spin_trylock(&cpu_buffer->lock))
-				goto out_unlock;
+				goto out_reset;
 		} else
 			__raw_spin_lock(&cpu_buffer->lock);
 
@@ -1030,7 +1030,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 		/* we grabbed the lock before incrementing */
 		if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
-			goto out_unlock;
+			goto out_reset;
 
 		/*
 		 * If for some reason, we had an interrupt storm that made
@@ -1039,12 +1039,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 */
 		if (unlikely(next_page == commit_page)) {
 			WARN_ON_ONCE(1);
-			goto out_unlock;
+			goto out_reset;
 		}
 
 		if (next_page == head_page) {
 			if (!(buffer->flags & RB_FL_OVERWRITE))
-				goto out_unlock;
+				goto out_reset;
 
 			/* tail_page has not moved yet? */
 			if (tail_page == cpu_buffer->tail_page) {
@@ -1118,7 +1118,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 	return event;
 
- out_unlock:
+ out_reset:
 	/* reset write */
 	if (tail <= BUF_PAGE_SIZE)
 		local_set(&tail_page->write, tail);
-- 
cgit v1.2.3-59-g8ed1b


From b5f9fd0f8a05c9bafb91a9a85b9110938d8e585b Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Wed, 11 Feb 2009 13:57:25 -0500
Subject: tracing: convert c/p state power tracer to use tracepoints

Convert the c/p state "power" tracer to use tracepoints. Avoids a
function call when the tracer is disabled.

Signed-off-by: Jason Baron <jbaron@redhat.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |   2 +
 arch/x86/kernel/process.c                  |   3 +
 include/trace/power.h                      |  25 ++---
 kernel/trace/trace_power.c                 | 173 +++++++++++++++++------------
 4 files changed, 119 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 7ed925edf4d2..c5d737cdb365 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -70,6 +70,8 @@ struct acpi_cpufreq_data {
 
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
 
+DEFINE_TRACE(power_mark);
+
 /* acpi_perf_data is a pointer to percpu data. */
 static struct acpi_processor_performance *acpi_perf_data;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 026819ffcb0c..e0d0fd7ab514 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -19,6 +19,9 @@ EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
 
+DEFINE_TRACE(power_start);
+DEFINE_TRACE(power_end);
+
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	*dst = *src;
diff --git a/include/trace/power.h b/include/trace/power.h
index c7cefbcdaea4..2c733e58e89c 100644
--- a/include/trace/power.h
+++ b/include/trace/power.h
@@ -2,6 +2,7 @@
 #define _TRACE_POWER_H
 
 #include <linux/ktime.h>
+#include <linux/tracepoint.h>
 
 enum {
 	POWER_NONE = 0,
@@ -18,18 +19,16 @@ struct power_trace {
 #endif
 };
 
-#ifdef CONFIG_POWER_TRACER
-extern void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_end(struct power_trace *it);
-#else
-static inline void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_end(struct power_trace *it) { }
-#endif
+DECLARE_TRACE(power_start,
+	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
+		TPARGS(it, type, state));
+
+DECLARE_TRACE(power_mark,
+	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
+		TPARGS(it, type, state));
+
+DECLARE_TRACE(power_end,
+	TPPROTO(struct power_trace *it),
+		TPARGS(it));
 
 #endif /* _TRACE_POWER_H */
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index b1d0d087d3a6..91ce672fb037 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -21,15 +21,116 @@
 static struct trace_array *power_trace;
 static int __read_mostly trace_power_enabled;
 
+static void probe_power_start(struct power_trace *it, unsigned int type,
+				unsigned int level)
+{
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+}
+
+
+static void probe_power_end(struct power_trace *it)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	preempt_disable();
+	it->end = ktime_get();
+	data = tr->data[smp_processor_id()];
+
+	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
+					  sizeof(*entry), 0, 0);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	entry->state_data = *it;
+	trace_buffer_unlock_commit(tr, event, 0, 0);
+ out:
+	preempt_enable();
+}
+
+static void probe_power_mark(struct power_trace *it, unsigned int type,
+				unsigned int level)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+	preempt_disable();
+	it->end = it->stamp;
+	data = tr->data[smp_processor_id()];
+
+	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
+					  sizeof(*entry), 0, 0);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	entry->state_data = *it;
+	trace_buffer_unlock_commit(tr, event, 0, 0);
+ out:
+	preempt_enable();
+}
+
+static int tracing_power_register(void)
+{
+	int ret;
+
+	ret = register_trace_power_start(probe_power_start);
+	if (ret) {
+		pr_info("power trace: Couldn't activate tracepoint"
+			" probe to trace_power_start\n");
+		return ret;
+	}
+	ret = register_trace_power_end(probe_power_end);
+	if (ret) {
+		pr_info("power trace: Couldn't activate tracepoint"
+			" probe to trace_power_end\n");
+		goto fail_start;
+	}
+	ret = register_trace_power_mark(probe_power_mark);
+	if (ret) {
+		pr_info("power trace: Couldn't activate tracepoint"
+			" probe to trace_power_mark\n");
+		goto fail_end;
+	}
+	return ret;
+fail_end:
+	unregister_trace_power_end(probe_power_end);
+fail_start:
+	unregister_trace_power_start(probe_power_start);
+	return ret;
+}
 
 static void start_power_trace(struct trace_array *tr)
 {
 	trace_power_enabled = 1;
+	tracing_power_register();
 }
 
 static void stop_power_trace(struct trace_array *tr)
 {
 	trace_power_enabled = 0;
+	unregister_trace_power_start(probe_power_start);
+	unregister_trace_power_end(probe_power_end);
+	unregister_trace_power_mark(probe_power_mark);
 }
 
 
@@ -39,6 +140,7 @@ static int power_trace_init(struct trace_array *tr)
 	power_trace = tr;
 
 	trace_power_enabled = 1;
+	tracing_power_register();
 
 	for_each_cpu(cpu, cpu_possible_mask)
 		tracing_reset(tr, cpu);
@@ -95,74 +197,3 @@ static int init_power_trace(void)
 	return register_tracer(&power_tracer);
 }
 device_initcall(init_power_trace);
-
-void trace_power_start(struct power_trace *it, unsigned int type,
-			 unsigned int level)
-{
-	if (!trace_power_enabled)
-		return;
-
-	memset(it, 0, sizeof(struct power_trace));
-	it->state = level;
-	it->type = type;
-	it->stamp = ktime_get();
-}
-EXPORT_SYMBOL_GPL(trace_power_start);
-
-
-void trace_power_end(struct power_trace *it)
-{
-	struct ring_buffer_event *event;
-	struct trace_power *entry;
-	struct trace_array_cpu *data;
-	struct trace_array *tr = power_trace;
-
-	if (!trace_power_enabled)
-		return;
-
-	preempt_disable();
-	it->end = ktime_get();
-	data = tr->data[smp_processor_id()];
-
-	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->state_data = *it;
-	trace_buffer_unlock_commit(tr, event, 0, 0);
- out:
-	preempt_enable();
-}
-EXPORT_SYMBOL_GPL(trace_power_end);
-
-void trace_power_mark(struct power_trace *it, unsigned int type,
-			 unsigned int level)
-{
-	struct ring_buffer_event *event;
-	struct trace_power *entry;
-	struct trace_array_cpu *data;
-	struct trace_array *tr = power_trace;
-
-	if (!trace_power_enabled)
-		return;
-
-	memset(it, 0, sizeof(struct power_trace));
-	it->state = level;
-	it->type = type;
-	it->stamp = ktime_get();
-	preempt_disable();
-	it->end = it->stamp;
-	data = tr->data[smp_processor_id()];
-
-	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->state_data = *it;
-	trace_buffer_unlock_commit(tr, event, 0, 0);
- out:
-	preempt_enable();
-}
-EXPORT_SYMBOL_GPL(trace_power_mark);
-- 
cgit v1.2.3-59-g8ed1b


From a234aa9ecdf47a5461573a21dc0b154278df5ba8 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Sat, 14 Feb 2009 09:36:00 +0600
Subject: tracing: fix section mismatch in trace_hw_branches.c

The function bts_trace_init() references a variable
bts_hotcpu_notifier which is marked
as __cpuinitdata. Thus causes section mismatch. This patch fixes it.

   LD      kernel/trace/built-in.o
 WARNING: kernel/trace/built-in.o(.text+0xc90c): Section mismatch in
 reference from the function bts_trace_init() to the variable
 .cpuinit.data:bts_hotcpu_notifier
 The function bts_trace_init() references
 the variable __cpuinitdata bts_hotcpu_notifier.
 This is often because bts_trace_init lacks a __cpuinitdata
 annotation or the annotation of bts_hotcpu_notifier is wrong.

 WARNING: kernel/trace/built-in.o(.text+0xc92a): Section mismatch in
 reference from the function bts_trace_reset() to the variable
 .cpuinit.data:bts_hotcpu_notifier
 The function bts_trace_reset() references
 the variable __cpuinitdata bts_hotcpu_notifier.
 This is often because bts_trace_reset lacks a __cpuinitdata
 annotation or the annotation of bts_hotcpu_notifier is wrong.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Cc: markus.t.metzger@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 0794dd33f27b..3561aace075c 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -127,7 +127,7 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 	.notifier_call = bts_hotcpu_handler
 };
 
-static int bts_trace_init(struct trace_array *tr)
+static int __cpuinit bts_trace_init(struct trace_array *tr)
 {
 	hw_branch_trace = tr;
 
@@ -137,7 +137,7 @@ static int bts_trace_init(struct trace_array *tr)
 	return 0;
 }
 
-static void bts_trace_reset(struct trace_array *tr)
+static void __cpuinit bts_trace_reset(struct trace_array *tr)
 {
 	bts_trace_stop(tr);
 	unregister_hotcpu_notifier(&bts_hotcpu_notifier);
-- 
cgit v1.2.3-59-g8ed1b


From 0c75a3ed633419d75d823d5dcb05d42924c6ae61 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Feb 2009 11:21:52 -0500
Subject: ftrace: state that all functions are enabled in set_ftrace_filter

Impact: clean up, make set_ftrace_filter less confusing

The set_ftrace_filter shows only the functions that will be traced.
But when it is empty, it will trace all functions. This can be a bit
confusing.

This patch makes set_ftrace_filter show:

  #### all functions enabled ####

When all functions will be traced, and we do not filter only a select
few.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1796e018fbff..369fb78bd4ab 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -773,6 +773,7 @@ enum {
 	FTRACE_ITER_CONT	= (1 << 1),
 	FTRACE_ITER_NOTRACE	= (1 << 2),
 	FTRACE_ITER_FAILURES	= (1 << 3),
+	FTRACE_ITER_PRINTALL	= (1 << 4),
 };
 
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -794,6 +795,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 	(*pos)++;
 
+	if (iter->flags & FTRACE_ITER_PRINTALL)
+		return NULL;
+
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
  retry:
@@ -834,6 +838,19 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	struct ftrace_iterator *iter = m->private;
 	void *p = NULL;
 
+	/*
+	 * For set_ftrace_filter reading, if we have the filter
+	 * off, we can short cut and just print out that all
+	 * functions are enabled.
+	 */
+	if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) {
+		if (*pos > 0)
+			return NULL;
+		iter->flags |= FTRACE_ITER_PRINTALL;
+		(*pos)++;
+		return iter;
+	}
+
 	if (*pos > 0) {
 		if (iter->idx < 0)
 			return p;
@@ -852,9 +869,15 @@ static void t_stop(struct seq_file *m, void *p)
 
 static int t_show(struct seq_file *m, void *v)
 {
+	struct ftrace_iterator *iter = m->private;
 	struct dyn_ftrace *rec = v;
 	char str[KSYM_SYMBOL_LEN];
 
+	if (iter->flags & FTRACE_ITER_PRINTALL) {
+		seq_printf(m, "#### all functions enabled ####\n");
+		return 0;
+	}
+
 	if (!rec)
 		return 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From 265c831cb03d533cbe159af45798ac9fef534260 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 12:43:56 -0500
Subject: ftrace: add do_for_each_ftrace_rec and while_for_each_ftrace_rec

Impact: clean up

To iterate over all the functions that dynamic trace knows about
it requires two for loops. One to iterate over the pages and the
other to iterate over the records within the page.

There are several duplications of these loops in ftrace.c. This
patch creates the macros do_for_each_ftrace_rec and
while_for_each_ftrace_rec to handle this logic, and removes the
duplicate code.

While making this change, I also discovered and fixed a small
bug that one of the iterations should exit the loop after it found the
record it was searching for. This used a break when it should have
used a goto, since there were two loops it needed to break out
from.  No real harm was done by this bug since it would only continue
to search the other records, and the code was in a slow path anyway.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 208 ++++++++++++++++++++++++--------------------------
 1 file changed, 101 insertions(+), 107 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 369fb78bd4ab..fed1ebc3a133 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -297,6 +297,19 @@ static struct ftrace_page	*ftrace_pages;
 
 static struct dyn_ftrace *ftrace_free_records;
 
+/*
+ * This is a double for. Do not use 'break' to break out of the loop,
+ * you must use a goto.
+ */
+#define do_for_each_ftrace_rec(pg, rec)					\
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {		\
+		int _____i;						\
+		for (_____i = 0; _____i < pg->index; _____i++) {	\
+			rec = &pg->records[_____i];
+
+#define while_for_each_ftrace_rec()		\
+		}				\
+	}
 
 #ifdef CONFIG_KPROBES
 
@@ -341,7 +354,6 @@ void ftrace_release(void *start, unsigned long size)
 	struct ftrace_page *pg;
 	unsigned long s = (unsigned long)start;
 	unsigned long e = s + size;
-	int i;
 
 	if (ftrace_disabled || !start)
 		return;
@@ -349,14 +361,11 @@ void ftrace_release(void *start, unsigned long size)
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
 
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {
-		for (i = 0; i < pg->index; i++) {
-			rec = &pg->records[i];
+	do_for_each_ftrace_rec(pg, rec) {
+		if ((rec->ip >= s) && (rec->ip < e))
+			ftrace_free_rec(rec);
+	} while_for_each_ftrace_rec();
 
-			if ((rec->ip >= s) && (rec->ip < e))
-				ftrace_free_rec(rec);
-		}
-	}
 	spin_unlock(&ftrace_lock);
 }
 
@@ -523,41 +532,37 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 static void ftrace_replace_code(int enable)
 {
-	int i, failed;
+	int failed;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
 
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {
-		for (i = 0; i < pg->index; i++) {
-			rec = &pg->records[i];
-
-			/*
-			 * Skip over free records and records that have
-			 * failed.
-			 */
-			if (rec->flags & FTRACE_FL_FREE ||
-			    rec->flags & FTRACE_FL_FAILED)
-				continue;
-
-			/* ignore updates to this record's mcount site */
-			if (get_kprobe((void *)rec->ip)) {
-				freeze_record(rec);
-				continue;
-			} else {
-				unfreeze_record(rec);
-			}
+	do_for_each_ftrace_rec(pg, rec) {
+		/*
+		 * Skip over free records and records that have
+		 * failed.
+		 */
+		if (rec->flags & FTRACE_FL_FREE ||
+		    rec->flags & FTRACE_FL_FAILED)
+			continue;
 
-			failed = __ftrace_replace_code(rec, enable);
-			if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
-				rec->flags |= FTRACE_FL_FAILED;
-				if ((system_state == SYSTEM_BOOTING) ||
-				    !core_kernel_text(rec->ip)) {
-					ftrace_free_rec(rec);
-				} else
-					ftrace_bug(failed, rec->ip);
-			}
+		/* ignore updates to this record's mcount site */
+		if (get_kprobe((void *)rec->ip)) {
+			freeze_record(rec);
+			continue;
+		} else {
+			unfreeze_record(rec);
 		}
-	}
+
+		failed = __ftrace_replace_code(rec, enable);
+		if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
+			rec->flags |= FTRACE_FL_FAILED;
+			if ((system_state == SYSTEM_BOOTING) ||
+			    !core_kernel_text(rec->ip)) {
+				ftrace_free_rec(rec);
+			} else
+				ftrace_bug(failed, rec->ip);
+		}
+	} while_for_each_ftrace_rec();
 }
 
 static int
@@ -956,22 +961,17 @@ static void ftrace_filter_reset(int enable)
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-	unsigned i;
 
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
 	if (enable)
 		ftrace_filtered = 0;
-	pg = ftrace_pages_start;
-	while (pg) {
-		for (i = 0; i < pg->index; i++) {
-			rec = &pg->records[i];
-			if (rec->flags & FTRACE_FL_FAILED)
-				continue;
-			rec->flags &= ~type;
-		}
-		pg = pg->next;
-	}
+	do_for_each_ftrace_rec(pg, rec) {
+		if (rec->flags & FTRACE_FL_FAILED)
+			continue;
+		rec->flags &= ~type;
+	} while_for_each_ftrace_rec();
+
 	spin_unlock(&ftrace_lock);
 }
 
@@ -1094,44 +1094,39 @@ ftrace_match(unsigned char *buff, int len, int enable)
 	spin_lock(&ftrace_lock);
 	if (enable)
 		ftrace_filtered = 1;
-	pg = ftrace_pages_start;
-	while (pg) {
-		for (i = 0; i < pg->index; i++) {
-			int matched = 0;
-			char *ptr;
-
-			rec = &pg->records[i];
-			if (rec->flags & FTRACE_FL_FAILED)
-				continue;
-			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-			switch (type) {
-			case MATCH_FULL:
-				if (strcmp(str, buff) == 0)
-					matched = 1;
-				break;
-			case MATCH_FRONT_ONLY:
-				if (memcmp(str, buff, match) == 0)
-					matched = 1;
-				break;
-			case MATCH_MIDDLE_ONLY:
-				if (strstr(str, search))
-					matched = 1;
-				break;
-			case MATCH_END_ONLY:
-				ptr = strstr(str, search);
-				if (ptr && (ptr[search_len] == 0))
-					matched = 1;
-				break;
-			}
-			if (matched) {
-				if (not)
-					rec->flags &= ~flag;
-				else
-					rec->flags |= flag;
-			}
+	do_for_each_ftrace_rec(pg, rec) {
+		int matched = 0;
+		char *ptr;
+
+		if (rec->flags & FTRACE_FL_FAILED)
+			continue;
+		kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+		switch (type) {
+		case MATCH_FULL:
+			if (strcmp(str, buff) == 0)
+				matched = 1;
+			break;
+		case MATCH_FRONT_ONLY:
+			if (memcmp(str, buff, match) == 0)
+				matched = 1;
+			break;
+		case MATCH_MIDDLE_ONLY:
+			if (strstr(str, search))
+				matched = 1;
+			break;
+		case MATCH_END_ONLY:
+			ptr = strstr(str, search);
+			if (ptr && (ptr[search_len] == 0))
+				matched = 1;
+			break;
 		}
-		pg = pg->next;
-	}
+		if (matched) {
+			if (not)
+				rec->flags &= ~flag;
+			else
+				rec->flags |= flag;
+		}
+	} while_for_each_ftrace_rec();
 	spin_unlock(&ftrace_lock);
 }
 
@@ -1452,7 +1447,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
 	int found = 0;
-	int i, j;
+	int j;
 
 	if (ftrace_disabled)
 		return -ENODEV;
@@ -1460,27 +1455,26 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
 
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {
-		for (i = 0; i < pg->index; i++) {
-			rec = &pg->records[i];
-
-			if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
-				continue;
-
-			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-			if (strcmp(str, buffer) == 0) {
-				found = 1;
-				for (j = 0; j < idx; j++)
-					if (array[j] == rec->ip) {
-						found = 0;
-						break;
-					}
-				if (found)
-					array[idx] = rec->ip;
-				break;
-			}
+	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
+			continue;
+
+		kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+		if (strcmp(str, buffer) == 0) {
+			/* Return 1 if we add it to the array */
+			found = 1;
+			for (j = 0; j < idx; j++)
+				if (array[j] == rec->ip) {
+					found = 0;
+					break;
+				}
+			if (found)
+				array[idx] = rec->ip;
+			goto out;
 		}
-	}
+	} while_for_each_ftrace_rec();
+ out:
 	spin_unlock(&ftrace_lock);
 
 	return found ? 0 : -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From 7f24b31b01a271b62346d9df084b029e48612163 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 14:37:33 -0500
Subject: ftrace: rename ftrace_match to ftrace_match_records

Impact: clean up

ftrace_match is too generic of a name. What it really does is
search all records and matches the records with the given string,
and either sets or unsets the functions to be traced depending
on if the parameter 'enable' is set or not.

This allows us to make another function called ftrace_match that
can be used to test a single record.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fed1ebc3a133..f397d7adb62e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1054,7 +1054,7 @@ enum {
 };
 
 static void
-ftrace_match(unsigned char *buff, int len, int enable)
+ftrace_match_records(unsigned char *buff, int len, int enable)
 {
 	char str[KSYM_SYMBOL_LEN];
 	char *search = NULL;
@@ -1197,7 +1197,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 	if (isspace(ch)) {
 		iter->filtered++;
 		iter->buffer[iter->buffer_idx] = 0;
-		ftrace_match(iter->buffer, iter->buffer_idx, enable);
+		ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
 		iter->buffer_idx = 0;
 	} else
 		iter->flags |= FTRACE_ITER_CONT;
@@ -1236,7 +1236,7 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
 	if (reset)
 		ftrace_filter_reset(enable);
 	if (buf)
-		ftrace_match(buf, len, enable);
+		ftrace_match_records(buf, len, enable);
 	mutex_unlock(&ftrace_regex_lock);
 }
 
@@ -1286,7 +1286,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 	if (iter->buffer_idx) {
 		iter->filtered++;
 		iter->buffer[iter->buffer_idx] = 0;
-		ftrace_match(iter->buffer, iter->buffer_idx, enable);
+		ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
 	}
 
 	mutex_lock(&ftrace_sysctl_lock);
-- 
cgit v1.2.3-59-g8ed1b


From 9f4801e30ad291e27284e873696da1ead92d68fa Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 15:56:43 -0500
Subject: ftrace: break up ftrace_match_records into smaller components

Impact: clean up

ftrace_match_records does a lot of things that other features
can use. This patch breaks up ftrace_match_records and pulls
out ftrace_setup_glob and ftrace_match_record.

ftrace_setup_glob prepares a simple glob expression for use with
ftrace_match_record. ftrace_match_record compares a single record
with a glob type.

Breaking this up will allow for more features to run on individual
records.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 115 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 75 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f397d7adb62e..fcec31323a10 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1053,79 +1053,114 @@ enum {
 	MATCH_END_ONLY,
 };
 
-static void
-ftrace_match_records(unsigned char *buff, int len, int enable)
+/*
+ * (static function - no need for kernel doc)
+ *
+ * Pass in a buffer containing a glob and this function will
+ * set search to point to the search part of the buffer and
+ * return the type of search it is (see enum above).
+ * This does modify buff.
+ *
+ * Returns enum type.
+ *  search returns the pointer to use for comparison.
+ *  not returns 1 if buff started with a '!'
+ *     0 otherwise.
+ */
+static int
+ftrace_setup_glob(unsigned char *buff, int len, char **search, int *not)
 {
-	char str[KSYM_SYMBOL_LEN];
-	char *search = NULL;
-	struct ftrace_page *pg;
-	struct dyn_ftrace *rec;
 	int type = MATCH_FULL;
-	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-	unsigned i, match = 0, search_len = 0;
-	int not = 0;
+	int i;
 
 	if (buff[0] == '!') {
-		not = 1;
+		*not = 1;
 		buff++;
 		len--;
-	}
+	} else
+		*not = 0;
+
+	*search = buff;
 
 	for (i = 0; i < len; i++) {
 		if (buff[i] == '*') {
 			if (!i) {
-				search = buff + i + 1;
+				*search = buff + 1;
 				type = MATCH_END_ONLY;
-				search_len = len - (i + 1);
 			} else {
-				if (type == MATCH_END_ONLY) {
+				if (type == MATCH_END_ONLY)
 					type = MATCH_MIDDLE_ONLY;
-				} else {
-					match = i;
+				else
 					type = MATCH_FRONT_ONLY;
-				}
 				buff[i] = 0;
 				break;
 			}
 		}
 	}
 
+	return type;
+}
+
+static int
+ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
+{
+	char str[KSYM_SYMBOL_LEN];
+	int matched = 0;
+	char *ptr;
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+	switch (type) {
+	case MATCH_FULL:
+		if (strcmp(str, regex) == 0)
+			matched = 1;
+		break;
+	case MATCH_FRONT_ONLY:
+		if (strncmp(str, regex, len) == 0)
+			matched = 1;
+		break;
+	case MATCH_MIDDLE_ONLY:
+		if (strstr(str, regex))
+			matched = 1;
+		break;
+	case MATCH_END_ONLY:
+		ptr = strstr(str, regex);
+		if (ptr && (ptr[len] == 0))
+			matched = 1;
+		break;
+	}
+
+	return matched;
+}
+
+static void ftrace_match_records(char *buff, int len, int enable)
+{
+	char *search;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	int type;
+	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+	unsigned search_len;
+	int not;
+
+	type = ftrace_setup_glob(buff, len, &search, &not);
+
+	search_len = strlen(search);
+
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
 	if (enable)
 		ftrace_filtered = 1;
 	do_for_each_ftrace_rec(pg, rec) {
-		int matched = 0;
-		char *ptr;
 
 		if (rec->flags & FTRACE_FL_FAILED)
 			continue;
-		kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-		switch (type) {
-		case MATCH_FULL:
-			if (strcmp(str, buff) == 0)
-				matched = 1;
-			break;
-		case MATCH_FRONT_ONLY:
-			if (memcmp(str, buff, match) == 0)
-				matched = 1;
-			break;
-		case MATCH_MIDDLE_ONLY:
-			if (strstr(str, search))
-				matched = 1;
-			break;
-		case MATCH_END_ONLY:
-			ptr = strstr(str, search);
-			if (ptr && (ptr[search_len] == 0))
-				matched = 1;
-			break;
-		}
-		if (matched) {
+
+		if (ftrace_match_record(rec, search, search_len, type)) {
 			if (not)
 				rec->flags &= ~flag;
 			else
 				rec->flags |= flag;
 		}
+
 	} while_for_each_ftrace_rec();
 	spin_unlock(&ftrace_lock);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 64e7c440618998fd69eee6ab490b042d12248021 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 17:08:48 -0500
Subject: ftrace: add module command function filter selection

This patch adds a "command" syntax to the function filtering files:

  /debugfs/tracing/set_ftrace_filter
  /debugfs/tracing/set_ftrace_notrace

Of the format:  <function>:<command>:<parameter>

The command is optional, and dependent on the command, so are
the parameters.

 echo do_fork > set_ftrace_filter

Will only trace 'do_fork'.

 echo 'sched_*' > set_ftrace_filter

Will only trace functions starting with the letters 'sched_'.

 echo '*:mod:ext3' > set_ftrace_filter

Will trace only the ext3 module functions.

 echo '*write*:mod:ext3' > set_ftrace_notrace

Will prevent the ext3 functions with the letters 'write' in
the name from being traced.

 echo '!*_allocate:mod:ext3' > set_ftrace_filter

Will remove the functions in ext3 that end with the letters
'_allocate' from the ftrace filter.

Although this patch implements the 'command' format, only the
'mod' command is supported. More commands to follow.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 109 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fcec31323a10..9e60ae423af9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1067,7 +1067,7 @@ enum {
  *     0 otherwise.
  */
 static int
-ftrace_setup_glob(unsigned char *buff, int len, char **search, int *not)
+ftrace_setup_glob(char *buff, int len, char **search, int *not)
 {
 	int type = MATCH_FULL;
 	int i;
@@ -1100,14 +1100,11 @@ ftrace_setup_glob(unsigned char *buff, int len, char **search, int *not)
 	return type;
 }
 
-static int
-ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
+static int ftrace_match(char *str, char *regex, int len, int type)
 {
-	char str[KSYM_SYMBOL_LEN];
 	int matched = 0;
 	char *ptr;
 
-	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 	switch (type) {
 	case MATCH_FULL:
 		if (strcmp(str, regex) == 0)
@@ -1131,6 +1128,15 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
 	return matched;
 }
 
+static int
+ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
+{
+	char str[KSYM_SYMBOL_LEN];
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+	return ftrace_match(str, regex, len, type);
+}
+
 static void ftrace_match_records(char *buff, int len, int enable)
 {
 	char *search;
@@ -1165,6 +1171,100 @@ static void ftrace_match_records(char *buff, int len, int enable)
 	spin_unlock(&ftrace_lock);
 }
 
+static int
+ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
+			   char *regex, int len, int type)
+{
+	char str[KSYM_SYMBOL_LEN];
+	char *modname;
+
+	kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
+
+	if (!modname || strcmp(modname, mod))
+		return 0;
+
+	/* blank search means to match all funcs in the mod */
+	if (len)
+		return ftrace_match(str, regex, len, type);
+	else
+		return 1;
+}
+
+static void ftrace_match_module_records(char *buff, char *mod, int enable)
+{
+	char *search = buff;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	int type = MATCH_FULL;
+	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+	unsigned search_len = 0;
+	int not = 0;
+
+	/* blank or '*' mean the same */
+	if (strcmp(buff, "*") == 0)
+		buff[0] = 0;
+
+	/* handle the case of 'dont filter this module' */
+	if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) {
+		buff[0] = 0;
+		not = 1;
+	}
+
+	if (strlen(buff)) {
+		type = ftrace_setup_glob(buff, strlen(buff), &search, &not);
+		search_len = strlen(search);
+	}
+
+	/* should not be called from interrupt context */
+	spin_lock(&ftrace_lock);
+	if (enable)
+		ftrace_filtered = 1;
+
+	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_FAILED)
+			continue;
+
+		if (ftrace_match_module_record(rec, mod,
+					       search, search_len, type)) {
+			if (not)
+				rec->flags &= ~flag;
+			else
+				rec->flags |= flag;
+		}
+
+	} while_for_each_ftrace_rec();
+	spin_unlock(&ftrace_lock);
+}
+
+static int ftrace_process_regex(char *buff, int len, int enable)
+{
+	char *func, *mod, *command, *next = buff;
+
+	func = strsep(&next, ":");
+
+	if (!next) {
+		ftrace_match_records(func, len, enable);
+		return 0;
+	}
+
+	/* command fonud */
+
+	command = strsep(&next, ":");
+
+	if (strcmp(command, "mod") == 0) {
+		/* only match modules */
+		if (!next)
+			return -EINVAL;
+
+		mod = strsep(&next, ":");
+		ftrace_match_module_records(func, mod, enable);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
 static ssize_t
 ftrace_regex_write(struct file *file, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos, int enable)
@@ -1232,7 +1332,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 	if (isspace(ch)) {
 		iter->filtered++;
 		iter->buffer[iter->buffer_idx] = 0;
-		ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
+		ret = ftrace_process_regex(iter->buffer,
+					   iter->buffer_idx, enable);
+		if (ret)
+			goto out;
 		iter->buffer_idx = 0;
 	} else
 		iter->flags |= FTRACE_ITER_CONT;
-- 
cgit v1.2.3-59-g8ed1b


From e68746a271eb3393a2183840be9e903caddf765b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 20:53:42 -0500
Subject: ftrace: enable filtering only when a function is filtered on

Impact: fix to prevent empty set_ftrace_filter and no ftrace output

The function filter is used to only trace a given set of functions.
The filter is enabled when a function name is echoed into the
set_ftrace_filter file. But if the name has a typo and the function
is not found, the filter is enabled, but no function is listed.

This makes a confusing situation where set_ftrace_filter is empty
but no functions ever get enabled for tracing.

For example:

 # cat /debug/tracing/set_ftrace_filter

  #### all functions enabled ####

 # echo bad_name > set_ftrace_filter
 # cat /debug/tracing/set_ftrace_filter

 # echo function > current_tracer
 # cat trace

  # tracer: nop
  #
  #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
  #              | |       |          |         |

This patch changes that to only enable filtering if a function
is set to be filtered on. Now, the filter is not enabled if
a bad name is echoed into set_ftrace_filter.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e60ae423af9..340f88b68d9e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1153,8 +1153,6 @@ static void ftrace_match_records(char *buff, int len, int enable)
 
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
-	if (enable)
-		ftrace_filtered = 1;
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & FTRACE_FL_FAILED)
@@ -1166,7 +1164,12 @@ static void ftrace_match_records(char *buff, int len, int enable)
 			else
 				rec->flags |= flag;
 		}
-
+		/*
+		 * Only enable filtering if we have a function that
+		 * is filtered on.
+		 */
+		if (enable && (rec->flags & FTRACE_FL_FILTER))
+			ftrace_filtered = 1;
 	} while_for_each_ftrace_rec();
 	spin_unlock(&ftrace_lock);
 }
@@ -1217,9 +1220,6 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
-	if (enable)
-		ftrace_filtered = 1;
-
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & FTRACE_FL_FAILED)
@@ -1232,6 +1232,8 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 			else
 				rec->flags |= flag;
 		}
+		if (enable && (rec->flags & FTRACE_FL_FILTER))
+			ftrace_filtered = 1;
 
 	} while_for_each_ftrace_rec();
 	spin_unlock(&ftrace_lock);
-- 
cgit v1.2.3-59-g8ed1b


From f6180773d90595650e11de0118bb112018290915 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 00:40:25 -0500
Subject: ftrace: add command interface for function selection

Allow for other tracers to add their own commands for function
selection. This interface gives a trace the ability to name a
command for function selection. Right now it is pretty limited
in what it offers, but this is a building step for more features.

The :mod: command is converted to this interface and also serves
as a template for other implementations.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |  16 ++++++++
 kernel/trace/ftrace.c  | 106 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 111 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 106b7909d500..f0a0ecc63b5c 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -95,6 +95,13 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 		   loff_t *ppos);
 #endif
 
+struct ftrace_func_command {
+	struct list_head	list;
+	char			*name;
+	int			(*func)(char *func, char *cmd,
+					char *params, int enable);
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
@@ -119,6 +126,9 @@ struct dyn_ftrace {
 int ftrace_force_update(void);
 void ftrace_set_filter(unsigned char *buf, int len, int reset);
 
+int register_ftrace_command(struct ftrace_func_command *cmd);
+int unregister_ftrace_command(struct ftrace_func_command *cmd);
+
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern int ftrace_dyn_arch_init(void *data);
@@ -202,6 +212,12 @@ extern void ftrace_enable_daemon(void);
 # define ftrace_disable_daemon()		do { } while (0)
 # define ftrace_enable_daemon()			do { } while (0)
 static inline void ftrace_release(void *start, unsigned long size) { }
+static inline int register_ftrace_command(struct ftrace_func_command *cmd)
+{
+}
+static inline int unregister_ftrace_command(char *cmd_name)
+{
+}
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /* totally disable ftrace - can not re-enable after this */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 340f88b68d9e..45a44c402566 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1239,9 +1239,93 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 	spin_unlock(&ftrace_lock);
 }
 
+/*
+ * We register the module command as a template to show others how
+ * to register the a command as well.
+ */
+
+static int
+ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
+{
+	char *mod;
+
+	/*
+	 * cmd == 'mod' because we only registered this func
+	 * for the 'mod' ftrace_func_command.
+	 * But if you register one func with multiple commands,
+	 * you can tell which command was used by the cmd
+	 * parameter.
+	 */
+
+	/* we must have a module name */
+	if (!param)
+		return -EINVAL;
+
+	mod = strsep(&param, ":");
+	if (!strlen(mod))
+		return -EINVAL;
+
+	ftrace_match_module_records(func, mod, enable);
+	return 0;
+}
+
+static struct ftrace_func_command ftrace_mod_cmd = {
+	.name			= "mod",
+	.func			= ftrace_mod_callback,
+};
+
+static int __init ftrace_mod_cmd_init(void)
+{
+	return register_ftrace_command(&ftrace_mod_cmd);
+}
+device_initcall(ftrace_mod_cmd_init);
+
+static LIST_HEAD(ftrace_commands);
+static DEFINE_MUTEX(ftrace_cmd_mutex);
+
+int register_ftrace_command(struct ftrace_func_command *cmd)
+{
+	struct ftrace_func_command *p;
+	int ret = 0;
+
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry(p, &ftrace_commands, list) {
+		if (strcmp(cmd->name, p->name) == 0) {
+			ret = -EBUSY;
+			goto out_unlock;
+		}
+	}
+	list_add(&cmd->list, &ftrace_commands);
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
+
+	return ret;
+}
+
+int unregister_ftrace_command(struct ftrace_func_command *cmd)
+{
+	struct ftrace_func_command *p, *n;
+	int ret = -ENODEV;
+
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry_safe(p, n, &ftrace_commands, list) {
+		if (strcmp(cmd->name, p->name) == 0) {
+			ret = 0;
+			list_del_init(&p->list);
+			goto out_unlock;
+		}
+	}
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
+
+	return ret;
+}
+
 static int ftrace_process_regex(char *buff, int len, int enable)
 {
-	char *func, *mod, *command, *next = buff;
+	struct ftrace_func_command *p;
+	char *func, *command, *next = buff;
+	int ret = -EINVAL;
 
 	func = strsep(&next, ":");
 
@@ -1250,21 +1334,21 @@ static int ftrace_process_regex(char *buff, int len, int enable)
 		return 0;
 	}
 
-	/* command fonud */
+	/* command found */
 
 	command = strsep(&next, ":");
 
-	if (strcmp(command, "mod") == 0) {
-		/* only match modules */
-		if (!next)
-			return -EINVAL;
-
-		mod = strsep(&next, ":");
-		ftrace_match_module_records(func, mod, enable);
-		return 0;
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry(p, &ftrace_commands, list) {
+		if (strcmp(p->name, command) == 0) {
+			ret = p->func(func, command, next, enable);
+			goto out_unlock;
+		}
 	}
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
 
-	return -EINVAL;
+	return ret;
 }
 
 static ssize_t
-- 
cgit v1.2.3-59-g8ed1b


From 52baf11922db7377b580dd5448a07f71c6a35611 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 01:15:39 -0500
Subject: ftrace: convert ftrace_lock from a spinlock to mutex

Impact: clean up

The older versions of ftrace required doing the ftrace list
search under atomic context. Now all the calls are in non-atomic
context. There is no reason to keep the ftrace_lock as a spinlock.

This patch converts it to a mutex.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 51 +++++++++++++++++++--------------------------------
 1 file changed, 19 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 45a44c402566..4771732037ee 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -61,7 +61,7 @@ int function_trace_stop;
  */
 static int ftrace_disabled __read_mostly;
 
-static DEFINE_SPINLOCK(ftrace_lock);
+static DEFINE_MUTEX(ftrace_lock);
 static DEFINE_MUTEX(ftrace_sysctl_lock);
 static DEFINE_MUTEX(ftrace_start_lock);
 
@@ -134,8 +134,7 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
 
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 
 	ops->next = ftrace_list;
 	/*
@@ -172,7 +171,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 #endif
 	}
 
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return 0;
 }
@@ -182,8 +181,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	struct ftrace_ops **p;
 	int ret = 0;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 
 	/*
 	 * If we are removing the last function, then simply point
@@ -224,7 +222,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	}
 
  out:
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return ret;
 }
@@ -233,8 +231,7 @@ static void ftrace_update_pid_func(void)
 {
 	ftrace_func_t func;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 
 	if (ftrace_trace_function == ftrace_stub)
 		goto out;
@@ -256,7 +253,7 @@ static void ftrace_update_pid_func(void)
 #endif
 
  out:
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -358,15 +355,12 @@ void ftrace_release(void *start, unsigned long size)
 	if (ftrace_disabled || !start)
 		return;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
-
+	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 		if ((rec->ip >= s) && (rec->ip < e))
 			ftrace_free_rec(rec);
 	} while_for_each_ftrace_rec();
-
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
@@ -803,8 +797,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 	if (iter->flags & FTRACE_ITER_PRINTALL)
 		return NULL;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
  retry:
 	if (iter->idx >= iter->pg->index) {
 		if (iter->pg->next) {
@@ -833,7 +826,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 			goto retry;
 		}
 	}
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return rec;
 }
@@ -962,8 +955,7 @@ static void ftrace_filter_reset(int enable)
 	struct dyn_ftrace *rec;
 	unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 	if (enable)
 		ftrace_filtered = 0;
 	do_for_each_ftrace_rec(pg, rec) {
@@ -971,8 +963,7 @@ static void ftrace_filter_reset(int enable)
 			continue;
 		rec->flags &= ~type;
 	} while_for_each_ftrace_rec();
-
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 static int
@@ -1151,8 +1142,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
 
 	search_len = strlen(search);
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & FTRACE_FL_FAILED)
@@ -1171,7 +1161,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
 		if (enable && (rec->flags & FTRACE_FL_FILTER))
 			ftrace_filtered = 1;
 	} while_for_each_ftrace_rec();
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 static int
@@ -1218,8 +1208,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 		search_len = strlen(search);
 	}
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & FTRACE_FL_FAILED)
@@ -1236,7 +1225,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 			ftrace_filtered = 1;
 
 	} while_for_each_ftrace_rec();
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 /*
@@ -1676,9 +1665,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 	if (ftrace_disabled)
 		return -ENODEV;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
-
+	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
@@ -1699,7 +1686,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 		}
 	} while_for_each_ftrace_rec();
  out:
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return found ? 0 : -EINVAL;
 }
-- 
cgit v1.2.3-59-g8ed1b


From e6ea44e9b4c12325337cd1c06103cd515a1c02b2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 01:42:44 -0500
Subject: ftrace: consolidate mutexes

Impact: clean up

Now that ftrace_lock is a mutex, there is no reason to have three
different mutexes protecting similar data. All the mutex paths
are not in hot paths, so having a mutex to cover more data is
not a problem.

This patch removes the ftrace_sysctl_lock and ftrace_start_lock
and uses the ftrace_lock to protect the locations that were protected
by these locks. By doing so, this change also removes some of
the lock nesting that was taking place.

There are still more mutexes in ftrace.c that can probably be
consolidated, but they can be dealt with later. We need to be careful
about the way the locks are nested, and by consolidating, we can cause
a recursive deadlock.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 68 ++++++++++++++++-----------------------------------
 1 file changed, 21 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4771732037ee..157d4f68b0e0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,8 +62,6 @@ int function_trace_stop;
 static int ftrace_disabled __read_mostly;
 
 static DEFINE_MUTEX(ftrace_lock);
-static DEFINE_MUTEX(ftrace_sysctl_lock);
-static DEFINE_MUTEX(ftrace_start_lock);
 
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
@@ -134,8 +132,6 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
 
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
-	mutex_lock(&ftrace_lock);
-
 	ops->next = ftrace_list;
 	/*
 	 * We are entering ops into the ftrace_list but another
@@ -171,17 +167,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 #endif
 	}
 
-	mutex_unlock(&ftrace_lock);
-
 	return 0;
 }
 
 static int __unregister_ftrace_function(struct ftrace_ops *ops)
 {
 	struct ftrace_ops **p;
-	int ret = 0;
-
-	mutex_lock(&ftrace_lock);
 
 	/*
 	 * If we are removing the last function, then simply point
@@ -190,17 +181,15 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
 		ftrace_trace_function = ftrace_stub;
 		ftrace_list = &ftrace_list_end;
-		goto out;
+		return 0;
 	}
 
 	for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
 		if (*p == ops)
 			break;
 
-	if (*p != ops) {
-		ret = -1;
-		goto out;
-	}
+	if (*p != ops)
+		return -1;
 
 	*p = (*p)->next;
 
@@ -221,10 +210,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 		}
 	}
 
- out:
-	mutex_unlock(&ftrace_lock);
-
-	return ret;
+	return 0;
 }
 
 static void ftrace_update_pid_func(void)
@@ -622,13 +608,10 @@ static void ftrace_startup(int command)
 	if (unlikely(ftrace_disabled))
 		return;
 
-	mutex_lock(&ftrace_start_lock);
 	ftrace_start_up++;
 	command |= FTRACE_ENABLE_CALLS;
 
 	ftrace_startup_enable(command);
-
-	mutex_unlock(&ftrace_start_lock);
 }
 
 static void ftrace_shutdown(int command)
@@ -636,7 +619,6 @@ static void ftrace_shutdown(int command)
 	if (unlikely(ftrace_disabled))
 		return;
 
-	mutex_lock(&ftrace_start_lock);
 	ftrace_start_up--;
 	if (!ftrace_start_up)
 		command |= FTRACE_DISABLE_CALLS;
@@ -647,11 +629,9 @@ static void ftrace_shutdown(int command)
 	}
 
 	if (!command || !ftrace_enabled)
-		goto out;
+		return;
 
 	ftrace_run_update_code(command);
- out:
-	mutex_unlock(&ftrace_start_lock);
 }
 
 static void ftrace_startup_sysctl(void)
@@ -661,7 +641,6 @@ static void ftrace_startup_sysctl(void)
 	if (unlikely(ftrace_disabled))
 		return;
 
-	mutex_lock(&ftrace_start_lock);
 	/* Force update next time */
 	saved_ftrace_func = NULL;
 	/* ftrace_start_up is true if we want ftrace running */
@@ -669,7 +648,6 @@ static void ftrace_startup_sysctl(void)
 		command |= FTRACE_ENABLE_CALLS;
 
 	ftrace_run_update_code(command);
-	mutex_unlock(&ftrace_start_lock);
 }
 
 static void ftrace_shutdown_sysctl(void)
@@ -679,13 +657,11 @@ static void ftrace_shutdown_sysctl(void)
 	if (unlikely(ftrace_disabled))
 		return;
 
-	mutex_lock(&ftrace_start_lock);
 	/* ftrace_start_up is true if ftrace is running */
 	if (ftrace_start_up)
 		command |= FTRACE_DISABLE_CALLS;
 
 	ftrace_run_update_code(command);
-	mutex_unlock(&ftrace_start_lock);
 }
 
 static cycle_t		ftrace_update_time;
@@ -1502,12 +1478,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 		ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
 	}
 
-	mutex_lock(&ftrace_sysctl_lock);
-	mutex_lock(&ftrace_start_lock);
+	mutex_lock(&ftrace_lock);
 	if (ftrace_start_up && ftrace_enabled)
 		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
-	mutex_unlock(&ftrace_start_lock);
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 
 	kfree(iter);
 	mutex_unlock(&ftrace_regex_lock);
@@ -1824,7 +1798,7 @@ static int ftrace_convert_nops(struct module *mod,
 	unsigned long addr;
 	unsigned long flags;
 
-	mutex_lock(&ftrace_start_lock);
+	mutex_lock(&ftrace_lock);
 	p = start;
 	while (p < end) {
 		addr = ftrace_call_adjust(*p++);
@@ -1843,7 +1817,7 @@ static int ftrace_convert_nops(struct module *mod,
 	local_irq_save(flags);
 	ftrace_update_code(mod);
 	local_irq_restore(flags);
-	mutex_unlock(&ftrace_start_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return 0;
 }
@@ -2016,7 +1990,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 	if (ret < 0)
 		return ret;
 
-	mutex_lock(&ftrace_start_lock);
+	mutex_lock(&ftrace_lock);
 	if (val < 0) {
 		/* disable pid tracing */
 		if (!ftrace_pid_trace)
@@ -2055,7 +2029,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 	ftrace_startup_enable(0);
 
  out:
-	mutex_unlock(&ftrace_start_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return cnt;
 }
@@ -2118,12 +2092,12 @@ int register_ftrace_function(struct ftrace_ops *ops)
 	if (unlikely(ftrace_disabled))
 		return -1;
 
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 
 	ret = __register_ftrace_function(ops);
 	ftrace_startup(0);
 
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 	return ret;
 }
 
@@ -2137,10 +2111,10 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 {
 	int ret;
 
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 	ret = __unregister_ftrace_function(ops);
 	ftrace_shutdown(0);
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return ret;
 }
@@ -2155,7 +2129,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	if (unlikely(ftrace_disabled))
 		return -ENODEV;
 
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 
 	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
 
@@ -2184,7 +2158,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	}
 
  out:
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 	return ret;
 }
 
@@ -2296,7 +2270,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 {
 	int ret = 0;
 
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 
 	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
 	register_pm_notifier(&ftrace_suspend_notifier);
@@ -2314,13 +2288,13 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	ftrace_startup(FTRACE_START_FUNC_RET);
 
 out:
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 	return ret;
 }
 
 void unregister_ftrace_graph(void)
 {
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 
 	atomic_dec(&ftrace_graph_active);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
@@ -2328,7 +2302,7 @@ void unregister_ftrace_graph(void)
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 	unregister_pm_notifier(&ftrace_suspend_notifier);
 
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 /* Allocate a return stack for newly created task */
-- 
cgit v1.2.3-59-g8ed1b


From 59df055f1991c9fc0c71a9230663c39188f6972f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 15:29:06 -0500
Subject: ftrace: trace different functions with a different tracer

Impact: new feature

Currently, the function tracer only gives you an ability to hook
a tracer to all functions being traced. The dynamic function trace
allows you to pick and choose which of those functions will be
traced, but all functions being traced will call all tracers that
registered with the function tracer.

This patch adds a new feature that allows a tracer to hook to specific
functions, even when all functions are being traced. It allows for
different functions to call different tracer hooks.

The way this is accomplished is by a special function that will hook
to the function tracer and will set up a hash table knowing which
tracer hook to call with which function. This is the most general
and easiest method to accomplish this. Later, an arch may choose
to supply their own method in changing the mcount call of a function
to call a different tracer. But that will be an exercise for the
future.

To register a function:

 struct ftrace_hook_ops {
	void			(*func)(unsigned long ip,
					unsigned long parent_ip,
					void **data);
	int			(*callback)(unsigned long ip, void **data);
	void			(*free)(void **data);
 };

 int register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
				  void *data);

glob is a simple glob to search for the functions to hook.
ops is a pointer to the operations (listed below)
data is the default data to be passed to the hook functions when traced

ops:
 func is the hook function to call when the functions are traced
 callback is a callback function that is called when setting up the hash.
   That is, if the tracer needs to do something special for each
   function, that is being traced, and wants to give each function
   its own data. The address of the entry data is passed to this
   callback, so that the callback may wish to update the entry to
   whatever it would like.
 free is a callback for when the entry is freed. In case the tracer
   allocated any data, it is give the chance to free it.

To unregister we have three functions:

  void
  unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
				void *data)

This will unregister all hooks that match glob, point to ops, and
have its data matching data. (note, if glob is NULL, blank or '*',
all functions will be tested).

  void
  unregister_ftrace_function_hook_func(char *glob,
				 struct ftrace_hook_ops *ops)

This will unregister all functions matching glob that has an entry
pointing to ops.

  void unregister_ftrace_function_hook_all(char *glob)

This simply unregisters all funcs.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |  18 ++++
 kernel/trace/ftrace.c  | 247 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 265 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f0a0ecc63b5c..13918c4400ad 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -106,6 +106,24 @@ struct ftrace_func_command {
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
 
+struct ftrace_hook_ops {
+	void			(*func)(unsigned long ip,
+					unsigned long parent_ip,
+					void **data);
+	int			(*callback)(unsigned long ip, void **data);
+	void			(*free)(void **data);
+};
+
+extern int
+register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+			      void *data);
+extern void
+unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+				void *data);
+extern void
+unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops);
+extern void unregister_ftrace_function_hook_all(char *glob);
+
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
 	FTRACE_FL_FAILED	= (1 << 1),
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 157d4f68b0e0..0b80e325f296 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,7 @@
 #include <linux/sysctl.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
+#include <linux/hash.h>
 
 #include <asm/ftrace.h>
 
@@ -1245,6 +1246,252 @@ static int __init ftrace_mod_cmd_init(void)
 }
 device_initcall(ftrace_mod_cmd_init);
 
+#define FTRACE_HASH_BITS 7
+#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
+static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
+
+struct ftrace_func_hook {
+	struct hlist_node	node;
+	struct ftrace_hook_ops	*ops;
+	unsigned long		flags;
+	unsigned long		ip;
+	void			*data;
+	struct rcu_head		rcu;
+};
+
+static void
+function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct ftrace_func_hook *entry;
+	struct hlist_head *hhd;
+	struct hlist_node *n;
+	unsigned long key;
+	int resched;
+
+	key = hash_long(ip, FTRACE_HASH_BITS);
+
+	hhd = &ftrace_func_hash[key];
+
+	if (hlist_empty(hhd))
+		return;
+
+	/*
+	 * Disable preemption for these calls to prevent a RCU grace
+	 * period. This syncs the hash iteration and freeing of items
+	 * on the hash. rcu_read_lock is too dangerous here.
+	 */
+	resched = ftrace_preempt_disable();
+	hlist_for_each_entry_rcu(entry, n, hhd, node) {
+		if (entry->ip == ip)
+			entry->ops->func(ip, parent_ip, &entry->data);
+	}
+	ftrace_preempt_enable(resched);
+}
+
+static struct ftrace_ops trace_hook_ops __read_mostly =
+{
+	.func = function_trace_hook_call,
+};
+
+static int ftrace_hook_registered;
+
+static void __enable_ftrace_function_hook(void)
+{
+	int i;
+
+	if (ftrace_hook_registered)
+		return;
+
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+		if (hhd->first)
+			break;
+	}
+	/* Nothing registered? */
+	if (i == FTRACE_FUNC_HASHSIZE)
+		return;
+
+	__register_ftrace_function(&trace_hook_ops);
+	ftrace_startup(0);
+	ftrace_hook_registered = 1;
+}
+
+static void __disable_ftrace_function_hook(void)
+{
+	int i;
+
+	if (!ftrace_hook_registered)
+		return;
+
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+		if (hhd->first)
+			return;
+	}
+
+	/* no more funcs left */
+	__unregister_ftrace_function(&trace_hook_ops);
+	ftrace_shutdown(0);
+	ftrace_hook_registered = 0;
+}
+
+
+static void ftrace_free_entry_rcu(struct rcu_head *rhp)
+{
+	struct ftrace_func_hook *entry =
+		container_of(rhp, struct ftrace_func_hook, rcu);
+
+	if (entry->ops->free)
+		entry->ops->free(&entry->data);
+	kfree(entry);
+}
+
+
+int
+register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+			      void *data)
+{
+	struct ftrace_func_hook *entry;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	unsigned long key;
+	int type, len, not;
+	int count = 0;
+	char *search;
+
+	type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+	len = strlen(search);
+
+	/* we do not support '!' for function hooks */
+	if (WARN_ON(not))
+		return -EINVAL;
+
+	mutex_lock(&ftrace_lock);
+	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_FAILED)
+			continue;
+
+		if (!ftrace_match_record(rec, search, len, type))
+			continue;
+
+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+		if (!entry) {
+			/* If we did not hook to any, then return error */
+			if (!count)
+				count = -ENOMEM;
+			goto out_unlock;
+		}
+
+		count++;
+
+		entry->data = data;
+
+		/*
+		 * The caller might want to do something special
+		 * for each function we find. We call the callback
+		 * to give the caller an opportunity to do so.
+		 */
+		if (ops->callback) {
+			if (ops->callback(rec->ip, &entry->data) < 0) {
+				/* caller does not like this func */
+				kfree(entry);
+				continue;
+			}
+		}
+
+		entry->ops = ops;
+		entry->ip = rec->ip;
+
+		key = hash_long(entry->ip, FTRACE_HASH_BITS);
+		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
+
+	} while_for_each_ftrace_rec();
+	__enable_ftrace_function_hook();
+
+ out_unlock:
+	mutex_unlock(&ftrace_lock);
+
+	return count;
+}
+
+enum {
+	HOOK_TEST_FUNC		= 1,
+	HOOK_TEST_DATA		= 2
+};
+
+static void
+__unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+				  void *data, int flags)
+{
+	struct ftrace_func_hook *entry;
+	struct hlist_node *n, *tmp;
+	char str[KSYM_SYMBOL_LEN];
+	int type = MATCH_FULL;
+	int i, len = 0;
+	char *search;
+
+	if (glob && (strcmp(glob, "*") || !strlen(glob)))
+		glob = NULL;
+	else {
+		int not;
+
+		type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+		len = strlen(search);
+
+		/* we do not support '!' for function hooks */
+		if (WARN_ON(not))
+			return;
+	}
+
+	mutex_lock(&ftrace_lock);
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+
+		hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
+
+			/* break up if statements for readability */
+			if ((flags & HOOK_TEST_FUNC) && entry->ops != ops)
+				continue;
+
+			if ((flags & HOOK_TEST_DATA) && entry->data != data)
+				continue;
+
+			/* do this last, since it is the most expensive */
+			if (glob) {
+				kallsyms_lookup(entry->ip, NULL, NULL,
+						NULL, str);
+				if (!ftrace_match(str, glob, len, type))
+					continue;
+			}
+
+			hlist_del(&entry->node);
+			call_rcu(&entry->rcu, ftrace_free_entry_rcu);
+		}
+	}
+	__disable_ftrace_function_hook();
+	mutex_unlock(&ftrace_lock);
+}
+
+void
+unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+				void *data)
+{
+	__unregister_ftrace_function_hook(glob, ops, data,
+					  HOOK_TEST_FUNC | HOOK_TEST_DATA);
+}
+
+void
+unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops)
+{
+	__unregister_ftrace_function_hook(glob, ops, NULL, HOOK_TEST_FUNC);
+}
+
+void unregister_ftrace_function_hook_all(char *glob)
+{
+	__unregister_ftrace_function_hook(glob, NULL, NULL, 0);
+}
+
 static LIST_HEAD(ftrace_commands);
 static DEFINE_MUTEX(ftrace_cmd_mutex);
 
-- 
cgit v1.2.3-59-g8ed1b


From 988ae9d6b2bc3ebdc1a488490250a6812f85e9d4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 19:17:02 -0500
Subject: ring-buffer: add tracing_is_on to test if ring buffer is enabled

This patch adds the tracing_is_on() interface to tell if the ring
buffer is turned on or not.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h | 2 ++
 kernel/trace/ring_buffer.c  | 9 +++++++++
 2 files changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 8e6646a54acf..f5e793d69bd3 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -128,10 +128,12 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 void tracing_on(void);
 void tracing_off(void);
 void tracing_off_permanent(void);
+int tracing_is_on(void);
 #else
 static inline void tracing_on(void) { }
 static inline void tracing_off(void) { }
 static inline void tracing_off_permanent(void) { }
+static inline int tracing_is_on(void) { return 0; }
 #endif
 
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2b4626ce95d6..8f19f1aa42b0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -98,6 +98,15 @@ void tracing_off_permanent(void)
 	set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
 }
 
+/**
+ * tracing_is_on - show state of ring buffers enabled
+ */
+int tracing_is_on(void)
+{
+	return ring_buffer_flags == RB_BUFFERS_ON;
+}
+EXPORT_SYMBOL_GPL(tracing_is_on);
+
 #include "trace.h"
 
 /* Up this if you want to test the TIME_EXTENTS and normalization */
-- 
cgit v1.2.3-59-g8ed1b


From 23b4ff3aa479c9e3bb23cb6b2d0a97878399784a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 19:04:24 -0500
Subject: ftrace: add traceon traceoff commands to enable/disable the buffers

This patch adds the new function selection commands traceon and
traceoff. traceon sets the function to enable the ring buffers
while traceoff disables the ring buffers.  You can pass in the
number of times you want the command to be executed when the function
is hit. It will only execute if the state of the buffers are not
already in that state.

Example:

 # echo do_fork:traceon:4

Will enable the ring buffers if they are disabled every time it
hits do_fork, up to 4 times.

 # echo sys_close:traceoff

This will disable the ring buffers every time (unlimited) when
sys_close is called.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions.c | 135 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 36bf9568ccd9..5c95708b9dc3 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -9,6 +9,7 @@
  *  Copyright (C) 2004-2006 Ingo Molnar
  *  Copyright (C) 2004 William Lee Irwin III
  */
+#include <linux/ring_buffer.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
@@ -231,9 +232,143 @@ static struct tracer function_trace __read_mostly =
 #endif
 };
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+static void
+ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+{
+	long *count = (long *)data;
+
+	if (tracing_is_on())
+		return;
+
+	if (!*count)
+		return;
+
+	if (*count != -1)
+		(*count)--;
+
+	tracing_on();
+}
+
+static void
+ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+{
+	long *count = (long *)data;
+
+	if (!tracing_is_on())
+		return;
+
+	if (!*count)
+		return;
+
+	if (*count != -1)
+		(*count)--;
+
+	tracing_off();
+}
+
+static struct ftrace_hook_ops traceon_hook_ops = {
+	.func			= ftrace_traceon,
+};
+
+static struct ftrace_hook_ops traceoff_hook_ops = {
+	.func			= ftrace_traceoff,
+};
+
+static int
+ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
+{
+	struct ftrace_hook_ops *ops;
+
+	/* we register both traceon and traceoff to this callback */
+	if (strcmp(cmd, "traceon") == 0)
+		ops = &traceon_hook_ops;
+	else
+		ops = &traceoff_hook_ops;
+
+	unregister_ftrace_function_hook_func(glob, ops);
+
+	return 0;
+}
+
+static int
+ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
+{
+	struct ftrace_hook_ops *ops;
+	void *count = (void *)-1;
+	char *number;
+	int ret;
+
+	/* hash funcs only work with set_ftrace_filter */
+	if (!enable)
+		return -EINVAL;
+
+	if (glob[0] == '!')
+		return ftrace_trace_onoff_unreg(glob+1, cmd, param);
+
+	/* we register both traceon and traceoff to this callback */
+	if (strcmp(cmd, "traceon") == 0)
+		ops = &traceon_hook_ops;
+	else
+		ops = &traceoff_hook_ops;
+
+	if (!param)
+		goto out_reg;
+
+	number = strsep(&param, ":");
+
+	if (!strlen(number))
+		goto out_reg;
+
+	/*
+	 * We use the callback data field (which is a pointer)
+	 * as our counter.
+	 */
+	ret = strict_strtoul(number, 0, (unsigned long *)&count);
+	if (ret)
+		return ret;
+
+ out_reg:
+	ret = register_ftrace_function_hook(glob, ops, count);
+
+	return ret;
+}
+
+static struct ftrace_func_command ftrace_traceon_cmd = {
+	.name			= "traceon",
+	.func			= ftrace_trace_onoff_callback,
+};
+
+static struct ftrace_func_command ftrace_traceoff_cmd = {
+	.name			= "traceoff",
+	.func			= ftrace_trace_onoff_callback,
+};
+
+static int __init init_func_cmd_traceon(void)
+{
+	int ret;
+
+	ret = register_ftrace_command(&ftrace_traceoff_cmd);
+	if (ret)
+		return ret;
+
+	ret = register_ftrace_command(&ftrace_traceon_cmd);
+	if (ret)
+		unregister_ftrace_command(&ftrace_traceoff_cmd);
+	return ret;
+}
+#else
+static inline int init_func_cmd_traceon(void)
+{
+	return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
 static __init int init_function_trace(void)
 {
+	init_func_cmd_traceon();
 	return register_tracer(&function_trace);
 }
 
 device_initcall(init_function_trace);
+
-- 
cgit v1.2.3-59-g8ed1b


From 8fc0c701c5b6c0c3e242758c3acef6f9047940a9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Feb 2009 15:28:00 -0500
Subject: ftrace: show selected functions in set_ftrace_filter

This patch adds output to show what functions have tracer hooks
attached to them.

  # echo 'sys_open:traceon:4' > /debug/tracing/set_ftrace_filter
  # cat set_ftrace_filter

 #### all functions enabled ####
 sys_open:ftrace_traceon:0000000000000004

  # echo 'do_fork:traceoff:' > set_ftrace_filter
  # cat set_ftrace_filter

 #### all functions enabled ####
 sys_open:ftrace_traceon:0000000000000002
 do_fork:ftrace_traceoff:ffffffffffffffff

Note the 4 changed to a 2. This is because The code was executed twice
since the traceoff was added. If a cat is done again:

 #### all functions enabled ####
 sys_open:ftrace_traceon
 do_fork:ftrace_traceoff:ffffffffffffffff

The number disappears. That is because it will not print a NULL.

Callbacks to allow the tracer to pretty print will be implemented soon.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 123 ++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 103 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0b80e325f296..1e058848cddb 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -45,14 +45,14 @@
 			ftrace_kill();		\
 	} while (0)
 
+/* hash bits for specific function selection */
+#define FTRACE_HASH_BITS 7
+#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
+
 /* ftrace_enabled is a method to turn ftrace on or off */
 int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
-static struct pid * const ftrace_swapper_pid = &init_struct_pid;
-
 /* Quick disabling of function tracer. */
 int function_trace_stop;
 
@@ -248,6 +248,21 @@ static void ftrace_update_pid_func(void)
 # error Dynamic ftrace depends on MCOUNT_RECORD
 #endif
 
+/* set when tracing only a pid */
+struct pid *ftrace_pid_trace;
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
+
+struct ftrace_func_hook {
+	struct hlist_node	node;
+	struct ftrace_hook_ops	*ops;
+	unsigned long		flags;
+	unsigned long		ip;
+	void			*data;
+	struct rcu_head		rcu;
+};
+
+
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
@@ -750,12 +765,14 @@ enum {
 	FTRACE_ITER_NOTRACE	= (1 << 2),
 	FTRACE_ITER_FAILURES	= (1 << 3),
 	FTRACE_ITER_PRINTALL	= (1 << 4),
+	FTRACE_ITER_HASH	= (1 << 5),
 };
 
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 
 struct ftrace_iterator {
 	struct ftrace_page	*pg;
+	int			hidx;
 	int			idx;
 	unsigned		flags;
 	unsigned char		buffer[FTRACE_BUFF_MAX+1];
@@ -763,18 +780,87 @@ struct ftrace_iterator {
 	unsigned		filtered;
 };
 
+static void *
+t_hash_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	struct hlist_node *hnd = v;
+	struct hlist_head *hhd;
+
+	WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
+
+	(*pos)++;
+
+ retry:
+	if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
+		return NULL;
+
+	hhd = &ftrace_func_hash[iter->hidx];
+
+	if (hlist_empty(hhd)) {
+		iter->hidx++;
+		hnd = NULL;
+		goto retry;
+	}
+
+	if (!hnd)
+		hnd = hhd->first;
+	else {
+		hnd = hnd->next;
+		if (!hnd) {
+			iter->hidx++;
+			goto retry;
+		}
+	}
+
+	return hnd;
+}
+
+static void *t_hash_start(struct seq_file *m, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	void *p = NULL;
+
+	iter->flags |= FTRACE_ITER_HASH;
+
+	return t_hash_next(m, p, pos);
+}
+
+static int t_hash_show(struct seq_file *m, void *v)
+{
+	struct ftrace_func_hook *rec;
+	struct hlist_node *hnd = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	rec = hlist_entry(hnd, struct ftrace_func_hook, node);
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+	seq_printf(m, "%s:", str);
+
+	kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
+	seq_printf(m, "%s", str);
+
+	if (rec->data)
+		seq_printf(m, ":%p", rec->data);
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
 	struct dyn_ftrace *rec = NULL;
 
+	if (iter->flags & FTRACE_ITER_HASH)
+		return t_hash_next(m, v, pos);
+
 	(*pos)++;
 
 	if (iter->flags & FTRACE_ITER_PRINTALL)
 		return NULL;
 
-	mutex_lock(&ftrace_lock);
  retry:
 	if (iter->idx >= iter->pg->index) {
 		if (iter->pg->next) {
@@ -803,7 +889,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 			goto retry;
 		}
 	}
-	mutex_unlock(&ftrace_lock);
 
 	return rec;
 }
@@ -813,6 +898,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	struct ftrace_iterator *iter = m->private;
 	void *p = NULL;
 
+	mutex_lock(&ftrace_lock);
 	/*
 	 * For set_ftrace_filter reading, if we have the filter
 	 * off, we can short cut and just print out that all
@@ -820,12 +906,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	 */
 	if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) {
 		if (*pos > 0)
-			return NULL;
+			return t_hash_start(m, pos);
 		iter->flags |= FTRACE_ITER_PRINTALL;
 		(*pos)++;
 		return iter;
 	}
 
+	if (iter->flags & FTRACE_ITER_HASH)
+		return t_hash_start(m, pos);
+
 	if (*pos > 0) {
 		if (iter->idx < 0)
 			return p;
@@ -835,11 +924,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 
 	p = t_next(m, p, pos);
 
+	if (!p)
+		return t_hash_start(m, pos);
+
 	return p;
 }
 
 static void t_stop(struct seq_file *m, void *p)
 {
+	mutex_unlock(&ftrace_lock);
 }
 
 static int t_show(struct seq_file *m, void *v)
@@ -848,6 +941,9 @@ static int t_show(struct seq_file *m, void *v)
 	struct dyn_ftrace *rec = v;
 	char str[KSYM_SYMBOL_LEN];
 
+	if (iter->flags & FTRACE_ITER_HASH)
+		return t_hash_show(m, v);
+
 	if (iter->flags & FTRACE_ITER_PRINTALL) {
 		seq_printf(m, "#### all functions enabled ####\n");
 		return 0;
@@ -1246,19 +1342,6 @@ static int __init ftrace_mod_cmd_init(void)
 }
 device_initcall(ftrace_mod_cmd_init);
 
-#define FTRACE_HASH_BITS 7
-#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
-static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
-
-struct ftrace_func_hook {
-	struct hlist_node	node;
-	struct ftrace_hook_ops	*ops;
-	unsigned long		flags;
-	unsigned long		ip;
-	void			*data;
-	struct rcu_head		rcu;
-};
-
 static void
 function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 809dcf29ce4e1723709910878e050bd187617e0e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Feb 2009 23:06:01 -0500
Subject: ftrace: add pretty print to selected fuction traces

This patch adds a call back for the tracers that have hooks to
selected functions. This allows the tracer to show better output
in the set_ftrace_filter file.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h | 6 ++++++
 kernel/trace/ftrace.c  | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 13918c4400ad..b331e216d8a1 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -106,12 +106,18 @@ struct ftrace_func_command {
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
 
+struct seq_file;
+
 struct ftrace_hook_ops {
 	void			(*func)(unsigned long ip,
 					unsigned long parent_ip,
 					void **data);
 	int			(*callback)(unsigned long ip, void **data);
 	void			(*free)(void **data);
+	int			(*print)(struct seq_file *m,
+					 unsigned long ip,
+					 struct ftrace_hook_ops *ops,
+					 void *data);
 };
 
 extern int
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e058848cddb..6533c1d20155 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -834,6 +834,9 @@ static int t_hash_show(struct seq_file *m, void *v)
 
 	rec = hlist_entry(hnd, struct ftrace_func_hook, node);
 
+	if (rec->ops->print)
+		return rec->ops->print(m, rec->ip, rec->ops, rec->data);
+
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 	seq_printf(m, "%s:", str);
 
-- 
cgit v1.2.3-59-g8ed1b


From e110e3d1eaa0f9628918be67ddd32e8ad65a2871 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Feb 2009 23:38:13 -0500
Subject: ftrace: add pretty print function for traceon and traceoff hooks

This patch adds a pretty print version of traceon and traceoff
output for set_ftrace_filter.

  # echo 'sys_open:traceon:4' > set_ftrace_filter
  # cat set_ftrace_filter

 #### all functions enabled ####
 sys_open:traceon:count=4

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5c95708b9dc3..f520aa419dff 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -267,14 +267,42 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
 	tracing_off();
 }
 
+static int
+ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
+			 struct ftrace_hook_ops *ops, void *data);
+
 static struct ftrace_hook_ops traceon_hook_ops = {
 	.func			= ftrace_traceon,
+	.print			= ftrace_trace_onoff_print,
 };
 
 static struct ftrace_hook_ops traceoff_hook_ops = {
 	.func			= ftrace_traceoff,
+	.print			= ftrace_trace_onoff_print,
 };
 
+static int
+ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
+			 struct ftrace_hook_ops *ops, void *data)
+{
+	char str[KSYM_SYMBOL_LEN];
+	long count = (long)data;
+
+	kallsyms_lookup(ip, NULL, NULL, NULL, str);
+	seq_printf(m, "%s:", str);
+
+	if (ops == &traceon_hook_ops)
+		seq_printf(m, "traceon");
+	else
+		seq_printf(m, "traceoff");
+
+	if (count != -1)
+		seq_printf(m, ":count=%ld", count);
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
 static int
 ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 73d3fd96e77745742f3750b7b19ee42204adc210 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 17 Feb 2009 11:48:18 +0100
Subject: ftrace: fix !CONFIG_DYNAMIC_FTRACE ftrace_swapper_pid definition

Impact: build fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6533c1d20155..4e6c87ecf1bf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -243,14 +243,16 @@ static void ftrace_update_pid_func(void)
 	mutex_unlock(&ftrace_lock);
 }
 
+/* set when tracing only a pid */
+struct pid *ftrace_pid_trace;
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+
 #ifdef CONFIG_DYNAMIC_FTRACE
+
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
 # error Dynamic ftrace depends on MCOUNT_RECORD
 #endif
 
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
-static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
 
 struct ftrace_func_hook {
-- 
cgit v1.2.3-59-g8ed1b


From 6a24a244cd3a02d5b290293c32fcf2c6e92b4235 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 11:20:26 -0500
Subject: ftrace: clean up coding style

Ingo Molnar pointed out some coding style issues with the recent ftrace
updates. This patch cleans them up.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c          | 24 +++++++++++++-----------
 kernel/trace/trace_functions.c |  1 -
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4e6c87ecf1bf..af9d95c0e4de 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -460,8 +460,8 @@ static void ftrace_bug(int failed, unsigned long ip)
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
-	unsigned long ip, fl;
 	unsigned long ftrace_addr;
+	unsigned long ip, fl;
 
 	ftrace_addr = (unsigned long)FTRACE_ADDR;
 
@@ -530,9 +530,9 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 static void ftrace_replace_code(int enable)
 {
-	int failed;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
+	int failed;
 
 	do_for_each_ftrace_rec(pg, rec) {
 		/*
@@ -1208,14 +1208,15 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
 
 static void ftrace_match_records(char *buff, int len, int enable)
 {
-	char *search;
+	unsigned int search_len;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
+	unsigned long flag;
+	char *search;
 	int type;
-	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-	unsigned search_len;
 	int not;
 
+	flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
 	type = ftrace_setup_glob(buff, len, &search, &not);
 
 	search_len = strlen(search);
@@ -1263,14 +1264,16 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
 
 static void ftrace_match_module_records(char *buff, char *mod, int enable)
 {
-	char *search = buff;
+	unsigned search_len = 0;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	int type = MATCH_FULL;
-	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-	unsigned search_len = 0;
+	char *search = buff;
+	unsigned long flag;
 	int not = 0;
 
+	flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+
 	/* blank or '*' mean the same */
 	if (strcmp(buff, "*") == 0)
 		buff[0] = 0;
@@ -1442,8 +1445,8 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 	struct ftrace_func_hook *entry;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
-	unsigned long key;
 	int type, len, not;
+	unsigned long key;
 	int count = 0;
 	char *search;
 
@@ -1623,8 +1626,8 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd)
 
 static int ftrace_process_regex(char *buff, int len, int enable)
 {
-	struct ftrace_func_command *p;
 	char *func, *command, *next = buff;
+	struct ftrace_func_command *p;
 	int ret = -EINVAL;
 
 	func = strsep(&next, ":");
@@ -2392,7 +2395,6 @@ static __init int ftrace_init_debugfs(void)
 			   "'set_ftrace_pid' entry\n");
 	return 0;
 }
-
 fs_initcall(ftrace_init_debugfs);
 
 /**
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index f520aa419dff..021a574c5988 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -397,6 +397,5 @@ static __init int init_function_trace(void)
 	init_func_cmd_traceon();
 	return register_tracer(&function_trace);
 }
-
 device_initcall(init_function_trace);
 
-- 
cgit v1.2.3-59-g8ed1b


From b6887d7916e44c1d8913084fb6aa5004d9473f1a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 12:32:04 -0500
Subject: ftrace: rename _hook to _probe

Impact: clean up

Ingo Molnar did not like the _hook naming convention used by the
select function tracer. Luis Claudio R. Goncalves suggested using
the "_probe" extension. This patch implements the change of
calling the functions and variables "_hook" and replacing them
with "_probe".

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h         | 12 +++----
 kernel/trace/ftrace.c          | 78 +++++++++++++++++++++---------------------
 kernel/trace/trace_functions.c | 26 +++++++-------
 3 files changed, 58 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 63281228ce3e..9d224c43e634 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -108,7 +108,7 @@ struct ftrace_func_command {
 
 struct seq_file;
 
-struct ftrace_hook_ops {
+struct ftrace_probe_ops {
 	void			(*func)(unsigned long ip,
 					unsigned long parent_ip,
 					void **data);
@@ -116,19 +116,19 @@ struct ftrace_hook_ops {
 	void			(*free)(void **data);
 	int			(*print)(struct seq_file *m,
 					 unsigned long ip,
-					 struct ftrace_hook_ops *ops,
+					 struct ftrace_probe_ops *ops,
 					 void *data);
 };
 
 extern int
-register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 			      void *data);
 extern void
-unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				void *data);
 extern void
-unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops);
-extern void unregister_ftrace_function_hook_all(char *glob);
+unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops);
+extern void unregister_ftrace_function_probe_all(char *glob);
 
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index af9d95c0e4de..330a059f6ed7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -255,9 +255,9 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 
 static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
 
-struct ftrace_func_hook {
+struct ftrace_func_probe {
 	struct hlist_node	node;
-	struct ftrace_hook_ops	*ops;
+	struct ftrace_probe_ops	*ops;
 	unsigned long		flags;
 	unsigned long		ip;
 	void			*data;
@@ -830,11 +830,11 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 
 static int t_hash_show(struct seq_file *m, void *v)
 {
-	struct ftrace_func_hook *rec;
+	struct ftrace_func_probe *rec;
 	struct hlist_node *hnd = v;
 	char str[KSYM_SYMBOL_LEN];
 
-	rec = hlist_entry(hnd, struct ftrace_func_hook, node);
+	rec = hlist_entry(hnd, struct ftrace_func_probe, node);
 
 	if (rec->ops->print)
 		return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1351,9 +1351,9 @@ static int __init ftrace_mod_cmd_init(void)
 device_initcall(ftrace_mod_cmd_init);
 
 static void
-function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
+function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct hlist_head *hhd;
 	struct hlist_node *n;
 	unsigned long key;
@@ -1379,18 +1379,18 @@ function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
 	ftrace_preempt_enable(resched);
 }
 
-static struct ftrace_ops trace_hook_ops __read_mostly =
+static struct ftrace_ops trace_probe_ops __read_mostly =
 {
-	.func = function_trace_hook_call,
+	.func = function_trace_probe_call,
 };
 
-static int ftrace_hook_registered;
+static int ftrace_probe_registered;
 
-static void __enable_ftrace_function_hook(void)
+static void __enable_ftrace_function_probe(void)
 {
 	int i;
 
-	if (ftrace_hook_registered)
+	if (ftrace_probe_registered)
 		return;
 
 	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
@@ -1402,16 +1402,16 @@ static void __enable_ftrace_function_hook(void)
 	if (i == FTRACE_FUNC_HASHSIZE)
 		return;
 
-	__register_ftrace_function(&trace_hook_ops);
+	__register_ftrace_function(&trace_probe_ops);
 	ftrace_startup(0);
-	ftrace_hook_registered = 1;
+	ftrace_probe_registered = 1;
 }
 
-static void __disable_ftrace_function_hook(void)
+static void __disable_ftrace_function_probe(void)
 {
 	int i;
 
-	if (!ftrace_hook_registered)
+	if (!ftrace_probe_registered)
 		return;
 
 	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
@@ -1421,16 +1421,16 @@ static void __disable_ftrace_function_hook(void)
 	}
 
 	/* no more funcs left */
-	__unregister_ftrace_function(&trace_hook_ops);
+	__unregister_ftrace_function(&trace_probe_ops);
 	ftrace_shutdown(0);
-	ftrace_hook_registered = 0;
+	ftrace_probe_registered = 0;
 }
 
 
 static void ftrace_free_entry_rcu(struct rcu_head *rhp)
 {
-	struct ftrace_func_hook *entry =
-		container_of(rhp, struct ftrace_func_hook, rcu);
+	struct ftrace_func_probe *entry =
+		container_of(rhp, struct ftrace_func_probe, rcu);
 
 	if (entry->ops->free)
 		entry->ops->free(&entry->data);
@@ -1439,10 +1439,10 @@ static void ftrace_free_entry_rcu(struct rcu_head *rhp)
 
 
 int
-register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 			      void *data)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	int type, len, not;
@@ -1453,7 +1453,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 	type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
 	len = strlen(search);
 
-	/* we do not support '!' for function hooks */
+	/* we do not support '!' for function probes */
 	if (WARN_ON(not))
 		return -EINVAL;
 
@@ -1468,7 +1468,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 
 		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 		if (!entry) {
-			/* If we did not hook to any, then return error */
+			/* If we did not process any, then return error */
 			if (!count)
 				count = -ENOMEM;
 			goto out_unlock;
@@ -1498,7 +1498,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
 
 	} while_for_each_ftrace_rec();
-	__enable_ftrace_function_hook();
+	__enable_ftrace_function_probe();
 
  out_unlock:
 	mutex_unlock(&ftrace_lock);
@@ -1507,15 +1507,15 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 }
 
 enum {
-	HOOK_TEST_FUNC		= 1,
-	HOOK_TEST_DATA		= 2
+	PROBE_TEST_FUNC		= 1,
+	PROBE_TEST_DATA		= 2
 };
 
 static void
-__unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				  void *data, int flags)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct hlist_node *n, *tmp;
 	char str[KSYM_SYMBOL_LEN];
 	int type = MATCH_FULL;
@@ -1530,7 +1530,7 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
 		len = strlen(search);
 
-		/* we do not support '!' for function hooks */
+		/* we do not support '!' for function probes */
 		if (WARN_ON(not))
 			return;
 	}
@@ -1542,10 +1542,10 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
 
 			/* break up if statements for readability */
-			if ((flags & HOOK_TEST_FUNC) && entry->ops != ops)
+			if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
 				continue;
 
-			if ((flags & HOOK_TEST_DATA) && entry->data != data)
+			if ((flags & PROBE_TEST_DATA) && entry->data != data)
 				continue;
 
 			/* do this last, since it is the most expensive */
@@ -1560,27 +1560,27 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 			call_rcu(&entry->rcu, ftrace_free_entry_rcu);
 		}
 	}
-	__disable_ftrace_function_hook();
+	__disable_ftrace_function_probe();
 	mutex_unlock(&ftrace_lock);
 }
 
 void
-unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				void *data)
 {
-	__unregister_ftrace_function_hook(glob, ops, data,
-					  HOOK_TEST_FUNC | HOOK_TEST_DATA);
+	__unregister_ftrace_function_probe(glob, ops, data,
+					  PROBE_TEST_FUNC | PROBE_TEST_DATA);
 }
 
 void
-unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops)
+unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
 {
-	__unregister_ftrace_function_hook(glob, ops, NULL, HOOK_TEST_FUNC);
+	__unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);
 }
 
-void unregister_ftrace_function_hook_all(char *glob)
+void unregister_ftrace_function_probe_all(char *glob)
 {
-	__unregister_ftrace_function_hook(glob, NULL, NULL, 0);
+	__unregister_ftrace_function_probe(glob, NULL, NULL, 0);
 }
 
 static LIST_HEAD(ftrace_commands);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 021a574c5988..6ea73ed03bfa 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -269,21 +269,21 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
 
 static int
 ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
-			 struct ftrace_hook_ops *ops, void *data);
+			 struct ftrace_probe_ops *ops, void *data);
 
-static struct ftrace_hook_ops traceon_hook_ops = {
+static struct ftrace_probe_ops traceon_probe_ops = {
 	.func			= ftrace_traceon,
 	.print			= ftrace_trace_onoff_print,
 };
 
-static struct ftrace_hook_ops traceoff_hook_ops = {
+static struct ftrace_probe_ops traceoff_probe_ops = {
 	.func			= ftrace_traceoff,
 	.print			= ftrace_trace_onoff_print,
 };
 
 static int
 ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
-			 struct ftrace_hook_ops *ops, void *data)
+			 struct ftrace_probe_ops *ops, void *data)
 {
 	char str[KSYM_SYMBOL_LEN];
 	long count = (long)data;
@@ -291,7 +291,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 	kallsyms_lookup(ip, NULL, NULL, NULL, str);
 	seq_printf(m, "%s:", str);
 
-	if (ops == &traceon_hook_ops)
+	if (ops == &traceon_probe_ops)
 		seq_printf(m, "traceon");
 	else
 		seq_printf(m, "traceoff");
@@ -306,15 +306,15 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 static int
 ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 {
-	struct ftrace_hook_ops *ops;
+	struct ftrace_probe_ops *ops;
 
 	/* we register both traceon and traceoff to this callback */
 	if (strcmp(cmd, "traceon") == 0)
-		ops = &traceon_hook_ops;
+		ops = &traceon_probe_ops;
 	else
-		ops = &traceoff_hook_ops;
+		ops = &traceoff_probe_ops;
 
-	unregister_ftrace_function_hook_func(glob, ops);
+	unregister_ftrace_function_probe_func(glob, ops);
 
 	return 0;
 }
@@ -322,7 +322,7 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 static int
 ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 {
-	struct ftrace_hook_ops *ops;
+	struct ftrace_probe_ops *ops;
 	void *count = (void *)-1;
 	char *number;
 	int ret;
@@ -336,9 +336,9 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 
 	/* we register both traceon and traceoff to this callback */
 	if (strcmp(cmd, "traceon") == 0)
-		ops = &traceon_hook_ops;
+		ops = &traceon_probe_ops;
 	else
-		ops = &traceoff_hook_ops;
+		ops = &traceoff_probe_ops;
 
 	if (!param)
 		goto out_reg;
@@ -357,7 +357,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 		return ret;
 
  out_reg:
-	ret = register_ftrace_function_hook(glob, ops, count);
+	ret = register_ftrace_function_probe(glob, ops, count);
 
 	return ret;
 }
-- 
cgit v1.2.3-59-g8ed1b


From af513098452b8887d7c0e15a39d7cb74479501bd Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 17 Feb 2009 01:07:28 -0500
Subject: tracing: use the more proper parameter

Pass tsk to tracing_record_cmdline instead of current.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 95f99a7abf2f..dc61e82faad9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -336,7 +336,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	data->rt_priority = tsk->rt_priority;
 
 	/* record this tasks comm */
-	tracing_record_cmdline(current);
+	tracing_record_cmdline(tsk);
 }
 
 static void
-- 
cgit v1.2.3-59-g8ed1b


From d2ef7c2f0f9ab48c25eafc0ebad0df5f7930420b Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 17 Feb 2009 01:09:47 -0500
Subject: tracing: fix the return value of trace selftest

This patch is to fix the return value of trace_selftest_startup_sysprof
and trace_selftest_startup_branch on failure.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_selftest.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 0c9aa1457e51..c72e749bcbef 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -622,7 +622,7 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
-		return 0;
+		return ret;
 	}
 
 	/* Sleep for a 1/10 of a second */
@@ -634,6 +634,11 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	trace->reset(tr);
 	tracing_start();
 
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
 	return ret;
 }
 #endif /* CONFIG_SYSPROF_TRACER */
@@ -661,6 +666,11 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 	trace->reset(tr);
 	tracing_start();
 
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
 	return ret;
 }
 #endif /* CONFIG_BRANCH_TRACER */
-- 
cgit v1.2.3-59-g8ed1b


From 73d8b8bc4f24a97a406d09c8268ac019f4ac661e Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 17 Feb 2009 01:10:02 -0500
Subject: tracing: fix typing mistake in hint message and comments

Impact: cleanup

Fix incorrect hint message in code and typos in comments.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_irqsoff.c      | 2 +-
 kernel/trace/trace_sched_switch.c | 2 +-
 kernel/trace/trace_sched_wakeup.c | 2 +-
 kernel/trace/trace_selftest.c     | 4 ++--
 kernel/trace/trace_stat.c         | 2 +-
 kernel/trace/trace_sysprof.c      | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index c6b442d88de8..9e5ebd844158 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -1,5 +1,5 @@
 /*
- * trace irqs off criticall timings
+ * trace irqs off critical timings
  *
  * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
  * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 30e14fe85896..82fbb5a2df89 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -93,7 +93,7 @@ static int tracing_sched_register(void)
 	ret = register_trace_sched_switch(probe_sched_switch);
 	if (ret) {
 		pr_info("sched trace: Couldn't activate tracepoint"
-			" probe to kernel_sched_schedule\n");
+			" probe to kernel_sched_switch\n");
 		goto fail_deprobe_wake_new;
 	}
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 96d716485898..276c51aaf314 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -284,7 +284,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
 	ret = register_trace_sched_switch(probe_wakeup_sched_switch);
 	if (ret) {
 		pr_info("sched trace: Couldn't activate tracepoint"
-			" probe to kernel_sched_schedule\n");
+			" probe to kernel_sched_switch\n");
 		goto fail_deprobe_wake_new;
 	}
 
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index c72e749bcbef..01415f4edaa5 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -107,9 +107,9 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	func();
 
 	/*
-	 * Some archs *cough*PowerPC*cough* add charachters to the
+	 * Some archs *cough*PowerPC*cough* add characters to the
 	 * start of the function names. We simply put a '*' to
-	 * accomodate them.
+	 * accommodate them.
 	 */
 	func_name = "*" STR(DYN_FTRACE_TEST_NAME);
 
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index eae9cef39291..39310e3434ee 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -30,7 +30,7 @@ struct tracer_stat_session {
 	struct dentry		*file;
 };
 
-/* All of the sessions currently in use. Each stat file embeed one session */
+/* All of the sessions currently in use. Each stat file embed one session */
 static LIST_HEAD(all_stat_sessions);
 static DEFINE_MUTEX(all_stat_sessions_mutex);
 
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 7c9a2d82a7d8..c771af4e8f1a 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -327,5 +327,5 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
 			d_tracer, NULL, &sysprof_sample_fops);
 	if (entry)
 		return;
-	pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n");
+	pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
 }
-- 
cgit v1.2.3-59-g8ed1b


From 35ebf1caa4854ad5ba25f3a72967acc064147994 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 13:12:12 -0500
Subject: ftrace: show unlimited when traceon or traceoff has no counter

Impact: clean up

The traceon and traceoff function probes are confusing to developers
to what happens when a counter is not specified. This should help
clear things up.

 # echo "*:traceoff" > set_ftrace_filter
 # cat /debug/tracing/set_ftrace_filter

  #### all functions enabled ####
  do_fork:traceoff:unlimited

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 6ea73ed03bfa..4c113a8c466f 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -296,7 +296,9 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 	else
 		seq_printf(m, "traceoff");
 
-	if (count != -1)
+	if (count == -1)
+		seq_printf(m, ":unlimited\n");
+	else
 		seq_printf(m, ":count=%ld", count);
 	seq_putc(m, '\n');
 
-- 
cgit v1.2.3-59-g8ed1b


From 6eaaa5d57e76c454479833fc8594cd7c3b75c789 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 11 Feb 2009 02:25:00 +0100
Subject: tracing/core: use appropriate waiting on trace_pipe

Impact: api and pipe waiting change

Currently, the waiting used in tracing_read_pipe() is done through a
100 msecs schedule_timeout() loop which periodically check if there
are traces on the buffer.

This can cause small latencies for programs which are reading the incoming
events.

This patch makes the reader waiting for the trace_wait waitqueue except
for few tracers such as the sched and functions tracers which might be
already hold the runqueue lock while waking up the reader.

This is performed through a new callback wait_pipe() on struct tracer.
If none is implemented on a specific tracer, the default waiting for
trace_wait queue is attached.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                 | 62 ++++++++++++++++++++++++------------
 kernel/trace/trace.h                 | 25 +++++++++++++--
 kernel/trace/trace_functions.c       |  1 +
 kernel/trace/trace_functions_graph.c |  1 +
 kernel/trace/trace_sched_switch.c    |  1 +
 kernel/trace/trace_sched_wakeup.c    |  1 +
 6 files changed, 67 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc61e82faad9..881a94474d79 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -499,6 +499,9 @@ __acquires(kernel_lock)
 	else
 		if (!type->flags->opts)
 			type->flags->opts = dummy_tracer_opt;
+	if (!type->wait_pipe)
+		type->wait_pipe = default_wait_pipe;
+
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 	if (type->selftest && !tracing_selftest_disabled) {
@@ -1064,7 +1067,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_prio		= wakee->prio;
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
-	trace_buffer_unlock_commit(tr, event, flags, pc);
+
+	ring_buffer_unlock_commit(tr->buffer, event);
+	ftrace_trace_stack(tr, flags, 6, pc);
+	ftrace_trace_userstack(tr, flags, pc);
 }
 
 void
@@ -2392,6 +2398,38 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
 	}
 }
 
+
+void default_wait_pipe(struct trace_iterator *iter)
+{
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
+
+	if (trace_empty(iter))
+		schedule();
+
+	finish_wait(&trace_wait, &wait);
+}
+
+/*
+ * This is a make-shift waitqueue.
+ * A tracer might use this callback on some rare cases:
+ *
+ *  1) the current tracer might hold the runqueue lock when it wakes up
+ *     a reader, hence a deadlock (sched, function, and function graph tracers)
+ *  2) the function tracers, trace all functions, we don't want
+ *     the overhead of calling wake_up and friends
+ *     (and tracing them too)
+ *
+ *     Anyway, this is really very primitive wakeup.
+ */
+void poll_wait_pipe(struct trace_iterator *iter)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+	/* sleep for 100 msecs, and try again. */
+	schedule_timeout(HZ / 10);
+}
+
 /* Must be called with trace_types_lock mutex held. */
 static int tracing_wait_pipe(struct file *filp)
 {
@@ -2403,30 +2441,14 @@ static int tracing_wait_pipe(struct file *filp)
 			return -EAGAIN;
 		}
 
-		/*
-		 * This is a make-shift waitqueue. The reason we don't use
-		 * an actual wait queue is because:
-		 *  1) we only ever have one waiter
-		 *  2) the tracing, traces all functions, we don't want
-		 *     the overhead of calling wake_up and friends
-		 *     (and tracing them too)
-		 *     Anyway, this is really very primitive wakeup.
-		 */
-		set_current_state(TASK_INTERRUPTIBLE);
-		iter->tr->waiter = current;
-
 		mutex_unlock(&trace_types_lock);
 
-		/* sleep for 100 msecs, and try again. */
-		schedule_timeout(HZ/10);
+		iter->trace->wait_pipe(iter);
 
 		mutex_lock(&trace_types_lock);
 
-		iter->tr->waiter = NULL;
-
-		if (signal_pending(current)) {
+		if (signal_pending(current))
 			return -EINTR;
-		}
 
 		if (iter->trace != current_trace)
 			return 0;
@@ -2442,8 +2464,6 @@ static int tracing_wait_pipe(struct file *filp)
 		 */
 		if (!tracer_enabled && iter->pos)
 			break;
-
-		continue;
 	}
 
 	return 1;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dbff0207b213..eed732c151fc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -337,18 +337,34 @@ struct tracer_flags {
 #define TRACER_OPT(s, b)	.name = #s, .bit = b
 
 
-/*
- * A specific tracer, represented by methods that operate on a trace array:
+/**
+ * struct tracer - a specific tracer and its callbacks to interact with debugfs
+ * @name: the name chosen to select it on the available_tracers file
+ * @init: called when one switches to this tracer (echo name > current_tracer)
+ * @reset: called when one switches to another tracer
+ * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
+ * @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @open: called when the trace file is opened
+ * @pipe_open: called when the trace_pipe file is opened
+ * @wait_pipe: override how the user waits for traces on trace_pipe
+ * @close: called when the trace file is released
+ * @read: override the default read callback on trace_pipe
+ * @splice_read: override the default splice_read callback on trace_pipe
+ * @selftest: selftest to run on boot (see trace_selftest.c)
+ * @print_headers: override the first lines that describe your columns
+ * @print_line: callback that prints a trace
+ * @set_flag: signals one of your private flags changed (trace_options file)
+ * @flags: your private flags
  */
 struct tracer {
 	const char		*name;
-	/* Your tracer should raise a warning if init fails */
 	int			(*init)(struct trace_array *tr);
 	void			(*reset)(struct trace_array *tr);
 	void			(*start)(struct trace_array *tr);
 	void			(*stop)(struct trace_array *tr);
 	void			(*open)(struct trace_iterator *iter);
 	void			(*pipe_open)(struct trace_iterator *iter);
+	void			(*wait_pipe)(struct trace_iterator *iter);
 	void			(*close)(struct trace_iterator *iter);
 	ssize_t			(*read)(struct trace_iterator *iter,
 					struct file *filp, char __user *ubuf,
@@ -432,6 +448,9 @@ void tracing_generic_entry_update(struct trace_entry *entry,
 				  unsigned long flags,
 				  int pc);
 
+void default_wait_pipe(struct trace_iterator *iter);
+void poll_wait_pipe(struct trace_iterator *iter);
+
 void ftrace(struct trace_array *tr,
 			    struct trace_array_cpu *data,
 			    unsigned long ip,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 4c113a8c466f..c9a0b7df44ff 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -225,6 +225,7 @@ static struct tracer function_trace __read_mostly =
 	.init		= function_trace_init,
 	.reset		= function_trace_reset,
 	.start		= function_trace_start,
+	.wait_pipe	= poll_wait_pipe,
 	.flags		= &func_flags,
 	.set_flag	= func_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 519a0cab1530..0ff5cb661900 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -757,6 +757,7 @@ static struct tracer graph_trace __read_mostly = {
 	.name	     	= "function_graph",
 	.open		= graph_trace_open,
 	.close		= graph_trace_close,
+	.wait_pipe	= poll_wait_pipe,
 	.init	     	= graph_trace_init,
 	.reset	     	= graph_trace_reset,
 	.print_line	= print_graph_function,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 82fbb5a2df89..77132c2cf3d9 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -221,6 +221,7 @@ static struct tracer sched_switch_trace __read_mostly =
 	.reset		= sched_switch_trace_reset,
 	.start		= sched_switch_trace_start,
 	.stop		= sched_switch_trace_stop,
+	.wait_pipe	= poll_wait_pipe,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_sched_switch,
 #endif
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 276c51aaf314..db55f7aaa640 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -380,6 +380,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
 	.reset		= wakeup_tracer_reset,
 	.start		= wakeup_tracer_start,
 	.stop		= wakeup_tracer_stop,
+	.wait_pipe	= poll_wait_pipe,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
-- 
cgit v1.2.3-59-g8ed1b


From fa7c7f6e11f70d62505074a8b30a776236850dec Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 11 Feb 2009 02:51:30 +0100
Subject: tracing/core: remove unused parameter in tracing_fill_pipe_page()

Impact: cleanup

The struct page *pages parameter is unused.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 881a94474d79..e1f3b99a2e52 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2571,8 +2571,7 @@ static struct pipe_buf_operations tracing_pipe_buf_ops = {
 };
 
 static size_t
-tracing_fill_pipe_page(struct page *pages, size_t rem,
-			struct trace_iterator *iter)
+tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
 {
 	size_t count;
 	int ret;
@@ -2649,7 +2648,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		if (!pages[i])
 			break;
 
-		rem = tracing_fill_pipe_page(pages[i], rem, iter);
+		rem = tracing_fill_pipe_page(rem, iter);
 
 		/* Copy the data into the page, so we can start over. */
 		ret = trace_seq_to_buffer(&iter->seq,
-- 
cgit v1.2.3-59-g8ed1b


From d1f9cbd78841f1a797c77e9117e4882f932c2ef6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 18 Feb 2009 04:25:25 +0100
Subject: tracing/function-graph-tracer: fix traces weirdness while absolute
 time printing

Impact: trace output cleanup/reordering

When an interrupt occurs and and the abstime option is selected:

  echo funcgraph-abstime > /debug/tracing/trace_options

then we observe broken traces:

30581.025422 |   0)   Xorg-4291    |   0.503 us    |      idle_cpu();
30581.025424 |   0)   Xorg-4291    |   2.576 us    |    }
30581.025424 |   0)   Xorg-4291    | + 75.771 us   |  }
 0)   Xorg-4291    |   <========== |
30581.025425 |   0)   Xorg-4291    |               |  schedule() {
30581.025426 |   0)   Xorg-4291    |               |    __schedule() {
30581.025426 |   0)   Xorg-4291    |   0.705 us    |      _spin_lock_irq();

With this patch, the interrupts output better adapts
to absolute time printing:

  414.856543 |   1)   Xorg-4279    |   8.816 us    |                        }
  414.856544 |   1)   Xorg-4279    |   0.525 us    |                        rcu_irq_exit();
  414.856545 |   1)   Xorg-4279    |   0.526 us    |                        idle_cpu();
  414.856546 |   1)   Xorg-4279    | + 12.157 us   |                      }
  414.856549 |   1)   Xorg-4279    | ! 104.114 us  |                    }
  414.856549 |   1)   Xorg-4279    |   <========== |
  414.856549 |   1)   Xorg-4279    | ! 107.944 us  |                  }
  414.856550 |   1)   Xorg-4279    | ! 137.010 us  |                }
  414.856551 |   1)   Xorg-4279    |   0.624 us    |                _read_unlock();
  414.856552 |   1)   Xorg-4279    | ! 140.930 us  |              }
  414.856552 |   1)   Xorg-4279    | ! 166.159 us  |            }

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 50 +++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6c7738e4f98b..8f4004a00b4e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -351,16 +351,35 @@ print_graph_overhead(unsigned long long duration, struct trace_seq *s)
 	return trace_seq_printf(s, "  ");
 }
 
+static int print_graph_abs_time(u64 t, struct trace_seq *s)
+{
+	unsigned long usecs_rem;
+
+	usecs_rem = do_div(t, NSEC_PER_SEC);
+	usecs_rem /= 1000;
+
+	return trace_seq_printf(s, "%5lu.%06lu |  ",
+			(unsigned long)t, usecs_rem);
+}
+
 static enum print_line_t
-print_graph_irq(struct trace_seq *s, unsigned long addr,
+print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 		enum trace_type type, int cpu, pid_t pid)
 {
 	int ret;
+	struct trace_seq *s = &iter->seq;
 
 	if (addr < (unsigned long)__irqentry_text_start ||
 		addr >= (unsigned long)__irqentry_text_end)
 		return TRACE_TYPE_UNHANDLED;
 
+	/* Absolute time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+		ret = print_graph_abs_time(iter->ts, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
@@ -446,17 +465,6 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 
 }
 
-static int print_graph_abs_time(u64 t, struct trace_seq *s)
-{
-	unsigned long usecs_rem;
-
-	usecs_rem = do_div(t, 1000000000);
-	usecs_rem /= 1000;
-
-	return trace_seq_printf(s, "%5lu.%06lu |  ",
-			(unsigned long)t, usecs_rem);
-}
-
 /* Case of a leaf function on its call entry */
 static enum print_line_t
 print_graph_entry_leaf(struct trace_iterator *iter,
@@ -561,7 +569,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Interrupt */
-	ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, ent->pid);
+	ret = print_graph_irq(iter, call->func, TRACE_GRAPH_ENT, cpu, ent->pid);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -581,7 +589,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 
 	/* Proc */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-		ret = print_graph_proc(s, ent->pid);
+		ret = print_graph_proc(s, pid);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 
@@ -605,11 +613,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	int i;
 	int ret;
 	int cpu = iter->cpu;
-	pid_t *last_pid = iter->private;
+	pid_t *last_pid = iter->private, pid = ent->pid;
 	unsigned long long duration = trace->rettime - trace->calltime;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Absolute time */
@@ -668,7 +676,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid);
+	ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -684,6 +692,10 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 	int cpu = iter->cpu;
 	pid_t *last_pid = iter->private;
 
+	/* Pid */
+	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+		return TRACE_TYPE_PARTIAL_LINE;
+
 	/* Absolute time */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
 		ret = print_graph_abs_time(iter->ts, s);
@@ -691,10 +703,6 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
-		return TRACE_TYPE_PARTIAL_LINE;
-
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
-- 
cgit v1.2.3-59-g8ed1b


From 00a8bf859331e349713274825e6fbf20bf2ac15a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 19 Feb 2009 13:01:37 +0100
Subject: tracing/function-graph-tracer: fix merge

Merge artifact: pid got changed to ent->pid meanwhile.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8f4004a00b4e..c009553a8e81 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -589,7 +589,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 
 	/* Proc */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-		ret = print_graph_proc(s, pid);
+		ret = print_graph_proc(s, ent->pid);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 
-- 
cgit v1.2.3-59-g8ed1b


From f9349a8f978929a0c71d2c42ae299f7d462c239d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 19 Feb 2009 21:13:12 +0100
Subject: tracing/function-graph-tracer: make set_graph_function file support
 ftrace regex

Impact: trace only functions matching a pattern

The set_graph_function file let one to trace only one or several
chosen functions and follow all their code flow.

Currently, only a constant function name is allowed so this patch
allows the ftrace_regex functions:

- matches all functions that end with "name":
  echo *name > set_graph_function

- matches all functions that begin with "name":
  echo name* > set_graph_function

- matches all functions that contains "name":
  echo *name* > set_graph_function

Example:

echo mutex* > set_graph_function

 0)               |  mutex_lock_nested() {
 0)   0.563 us    |    __might_sleep();
 0)   2.072 us    |  }
 0)               |  mutex_unlock() {
 0)   1.036 us    |    __mutex_unlock_slowpath();
 0)   2.433 us    |  }
 0)               |  mutex_unlock() {
 0)   0.691 us    |    __mutex_unlock_slowpath();
 0)   1.787 us    |  }
 0)               |  mutex_lock_interruptible_nested() {
 0)   0.548 us    |    __might_sleep();
 0)   1.945 us    |  }

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 56 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7dd5a2bef9cd..cf59f4c54745 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1895,6 +1895,10 @@ static void *g_start(struct seq_file *m, loff_t *pos)
 
 	mutex_lock(&graph_lock);
 
+	/* Nothing, tell g_show to print all functions are enabled */
+	if (!ftrace_graph_count && !*pos)
+		return (void *)1;
+
 	p = g_next(m, p, pos);
 
 	return p;
@@ -1913,6 +1917,11 @@ static int g_show(struct seq_file *m, void *v)
 	if (!ptr)
 		return 0;
 
+	if (ptr == (unsigned long *)1) {
+		seq_printf(m, "#### all functions enabled ####\n");
+		return 0;
+	}
+
 	kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
 
 	seq_printf(m, "%s\n", str);
@@ -1966,38 +1975,51 @@ ftrace_graph_read(struct file *file, char __user *ubuf,
 }
 
 static int
-ftrace_set_func(unsigned long *array, int idx, char *buffer)
+ftrace_set_func(unsigned long *array, int *idx, char *buffer)
 {
-	char str[KSYM_SYMBOL_LEN];
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
+	int search_len;
 	int found = 0;
-	int j;
+	int type, not;
+	char *search;
+	bool exists;
+	int i;
 
 	if (ftrace_disabled)
 		return -ENODEV;
 
+	/* decode regex */
+	type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not);
+	if (not)
+		return -EINVAL;
+
+	search_len = strlen(search);
+
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 
+		if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
+			break;
+
 		if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
 			continue;
 
-		kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-		if (strcmp(str, buffer) == 0) {
-			/* Return 1 if we add it to the array */
-			found = 1;
-			for (j = 0; j < idx; j++)
-				if (array[j] == rec->ip) {
-					found = 0;
+		if (ftrace_match_record(rec, search, search_len, type)) {
+			/* ensure it is not already in the array */
+			exists = false;
+			for (i = 0; i < *idx; i++)
+				if (array[i] == rec->ip) {
+					exists = true;
 					break;
 				}
-			if (found)
-				array[idx] = rec->ip;
-			goto out;
+			if (!exists) {
+				array[(*idx)++] = rec->ip;
+				found = 1;
+			}
 		}
 	} while_for_each_ftrace_rec();
- out:
+
 	mutex_unlock(&ftrace_lock);
 
 	return found ? 0 : -EINVAL;
@@ -2066,13 +2088,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
 	}
 	buffer[index] = 0;
 
-	/* we allow only one at a time */
-	ret = ftrace_set_func(array, ftrace_graph_count, buffer);
+	/* we allow only one expression at a time */
+	ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
 	if (ret)
 		goto out;
 
-	ftrace_graph_count++;
-
 	file->f_pos += read;
 
 	ret = read;
-- 
cgit v1.2.3-59-g8ed1b


From 000ab691172db3921efa3cb7f17fc79235a1de7f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 13:35:06 -0500
Subject: ftrace: allow archs to preform pre and post process for code
 modification

This patch creates the weak functions: ftrace_arch_code_modify_prepare
and ftrace_arch_code_modify_post_process that are called before and
after the stop machine is called to modify the kernel text.

If the arch needs to do pre or post processing, it only needs to define
these functions.

[ Update: Ingo Molnar suggested using the name ftrace_arch_code_modify_*
          over using ftrace_arch_modify_* ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |  3 +++
 kernel/trace/ftrace.c  | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 677432b9cb7e..fdb2a89ae543 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -99,6 +99,9 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
 
+int ftrace_arch_code_modify_prepare(void);
+int ftrace_arch_code_modify_post_process(void);
+
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
 	FTRACE_FL_FAILED	= (1 << 1),
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fdf913dfc7e8..72316d9647bd 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -585,6 +585,24 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 	return 1;
 }
 
+/*
+ * archs can override this function if they must do something
+ * before the modifying code is performed.
+ */
+int __weak ftrace_arch_code_modify_prepare(void)
+{
+	return 0;
+}
+
+/*
+ * archs can override this function if they must do something
+ * after the modifying code is performed.
+ */
+int __weak ftrace_arch_code_modify_post_process(void)
+{
+	return 0;
+}
+
 static int __ftrace_modify_code(void *data)
 {
 	int *command = data;
@@ -607,7 +625,17 @@ static int __ftrace_modify_code(void *data)
 
 static void ftrace_run_update_code(int command)
 {
+	int ret;
+
+	ret = ftrace_arch_code_modify_prepare();
+	FTRACE_WARN_ON(ret);
+	if (ret)
+		return;
+
 	stop_machine(__ftrace_modify_code, &command, NULL);
+
+	ret = ftrace_arch_code_modify_post_process();
+	FTRACE_WARN_ON(ret);
 }
 
 static ftrace_func_t saved_ftrace_func;
-- 
cgit v1.2.3-59-g8ed1b


From 4377245aa93b65b6597e4b7bb460fb9abc48b56b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Feb 2009 13:41:27 -0500
Subject: ftrace: break out modify loop immediately on detection of error

Impact: added precaution on failure detection

Break out of the modifying loop as soon as a failure is detected.
This is just an added precaution found by code review and was not
found by any bug chasing.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 72316d9647bd..11ad796ca049 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -561,8 +561,11 @@ static void ftrace_replace_code(int enable)
 				if ((system_state == SYSTEM_BOOTING) ||
 				    !core_kernel_text(rec->ip)) {
 					ftrace_free_rec(rec);
-				} else
+				} else {
 					ftrace_bug(failed, rec->ip);
+					/* Stop processing */
+					return;
+				}
 			}
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 5e01cb695d29619dd551bac7d6aa4ef1dc8ebc95 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Tue, 24 Feb 2009 13:55:18 +0100
Subject: x86, ftrace: fix section mismatch in hw-branch-tracer

Fix an invalid memory reference problem when cpu hotplug support is
disabled and the hw-branch-tracer is set as current tracer.

Initializing the tracer calls bts_trace_init() which has already
been freed at this time.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 3561aace075c..3335e807144b 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -127,20 +127,18 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 	.notifier_call = bts_hotcpu_handler
 };
 
-static int __cpuinit bts_trace_init(struct trace_array *tr)
+static int bts_trace_init(struct trace_array *tr)
 {
 	hw_branch_trace = tr;
 
-	register_hotcpu_notifier(&bts_hotcpu_notifier);
 	bts_trace_start(tr);
 
 	return 0;
 }
 
-static void __cpuinit bts_trace_reset(struct trace_array *tr)
+static void bts_trace_reset(struct trace_array *tr)
 {
 	bts_trace_stop(tr);
-	unregister_hotcpu_notifier(&bts_hotcpu_notifier);
 }
 
 static void bts_trace_print_header(struct seq_file *m)
@@ -299,6 +297,7 @@ struct tracer bts_tracer __read_mostly =
 
 __init static int init_bts_trace(void)
 {
+	register_hotcpu_notifier(&bts_hotcpu_notifier);
 	return register_tracer(&bts_tracer);
 }
 device_initcall(init_bts_trace);
-- 
cgit v1.2.3-59-g8ed1b


From b77e38aa240c3bd9c55c98b9f7c81541e042eae5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Feb 2009 10:21:36 -0500
Subject: tracing: add event trace infrastructure

This patch creates the event tracing infrastructure of ftrace.
It will create the files:

 /debug/tracing/available_events
 /debug/tracing/set_event

The available_events will list the trace points that have been
registered with the event tracer.

set_events will allow the user to enable or disable an event hook.

example:

 # echo sched_wakeup > /debug/tracing/set_event

Will enable the sched_wakeup event (if it is registered).

 # echo "!sched_wakeup" >> /debug/tracing/set_event

Will disable the sched_wakeup event (and only that event).

 # echo > /debug/tracing/set_event

Will disable all events (notice the '>')

 # cat /debug/tracing/available_events > /debug/tracing/set_event

Will enable all registered event hooks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/asm-generic/vmlinux.lds.h |  11 +-
 kernel/trace/Kconfig              |   9 ++
 kernel/trace/Makefile             |   1 +
 kernel/trace/trace_events.c       | 280 ++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.h       |  52 +++++++
 5 files changed, 352 insertions(+), 1 deletion(-)
 create mode 100644 kernel/trace/trace_events.c
 create mode 100644 kernel/trace/trace_events.h

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c61fab1dd2f8..0add6b28c366 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -61,6 +61,14 @@
 #define BRANCH_PROFILE()
 #endif
 
+#ifdef CONFIG_EVENT_TRACER
+#define FTRACE_EVENTS()	VMLINUX_SYMBOL(__start_ftrace_events) = .;	\
+			*(_ftrace_events)				\
+			VMLINUX_SYMBOL(__stop_ftrace_events) = .;
+#else
+#define FTRACE_EVENTS()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -81,7 +89,8 @@
 	*(__tracepoints)						\
 	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
 	LIKELY_PROFILE()		       				\
-	BRANCH_PROFILE()
+	BRANCH_PROFILE()						\
+	FTRACE_EVENTS()
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 07877f4b5233..999c6a2485df 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -159,6 +159,15 @@ config CONTEXT_SWITCH_TRACER
 	  This tracer gets called from the context switch and records
 	  all switching of tasks.
 
+config EVENT_TRACER
+	bool "Trace various events in the kernel"
+	depends on DEBUG_KERNEL
+	select TRACING
+	help
+	  This tracer hooks to various trace points in the kernel
+	  allowing the user to pick and choose which trace point they
+	  want to trace.
+
 config BOOT_TRACER
 	bool "Trace boot initcalls"
 	depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 627090bc262d..c7363568b1cf 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -38,5 +38,6 @@ obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
+obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
new file mode 100644
index 000000000000..05bc80ec8d2c
--- /dev/null
+++ b/kernel/trace/trace_events.c
@@ -0,0 +1,280 @@
+/*
+ * event tracer
+ *
+ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+
+#include "trace_events.h"
+
+void event_trace_printk(unsigned long ip, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	tracing_record_cmdline(current);
+	trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	va_end(ap);
+}
+
+static void ftrace_clear_events(void)
+{
+	struct ftrace_event_call *call = (void *)__start_ftrace_events;
+
+
+	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+
+		if (call->enabled) {
+			call->enabled = 0;
+			call->unregfunc();
+		}
+		call++;
+	}
+}
+
+static int ftrace_set_clr_event(char *buf, int set)
+{
+	struct ftrace_event_call *call = (void *)__start_ftrace_events;
+
+
+	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+
+		if (strcmp(buf, call->name) != 0) {
+			call++;
+			continue;
+		}
+
+		if (set) {
+			/* Already set? */
+			if (call->enabled)
+				return 0;
+			call->enabled = 1;
+			call->regfunc();
+		} else {
+			/* Already cleared? */
+			if (!call->enabled)
+				return 0;
+			call->enabled = 0;
+			call->unregfunc();
+		}
+		return 0;
+	}
+	return -EINVAL;
+}
+
+/* 128 should be much more than enough */
+#define EVENT_BUF_SIZE		127
+
+static ssize_t
+ftrace_event_write(struct file *file, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	size_t read = 0;
+	int i, set = 1;
+	ssize_t ret;
+	char *buf;
+	char ch;
+
+	if (!cnt || cnt < 0)
+		return 0;
+
+	ret = get_user(ch, ubuf++);
+	if (ret)
+		return ret;
+	read++;
+	cnt--;
+
+	/* skip white space */
+	while (cnt && isspace(ch)) {
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			return ret;
+		read++;
+		cnt--;
+	}
+
+	/* Only white space found? */
+	if (isspace(ch)) {
+		file->f_pos += read;
+		ret = read;
+		return ret;
+	}
+
+	buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (cnt > EVENT_BUF_SIZE)
+		cnt = EVENT_BUF_SIZE;
+
+	i = 0;
+	while (cnt && !isspace(ch)) {
+		if (!i && ch == '!')
+			set = 0;
+		else
+			buf[i++] = ch;
+
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out_free;
+		read++;
+		cnt--;
+	}
+	buf[i] = 0;
+
+	file->f_pos += read;
+
+	ret = ftrace_set_clr_event(buf, set);
+	if (ret)
+		goto out_free;
+
+	ret = read;
+
+ out_free:
+	kfree(buf);
+
+	return ret;
+}
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_event_call *call = m->private;
+	struct ftrace_event_call *next = call;
+
+	(*pos)++;
+
+	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+		return NULL;
+
+	m->private = ++next;
+
+	return call;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	return t_next(m, NULL, pos);
+}
+
+static void *
+s_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_event_call *call = m->private;
+	struct ftrace_event_call *next;
+
+	(*pos)++;
+
+ retry:
+	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+		return NULL;
+
+	if (!call->enabled) {
+		call++;
+		goto retry;
+	}
+
+	next = call;
+	m->private = ++next;
+
+	return call;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	return s_next(m, NULL, pos);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct ftrace_event_call *call = v;
+
+	seq_printf(m, "%s\n", call->name);
+
+	return 0;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int
+ftrace_event_seq_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	const struct seq_operations *seq_ops;
+
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !(file->f_flags & O_APPEND))
+		ftrace_clear_events();
+
+	seq_ops = inode->i_private;
+	ret = seq_open(file, seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+
+		m->private = __start_ftrace_events;
+	}
+	return ret;
+}
+
+static const struct seq_operations show_event_seq_ops = {
+	.start = t_start,
+	.next = t_next,
+	.show = t_show,
+	.stop = t_stop,
+};
+
+static const struct seq_operations show_set_event_seq_ops = {
+	.start = s_start,
+	.next = s_next,
+	.show = t_show,
+	.stop = t_stop,
+};
+
+static const struct file_operations ftrace_avail_fops = {
+	.open = ftrace_event_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static const struct file_operations ftrace_set_event_fops = {
+	.open = ftrace_event_seq_open,
+	.read = seq_read,
+	.write = ftrace_event_write,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static __init int event_trace_init(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return 0;
+
+	entry = debugfs_create_file("available_events", 0444, d_tracer,
+				    (void *)&show_event_seq_ops,
+				    &ftrace_avail_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'available_events' entry\n");
+
+	entry = debugfs_create_file("set_event", 0644, d_tracer,
+				    (void *)&show_set_event_seq_ops,
+				    &ftrace_set_event_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_event' entry\n");
+
+	return 0;
+}
+fs_initcall(event_trace_init);
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
new file mode 100644
index 000000000000..39342f86db27
--- /dev/null
+++ b/kernel/trace/trace_events.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_KERNEL_TRACE_EVENTS_H
+#define _LINUX_KERNEL_TRACE_EVENTS_H
+
+#include <linux/ftrace.h>
+#include "trace.h"
+
+struct ftrace_event_call {
+	char		*name;
+	int		enabled;
+	int		(*regfunc)(void);
+	void		(*unregfunc)(void);
+};
+
+
+#undef TPFMT
+#define TPFMT(fmt, args...)	fmt "\n", ##args
+
+#undef DEFINE_TRACE_FMT
+#define DEFINE_TRACE_FMT(call, proto, args, fmt)			\
+static void ftrace_event_##call(proto)					\
+{									\
+	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
+}									\
+									\
+static int ftrace_reg_event_##call(void)				\
+{									\
+	int ret;							\
+									\
+	ret = register_trace_##call(ftrace_event_##call);		\
+	if (!ret)							\
+		pr_info("event trace: Could not activate trace point "	\
+			"probe to " #call);				\
+	return ret;							\
+}									\
+									\
+static void ftrace_unreg_event_##call(void)				\
+{									\
+	unregister_trace_##call(ftrace_event_##call);			\
+}									\
+									\
+static struct ftrace_event_call __used					\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name 			= #call,				\
+	.regfunc		= ftrace_reg_event_##call,		\
+	.unregfunc		= ftrace_unreg_event_##call,		\
+}
+
+void event_trace_printk(unsigned long ip, const char *fmt, ...);
+extern unsigned long __start_ftrace_events[];
+extern unsigned long __stop_ftrace_events[];
+
+#endif /* _LINUX_KERNEL_TRACE_EVENTS_H */
-- 
cgit v1.2.3-59-g8ed1b


From f3fe8e4a38fd19dbb3f8ffb1826aa840ae304a65 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Feb 2009 10:22:57 -0500
Subject: tracing: add schedule events to event trace

This patch changes the trace/sched.h to use the DECLARE_TRACE_FMT
such that they are automatically registered with the event tracer.

And it also adds the tracing sched headers to kernel/trace/events.c

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/trace/sched.h             | 49 +-------------------------
 include/trace/sched_event_types.h | 72 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/Makefile             |  1 +
 kernel/trace/events.c             | 13 +++++++
 4 files changed, 87 insertions(+), 48 deletions(-)
 create mode 100644 include/trace/sched_event_types.h
 create mode 100644 kernel/trace/events.c

(limited to 'kernel')

diff --git a/include/trace/sched.h b/include/trace/sched.h
index 0d81098ee9fc..4e372a1a29bf 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -4,53 +4,6 @@
 #include <linux/sched.h>
 #include <linux/tracepoint.h>
 
-DECLARE_TRACE(sched_kthread_stop,
-	TPPROTO(struct task_struct *t),
-		TPARGS(t));
-
-DECLARE_TRACE(sched_kthread_stop_ret,
-	TPPROTO(int ret),
-		TPARGS(ret));
-
-DECLARE_TRACE(sched_wait_task,
-	TPPROTO(struct rq *rq, struct task_struct *p),
-		TPARGS(rq, p));
-
-DECLARE_TRACE(sched_wakeup,
-	TPPROTO(struct rq *rq, struct task_struct *p, int success),
-		TPARGS(rq, p, success));
-
-DECLARE_TRACE(sched_wakeup_new,
-	TPPROTO(struct rq *rq, struct task_struct *p, int success),
-		TPARGS(rq, p, success));
-
-DECLARE_TRACE(sched_switch,
-	TPPROTO(struct rq *rq, struct task_struct *prev,
-		struct task_struct *next),
-		TPARGS(rq, prev, next));
-
-DECLARE_TRACE(sched_migrate_task,
-	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
-		TPARGS(p, orig_cpu, dest_cpu));
-
-DECLARE_TRACE(sched_process_free,
-	TPPROTO(struct task_struct *p),
-		TPARGS(p));
-
-DECLARE_TRACE(sched_process_exit,
-	TPPROTO(struct task_struct *p),
-		TPARGS(p));
-
-DECLARE_TRACE(sched_process_wait,
-	TPPROTO(struct pid *pid),
-		TPARGS(pid));
-
-DECLARE_TRACE(sched_process_fork,
-	TPPROTO(struct task_struct *parent, struct task_struct *child),
-		TPARGS(parent, child));
-
-DECLARE_TRACE(sched_signal_send,
-	TPPROTO(int sig, struct task_struct *p),
-		TPARGS(sig, p));
+#include <trace/sched_event_types.h>
 
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
new file mode 100644
index 000000000000..a4f662940f4e
--- /dev/null
+++ b/include/trace/sched_event_types.h
@@ -0,0 +1,72 @@
+
+/* use <trace/sched.h> instead */
+#ifndef DEFINE_TRACE_FMT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+DEFINE_TRACE_FMT(sched_kthread_stop,
+	TPPROTO(struct task_struct *t),
+	TPARGS(t),
+	TPFMT("task %s:%d", t->comm, t->pid));
+
+DEFINE_TRACE_FMT(sched_kthread_stop_ret,
+	TPPROTO(int ret),
+	TPARGS(ret),
+	TPFMT("ret=%d", ret));
+
+DEFINE_TRACE_FMT(sched_wait_task,
+	TPPROTO(struct rq *rq, struct task_struct *p),
+	TPARGS(rq, p),
+	TPFMT("task %s:%d", p->comm, p->pid));
+
+DEFINE_TRACE_FMT(sched_wakeup,
+	TPPROTO(struct rq *rq, struct task_struct *p, int success),
+	TPARGS(rq, p, success),
+	TPFMT("task %s:%d %s",
+	      p->comm, p->pid, success?"succeeded":"failed"));
+
+DEFINE_TRACE_FMT(sched_wakeup_new,
+	TPPROTO(struct rq *rq, struct task_struct *p, int success),
+	TPARGS(rq, p, success),
+	TPFMT("task %s:%d",
+	      p->comm, p->pid, success?"succeeded":"failed"));
+
+DEFINE_TRACE_FMT(sched_switch,
+	TPPROTO(struct rq *rq, struct task_struct *prev,
+		struct task_struct *next),
+	TPARGS(rq, prev, next),
+	TPFMT("task %s:%d ==> %s:%d",
+	      prev->comm, prev->pid, next->comm, next->pid));
+
+DEFINE_TRACE_FMT(sched_migrate_task,
+	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+	TPARGS(p, orig_cpu, dest_cpu),
+	TPFMT("task %s:%d from: %d  to: %d",
+	      p->comm, p->pid, orig_cpu, dest_cpu));
+
+DEFINE_TRACE_FMT(sched_process_free,
+	TPPROTO(struct task_struct *p),
+	TPARGS(p),
+	TPFMT("task %s:%d", p->comm, p->pid));
+
+DEFINE_TRACE_FMT(sched_process_exit,
+	TPPROTO(struct task_struct *p),
+	TPARGS(p),
+	TPFMT("task %s:%d", p->comm, p->pid));
+
+DEFINE_TRACE_FMT(sched_process_wait,
+	TPPROTO(struct pid *pid),
+	TPARGS(pid),
+	TPFMT("pid %d", pid));
+
+DEFINE_TRACE_FMT(sched_process_fork,
+	TPPROTO(struct task_struct *parent, struct task_struct *child),
+	TPARGS(parent, child),
+	TPFMT("parent %s:%d  child %s:%d",
+	      parent->comm, parent->pid, child->comm, child->pid));
+
+DEFINE_TRACE_FMT(sched_signal_send,
+	TPPROTO(int sig, struct task_struct *p),
+	TPARGS(sig, p),
+	TPFMT("sig: %d   task %s:%d", sig, p->comm, p->pid));
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c7363568b1cf..664b6c0dc75a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -39,5 +39,6 @@ obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACER) += trace_events.o
+obj-$(CONFIG_EVENT_TRACER) += events.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
new file mode 100644
index 000000000000..38c89eef99ee
--- /dev/null
+++ b/kernel/trace/events.c
@@ -0,0 +1,13 @@
+/*
+ * This is the place to register all trace points as events.
+ * Include the trace/<type>.h at the top.
+ * Include the trace/<type>_event_types.h at the bottom.
+ */
+
+/* trace/<type>.h here */
+#include <trace/sched.h>
+
+#include "trace_events.h"
+
+/* trace/<type>_event_types.h here */
+#include <trace/sched_event_types.h>
-- 
cgit v1.2.3-59-g8ed1b


From 1473e4417c79f12d91ef91a469699bfa911f510f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Feb 2009 14:15:08 -0500
Subject: tracing: make event directory structure

This patch adds the directory /debug/tracing/events/ that will contain
all the registered trace points.

 # ls /debug/tracing/events/
sched_kthread_stop      sched_process_fork  sched_switch
sched_kthread_stop_ret  sched_process_free  sched_wait_task
sched_migrate_task      sched_process_wait  sched_wakeup
sched_process_exit      sched_signal_send   sched_wakeup_new

 # ls /debug/tracing/events/sched_switch/
enable

 # cat /debug/tracing/events/sched_switch/enable
1

 # cat /debug/tracing/set_event
sched_switch

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 137 ++++++++++++++++++++++++++++++++++++++++++--
 kernel/trace/trace_events.h |   7 ++-
 2 files changed, 137 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05bc80ec8d2c..3bcb9df93342 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -12,6 +12,11 @@
 
 #include "trace_events.h"
 
+#define events_for_each(event)						\
+	for (event = __start_ftrace_events;				\
+	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
+	     event++)
+
 void event_trace_printk(unsigned long ip, const char *fmt, ...)
 {
 	va_list ap;
@@ -39,15 +44,16 @@ static void ftrace_clear_events(void)
 
 static int ftrace_set_clr_event(char *buf, int set)
 {
-	struct ftrace_event_call *call = (void *)__start_ftrace_events;
+	struct ftrace_event_call *call = __start_ftrace_events;
 
 
-	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+	events_for_each(call) {
 
-		if (strcmp(buf, call->name) != 0) {
-			call++;
+		if (!call->name)
+			continue;
+
+		if (strcmp(buf, call->name) != 0)
 			continue;
-		}
 
 		if (set) {
 			/* Already set? */
@@ -223,6 +229,67 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
+static ssize_t
+event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+		  loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char *buf;
+
+	if (call->enabled)
+		buf = "1\n";
+	else
+		buf = "0\n";
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+}
+
+static ssize_t
+event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		   loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[64];
+	unsigned long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	switch (val) {
+	case 0:
+		if (!call->enabled)
+			break;
+
+		call->enabled = 0;
+		call->unregfunc();
+		break;
+	case 1:
+		if (call->enabled)
+			break;
+
+		call->enabled = 1;
+		call->regfunc();
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -252,10 +319,59 @@ static const struct file_operations ftrace_set_event_fops = {
 	.release = seq_release,
 };
 
+static const struct file_operations ftrace_enable_fops = {
+	.open = tracing_open_generic,
+	.read = event_enable_read,
+	.write = event_enable_write,
+};
+
+static struct dentry *event_trace_events_dir(void)
+{
+	static struct dentry *d_tracer;
+	static struct dentry *d_events;
+
+	if (d_events)
+		return d_events;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return NULL;
+
+	d_events = debugfs_create_dir("events", d_tracer);
+	if (!d_events)
+		pr_warning("Could not create debugfs "
+			   "'events' directory\n");
+
+	return d_events;
+}
+
+static int
+event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
+{
+	struct dentry *entry;
+
+	call->dir = debugfs_create_dir(call->name, d_events);
+	if (!call->dir) {
+		pr_warning("Could not create debugfs "
+			   "'%s' directory\n", call->name);
+		return -1;
+	}
+
+	entry = debugfs_create_file("enable", 0644, call->dir, call,
+				    &ftrace_enable_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/enable' entry\n", call->name);
+
+	return 0;
+}
+
 static __init int event_trace_init(void)
 {
+	struct ftrace_event_call *call = __start_ftrace_events;
 	struct dentry *d_tracer;
 	struct dentry *entry;
+	struct dentry *d_events;
 
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
@@ -275,6 +391,17 @@ static __init int event_trace_init(void)
 		pr_warning("Could not create debugfs "
 			   "'set_event' entry\n");
 
+	d_events = event_trace_events_dir();
+	if (!d_events)
+		return 0;
+
+	events_for_each(call) {
+		/* The linker may leave blanks */
+		if (!call->name)
+			continue;
+		event_create_dir(call, d_events);
+	}
+
 	return 0;
 }
 fs_initcall(event_trace_init);
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
index 39342f86db27..cb8455b3ac9a 100644
--- a/kernel/trace/trace_events.h
+++ b/kernel/trace/trace_events.h
@@ -1,11 +1,13 @@
 #ifndef _LINUX_KERNEL_TRACE_EVENTS_H
 #define _LINUX_KERNEL_TRACE_EVENTS_H
 
+#include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include "trace.h"
 
 struct ftrace_event_call {
 	char		*name;
+	struct dentry	*dir;
 	int		enabled;
 	int		(*regfunc)(void);
 	void		(*unregfunc)(void);
@@ -39,6 +41,7 @@ static void ftrace_unreg_event_##call(void)				\
 }									\
 									\
 static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
 	.regfunc		= ftrace_reg_event_##call,		\
@@ -46,7 +49,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 }
 
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
-extern unsigned long __start_ftrace_events[];
-extern unsigned long __stop_ftrace_events[];
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
 
 #endif /* _LINUX_KERNEL_TRACE_EVENTS_H */
-- 
cgit v1.2.3-59-g8ed1b


From 2d542cf34264ac92e9e7ac55c0b096b066d569d2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Feb 2009 08:40:09 +0100
Subject: tracing/hw-branch-tracing: convert bts-tracer mutex to a spinlock

Impact: fix CPU hotplug lockup

bts_hotcpu_handler() is called with irqs disabled, so using mutex_lock()
is a no-no.

All the BTS codepaths here are atomic (they do not schedule), so using
a spinlock is the right solution.

Cc: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 57 ++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 3335e807144b..7bfdf4c2347f 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -3,17 +3,15 @@
  *
  * Copyright (C) 2008-2009 Intel Corporation.
  * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
- *
  */
-
-#include <linux/module.h>
-#include <linux/fs.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
-#include <linux/kallsyms.h>
-#include <linux/mutex.h>
+#include <linux/module.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
+#include <linux/fs.h>
 
 #include <asm/ds.h>
 
@@ -23,16 +21,17 @@
 
 #define SIZEOF_BTS (1 << 13)
 
-/* The tracer mutex protects the below per-cpu tracer array.
-   It needs to be held to:
-   - start tracing on all cpus
-   - stop tracing on all cpus
-   - start tracing on a single hotplug cpu
-   - stop tracing on a single hotplug cpu
-   - read the trace from all cpus
-   - read the trace from a single cpu
-*/
-static DEFINE_MUTEX(bts_tracer_mutex);
+/*
+ * The tracer lock protects the below per-cpu tracer array.
+ * It needs to be held to:
+ * - start tracing on all cpus
+ * - stop tracing on all cpus
+ * - start tracing on a single hotplug cpu
+ * - stop tracing on a single hotplug cpu
+ * - read the trace from all cpus
+ * - read the trace from a single cpu
+ */
+static DEFINE_SPINLOCK(bts_tracer_lock);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
 static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
 
@@ -47,7 +46,7 @@ static struct trace_array *hw_branch_trace __read_mostly;
  * Start tracing on the current cpu.
  * The argument is ignored.
  *
- * pre: bts_tracer_mutex must be locked.
+ * pre: bts_tracer_lock must be locked.
  */
 static void bts_trace_start_cpu(void *arg)
 {
@@ -66,19 +65,19 @@ static void bts_trace_start_cpu(void *arg)
 
 static void bts_trace_start(struct trace_array *tr)
 {
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	on_each_cpu(bts_trace_start_cpu, NULL, 1);
 	trace_hw_branches_enabled = 1;
 
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 }
 
 /*
  * Stop tracing on the current cpu.
  * The argument is ignored.
  *
- * pre: bts_tracer_mutex must be locked.
+ * pre: bts_tracer_lock must be locked.
  */
 static void bts_trace_stop_cpu(void *arg)
 {
@@ -90,12 +89,12 @@ static void bts_trace_stop_cpu(void *arg)
 
 static void bts_trace_stop(struct trace_array *tr)
 {
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	trace_hw_branches_enabled = 0;
 	on_each_cpu(bts_trace_stop_cpu, NULL, 1);
 
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 }
 
 static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
@@ -103,7 +102,7 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
 {
 	unsigned int cpu = (unsigned long)hcpu;
 
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	if (!trace_hw_branches_enabled)
 		goto out;
@@ -119,7 +118,7 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
 	}
 
  out:
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 	return NOTIFY_DONE;
 }
 
@@ -225,7 +224,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
 /*
  * Collect the trace on the current cpu and write it into the ftrace buffer.
  *
- * pre: bts_tracer_mutex must be locked
+ * pre: bts_tracer_lock must be locked
  */
 static void trace_bts_cpu(void *arg)
 {
@@ -261,11 +260,11 @@ out:
 
 static void trace_bts_prepare(struct trace_iterator *iter)
 {
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	on_each_cpu(trace_bts_cpu, iter->tr, 1);
 
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 }
 
 static void trace_bts_close(struct trace_iterator *iter)
@@ -275,11 +274,11 @@ static void trace_bts_close(struct trace_iterator *iter)
 
 void trace_hw_branch_oops(void)
 {
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	trace_bts_cpu(hw_branch_trace);
 
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 }
 
 struct tracer bts_tracer __read_mostly =
-- 
cgit v1.2.3-59-g8ed1b


From 886b5b73d71e4027d7dc6c14f5f7ab102201ea6b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Feb 2009 11:03:44 +0100
Subject: tracing: remove /debug/tracing/latency_trace

Impact: remove old debug/tracing API

/debug/tracing/latency_trace is an old legacy format we kept from
the old latency tracer. Remove the file for now. If there's any
useful bit missing then we'll propagate any useful output bits into
the /debug/tracing/trace output.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e1f3b99a2e52..11ba100f9a9e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2938,11 +2938,6 @@ static __init int tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
 
-	entry = debugfs_create_file("latency_trace", 0444, d_tracer,
-				    &global_trace, &tracing_lt_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'latency_trace' entry\n");
-
 	entry = debugfs_create_file("trace", 0444, d_tracer,
 				    &global_trace, &tracing_fops);
 	if (!entry)
-- 
cgit v1.2.3-59-g8ed1b


From 15d0d3b3371227f846b9f644547fde081c7e1c0c Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 25 Feb 2009 06:22:45 +0100
Subject: generic IPI: simplify barriers and locking

Simplify the barriers in generic remote function call interrupt
code.

Firstly, just unconditionally take the lock and check the list
in the generic_call_function_single_interrupt IPI handler. As
we've just taken an IPI here, the chances are fairly high that
there will be work on the list for us, so do the locking
unconditionally. This removes the tricky lockless list_empty
check and dubious barriers. The change looks bigger than it is
because it is just removing an outer loop.

Secondly, clarify architecture specific IPI locking rules.
Generic code has no tools to impose any sane ordering on IPIs if
they go outside normal cache coherency, ergo the arch code must
make them appear to obey cache coherency as a "memory operation"
to initiate an IPI, and a "memory operation" to receive one.
This way at least they can be reasoned about in generic code,
and smp_mb used to provide ordering.

The combination of these two changes means that explict barriers
can be taken out of queue handling for the single case -- shared
data is explicitly locked, and ipi ordering must conform to
that, so no barriers needed. An extra barrier is needed in the
many handler, so as to ensure we load the list element after the
IPI is received.

Does any architecture actually *need* these barriers? For the
initiator I could see it, but for the handler I would be
surprised. So the other thing we could do for simplicity is just
to require that, rather than just matching with cache coherency,
we just require a full barrier before generating an IPI, and
after receiving an IPI. In which case, the smp_mb()s can go
away. But just for now, we'll be on the safe side and use the
barriers (they're in the slow case anyway).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linux-arch@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 83 ++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 44 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index bbedbb7efe32..6ecf4b9895d4 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -74,9 +74,16 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
 	spin_unlock_irqrestore(&dst->lock, flags);
 
 	/*
-	 * Make the list addition visible before sending the ipi.
+	 * The list addition should be visible before sending the IPI
+	 * handler locks the list to pull the entry off it because of
+	 * normal cache coherency rules implied by spinlocks.
+	 *
+	 * If IPIs can go out of order to the cache coherency protocol
+	 * in an architecture, sufficient synchronisation should be added
+	 * to arch code to make it appear to obey cache coherency WRT
+	 * locking and barrier primitives. Generic code isn't really equipped
+	 * to do the right thing...
 	 */
-	smp_mb();
 
 	if (ipi)
 		arch_send_call_function_single_ipi(cpu);
@@ -103,6 +110,14 @@ void generic_smp_call_function_interrupt(void)
 	struct call_function_data *data;
 	int cpu = get_cpu();
 
+	/*
+	 * Ensure entry is visible on call_function_queue after we have
+	 * entered the IPI. See comment in smp_call_function_many.
+	 * If we don't have this, then we may miss an entry on the list
+	 * and never get another IPI to process it.
+	 */
+	smp_mb();
+
 	/*
 	 * It's ok to use list_for_each_rcu() here even though we may delete
 	 * 'pos', since list_del_rcu() doesn't clear ->next
@@ -154,49 +169,37 @@ void generic_smp_call_function_single_interrupt(void)
 {
 	struct call_single_queue *q = &__get_cpu_var(call_single_queue);
 	LIST_HEAD(list);
+	unsigned int data_flags;
 
-	/*
-	 * Need to see other stores to list head for checking whether
-	 * list is empty without holding q->lock
-	 */
-	smp_read_barrier_depends();
-	while (!list_empty(&q->list)) {
-		unsigned int data_flags;
-
-		spin_lock(&q->lock);
-		list_replace_init(&q->list, &list);
-		spin_unlock(&q->lock);
+	spin_lock(&q->lock);
+	list_replace_init(&q->list, &list);
+	spin_unlock(&q->lock);
 
-		while (!list_empty(&list)) {
-			struct call_single_data *data;
+	while (!list_empty(&list)) {
+		struct call_single_data *data;
 
-			data = list_entry(list.next, struct call_single_data,
-						list);
-			list_del(&data->list);
+		data = list_entry(list.next, struct call_single_data,
+					list);
+		list_del(&data->list);
 
-			/*
-			 * 'data' can be invalid after this call if
-			 * flags == 0 (when called through
-			 * generic_exec_single(), so save them away before
-			 * making the call.
-			 */
-			data_flags = data->flags;
-
-			data->func(data->info);
-
-			if (data_flags & CSD_FLAG_WAIT) {
-				smp_wmb();
-				data->flags &= ~CSD_FLAG_WAIT;
-			} else if (data_flags & CSD_FLAG_LOCK) {
-				smp_wmb();
-				data->flags &= ~CSD_FLAG_LOCK;
-			} else if (data_flags & CSD_FLAG_ALLOC)
-				kfree(data);
-		}
 		/*
-		 * See comment on outer loop
+		 * 'data' can be invalid after this call if
+		 * flags == 0 (when called through
+		 * generic_exec_single(), so save them away before
+		 * making the call.
 		 */
-		smp_read_barrier_depends();
+		data_flags = data->flags;
+
+		data->func(data->info);
+
+		if (data_flags & CSD_FLAG_WAIT) {
+			smp_wmb();
+			data->flags &= ~CSD_FLAG_WAIT;
+		} else if (data_flags & CSD_FLAG_LOCK) {
+			smp_wmb();
+			data->flags &= ~CSD_FLAG_LOCK;
+		} else if (data_flags & CSD_FLAG_ALLOC)
+			kfree(data);
 	}
 }
 
@@ -375,6 +378,8 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	/*
 	 * Make the list addition visible before sending the ipi.
+	 * (IPIs must obey or appear to obey normal Linux cache coherency
+	 * rules -- see comment in generic_exec_single).
 	 */
 	smp_mb();
 
-- 
cgit v1.2.3-59-g8ed1b


From b04cc6b1f6398b0e0b60d37e27ce51b4899672ec Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 25 Feb 2009 03:22:28 +0100
Subject: tracing/core: introduce per cpu tracing files

Impact: split up tracing output per cpu

Currently, on the tracing debugfs directory, three files are
available to the user to let him extracting the trace output:

- trace is an iterator through the ring-buffer. It's a reader
  but not a consumer It doesn't block when no more traces are
  available.

- trace pretty similar to the former, except that it adds more
  informations such as prempt count, irq flag, ...

- trace_pipe is a reader and a consumer, it will also block
  waiting for traces if necessary (heh, yes it's a pipe).

The traces coming from different cpus are curretly mixed up
inside these files. Sometimes it messes up the informations,
sometimes it's useful, depending on what does the tracer
capture.

The tracing_cpumask file is useful to filter the output and
select only the traces captured a custom defined set of cpus.
But still it is not enough powerful to extract at the same time
one trace buffer per cpu.

So this patch creates a new directory: /debug/tracing/per_cpu/.

Inside this directory, you will now find one trace_pipe file and
one trace file per cpu.

Which means if you have two cpus, you will have:

 trace0
 trace1
 trace_pipe0
 trace_pipe1

And of course, reading these files will have the same effect
than with the usual tracing files, except that you will only see
the traces from the given cpu.

The original all-in-one cpu trace file are still available on
their original place.

Until now, only one consumer was allowed on trace_pipe to avoid
racy consuming on the ring-buffer. Now the approach changed a
bit, you can have only one consumer per cpu.

Which means you are allowed to read concurrently trace_pipe0 and
trace_pipe1 But you can't have two readers on trace_pipe0 or
trace_pipe1.

Following the same logic, if there is one reader on the common
trace_pipe, you can not have at the same time another reader on
trace_pipe0 or in trace_pipe1. Because in trace_pipe is already
a consumer in all cpu buffers in essence.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 168 +++++++++++++++++++++++++++++++++++++++++++--------
 kernel/trace/trace.h |   3 +
 2 files changed, 147 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 11ba100f9a9e..aa58b7bc847c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -98,6 +98,9 @@ static inline void ftrace_enable_cpu(void)
 
 static cpumask_var_t __read_mostly	tracing_buffer_mask;
 
+/* Define which cpu buffers are currently read in trace_pipe */
+static cpumask_var_t			tracing_reader_cpumask;
+
 #define for_each_tracing_cpu(cpu)	\
 	for_each_cpu(cpu, tracing_buffer_mask)
 
@@ -1195,10 +1198,25 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 {
 	struct ring_buffer *buffer = iter->tr->buffer;
 	struct trace_entry *ent, *next = NULL;
+	int cpu_file = iter->cpu_file;
 	u64 next_ts = 0, ts;
 	int next_cpu = -1;
 	int cpu;
 
+	/*
+	 * If we are in a per_cpu trace file, don't bother by iterating over
+	 * all cpu and peek directly.
+	 */
+	if (cpu_file > TRACE_PIPE_ALL_CPU) {
+		if (ring_buffer_empty_cpu(buffer, cpu_file))
+			return NULL;
+		ent = peek_next_entry(iter, cpu_file, ent_ts);
+		if (ent_cpu)
+			*ent_cpu = cpu_file;
+
+		return ent;
+	}
+
 	for_each_tracing_cpu(cpu) {
 
 		if (ring_buffer_empty_cpu(buffer, cpu))
@@ -1279,6 +1297,7 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
 	struct trace_iterator *iter = m->private;
+	int cpu_file = iter->cpu_file;
 	void *p = NULL;
 	loff_t l = 0;
 	int cpu;
@@ -1299,9 +1318,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 
 		ftrace_disable_cpu();
 
-		for_each_tracing_cpu(cpu) {
-			ring_buffer_iter_reset(iter->buffer_iter[cpu]);
-		}
+		if (cpu_file == TRACE_PIPE_ALL_CPU) {
+			for_each_tracing_cpu(cpu)
+				ring_buffer_iter_reset(iter->buffer_iter[cpu]);
+		} else
+			ring_buffer_iter_reset(iter->buffer_iter[cpu_file]);
+
 
 		ftrace_enable_cpu();
 
@@ -1653,6 +1675,7 @@ static struct seq_operations tracer_seq_ops = {
 static struct trace_iterator *
 __tracing_open(struct inode *inode, struct file *file, int *ret)
 {
+	long cpu_file = (long) inode->i_private;
 	struct trace_iterator *iter;
 	struct seq_file *m;
 	int cpu;
@@ -1672,9 +1695,10 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	if (current_trace && current_trace->print_max)
 		iter->tr = &max_tr;
 	else
-		iter->tr = inode->i_private;
+		iter->tr = &global_trace;
 	iter->trace = current_trace;
 	iter->pos = -1;
+	iter->cpu_file = cpu_file;
 
 	/* Notify the tracer early; before we stop tracing. */
 	if (iter->trace && iter->trace->open)
@@ -1684,14 +1708,22 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	if (ring_buffer_overruns(iter->tr->buffer))
 		iter->iter_flags |= TRACE_FILE_ANNOTATE;
 
+	if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
+		for_each_tracing_cpu(cpu) {
 
-	for_each_tracing_cpu(cpu) {
+			iter->buffer_iter[cpu] =
+				ring_buffer_read_start(iter->tr->buffer, cpu);
 
+			if (!iter->buffer_iter[cpu])
+				goto fail_buffer;
+		}
+	} else {
+		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_start(iter->tr->buffer, cpu);
+				ring_buffer_read_start(iter->tr->buffer, cpu);
 
 		if (!iter->buffer_iter[cpu])
-			goto fail_buffer;
+			goto fail;
 	}
 
 	/* TODO stop tracer */
@@ -1715,6 +1747,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 		if (iter->buffer_iter[cpu])
 			ring_buffer_read_finish(iter->buffer_iter[cpu]);
 	}
+fail:
 	mutex_unlock(&trace_types_lock);
 	kfree(iter);
 
@@ -2325,54 +2358,77 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static atomic_t tracing_reader;
-
 static int tracing_open_pipe(struct inode *inode, struct file *filp)
 {
+	long cpu_file = (long) inode->i_private;
 	struct trace_iterator *iter;
+	int ret = 0;
 
 	if (tracing_disabled)
 		return -ENODEV;
 
-	/* We only allow for reader of the pipe */
-	if (atomic_inc_return(&tracing_reader) != 1) {
-		atomic_dec(&tracing_reader);
-		return -EBUSY;
+	mutex_lock(&trace_types_lock);
+
+	/* We only allow one reader per cpu */
+	if (cpu_file == TRACE_PIPE_ALL_CPU) {
+		if (!cpumask_empty(tracing_reader_cpumask)) {
+			ret = -EBUSY;
+			goto out;
+		}
+		cpumask_setall(tracing_reader_cpumask);
+	} else {
+		if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
+			cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
+		else {
+			ret = -EBUSY;
+			goto out;
+		}
 	}
 
 	/* create a buffer to store the information to pass to userspace */
 	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-	if (!iter)
-		return -ENOMEM;
+	if (!iter) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
 		kfree(iter);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 
-	mutex_lock(&trace_types_lock);
-
 	/* trace pipe does not show start of buffer */
 	cpumask_setall(iter->started);
 
+	iter->cpu_file = cpu_file;
 	iter->tr = &global_trace;
 	iter->trace = current_trace;
 	filp->private_data = iter;
 
 	if (iter->trace->pipe_open)
 		iter->trace->pipe_open(iter);
-	mutex_unlock(&trace_types_lock);
 
-	return 0;
+out:
+	mutex_unlock(&trace_types_lock);
+	return ret;
 }
 
 static int tracing_release_pipe(struct inode *inode, struct file *file)
 {
 	struct trace_iterator *iter = file->private_data;
 
+	mutex_lock(&trace_types_lock);
+
+	if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
+		cpumask_clear(tracing_reader_cpumask);
+	else
+		cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
+
+	mutex_unlock(&trace_types_lock);
+
 	free_cpumask_var(iter->started);
 	kfree(iter);
-	atomic_dec(&tracing_reader);
 
 	return 0;
 }
@@ -2911,6 +2967,59 @@ struct dentry *tracing_init_dentry(void)
 	return d_tracer;
 }
 
+static struct dentry *d_percpu;
+
+struct dentry *tracing_dentry_percpu(void)
+{
+	static int once;
+	struct dentry *d_tracer;
+
+	if (d_percpu)
+		return d_percpu;
+
+	d_tracer = tracing_init_dentry();
+
+	if (!d_tracer)
+		return NULL;
+
+	d_percpu = debugfs_create_dir("per_cpu", d_tracer);
+
+	if (!d_percpu && !once) {
+		once = 1;
+		pr_warning("Could not create debugfs directory 'per_cpu'\n");
+		return NULL;
+	}
+
+	return d_percpu;
+}
+
+static void tracing_init_debugfs_percpu(long cpu)
+{
+	struct dentry *d_percpu = tracing_dentry_percpu();
+	struct dentry *entry;
+	/* strlen(trace_pipe) + MAX(log10(cpu)) + '\0' */
+	char filename[17];
+
+	if (cpu > 999 || cpu < 0)
+		return;
+
+	/* per cpu trace_pipe */
+	sprintf(filename, "trace_pipe%ld", cpu);
+
+	entry = debugfs_create_file(filename, 0444, d_percpu,
+				(void *) cpu, &tracing_pipe_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs '%s' entry\n", filename);
+
+	/* per cpu trace */
+	sprintf(filename, "trace%ld", cpu);
+
+	entry = debugfs_create_file(filename, 0444, d_percpu,
+				(void *) cpu, &tracing_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs '%s' entry\n", filename);
+}
+
 #ifdef CONFIG_FTRACE_SELFTEST
 /* Let selftest have access to static functions in this file */
 #include "trace_selftest.c"
@@ -2920,6 +3029,7 @@ static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
 	struct dentry *entry;
+	int cpu;
 
 	d_tracer = tracing_init_dentry();
 
@@ -2939,7 +3049,7 @@ static __init int tracer_init_debugfs(void)
 		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
 
 	entry = debugfs_create_file("trace", 0444, d_tracer,
-				    &global_trace, &tracing_fops);
+				 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace' entry\n");
 
@@ -2970,8 +3080,8 @@ static __init int tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'README' entry\n");
 
-	entry = debugfs_create_file("trace_pipe", 0644, d_tracer,
-				    NULL, &tracing_pipe_fops);
+	entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
+			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'trace_pipe' entry\n");
@@ -2999,6 +3109,10 @@ static __init int tracer_init_debugfs(void)
 #ifdef CONFIG_SYSPROF_TRACER
 	init_tracer_sysprof_debugfs(d_tracer);
 #endif
+
+	for_each_tracing_cpu(cpu)
+		tracing_init_debugfs_percpu(cpu);
+
 	return 0;
 }
 
@@ -3222,8 +3336,12 @@ __init static int tracer_alloc_buffers(void)
 	if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
 		goto out_free_buffer_mask;
 
+	if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
+		goto out_free_tracing_cpumask;
+
 	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
 	cpumask_copy(tracing_cpumask, cpu_all_mask);
+	cpumask_clear(tracing_reader_cpumask);
 
 	/* TODO: make the number of buffers hot pluggable with CPUS */
 	global_trace.buffer = ring_buffer_alloc(trace_buf_size,
@@ -3272,6 +3390,8 @@ __init static int tracer_alloc_buffers(void)
 	ret = 0;
 
 out_free_cpumask:
+	free_cpumask_var(tracing_reader_cpumask);
+out_free_tracing_cpumask:
 	free_cpumask_var(tracing_cpumask);
 out_free_buffer_mask:
 	free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index eed732c151fc..508235a39da6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -395,6 +395,8 @@ struct trace_seq {
 	unsigned int		readpos;
 };
 
+#define TRACE_PIPE_ALL_CPU	-1
+
 /*
  * Trace iterator - used by printout routines who present trace
  * results to users and which routines might sleep, etc:
@@ -404,6 +406,7 @@ struct trace_iterator {
 	struct tracer		*trace;
 	void			*private;
 	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
+	int			cpu_file;
 
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
-- 
cgit v1.2.3-59-g8ed1b


From d7350c3f45694104e820041969c8185c5f99e57c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 25 Feb 2009 06:13:16 +0100
Subject: tracing/core: make the read callbacks reentrants

Now that several per-cpu files can be read or spliced at the
same, we want the read/splice callbacks for tracing files to be
reentrants.

Until now, a single global mutex (trace_types_lock) serialized
the access to tracing_read_pipe(), tracing_splice_read_pipe(),
and the seq helpers.

Ie: it means that if a user tries to read trace_pipe0 and
trace_pipe1 at the same time, the access to the function
tracing_read_pipe() is contended and one reader must wait for
the other to finish its read call.

The trace_type_lock mutex is mostly here to serialize the access
to the global current tracer (current_trace), which can be
changed concurrently. Although the iter struct keeps a private
pointer to this tracer, its callbacks can be changed by another
function.

The method used here is to not keep anymore private reference to
the tracer inside the iterator but to make a copy of it inside
the iterator. Then it checks on subsequents read calls if the
tracer has changed. This is not costly because the current
tracer is not expected to be changed often, so we use a branch
prediction for that.

Moreover, we add a private mutex to the iterator (there is one
iterator per file descriptor) to serialize the accesses in case
of multiple consumers per file descriptor (which would be a
silly idea from the user). Note that this is not to protect the
ring buffer, since the ring buffer already serializes the
readers accesses. This is to prevent from traces weirdness in
case of concurrent consumers. But these mutexes can be dropped
anyway, that would not result in any crash. Just tell me what
you think about it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 101 ++++++++++++++++++++++++++++++++++++++++++---------
 kernel/trace/trace.h |   3 +-
 2 files changed, 85 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aa58b7bc847c..d8d899f3bd6f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1294,20 +1294,32 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 	return ent;
 }
 
+/*
+ * No necessary locking here. The worst thing which can
+ * happen is loosing events consumed at the same time
+ * by a trace_pipe reader.
+ * Other than that, we don't risk to crash the ring buffer
+ * because it serializes the readers.
+ *
+ * The current tracer is copied to avoid a global locking
+ * all around.
+ */
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
 	struct trace_iterator *iter = m->private;
+	static struct tracer *old_tracer;
 	int cpu_file = iter->cpu_file;
 	void *p = NULL;
 	loff_t l = 0;
 	int cpu;
 
+	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
-
-	if (!current_trace || current_trace != iter->trace) {
-		mutex_unlock(&trace_types_lock);
-		return NULL;
+	if (unlikely(old_tracer != current_trace && current_trace)) {
+		old_tracer = current_trace;
+		*iter->trace = *current_trace;
 	}
+	mutex_unlock(&trace_types_lock);
 
 	atomic_inc(&trace_record_cmdline_disabled);
 
@@ -1341,7 +1353,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 static void s_stop(struct seq_file *m, void *p)
 {
 	atomic_dec(&trace_record_cmdline_disabled);
-	mutex_unlock(&trace_types_lock);
 }
 
 static void print_lat_help_header(struct seq_file *m)
@@ -1691,13 +1702,25 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 		goto out;
 	}
 
+	/*
+	 * We make a copy of the current tracer to avoid concurrent
+	 * changes on it while we are reading.
+	 */
 	mutex_lock(&trace_types_lock);
+	iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
+	if (!iter->trace) {
+		*ret = -ENOMEM;
+		goto fail;
+	}
+	if (current_trace)
+		*iter->trace = *current_trace;
+
 	if (current_trace && current_trace->print_max)
 		iter->tr = &max_tr;
 	else
 		iter->tr = &global_trace;
-	iter->trace = current_trace;
 	iter->pos = -1;
+	mutex_init(&iter->mutex);
 	iter->cpu_file = cpu_file;
 
 	/* Notify the tracer early; before we stop tracing. */
@@ -1747,8 +1770,9 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 		if (iter->buffer_iter[cpu])
 			ring_buffer_read_finish(iter->buffer_iter[cpu]);
 	}
-fail:
+ fail:
 	mutex_unlock(&trace_types_lock);
+	kfree(iter->trace);
 	kfree(iter);
 
 	return ERR_PTR(-ENOMEM);
@@ -1783,6 +1807,8 @@ static int tracing_release(struct inode *inode, struct file *file)
 	mutex_unlock(&trace_types_lock);
 
 	seq_release(inode, file);
+	mutex_destroy(&iter->mutex);
+	kfree(iter->trace);
 	kfree(iter);
 	return 0;
 }
@@ -2392,10 +2418,21 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 		goto out;
 	}
 
+	/*
+	 * We make a copy of the current tracer to avoid concurrent
+	 * changes on it while we are reading.
+	 */
+	iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL);
+	if (!iter->trace) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	if (current_trace)
+		*iter->trace = *current_trace;
+
 	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
-		kfree(iter);
 		ret = -ENOMEM;
-		goto out;
+		goto fail;
 	}
 
 	/* trace pipe does not show start of buffer */
@@ -2403,7 +2440,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 
 	iter->cpu_file = cpu_file;
 	iter->tr = &global_trace;
-	iter->trace = current_trace;
+	mutex_init(&iter->mutex);
 	filp->private_data = iter;
 
 	if (iter->trace->pipe_open)
@@ -2412,6 +2449,12 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 out:
 	mutex_unlock(&trace_types_lock);
 	return ret;
+
+fail:
+	kfree(iter->trace);
+	kfree(iter);
+	mutex_unlock(&trace_types_lock);
+	return ret;
 }
 
 static int tracing_release_pipe(struct inode *inode, struct file *file)
@@ -2428,6 +2471,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
 	mutex_unlock(&trace_types_lock);
 
 	free_cpumask_var(iter->started);
+	mutex_destroy(&iter->mutex);
+	kfree(iter->trace);
 	kfree(iter);
 
 	return 0;
@@ -2497,18 +2542,15 @@ static int tracing_wait_pipe(struct file *filp)
 			return -EAGAIN;
 		}
 
-		mutex_unlock(&trace_types_lock);
+		mutex_unlock(&iter->mutex);
 
 		iter->trace->wait_pipe(iter);
 
-		mutex_lock(&trace_types_lock);
+		mutex_lock(&iter->mutex);
 
 		if (signal_pending(current))
 			return -EINTR;
 
-		if (iter->trace != current_trace)
-			return 0;
-
 		/*
 		 * We block until we read something and tracing is disabled.
 		 * We still block if tracing is disabled, but we have never
@@ -2533,6 +2575,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
 	struct trace_iterator *iter = filp->private_data;
+	static struct tracer *old_tracer;
 	ssize_t sret;
 
 	/* return any leftover data */
@@ -2542,7 +2585,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 
 	trace_seq_reset(&iter->seq);
 
+	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
+	if (unlikely(old_tracer != current_trace && current_trace)) {
+		old_tracer = current_trace;
+		*iter->trace = *current_trace;
+	}
+	mutex_unlock(&trace_types_lock);
+
+	/*
+	 * Avoid more than one consumer on a single file descriptor
+	 * This is just a matter of traces coherency, the ring buffer itself
+	 * is protected.
+	 */
+	mutex_lock(&iter->mutex);
 	if (iter->trace->read) {
 		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
 		if (sret)
@@ -2599,7 +2655,7 @@ waitagain:
 		goto waitagain;
 
 out:
-	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&iter->mutex);
 
 	return sret;
 }
@@ -2676,11 +2732,20 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		.ops		= &tracing_pipe_buf_ops,
 		.spd_release	= tracing_spd_release_pipe,
 	};
+	static struct tracer *old_tracer;
 	ssize_t ret;
 	size_t rem;
 	unsigned int i;
 
+	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
+	if (unlikely(old_tracer != current_trace && current_trace)) {
+		old_tracer = current_trace;
+		*iter->trace = *current_trace;
+	}
+	mutex_unlock(&trace_types_lock);
+
+	mutex_lock(&iter->mutex);
 
 	if (iter->trace->splice_read) {
 		ret = iter->trace->splice_read(iter, filp,
@@ -2720,14 +2785,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		trace_seq_reset(&iter->seq);
 	}
 
-	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&iter->mutex);
 
 	spd.nr_pages = i;
 
 	return splice_to_pipe(pipe, &spd);
 
 out_err:
-	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&iter->mutex);
 
 	return ret;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 508235a39da6..632191770aac 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -405,8 +405,9 @@ struct trace_iterator {
 	struct trace_array	*tr;
 	struct tracer		*trace;
 	void			*private;
-	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
 	int			cpu_file;
+	struct mutex		mutex;
+	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
 
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
-- 
cgit v1.2.3-59-g8ed1b


From 8969a5ede0f9e17da4b943712429aef2c9bcd82b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Feb 2009 13:59:47 +0100
Subject: generic-ipi: remove kmalloc()

Remove the use of kmalloc() from the smp_call_function_*()
calls.

Steven's generic-ipi patch (d7240b98: generic-ipi: use per cpu
data for single cpu ipi calls) started the discussion on the use
of kmalloc() in this code and fixed the
smp_call_function_single(.wait=0) fallback case.

In this patch we complete this by also providing means for the
_many() call, which fully removes the need for kmalloc() in this
code.

The problem with the _many() call is that other cpus might still
be observing our entry when we're done with it. It solved this
by dynamically allocating data elements and RCU-freeing it.

We solve it by using a single per-cpu entry which provides
static storage and solves one half of the problem (avoiding
referencing freed data).

The other half, ensuring the queue iteration it still possible,
is done by placing re-used entries at the head of the list. This
means that if someone was still iterating that entry when it got
moved, he will now re-visit the entries on the list he had
already seen, but avoids skipping over entries like would have
happened had we placed the new entry at the end.

Furthermore, visiting entries twice is not a problem, since we
remove our cpu from the entry's cpumask once its called.

Many thanks to Oleg for his suggestions and him poking holes in
my earlier attempts.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 264 +++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 166 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 6ecf4b9895d4..7a0ce25829dc 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -10,23 +10,28 @@
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/smp.h>
+#include <linux/cpu.h>
 
 static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
-static LIST_HEAD(call_function_queue);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
+
+static struct {
+	struct list_head	queue;
+	spinlock_t		lock;
+} call_function __cacheline_aligned_in_smp = {
+	.queue = LIST_HEAD_INIT(call_function.queue),
+	.lock  = __SPIN_LOCK_UNLOCKED(call_function.lock),
+};
 
 enum {
 	CSD_FLAG_WAIT		= 0x01,
-	CSD_FLAG_ALLOC		= 0x02,
-	CSD_FLAG_LOCK		= 0x04,
+	CSD_FLAG_LOCK		= 0x02,
 };
 
 struct call_function_data {
 	struct call_single_data csd;
 	spinlock_t lock;
 	unsigned int refs;
-	struct rcu_head rcu_head;
-	unsigned long cpumask_bits[];
+	cpumask_var_t cpumask;
 };
 
 struct call_single_queue {
@@ -34,8 +39,45 @@ struct call_single_queue {
 	spinlock_t lock;
 };
 
+static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
+	.lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
+};
+
+static int
+hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+	struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
+				cpu_to_node(cpu)))
+			return NOTIFY_BAD;
+		break;
+
+#ifdef CONFIG_CPU_HOTPLUG
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		free_cpumask_var(cfd->cpumask);
+		break;
+#endif
+	};
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
+	.notifier_call = hotplug_cfd,
+};
+
 static int __cpuinit init_call_single_data(void)
 {
+	void *cpu = (void *)(long)smp_processor_id();
 	int i;
 
 	for_each_possible_cpu(i) {
@@ -44,18 +86,69 @@ static int __cpuinit init_call_single_data(void)
 		spin_lock_init(&q->lock);
 		INIT_LIST_HEAD(&q->list);
 	}
+
+	hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
+	register_cpu_notifier(&hotplug_cfd_notifier);
+
 	return 0;
 }
 early_initcall(init_call_single_data);
 
-static void csd_flag_wait(struct call_single_data *data)
+/*
+ * csd_wait/csd_complete are used for synchronous ipi calls
+ */
+static void csd_wait_prepare(struct call_single_data *data)
 {
-	/* Wait for response */
-	do {
-		if (!(data->flags & CSD_FLAG_WAIT))
-			break;
+	data->flags |= CSD_FLAG_WAIT;
+}
+
+static void csd_complete(struct call_single_data *data)
+{
+	if (data->flags & CSD_FLAG_WAIT) {
+		/*
+		 * ensure we're all done before saying we are
+		 */
+		smp_mb();
+		data->flags &= ~CSD_FLAG_WAIT;
+	}
+}
+
+static void csd_wait(struct call_single_data *data)
+{
+	while (data->flags & CSD_FLAG_WAIT)
 		cpu_relax();
-	} while (1);
+}
+
+/*
+ * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
+ *
+ * For non-synchronous ipi calls the csd can still be in use by the previous
+ * function call. For multi-cpu calls its even more interesting as we'll have
+ * to ensure no other cpu is observing our csd.
+ */
+static void csd_lock(struct call_single_data *data)
+{
+	while (data->flags & CSD_FLAG_LOCK)
+		cpu_relax();
+	data->flags = CSD_FLAG_LOCK;
+
+	/*
+	 * prevent CPU from reordering the above assignment to ->flags
+	 * with any subsequent assignments to other fields of the
+	 * specified call_single_data structure.
+	 */
+
+	smp_mb();
+}
+
+static void csd_unlock(struct call_single_data *data)
+{
+	WARN_ON(!(data->flags & CSD_FLAG_LOCK));
+	/*
+	 * ensure we're all done before releasing data
+	 */
+	smp_mb();
+	data->flags &= ~CSD_FLAG_LOCK;
 }
 
 /*
@@ -89,16 +182,7 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
 		arch_send_call_function_single_ipi(cpu);
 
 	if (wait)
-		csd_flag_wait(data);
-}
-
-static void rcu_free_call_data(struct rcu_head *head)
-{
-	struct call_function_data *data;
-
-	data = container_of(head, struct call_function_data, rcu_head);
-
-	kfree(data);
+		csd_wait(data);
 }
 
 /*
@@ -122,41 +206,35 @@ void generic_smp_call_function_interrupt(void)
 	 * It's ok to use list_for_each_rcu() here even though we may delete
 	 * 'pos', since list_del_rcu() doesn't clear ->next
 	 */
-	rcu_read_lock();
-	list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
+	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
 		int refs;
 
-		if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits)))
+		spin_lock(&data->lock);
+		if (!cpumask_test_cpu(cpu, data->cpumask)) {
+			spin_unlock(&data->lock);
 			continue;
+		}
+		cpumask_clear_cpu(cpu, data->cpumask);
+		spin_unlock(&data->lock);
 
 		data->csd.func(data->csd.info);
 
 		spin_lock(&data->lock);
-		cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
 		WARN_ON(data->refs == 0);
-		data->refs--;
-		refs = data->refs;
+		refs = --data->refs;
+		if (!refs) {
+			spin_lock(&call_function.lock);
+			list_del_rcu(&data->csd.list);
+			spin_unlock(&call_function.lock);
+		}
 		spin_unlock(&data->lock);
 
 		if (refs)
 			continue;
 
-		spin_lock(&call_function_lock);
-		list_del_rcu(&data->csd.list);
-		spin_unlock(&call_function_lock);
-
-		if (data->csd.flags & CSD_FLAG_WAIT) {
-			/*
-			 * serialize stores to data with the flag clear
-			 * and wakeup
-			 */
-			smp_wmb();
-			data->csd.flags &= ~CSD_FLAG_WAIT;
-		}
-		if (data->csd.flags & CSD_FLAG_ALLOC)
-			call_rcu(&data->rcu_head, rcu_free_call_data);
+		csd_complete(&data->csd);
+		csd_unlock(&data->csd);
 	}
-	rcu_read_unlock();
 
 	put_cpu();
 }
@@ -192,14 +270,14 @@ void generic_smp_call_function_single_interrupt(void)
 
 		data->func(data->info);
 
-		if (data_flags & CSD_FLAG_WAIT) {
-			smp_wmb();
-			data->flags &= ~CSD_FLAG_WAIT;
-		} else if (data_flags & CSD_FLAG_LOCK) {
-			smp_wmb();
-			data->flags &= ~CSD_FLAG_LOCK;
-		} else if (data_flags & CSD_FLAG_ALLOC)
-			kfree(data);
+		if (data_flags & CSD_FLAG_WAIT)
+			csd_complete(data);
+
+		/*
+		 * Unlocked CSDs are valid through generic_exec_single()
+		 */
+		if (data_flags & CSD_FLAG_LOCK)
+			csd_unlock(data);
 	}
 }
 
@@ -218,7 +296,9 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 			     int wait)
 {
-	struct call_single_data d;
+	struct call_single_data d = {
+		.flags = 0,
+	};
 	unsigned long flags;
 	/* prevent preemption and reschedule on another processor,
 	   as well as CPU removal */
@@ -239,13 +319,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 			/*
 			 * We are calling a function on a single CPU
 			 * and we are not going to wait for it to finish.
-			 * We first try to allocate the data, but if we
-			 * fail, we fall back to use a per cpu data to pass
-			 * the information to that CPU. Since all callers
-			 * of this code will use the same data, we must
-			 * synchronize the callers to prevent a new caller
-			 * from corrupting the data before the callee
-			 * can access it.
+			 * We use a per cpu data to pass the information to
+			 * that CPU. Since all callers of this code will
+			 * use the same data, we must synchronize the
+			 * callers to prevent a new caller from corrupting
+			 * the data before the callee can access it.
 			 *
 			 * The CSD_FLAG_LOCK is used to let us know when
 			 * the IPI handler is done with the data.
@@ -255,18 +333,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 			 * will make sure the callee is done with the
 			 * data before a new caller will use it.
 			 */
-			data = kmalloc(sizeof(*data), GFP_ATOMIC);
-			if (data)
-				data->flags = CSD_FLAG_ALLOC;
-			else {
-				data = &per_cpu(csd_data, me);
-				while (data->flags & CSD_FLAG_LOCK)
-					cpu_relax();
-				data->flags = CSD_FLAG_LOCK;
-			}
+			data = &__get_cpu_var(csd_data);
+			csd_lock(data);
 		} else {
 			data = &d;
-			data->flags = CSD_FLAG_WAIT;
+			csd_wait_prepare(data);
 		}
 
 		data->func = func;
@@ -326,14 +397,14 @@ void smp_call_function_many(const struct cpumask *mask,
 {
 	struct call_function_data *data;
 	unsigned long flags;
-	int cpu, next_cpu;
+	int cpu, next_cpu, me = smp_processor_id();
 
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
 
 	/* So, what's a CPU they want?  Ignoring this one. */
 	cpu = cpumask_first_and(mask, cpu_online_mask);
-	if (cpu == smp_processor_id())
+	if (cpu == me)
 		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
 	/* No online cpus?  We're done. */
 	if (cpu >= nr_cpu_ids)
@@ -341,7 +412,7 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	/* Do we have another CPU which isn't us? */
 	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
-	if (next_cpu == smp_processor_id())
+	if (next_cpu == me)
 		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
 
 	/* Fastpath: do that cpu by itself. */
@@ -350,31 +421,28 @@ void smp_call_function_many(const struct cpumask *mask,
 		return;
 	}
 
-	data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC);
-	if (unlikely(!data)) {
-		/* Slow path. */
-		for_each_online_cpu(cpu) {
-			if (cpu == smp_processor_id())
-				continue;
-			if (cpumask_test_cpu(cpu, mask))
-				smp_call_function_single(cpu, func, info, wait);
-		}
-		return;
-	}
+	data = &__get_cpu_var(cfd_data);
+	csd_lock(&data->csd);
 
-	spin_lock_init(&data->lock);
-	data->csd.flags = CSD_FLAG_ALLOC;
+	spin_lock_irqsave(&data->lock, flags);
 	if (wait)
-		data->csd.flags |= CSD_FLAG_WAIT;
+		csd_wait_prepare(&data->csd);
+
 	data->csd.func = func;
 	data->csd.info = info;
-	cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask);
-	cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits));
-	data->refs = cpumask_weight(to_cpumask(data->cpumask_bits));
+	cpumask_and(data->cpumask, mask, cpu_online_mask);
+	cpumask_clear_cpu(me, data->cpumask);
+	data->refs = cpumask_weight(data->cpumask);
 
-	spin_lock_irqsave(&call_function_lock, flags);
-	list_add_tail_rcu(&data->csd.list, &call_function_queue);
-	spin_unlock_irqrestore(&call_function_lock, flags);
+	spin_lock(&call_function.lock);
+	/*
+	 * Place entry at the _HEAD_ of the list, so that any cpu still
+	 * observing the entry in generic_smp_call_function_interrupt() will
+	 * not miss any other list entries.
+	 */
+	list_add_rcu(&data->csd.list, &call_function.queue);
+	spin_unlock(&call_function.lock);
+	spin_unlock_irqrestore(&data->lock, flags);
 
 	/*
 	 * Make the list addition visible before sending the ipi.
@@ -384,11 +452,11 @@ void smp_call_function_many(const struct cpumask *mask,
 	smp_mb();
 
 	/* Send a message to all CPUs in the map */
-	arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits));
+	arch_send_call_function_ipi_mask(data->cpumask);
 
 	/* optionally wait for the CPUs to complete */
 	if (wait)
-		csd_flag_wait(&data->csd);
+		csd_wait(&data->csd);
 }
 EXPORT_SYMBOL(smp_call_function_many);
 
@@ -418,20 +486,20 @@ EXPORT_SYMBOL(smp_call_function);
 
 void ipi_call_lock(void)
 {
-	spin_lock(&call_function_lock);
+	spin_lock(&call_function.lock);
 }
 
 void ipi_call_unlock(void)
 {
-	spin_unlock(&call_function_lock);
+	spin_unlock(&call_function.lock);
 }
 
 void ipi_call_lock_irq(void)
 {
-	spin_lock_irq(&call_function_lock);
+	spin_lock_irq(&call_function.lock);
 }
 
 void ipi_call_unlock_irq(void)
 {
-	spin_unlock_irq(&call_function_lock);
+	spin_unlock_irq(&call_function.lock);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 6e2756376c706e4da3454a272947983f92e80a7e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Feb 2009 13:59:48 +0100
Subject: generic-ipi: remove CSD_FLAG_WAIT

Oleg noticed that we don't strictly need CSD_FLAG_WAIT, rework
the code so that we can use CSD_FLAG_LOCK for both purposes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blk-softirq.c |  2 +-
 include/linux/smp.h |  3 +-
 kernel/sched.c      |  2 +-
 kernel/smp.c        | 90 ++++++++++++++---------------------------------------
 kernel/softirq.c    |  2 +-
 5 files changed, 28 insertions(+), 71 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ce0efc6b26dc..ee9c21602228 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -64,7 +64,7 @@ static int raise_blk_irq(int cpu, struct request *rq)
 		data->info = rq;
 		data->flags = 0;
 
-		__smp_call_function_single(cpu, data);
+		__smp_call_function_single(cpu, data, 0);
 		return 0;
 	}
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 715196b09d67..00866d7fdf34 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -82,7 +82,8 @@ smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info,
 	return 0;
 }
 
-void __smp_call_function_single(int cpuid, struct call_single_data *data);
+void __smp_call_function_single(int cpuid, struct call_single_data *data,
+				int wait);
 
 /*
  * Generic and arch helpers
diff --git a/kernel/sched.c b/kernel/sched.c
index 410eec404133..d4c2749a2998 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1093,7 +1093,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
 	if (rq == this_rq()) {
 		hrtimer_restart(timer);
 	} else if (!rq->hrtick_csd_pending) {
-		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
 		rq->hrtick_csd_pending = 1;
 	}
 }
diff --git a/kernel/smp.c b/kernel/smp.c
index 7a0ce25829dc..f5308258891a 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -23,8 +23,7 @@ static struct {
 };
 
 enum {
-	CSD_FLAG_WAIT		= 0x01,
-	CSD_FLAG_LOCK		= 0x02,
+	CSD_FLAG_LOCK		= 0x01,
 };
 
 struct call_function_data {
@@ -94,31 +93,6 @@ static int __cpuinit init_call_single_data(void)
 }
 early_initcall(init_call_single_data);
 
-/*
- * csd_wait/csd_complete are used for synchronous ipi calls
- */
-static void csd_wait_prepare(struct call_single_data *data)
-{
-	data->flags |= CSD_FLAG_WAIT;
-}
-
-static void csd_complete(struct call_single_data *data)
-{
-	if (data->flags & CSD_FLAG_WAIT) {
-		/*
-		 * ensure we're all done before saying we are
-		 */
-		smp_mb();
-		data->flags &= ~CSD_FLAG_WAIT;
-	}
-}
-
-static void csd_wait(struct call_single_data *data)
-{
-	while (data->flags & CSD_FLAG_WAIT)
-		cpu_relax();
-}
-
 /*
  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
  *
@@ -126,10 +100,15 @@ static void csd_wait(struct call_single_data *data)
  * function call. For multi-cpu calls its even more interesting as we'll have
  * to ensure no other cpu is observing our csd.
  */
-static void csd_lock(struct call_single_data *data)
+static void csd_lock_wait(struct call_single_data *data)
 {
 	while (data->flags & CSD_FLAG_LOCK)
 		cpu_relax();
+}
+
+static void csd_lock(struct call_single_data *data)
+{
+	csd_lock_wait(data);
 	data->flags = CSD_FLAG_LOCK;
 
 	/*
@@ -155,11 +134,12 @@ static void csd_unlock(struct call_single_data *data)
  * Insert a previously allocated call_single_data element for execution
  * on the given CPU. data must already have ->func, ->info, and ->flags set.
  */
-static void generic_exec_single(int cpu, struct call_single_data *data)
+static
+void generic_exec_single(int cpu, struct call_single_data *data, int wait)
 {
 	struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
-	int wait = data->flags & CSD_FLAG_WAIT, ipi;
 	unsigned long flags;
+	int ipi;
 
 	spin_lock_irqsave(&dst->lock, flags);
 	ipi = list_empty(&dst->list);
@@ -182,7 +162,7 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
 		arch_send_call_function_single_ipi(cpu);
 
 	if (wait)
-		csd_wait(data);
+		csd_lock_wait(data);
 }
 
 /*
@@ -232,7 +212,6 @@ void generic_smp_call_function_interrupt(void)
 		if (refs)
 			continue;
 
-		csd_complete(&data->csd);
 		csd_unlock(&data->csd);
 	}
 
@@ -270,9 +249,6 @@ void generic_smp_call_function_single_interrupt(void)
 
 		data->func(data->info);
 
-		if (data_flags & CSD_FLAG_WAIT)
-			csd_complete(data);
-
 		/*
 		 * Unlocked CSDs are valid through generic_exec_single()
 		 */
@@ -313,36 +289,16 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 		func(info);
 		local_irq_restore(flags);
 	} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
-		struct call_single_data *data;
+		struct call_single_data *data = &d;
 
-		if (!wait) {
-			/*
-			 * We are calling a function on a single CPU
-			 * and we are not going to wait for it to finish.
-			 * We use a per cpu data to pass the information to
-			 * that CPU. Since all callers of this code will
-			 * use the same data, we must synchronize the
-			 * callers to prevent a new caller from corrupting
-			 * the data before the callee can access it.
-			 *
-			 * The CSD_FLAG_LOCK is used to let us know when
-			 * the IPI handler is done with the data.
-			 * The first caller will set it, and the callee
-			 * will clear it. The next caller must wait for
-			 * it to clear before we set it again. This
-			 * will make sure the callee is done with the
-			 * data before a new caller will use it.
-			 */
+		if (!wait)
 			data = &__get_cpu_var(csd_data);
-			csd_lock(data);
-		} else {
-			data = &d;
-			csd_wait_prepare(data);
-		}
+
+		csd_lock(data);
 
 		data->func = func;
 		data->info = info;
-		generic_exec_single(cpu, data);
+		generic_exec_single(cpu, data, wait);
 	} else {
 		err = -ENXIO;	/* CPU not online */
 	}
@@ -362,12 +318,15 @@ EXPORT_SYMBOL(smp_call_function_single);
  * instance.
  *
  */
-void __smp_call_function_single(int cpu, struct call_single_data *data)
+void __smp_call_function_single(int cpu, struct call_single_data *data,
+				int wait)
 {
+	csd_lock(data);
+
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
+	WARN_ON(wait && irqs_disabled());
 
-	generic_exec_single(cpu, data);
+	generic_exec_single(cpu, data, wait);
 }
 
 /* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
@@ -425,9 +384,6 @@ void smp_call_function_many(const struct cpumask *mask,
 	csd_lock(&data->csd);
 
 	spin_lock_irqsave(&data->lock, flags);
-	if (wait)
-		csd_wait_prepare(&data->csd);
-
 	data->csd.func = func;
 	data->csd.info = info;
 	cpumask_and(data->cpumask, mask, cpu_online_mask);
@@ -456,7 +412,7 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	/* optionally wait for the CPUs to complete */
 	if (wait)
-		csd_wait(&data->csd);
+		csd_lock_wait(&data->csd);
 }
 EXPORT_SYMBOL(smp_call_function_many);
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..48c3d5d627a8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -496,7 +496,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
 		cp->flags = 0;
 		cp->priv = softirq;
 
-		__smp_call_function_single(cpu, cp);
+		__smp_call_function_single(cpu, cp, 0);
 		return 0;
 	}
 	return 1;
-- 
cgit v1.2.3-59-g8ed1b


From 0b13fda1e0936b3d64c4c407f183d33fa6bd2ad4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Feb 2009 16:52:11 +0100
Subject: generic-ipi: cleanups

Andrew pointed out that there's some small amount of
style rot in kernel/smp.c.

Clean it up.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 162 +++++++++++++++++++++++++++++++----------------------------
 1 file changed, 86 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index f5308258891a..7ad2262d2eca 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -2,13 +2,12 @@
  * Generic helpers for smp ipi calls
  *
  * (C) Jens Axboe <jens.axboe@oracle.com> 2008
- *
  */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
 
@@ -17,29 +16,30 @@ static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
 static struct {
 	struct list_head	queue;
 	spinlock_t		lock;
-} call_function __cacheline_aligned_in_smp = {
-	.queue = LIST_HEAD_INIT(call_function.queue),
-	.lock  = __SPIN_LOCK_UNLOCKED(call_function.lock),
-};
+} call_function __cacheline_aligned_in_smp =
+	{
+		.queue		= LIST_HEAD_INIT(call_function.queue),
+		.lock		= __SPIN_LOCK_UNLOCKED(call_function.lock),
+	};
 
 enum {
 	CSD_FLAG_LOCK		= 0x01,
 };
 
 struct call_function_data {
-	struct call_single_data csd;
-	spinlock_t lock;
-	unsigned int refs;
-	cpumask_var_t cpumask;
+	struct call_single_data	csd;
+	spinlock_t		lock;
+	unsigned int		refs;
+	cpumask_var_t		cpumask;
 };
 
 struct call_single_queue {
-	struct list_head list;
-	spinlock_t lock;
+	struct list_head	list;
+	spinlock_t		lock;
 };
 
 static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
-	.lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
+	.lock			= __SPIN_LOCK_UNLOCKED(cfd_data.lock),
 };
 
 static int
@@ -71,7 +71,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 }
 
 static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
-	.notifier_call = hotplug_cfd,
+	.notifier_call		= hotplug_cfd,
 };
 
 static int __cpuinit init_call_single_data(void)
@@ -96,9 +96,9 @@ early_initcall(init_call_single_data);
 /*
  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
  *
- * For non-synchronous ipi calls the csd can still be in use by the previous
- * function call. For multi-cpu calls its even more interesting as we'll have
- * to ensure no other cpu is observing our csd.
+ * For non-synchronous ipi calls the csd can still be in use by the
+ * previous function call. For multi-cpu calls its even more interesting
+ * as we'll have to ensure no other cpu is observing our csd.
  */
 static void csd_lock_wait(struct call_single_data *data)
 {
@@ -112,27 +112,29 @@ static void csd_lock(struct call_single_data *data)
 	data->flags = CSD_FLAG_LOCK;
 
 	/*
-	 * prevent CPU from reordering the above assignment to ->flags
-	 * with any subsequent assignments to other fields of the
-	 * specified call_single_data structure.
+	 * prevent CPU from reordering the above assignment
+	 * to ->flags with any subsequent assignments to other
+	 * fields of the specified call_single_data structure:
 	 */
-
 	smp_mb();
 }
 
 static void csd_unlock(struct call_single_data *data)
 {
 	WARN_ON(!(data->flags & CSD_FLAG_LOCK));
+
 	/*
-	 * ensure we're all done before releasing data
+	 * ensure we're all done before releasing data:
 	 */
 	smp_mb();
+
 	data->flags &= ~CSD_FLAG_LOCK;
 }
 
 /*
- * Insert a previously allocated call_single_data element for execution
- * on the given CPU. data must already have ->func, ->info, and ->flags set.
+ * Insert a previously allocated call_single_data element
+ * for execution on the given CPU. data must already have
+ * ->func, ->info, and ->flags set.
  */
 static
 void generic_exec_single(int cpu, struct call_single_data *data, int wait)
@@ -154,10 +156,9 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
 	 * If IPIs can go out of order to the cache coherency protocol
 	 * in an architecture, sufficient synchronisation should be added
 	 * to arch code to make it appear to obey cache coherency WRT
-	 * locking and barrier primitives. Generic code isn't really equipped
-	 * to do the right thing...
+	 * locking and barrier primitives. Generic code isn't really
+	 * equipped to do the right thing...
 	 */
-
 	if (ipi)
 		arch_send_call_function_single_ipi(cpu);
 
@@ -183,8 +184,8 @@ void generic_smp_call_function_interrupt(void)
 	smp_mb();
 
 	/*
-	 * It's ok to use list_for_each_rcu() here even though we may delete
-	 * 'pos', since list_del_rcu() doesn't clear ->next
+	 * It's ok to use list_for_each_rcu() here even though we may
+	 * delete 'pos', since list_del_rcu() doesn't clear ->next
 	 */
 	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
 		int refs;
@@ -219,14 +220,14 @@ void generic_smp_call_function_interrupt(void)
 }
 
 /*
- * Invoked by arch to handle an IPI for call function single. Must be called
- * from the arch with interrupts disabled.
+ * Invoked by arch to handle an IPI for call function single. Must be
+ * called from the arch with interrupts disabled.
  */
 void generic_smp_call_function_single_interrupt(void)
 {
 	struct call_single_queue *q = &__get_cpu_var(call_single_queue);
-	LIST_HEAD(list);
 	unsigned int data_flags;
+	LIST_HEAD(list);
 
 	spin_lock(&q->lock);
 	list_replace_init(&q->list, &list);
@@ -235,22 +236,20 @@ void generic_smp_call_function_single_interrupt(void)
 	while (!list_empty(&list)) {
 		struct call_single_data *data;
 
-		data = list_entry(list.next, struct call_single_data,
-					list);
+		data = list_entry(list.next, struct call_single_data, list);
 		list_del(&data->list);
 
 		/*
-		 * 'data' can be invalid after this call if
-		 * flags == 0 (when called through
-		 * generic_exec_single(), so save them away before
-		 * making the call.
+		 * 'data' can be invalid after this call if flags == 0
+		 * (when called through generic_exec_single()),
+		 * so save them away before making the call:
 		 */
 		data_flags = data->flags;
 
 		data->func(data->info);
 
 		/*
-		 * Unlocked CSDs are valid through generic_exec_single()
+		 * Unlocked CSDs are valid through generic_exec_single():
 		 */
 		if (data_flags & CSD_FLAG_LOCK)
 			csd_unlock(data);
@@ -276,34 +275,41 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 		.flags = 0,
 	};
 	unsigned long flags;
-	/* prevent preemption and reschedule on another processor,
-	   as well as CPU removal */
-	int me = get_cpu();
+	int this_cpu;
 	int err = 0;
 
+	/*
+	 * prevent preemption and reschedule on another processor,
+	 * as well as CPU removal
+	 */
+	this_cpu = get_cpu();
+
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
 
-	if (cpu == me) {
+	if (cpu == this_cpu) {
 		local_irq_save(flags);
 		func(info);
 		local_irq_restore(flags);
-	} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
-		struct call_single_data *data = &d;
+	} else {
+		if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
+			struct call_single_data *data = &d;
 
-		if (!wait)
-			data = &__get_cpu_var(csd_data);
+			if (!wait)
+				data = &__get_cpu_var(csd_data);
 
-		csd_lock(data);
+			csd_lock(data);
 
-		data->func = func;
-		data->info = info;
-		generic_exec_single(cpu, data, wait);
-	} else {
-		err = -ENXIO;	/* CPU not online */
+			data->func = func;
+			data->info = info;
+			generic_exec_single(cpu, data, wait);
+		} else {
+			err = -ENXIO;	/* CPU not online */
+		}
 	}
 
 	put_cpu();
+
 	return err;
 }
 EXPORT_SYMBOL(smp_call_function_single);
@@ -313,10 +319,9 @@ EXPORT_SYMBOL(smp_call_function_single);
  * @cpu: The CPU to run on.
  * @data: Pre-allocated and setup data structure
  *
- * Like smp_call_function_single(), but allow caller to pass in a pre-allocated
- * data structure. Useful for embedding @data inside other structures, for
- * instance.
- *
+ * Like smp_call_function_single(), but allow caller to pass in a
+ * pre-allocated data structure. Useful for embedding @data inside
+ * other structures, for instance.
  */
 void __smp_call_function_single(int cpu, struct call_single_data *data,
 				int wait)
@@ -329,10 +334,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 	generic_exec_single(cpu, data, wait);
 }
 
-/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
+/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
+
 #ifndef arch_send_call_function_ipi_mask
-#define arch_send_call_function_ipi_mask(maskp) \
-	arch_send_call_function_ipi(*(maskp))
+# define arch_send_call_function_ipi_mask(maskp) \
+	 arch_send_call_function_ipi(*(maskp))
 #endif
 
 /**
@@ -340,7 +346,8 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
  * @mask: The set of cpus to run on (only runs on online subset).
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ * @wait: If true, wait (atomically) until function has completed
+ *        on other CPUs.
  *
  * If @wait is true, then returns once @func has returned. Note that @wait
  * will be implicitly turned on in case of allocation failures, since
@@ -351,27 +358,27 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
  * must be disabled when calling this function.
  */
 void smp_call_function_many(const struct cpumask *mask,
-			    void (*func)(void *), void *info,
-			    bool wait)
+			    void (*func)(void *), void *info, bool wait)
 {
 	struct call_function_data *data;
 	unsigned long flags;
-	int cpu, next_cpu, me = smp_processor_id();
+	int cpu, next_cpu, this_cpu = smp_processor_id();
 
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
 
-	/* So, what's a CPU they want?  Ignoring this one. */
+	/* So, what's a CPU they want? Ignoring this one. */
 	cpu = cpumask_first_and(mask, cpu_online_mask);
-	if (cpu == me)
+	if (cpu == this_cpu)
 		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+
 	/* No online cpus?  We're done. */
 	if (cpu >= nr_cpu_ids)
 		return;
 
 	/* Do we have another CPU which isn't us? */
 	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
-	if (next_cpu == me)
+	if (next_cpu == this_cpu)
 		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
 
 	/* Fastpath: do that cpu by itself. */
@@ -387,30 +394,31 @@ void smp_call_function_many(const struct cpumask *mask,
 	data->csd.func = func;
 	data->csd.info = info;
 	cpumask_and(data->cpumask, mask, cpu_online_mask);
-	cpumask_clear_cpu(me, data->cpumask);
+	cpumask_clear_cpu(this_cpu, data->cpumask);
 	data->refs = cpumask_weight(data->cpumask);
 
 	spin_lock(&call_function.lock);
 	/*
 	 * Place entry at the _HEAD_ of the list, so that any cpu still
-	 * observing the entry in generic_smp_call_function_interrupt() will
-	 * not miss any other list entries.
+	 * observing the entry in generic_smp_call_function_interrupt()
+	 * will not miss any other list entries:
 	 */
 	list_add_rcu(&data->csd.list, &call_function.queue);
 	spin_unlock(&call_function.lock);
+
 	spin_unlock_irqrestore(&data->lock, flags);
 
 	/*
 	 * Make the list addition visible before sending the ipi.
-	 * (IPIs must obey or appear to obey normal Linux cache coherency
-	 * rules -- see comment in generic_exec_single).
+	 * (IPIs must obey or appear to obey normal Linux cache
+	 * coherency rules -- see comment in generic_exec_single).
 	 */
 	smp_mb();
 
 	/* Send a message to all CPUs in the map */
 	arch_send_call_function_ipi_mask(data->cpumask);
 
-	/* optionally wait for the CPUs to complete */
+	/* Optionally wait for the CPUs to complete */
 	if (wait)
 		csd_lock_wait(&data->csd);
 }
@@ -420,7 +428,8 @@ EXPORT_SYMBOL(smp_call_function_many);
  * smp_call_function(): Run a function on all other CPUs.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ * @wait: If true, wait (atomically) until function has completed
+ *        on other CPUs.
  *
  * Returns 0.
  *
@@ -436,6 +445,7 @@ int smp_call_function(void (*func)(void *), void *info, int wait)
 	preempt_disable();
 	smp_call_function_many(cpu_online_mask, func, info, wait);
 	preempt_enable();
+
 	return 0;
 }
 EXPORT_SYMBOL(smp_call_function);
-- 
cgit v1.2.3-59-g8ed1b


From eef62a6826b8ab530cefff5aa55c1661a209c803 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Feb 2009 15:49:52 -0500
Subject: tracing: rename DEFINE_TRACE_FMT to just TRACE_FORMAT

There's been a bit confusion to whether DEFINE/DECLARE_TRACE_FMT should
be a DEFINE or a DECLARE. Ingo Molnar suggested simply calling it
TRACE_FORMAT.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h        |  2 +-
 include/trace/sched_event_types.h | 26 +++++++++++++-------------
 kernel/trace/trace_events.h       |  4 ++--
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 34ae464effff..3de09fa8e01d 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -153,7 +153,7 @@ static inline void tracepoint_synchronize_unregister(void)
 	synchronize_sched();
 }
 
-#define DEFINE_TRACE_FMT(name, proto, args, fmt)		\
+#define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, TPPROTO(proto), TPARGS(args))
 
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index a4f662940f4e..a3d3d66a51c8 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -1,72 +1,72 @@
 
 /* use <trace/sched.h> instead */
-#ifndef DEFINE_TRACE_FMT
+#ifndef TRACE_FORMAT
 # error Do not include this file directly.
 # error Unless you know what you are doing.
 #endif
 
-DEFINE_TRACE_FMT(sched_kthread_stop,
+TRACE_FORMAT(sched_kthread_stop,
 	TPPROTO(struct task_struct *t),
 	TPARGS(t),
 	TPFMT("task %s:%d", t->comm, t->pid));
 
-DEFINE_TRACE_FMT(sched_kthread_stop_ret,
+TRACE_FORMAT(sched_kthread_stop_ret,
 	TPPROTO(int ret),
 	TPARGS(ret),
 	TPFMT("ret=%d", ret));
 
-DEFINE_TRACE_FMT(sched_wait_task,
+TRACE_FORMAT(sched_wait_task,
 	TPPROTO(struct rq *rq, struct task_struct *p),
 	TPARGS(rq, p),
 	TPFMT("task %s:%d", p->comm, p->pid));
 
-DEFINE_TRACE_FMT(sched_wakeup,
+TRACE_FORMAT(sched_wakeup,
 	TPPROTO(struct rq *rq, struct task_struct *p, int success),
 	TPARGS(rq, p, success),
 	TPFMT("task %s:%d %s",
 	      p->comm, p->pid, success?"succeeded":"failed"));
 
-DEFINE_TRACE_FMT(sched_wakeup_new,
+TRACE_FORMAT(sched_wakeup_new,
 	TPPROTO(struct rq *rq, struct task_struct *p, int success),
 	TPARGS(rq, p, success),
 	TPFMT("task %s:%d",
 	      p->comm, p->pid, success?"succeeded":"failed"));
 
-DEFINE_TRACE_FMT(sched_switch,
+TRACE_FORMAT(sched_switch,
 	TPPROTO(struct rq *rq, struct task_struct *prev,
 		struct task_struct *next),
 	TPARGS(rq, prev, next),
 	TPFMT("task %s:%d ==> %s:%d",
 	      prev->comm, prev->pid, next->comm, next->pid));
 
-DEFINE_TRACE_FMT(sched_migrate_task,
+TRACE_FORMAT(sched_migrate_task,
 	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
 	TPARGS(p, orig_cpu, dest_cpu),
 	TPFMT("task %s:%d from: %d  to: %d",
 	      p->comm, p->pid, orig_cpu, dest_cpu));
 
-DEFINE_TRACE_FMT(sched_process_free,
+TRACE_FORMAT(sched_process_free,
 	TPPROTO(struct task_struct *p),
 	TPARGS(p),
 	TPFMT("task %s:%d", p->comm, p->pid));
 
-DEFINE_TRACE_FMT(sched_process_exit,
+TRACE_FORMAT(sched_process_exit,
 	TPPROTO(struct task_struct *p),
 	TPARGS(p),
 	TPFMT("task %s:%d", p->comm, p->pid));
 
-DEFINE_TRACE_FMT(sched_process_wait,
+TRACE_FORMAT(sched_process_wait,
 	TPPROTO(struct pid *pid),
 	TPARGS(pid),
 	TPFMT("pid %d", pid));
 
-DEFINE_TRACE_FMT(sched_process_fork,
+TRACE_FORMAT(sched_process_fork,
 	TPPROTO(struct task_struct *parent, struct task_struct *child),
 	TPARGS(parent, child),
 	TPFMT("parent %s:%d  child %s:%d",
 	      parent->comm, parent->pid, child->comm, child->pid));
 
-DEFINE_TRACE_FMT(sched_signal_send,
+TRACE_FORMAT(sched_signal_send,
 	TPPROTO(int sig, struct task_struct *p),
 	TPARGS(sig, p),
 	TPFMT("sig: %d   task %s:%d", sig, p->comm, p->pid));
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
index cb8455b3ac9a..deb95e5006c8 100644
--- a/kernel/trace/trace_events.h
+++ b/kernel/trace/trace_events.h
@@ -17,8 +17,8 @@ struct ftrace_event_call {
 #undef TPFMT
 #define TPFMT(fmt, args...)	fmt "\n", ##args
 
-#undef DEFINE_TRACE_FMT
-#define DEFINE_TRACE_FMT(call, proto, args, fmt)			\
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
 {									\
 	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
-- 
cgit v1.2.3-59-g8ed1b


From 8656e7a2fa6afcd8682990f804a2a9674568738f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 26 Feb 2009 00:41:38 +0100
Subject: tracing/core: make the per cpu trace files in per cpu directories

Impact: restructure the VFS layout of per CPU trace buffers

The per cpu trace files are all in a single directory:
/debug/tracing/per_cpu. In case of a large number of cpu, the
content of this directory becomes messy so we create now one
directory per cpu inside /debug/tracing/per_cpu which contain
each their own trace_pipe and trace files.

Ie:

 /debug/tracing$ ls -R per_cpu
 per_cpu:
 cpu0  cpu1

 per_cpu/cpu0:
 trace  trace_pipe

 per_cpu/cpu1:
 trace  trace_pipe

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d8d899f3bd6f..bdaf60d3d337 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3061,28 +3061,31 @@ struct dentry *tracing_dentry_percpu(void)
 static void tracing_init_debugfs_percpu(long cpu)
 {
 	struct dentry *d_percpu = tracing_dentry_percpu();
-	struct dentry *entry;
-	/* strlen(trace_pipe) + MAX(log10(cpu)) + '\0' */
-	char filename[17];
+	struct dentry *entry, *d_cpu;
+	/* strlen(cpu) + MAX(log10(cpu)) + '\0' */
+	char cpu_dir[7];
 
 	if (cpu > 999 || cpu < 0)
 		return;
 
-	/* per cpu trace_pipe */
-	sprintf(filename, "trace_pipe%ld", cpu);
+	sprintf(cpu_dir, "cpu%ld", cpu);
+	d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
+	if (!d_cpu) {
+		pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
+		return;
+	}
 
-	entry = debugfs_create_file(filename, 0444, d_percpu,
+	/* per cpu trace_pipe */
+	entry = debugfs_create_file("trace_pipe", 0444, d_cpu,
 				(void *) cpu, &tracing_pipe_fops);
 	if (!entry)
-		pr_warning("Could not create debugfs '%s' entry\n", filename);
+		pr_warning("Could not create debugfs 'trace_pipe' entry\n");
 
 	/* per cpu trace */
-	sprintf(filename, "trace%ld", cpu);
-
-	entry = debugfs_create_file(filename, 0444, d_percpu,
+	entry = debugfs_create_file("trace", 0444, d_cpu,
 				(void *) cpu, &tracing_fops);
 	if (!entry)
-		pr_warning("Could not create debugfs '%s' entry\n", filename);
+		pr_warning("Could not create debugfs 'trace' entry\n");
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
-- 
cgit v1.2.3-59-g8ed1b


From af39241b90a345556b8884adff87096afe71b050 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 26 Feb 2009 10:11:05 -0500
Subject: tracing, genirq: add irq enter and exit trace events

Impact: add new tracepoints

Add them to the generic IRQ code, that way every architecture
gets these new tracepoints, not just x86.

Using Steve's new 'TRACE_FORMAT', I can get function graph
trace as follows using the original two IRQ tracepoints:

 3)               |    handle_IRQ_event() {
 3)               |    /* (irq_handler_entry) irq=28 handler=eth0 */
 3)               |    e1000_intr_msi() {
 3)   2.460 us    |      __napi_schedule();
 3)   9.416 us    |    }
 3)               |    /* (irq_handler_exit) irq=28 handler=eth0 return=handled */
 3) + 22.935 us   |  }

Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/irq.h             |  9 +++++++++
 include/trace/irq_event_types.h | 17 +++++++++++++++++
 kernel/irq/handle.c             |  6 ++++++
 kernel/trace/events.c           |  2 ++
 4 files changed, 34 insertions(+)
 create mode 100644 include/trace/irq.h
 create mode 100644 include/trace/irq_event_types.h

(limited to 'kernel')

diff --git a/include/trace/irq.h b/include/trace/irq.h
new file mode 100644
index 000000000000..ff5d4495dc37
--- /dev/null
+++ b/include/trace/irq.h
@@ -0,0 +1,9 @@
+#ifndef _TRACE_IRQ_H
+#define _TRACE_IRQ_H
+
+#include <linux/interrupt.h>
+#include <linux/tracepoint.h>
+
+#include <trace/irq_event_types.h>
+
+#endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
new file mode 100644
index 000000000000..5d0919fdd2d4
--- /dev/null
+++ b/include/trace/irq_event_types.h
@@ -0,0 +1,17 @@
+
+/* use <trace/irq.h> instead */
+#ifndef TRACE_FORMAT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+TRACE_FORMAT(irq_handler_entry,
+	TPPROTO(int irq, struct irqaction *action),
+	TPARGS(irq, action),
+	TPFMT("irq=%d handler=%s", irq, action->name));
+
+TRACE_FORMAT(irq_handler_exit,
+	TPPROTO(int irq, struct irqaction *action, int ret),
+	TPARGS(irq, action, ret),
+	TPFMT("irq=%d handler=%s return=%s",
+		irq, action->name, ret ? "handled" : "unhandled"));
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3aba8d12f328..4709a7c870d7 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,6 +17,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
+#include <trace/irq.h>
 
 #include "internals.h"
 
@@ -316,6 +317,9 @@ irqreturn_t no_action(int cpl, void *dev_id)
 	return IRQ_NONE;
 }
 
+DEFINE_TRACE(irq_handler_entry);
+DEFINE_TRACE(irq_handler_exit);
+
 /**
  * handle_IRQ_event - irq action chain handler
  * @irq:	the interrupt number
@@ -332,7 +336,9 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 		local_irq_enable_in_hardirq();
 
 	do {
+		trace_irq_handler_entry(irq, action);
 		ret = action->handler(irq, action->dev_id);
+		trace_irq_handler_exit(irq, action, ret);
 		if (ret == IRQ_HANDLED)
 			status |= action->flags;
 		retval |= ret;
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 38c89eef99ee..3c75623893cc 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -6,8 +6,10 @@
 
 /* trace/<type>.h here */
 #include <trace/sched.h>
+#include <trace/irq.h>
 
 #include "trace_events.h"
 
 /* trace/<type>_event_types.h here */
 #include <trace/sched_event_types.h>
+#include <trace/irq_event_types.h>
-- 
cgit v1.2.3-59-g8ed1b


From 6409c4da289d6905f7ae2bd0630438368439bda2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:14 +0200
Subject: sched: sched_clock() improvement: use in_nmi()

make sure we dont execute more complex sched_clock() code in NMI context.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index a0b0852414cc..db69174b1178 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -29,6 +29,7 @@
 #include <linux/spinlock.h>
 #include <linux/ktime.h>
 #include <linux/module.h>
+#include <linux/hardirq.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -151,6 +152,13 @@ u64 sched_clock_cpu(int cpu)
 	struct sched_clock_data *scd = cpu_sdc(cpu);
 	u64 now, clock, this_clock, remote_clock;
 
+	/*
+	 * Normally this is not called in NMI context - but if it is,
+	 * trying to do any locking here is totally lethal.
+	 */
+	if (unlikely(in_nmi()))
+		return scd->clock;
+
 	if (unlikely(!sched_clock_running))
 		return 0ull;
 
-- 
cgit v1.2.3-59-g8ed1b


From 14131f2f98ac350ee9e73faed916d2238a8b6a0d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 26 Feb 2009 18:47:11 +0100
Subject: tracing: implement trace_clock_*() APIs

Impact: implement new tracing timestamp APIs

Add three trace clock variants, with differing scalability/precision
tradeoffs:

 -   local: CPU-local trace clock
 -  medium: scalable global clock with some jitter
 -  global: globally monotonic, serialized clock

Make the ring-buffer use the local trace clock internally.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/trace_clock.h |  19 +++++++++
 kernel/trace/Makefile       |   1 +
 kernel/trace/ring_buffer.c  |   5 +--
 kernel/trace/trace_clock.c  | 101 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 123 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/trace_clock.h
 create mode 100644 kernel/trace/trace_clock.c

(limited to 'kernel')

diff --git a/include/linux/trace_clock.h b/include/linux/trace_clock.h
new file mode 100644
index 000000000000..7a8130384087
--- /dev/null
+++ b/include/linux/trace_clock.h
@@ -0,0 +1,19 @@
+#ifndef _LINUX_TRACE_CLOCK_H
+#define _LINUX_TRACE_CLOCK_H
+
+/*
+ * 3 trace clock variants, with differing scalability/precision
+ * tradeoffs:
+ *
+ *  -   local: CPU-local trace clock
+ *  -  medium: scalable global clock with some jitter
+ *  -  global: globally monotonic, serialized clock
+ */
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+extern u64 notrace trace_clock_local(void);
+extern u64 notrace trace_clock(void);
+extern u64 notrace trace_clock_global(void);
+
+#endif /* _LINUX_TRACE_CLOCK_H */
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 664b6c0dc75a..c931fe0560cb 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8f19f1aa42b0..a8c275c01e83 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  */
 #include <linux/ring_buffer.h>
+#include <linux/trace_clock.h>
 #include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
@@ -12,7 +13,6 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
-#include <linux/sched.h>	/* used for sched_clock() (for now) */
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/list.h>
@@ -112,14 +112,13 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
-/* FIXME!!! */
 u64 ring_buffer_time_stamp(int cpu)
 {
 	u64 time;
 
 	preempt_disable_notrace();
 	/* shift to debug/test normalization and TIME_EXTENTS */
-	time = sched_clock() << DEBUG_SHIFT;
+	time = trace_clock_local() << DEBUG_SHIFT;
 	preempt_enable_no_resched_notrace();
 
 	return time;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
new file mode 100644
index 000000000000..2d4953f93560
--- /dev/null
+++ b/kernel/trace/trace_clock.c
@@ -0,0 +1,101 @@
+/*
+ * tracing clocks
+ *
+ *  Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Implements 3 trace clock variants, with differing scalability/precision
+ * tradeoffs:
+ *
+ *  -   local: CPU-local trace clock
+ *  -  medium: scalable global clock with some jitter
+ *  -  global: globally monotonic, serialized clock
+ *
+ * Tracer plugins will chose a default from these clocks.
+ */
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/ktime.h>
+
+/*
+ * trace_clock_local(): the simplest and least coherent tracing clock.
+ *
+ * Useful for tracing that does not cross to other CPUs nor
+ * does it go through idle events.
+ */
+u64 notrace trace_clock_local(void)
+{
+	/*
+	 * sched_clock() is an architecture implemented, fast, scalable,
+	 * lockless clock. It is not guaranteed to be coherent across
+	 * CPUs, nor across CPU idle events.
+	 */
+	return sched_clock();
+}
+
+/*
+ * trace_clock(): 'inbetween' trace clock. Not completely serialized,
+ * but not completely incorrect when crossing CPUs either.
+ *
+ * This is based on cpu_clock(), which will allow at most ~1 jiffy of
+ * jitter between CPUs. So it's a pretty scalable clock, but there
+ * can be offsets in the trace data.
+ */
+u64 notrace trace_clock(void)
+{
+	return cpu_clock(raw_smp_processor_id());
+}
+
+
+/*
+ * trace_clock_global(): special globally coherent trace clock
+ *
+ * It has higher overhead than the other trace clocks but is still
+ * an order of magnitude faster than GTOD derived hardware clocks.
+ *
+ * Used by plugins that need globally coherent timestamps.
+ */
+
+static u64 prev_trace_clock_time;
+
+static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp =
+	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+u64 notrace trace_clock_global(void)
+{
+	unsigned long flags;
+	int this_cpu;
+	u64 now;
+
+	raw_local_irq_save(flags);
+
+	this_cpu = raw_smp_processor_id();
+	now = cpu_clock(this_cpu);
+	/*
+	 * If in an NMI context then dont risk lockups and return the
+	 * cpu_clock() time:
+	 */
+	if (unlikely(in_nmi()))
+		goto out;
+
+	__raw_spin_lock(&trace_clock_lock);
+
+	/*
+	 * TODO: if this happens often then maybe we should reset
+	 * my_scd->clock to prev_trace_clock_time+1, to make sure
+	 * we start ticking with the local clock from now on?
+	 */
+	if ((s64)(now - prev_trace_clock_time) < 0)
+		now = prev_trace_clock_time + 1;
+
+	prev_trace_clock_time = now;
+
+	__raw_spin_unlock(&trace_clock_lock);
+
+ out:
+	raw_local_irq_restore(flags);
+
+	return now;
+}
-- 
cgit v1.2.3-59-g8ed1b


From a8259075074fb09c230b4cd2c8d3ee3c49d6ecd1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Feb 2009 22:19:12 -0500
Subject: tracing: add options directory and core option files

This patch creates an options directory in the debugfs, that contains
the available tracing options. These files contain 1 or 0, where 1
is the option is enabled and 0 it is disabled.

Simply echoing in 1 will enable the option and 0 will disable it.
This patch only contains the core options, not the tracer options.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bdaf60d3d337..40e983ed994f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3093,6 +3093,121 @@ static void tracing_init_debugfs_percpu(long cpu)
 #include "trace_selftest.c"
 #endif
 
+static ssize_t
+trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
+			loff_t *ppos)
+{
+	long index = (long)filp->private_data;
+	char *buf;
+
+	if (trace_flags & (1 << index))
+		buf = "1\n";
+	else
+		buf = "0\n";
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+}
+
+static ssize_t
+trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
+			 loff_t *ppos)
+{
+	long index = (long)filp->private_data;
+	char buf[64];
+	unsigned long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	switch (val) {
+	case 0:
+		trace_flags &= ~(1 << index);
+		break;
+	case 1:
+		trace_flags |= 1 << index;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
+
+static const struct file_operations trace_options_core_fops = {
+	.open = tracing_open_generic,
+	.read = trace_options_core_read,
+	.write = trace_options_core_write,
+};
+
+static struct dentry *trace_options_init_dentry(void)
+{
+	struct dentry *d_tracer;
+	static struct dentry *t_options;
+
+	if (t_options)
+		return t_options;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return NULL;
+
+	t_options = debugfs_create_dir("options", d_tracer);
+	if (!t_options) {
+		pr_warning("Could not create debugfs directory 'options'\n");
+		return NULL;
+	}
+
+	return t_options;
+}
+
+static struct dentry *
+create_trace_option_core_file(const char *option, long index)
+{
+	struct dentry *t_options;
+	struct dentry *entry;
+
+	t_options = trace_options_init_dentry();
+	if (!t_options)
+		return NULL;
+
+	entry = debugfs_create_file(option, 0644, t_options, (void *)index,
+				    &trace_options_core_fops);
+
+	return entry;
+}
+
+static __init void create_trace_options_dir(void)
+{
+	struct dentry *t_options;
+	struct dentry *entry;
+	int i;
+
+	t_options = trace_options_init_dentry();
+	if (!t_options)
+		return;
+
+	for (i = 0; trace_options[i]; i++) {
+		entry = create_trace_option_core_file(trace_options[i], i);
+		if (!entry)
+			pr_warning("Could not create debugfs %s entry\n",
+				   trace_options[i]);
+	}
+}
+
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
@@ -3111,6 +3226,8 @@ static __init int tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace_options' entry\n");
 
+	create_trace_options_dir();
+
 	entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
 				    NULL, &tracing_cpumask_fops);
 	if (!entry)
-- 
cgit v1.2.3-59-g8ed1b


From 577b785f55168d5acb3d123ba41bfe8d7981e044 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Feb 2009 23:43:05 -0500
Subject: tracing: add tracer dependent options to options directory

This patch adds the tracer dependent options dynamically to the
options directory when the tracer is activated. These options are
removed when the tracer is deactivated.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 173 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 40e983ed994f..485c6e7f4465 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2275,8 +2275,17 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
 	return t->init(tr);
 }
 
+struct trace_option_dentry;
+
+static struct trace_option_dentry *
+create_trace_option_files(struct tracer *tracer);
+
+static void
+destroy_trace_option_files(struct trace_option_dentry *topts);
+
 static int tracing_set_tracer(const char *buf)
 {
+	static struct trace_option_dentry *topts;
 	struct trace_array *tr = &global_trace;
 	struct tracer *t;
 	int ret = 0;
@@ -2297,7 +2306,12 @@ static int tracing_set_tracer(const char *buf)
 	if (current_trace && current_trace->reset)
 		current_trace->reset(tr);
 
+	destroy_trace_option_files(topts);
+
 	current_trace = t;
+
+	topts = create_trace_option_files(current_trace);
+
 	if (t->init) {
 		ret = tracer_init(t, tr);
 		if (ret)
@@ -3093,6 +3107,95 @@ static void tracing_init_debugfs_percpu(long cpu)
 #include "trace_selftest.c"
 #endif
 
+struct trace_option_dentry {
+	struct tracer_opt		*opt;
+	struct tracer_flags		*flags;
+	struct dentry			*entry;
+};
+
+static ssize_t
+trace_options_read(struct file *filp, char __user *ubuf, size_t cnt,
+			loff_t *ppos)
+{
+	struct trace_option_dentry *topt = filp->private_data;
+	char *buf;
+
+	if (topt->flags->val & topt->opt->bit)
+		buf = "1\n";
+	else
+		buf = "0\n";
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+}
+
+static ssize_t
+trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
+			 loff_t *ppos)
+{
+	struct trace_option_dentry *topt = filp->private_data;
+	unsigned long val;
+	char buf[64];
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	ret = 0;
+	switch (val) {
+	case 0:
+		/* do nothing if already cleared */
+		if (!(topt->flags->val & topt->opt->bit))
+			break;
+
+		mutex_lock(&trace_types_lock);
+		if (current_trace->set_flag)
+			ret = current_trace->set_flag(topt->flags->val,
+						      topt->opt->bit, 0);
+		mutex_unlock(&trace_types_lock);
+		if (ret)
+			return ret;
+		topt->flags->val &= ~topt->opt->bit;
+		break;
+	case 1:
+		/* do nothing if already set */
+		if (topt->flags->val & topt->opt->bit)
+			break;
+
+		mutex_lock(&trace_types_lock);
+		if (current_trace->set_flag)
+			ret = current_trace->set_flag(topt->flags->val,
+						      topt->opt->bit, 1);
+		mutex_unlock(&trace_types_lock);
+		if (ret)
+			return ret;
+		topt->flags->val |= topt->opt->bit;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
+
+static const struct file_operations trace_options_fops = {
+	.open = tracing_open_generic,
+	.read = trace_options_read,
+	.write = trace_options_write,
+};
+
 static ssize_t
 trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
 			loff_t *ppos)
@@ -3146,7 +3249,6 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
-
 static const struct file_operations trace_options_core_fops = {
 	.open = tracing_open_generic,
 	.read = trace_options_core_read,
@@ -3174,6 +3276,76 @@ static struct dentry *trace_options_init_dentry(void)
 	return t_options;
 }
 
+static void
+create_trace_option_file(struct trace_option_dentry *topt,
+			 struct tracer_flags *flags,
+			 struct tracer_opt *opt)
+{
+	struct dentry *t_options;
+	struct dentry *entry;
+
+	t_options = trace_options_init_dentry();
+	if (!t_options)
+		return;
+
+	topt->flags = flags;
+	topt->opt = opt;
+
+	entry = debugfs_create_file(opt->name, 0644, t_options, topt,
+				    &trace_options_fops);
+
+	topt->entry = entry;
+
+}
+
+static struct trace_option_dentry *
+create_trace_option_files(struct tracer *tracer)
+{
+	struct trace_option_dentry *topts;
+	struct tracer_flags *flags;
+	struct tracer_opt *opts;
+	int cnt;
+
+	if (!tracer)
+		return NULL;
+
+	flags = tracer->flags;
+
+	if (!flags || !flags->opts)
+		return NULL;
+
+	opts = flags->opts;
+
+	for (cnt = 0; opts[cnt].name; cnt++)
+		;
+
+	topts = kzalloc(sizeof(*topts) * (cnt + 1), GFP_KERNEL);
+	if (!topts)
+		return NULL;
+
+	for (cnt = 0; opts[cnt].name; cnt++)
+		create_trace_option_file(&topts[cnt], flags,
+					 &opts[cnt]);
+
+	return topts;
+}
+
+static void
+destroy_trace_option_files(struct trace_option_dentry *topts)
+{
+	int cnt;
+
+	if (!topts)
+		return;
+
+	for (cnt = 0; topts[cnt].opt; cnt++) {
+		if (topts[cnt].entry)
+			debugfs_remove(topts[cnt].entry);
+	}
+
+	kfree(topts);
+}
+
 static struct dentry *
 create_trace_option_core_file(const char *option, long index)
 {
-- 
cgit v1.2.3-59-g8ed1b


From d8e83d26b5ab3b31ee0ff6d093a2627707a1e221 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Feb 2009 23:55:58 -0500
Subject: tracing: add protection around open use of current_tracer

Impact: fix to possible race conditions

There's some uses of current_tracer that is not protected by the
trace_types_lock. There is a small chance that a sysadmin changes
the tracer while the current_tracer is being referenced.

If the race is hit, it is unlikely to cause any harm since the
tracers are constant and are not freed. But some strang side
effects may occur.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 485c6e7f4465..6c89ec6cbd48 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2024,12 +2024,12 @@ static ssize_t
 tracing_trace_options_read(struct file *filp, char __user *ubuf,
 		       size_t cnt, loff_t *ppos)
 {
-	int i;
+	struct tracer_opt *trace_opts;
+	u32 tracer_flags;
+	int len = 0;
 	char *buf;
 	int r = 0;
-	int len = 0;
-	u32 tracer_flags = current_trace->flags->val;
-	struct tracer_opt *trace_opts = current_trace->flags->opts;
+	int i;
 
 
 	/* calculate max size */
@@ -2038,6 +2038,10 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 		len += 3; /* "no" and space */
 	}
 
+	mutex_lock(&trace_types_lock);
+	tracer_flags = current_trace->flags->val;
+	trace_opts = current_trace->flags->opts;
+
 	/*
 	 * Increase the size with names of options specific
 	 * of the current tracer.
@@ -2049,8 +2053,10 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 
 	/* +2 for \n and \0 */
 	buf = kmalloc(len + 2, GFP_KERNEL);
-	if (!buf)
+	if (!buf) {
+		mutex_unlock(&trace_types_lock);
 		return -ENOMEM;
+	}
 
 	for (i = 0; trace_options[i]; i++) {
 		if (trace_flags & (1 << i))
@@ -2067,6 +2073,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 			r += sprintf(buf + r, "no%s ",
 				trace_opts[i].name);
 	}
+	mutex_unlock(&trace_types_lock);
 
 	r += sprintf(buf + r, "\n");
 	WARN_ON(r >= len + 2);
@@ -2074,7 +2081,6 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 
 	kfree(buf);
-
 	return r;
 }
 
@@ -2149,7 +2155,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 
 	/* If no option could be set, test the specific tracer options */
 	if (!trace_options[i]) {
+		mutex_lock(&trace_types_lock);
 		ret = set_tracer_option(current_trace, cmp, neg);
+		mutex_unlock(&trace_types_lock);
 		if (ret)
 			return ret;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 85a2f9b46f8cd8aaa11c64c715e1ea3ec27ec486 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 00:12:38 -0500
Subject: tracing: use pointer error returns for __tracing_open

Impact: fix compile warning and clean up

When I first wrote __tracing_open, instead of passing the error
code via the ERR_PTR macros, I lazily used a separate parameter
to hold the return for errors.

When Frederic Weisbecker updated that function, he used the Linux
kernel ERR_PTR for the returns. This caused the parameter return
to possibly not be initialized on error. gcc correctly pointed this
out with a warning.

This patch converts the entire function to use the Linux kernel
ERR_PTR macro methods.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6c89ec6cbd48..304e02ce39c4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1684,23 +1684,20 @@ static struct seq_operations tracer_seq_ops = {
 };
 
 static struct trace_iterator *
-__tracing_open(struct inode *inode, struct file *file, int *ret)
+__tracing_open(struct inode *inode, struct file *file)
 {
 	long cpu_file = (long) inode->i_private;
+	void *fail_ret = ERR_PTR(-ENOMEM);
 	struct trace_iterator *iter;
 	struct seq_file *m;
-	int cpu;
+	int cpu, ret;
 
-	if (tracing_disabled) {
-		*ret = -ENODEV;
-		return NULL;
-	}
+	if (tracing_disabled)
+		return ERR_PTR(-ENODEV);
 
 	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-	if (!iter) {
-		*ret = -ENOMEM;
-		goto out;
-	}
+	if (!iter)
+		return ERR_PTR(-ENOMEM);
 
 	/*
 	 * We make a copy of the current tracer to avoid concurrent
@@ -1708,10 +1705,9 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	 */
 	mutex_lock(&trace_types_lock);
 	iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
-	if (!iter->trace) {
-		*ret = -ENOMEM;
+	if (!iter->trace)
 		goto fail;
-	}
+
 	if (current_trace)
 		*iter->trace = *current_trace;
 
@@ -1750,9 +1746,11 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	}
 
 	/* TODO stop tracer */
-	*ret = seq_open(file, &tracer_seq_ops);
-	if (*ret)
+	ret = seq_open(file, &tracer_seq_ops);
+	if (ret < 0) {
+		fail_ret = ERR_PTR(ret);
 		goto fail_buffer;
+	}
 
 	m = file->private_data;
 	m->private = iter;
@@ -1762,7 +1760,6 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 
 	mutex_unlock(&trace_types_lock);
 
- out:
 	return iter;
 
  fail_buffer:
@@ -1775,7 +1772,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	kfree(iter->trace);
 	kfree(iter);
 
-	return ERR_PTR(-ENOMEM);
+	return fail_ret;
 }
 
 int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -1815,9 +1812,12 @@ static int tracing_release(struct inode *inode, struct file *file)
 
 static int tracing_open(struct inode *inode, struct file *file)
 {
-	int ret;
+	struct trace_iterator *iter;
+	int ret = 0;
 
-	__tracing_open(inode, file, &ret);
+	iter = __tracing_open(inode, file);
+	if (IS_ERR(iter))
+		ret = PTR_ERR(iter);
 
 	return ret;
 }
@@ -1825,11 +1825,13 @@ static int tracing_open(struct inode *inode, struct file *file)
 static int tracing_lt_open(struct inode *inode, struct file *file)
 {
 	struct trace_iterator *iter;
-	int ret;
+	int ret = 0;
 
-	iter = __tracing_open(inode, file, &ret);
+	iter = __tracing_open(inode, file);
 
-	if (!ret)
+	if (IS_ERR(iter))
+		ret = PTR_ERR(iter);
+	else
 		iter->iter_flags |= TRACE_FILE_LAT_FMT;
 
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 5c6a3ae1b4beebb56e2916b84f1208d96a9e32ff Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 00:22:21 -0500
Subject: tracing: use newline separator for trace options list

Impact: clean up

Instead of listing the trace options like:

 # cat /debug/tracing/trace_options
print-parent nosym-offset nosym-addr noverbose noraw nohex nobin noblock nostacktrace nosched-tree ftrace_printk noftrace_preempt nobranch annotate nouserstacktrace nosym-userobj

We now list them like:

 # cat /debug/tracing/trace_options
print-parent
nosym-offset
nosym-addr
noverbose
noraw
nohex
nobin
noblock
nostacktrace
nosched-tree
ftrace_printk
noftrace_preempt
nobranch
annotate
nouserstacktrace
nosym-userobj

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 304e02ce39c4..5db7485158df 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2037,7 +2037,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	/* calculate max size */
 	for (i = 0; trace_options[i]; i++) {
 		len += strlen(trace_options[i]);
-		len += 3; /* "no" and space */
+		len += 3; /* "no" and newline */
 	}
 
 	mutex_lock(&trace_types_lock);
@@ -2050,7 +2050,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	 */
 	for (i = 0; trace_opts[i].name; i++) {
 		len += strlen(trace_opts[i].name);
-		len += 3; /* "no" and space */
+		len += 3; /* "no" and newline */
 	}
 
 	/* +2 for \n and \0 */
@@ -2062,22 +2062,21 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 
 	for (i = 0; trace_options[i]; i++) {
 		if (trace_flags & (1 << i))
-			r += sprintf(buf + r, "%s ", trace_options[i]);
+			r += sprintf(buf + r, "%s\n", trace_options[i]);
 		else
-			r += sprintf(buf + r, "no%s ", trace_options[i]);
+			r += sprintf(buf + r, "no%s\n", trace_options[i]);
 	}
 
 	for (i = 0; trace_opts[i].name; i++) {
 		if (tracer_flags & trace_opts[i].bit)
-			r += sprintf(buf + r, "%s ",
+			r += sprintf(buf + r, "%s\n",
 				trace_opts[i].name);
 		else
-			r += sprintf(buf + r, "no%s ",
+			r += sprintf(buf + r, "no%s\n",
 				trace_opts[i].name);
 	}
 	mutex_unlock(&trace_types_lock);
 
-	r += sprintf(buf + r, "\n");
 	WARN_ON(r >= len + 2);
 
 	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-- 
cgit v1.2.3-59-g8ed1b


From 0cfe82451dfa3ebf4e69158f2eb450f2fbb6b715 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 10:51:10 -0500
Subject: tracing: replace kzalloc with kcalloc

Impact: clean up

kcalloc is a better approach to allocate a NULL array.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5db7485158df..9c5987aca74b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3328,7 +3328,7 @@ create_trace_option_files(struct tracer *tracer)
 	for (cnt = 0; opts[cnt].name; cnt++)
 		;
 
-	topts = kzalloc(sizeof(*topts) * (cnt + 1), GFP_KERNEL);
+	topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
 	if (!topts)
 		return NULL;
 
-- 
cgit v1.2.3-59-g8ed1b


From eb594e45f6979cd10b18d87f7b3f02119e00a108 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 17:36:06 -0500
Subject: tracing: move trace point formats to files in include/trace directory

Impact: clean up

To further facilitate the ease of adding trace points for developers, this
patch creates include/trace/trace_events.h and
include/trace/trace_event_types.h.

The former file will hold the trace/<type>.h files and the latter will hold
the trace/<type>_event_types.h files.

To create new tracepoints and to have them automatically
appear in the event tracer, a developer makes the trace/<type>.h file
which includes <linux/tracepoint.h> and the trace/<type>_event_types.h file.

The trace/<type>_event_types.h file will hold the TRACE_FORMAT
macros.

Then add the trace/<type>.h file to trace/trace_events.h,
and add the trace/<type>_event_types.h to the trace_event_types.h file.

No need to modify files elsewhere.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/trace/trace_event_types.h |  4 ++++
 include/trace/trace_events.h      |  4 ++++
 kernel/trace/events.c             | 10 ++--------
 3 files changed, 10 insertions(+), 8 deletions(-)
 create mode 100644 include/trace/trace_event_types.h
 create mode 100644 include/trace/trace_events.h

(limited to 'kernel')

diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
new file mode 100644
index 000000000000..33c8ed5ccb6c
--- /dev/null
+++ b/include/trace/trace_event_types.h
@@ -0,0 +1,4 @@
+/* trace/<type>_event_types.h here */
+
+#include <trace/sched_event_types.h>
+#include <trace/irq_event_types.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
new file mode 100644
index 000000000000..ea2ef2051762
--- /dev/null
+++ b/include/trace/trace_events.h
@@ -0,0 +1,4 @@
+/* trace/<type>.h here */
+
+#include <trace/sched.h>
+#include <trace/irq.h>
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 3c75623893cc..46e27ad2487e 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -1,15 +1,9 @@
 /*
  * This is the place to register all trace points as events.
- * Include the trace/<type>.h at the top.
- * Include the trace/<type>_event_types.h at the bottom.
  */
 
-/* trace/<type>.h here */
-#include <trace/sched.h>
-#include <trace/irq.h>
+#include <trace/trace_events.h>
 
 #include "trace_events.h"
 
-/* trace/<type>_event_types.h here */
-#include <trace/sched_event_types.h>
-#include <trace/irq_event_types.h>
+#include <trace/trace_event_types.h>
-- 
cgit v1.2.3-59-g8ed1b


From 6ecc2d1ca39177edb6fbdb7412948b0e9f409d02 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 21:33:02 -0500
Subject: tracing: add subsystem level to trace events

If a trace point header defines TRACE_SYSTEM, then it will add the
following trace points into that event system.

If include/trace/irq_event_types.h has:

 #define TRACE_SYSTEM irq

at the top and

 #undef TRACE_SYSTEM

at the bottom, then a directory "irq" will be created in the
/debug/tracing/events directory. Inside that directory will contain the
two trace points that are defined in include/trace/irq_event_types.h.

Only adding the above to irq and not to sched, we get:

 # ls /debug/tracing/events/
irq                     sched_process_exit  sched_signal_send  sched_wakeup_new
sched_kthread_stop      sched_process_fork  sched_switch
sched_kthread_stop_ret  sched_process_free  sched_wait_task
sched_migrate_task      sched_process_wait  sched_wakeup

 # ls /debug/tracing/events/irq
irq_handler_entry  irq_handler_exit

If we add #define TRACE_SYSTEM sched to the trace/sched_event_types.h
then the rest of the trace events will be put in a sched directory
within the events directory.

I've been playing with this idea of the subsystem for a while, but
recently Tom Zanussi posted some patches to lkml that included this
method. Tom's approach was clean and got me to finally put some effort
to clean up the event trace points.

Thanks to Tom Zanussi for demonstrating how nice the subsystem
method is.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/events.c       |  4 ++++
 kernel/trace/trace_events.c | 48 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.h |  2 ++
 3 files changed, 54 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 46e27ad2487e..4e4e45860c58 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -2,6 +2,10 @@
  * This is the place to register all trace points as events.
  */
 
+/* someday this needs to go in a generic header */
+#define __STR(x) #x
+#define STR(x) __STR(x)
+
 #include <trace/trace_events.h>
 
 #include "trace_events.h"
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3bcb9df93342..19332200c457 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -345,11 +345,59 @@ static struct dentry *event_trace_events_dir(void)
 	return d_events;
 }
 
+struct event_subsystem {
+	struct list_head	list;
+	const char		*name;
+	struct dentry		*entry;
+};
+
+static LIST_HEAD(event_subsystems);
+
+static struct dentry *
+event_subsystem_dir(const char *name, struct dentry *d_events)
+{
+	struct event_subsystem *system;
+
+	/* First see if we did not already create this dir */
+	list_for_each_entry(system, &event_subsystems, list) {
+		if (strcmp(system->name, name) == 0)
+			return system->entry;
+	}
+
+	/* need to create new entry */
+	system = kmalloc(sizeof(*system), GFP_KERNEL);
+	if (!system) {
+		pr_warning("No memory to create event subsystem %s\n",
+			   name);
+		return d_events;
+	}
+
+	system->entry = debugfs_create_dir(name, d_events);
+	if (!system->entry) {
+		pr_warning("Could not create event subsystem %s\n",
+			   name);
+		kfree(system);
+		return d_events;
+	}
+
+	system->name = name;
+	list_add(&system->list, &event_subsystems);
+
+	return system->entry;
+}
+
 static int
 event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 {
 	struct dentry *entry;
 
+	/*
+	 * If the trace point header did not define TRACE_SYSTEM
+	 * then the system would be called "TRACE_SYSTEM".
+	 */
+	if (strcmp(call->system, "TRACE_SYSTEM") != 0)
+		d_events = event_subsystem_dir(call->system, d_events);
+
 	call->dir = debugfs_create_dir(call->name, d_events);
 	if (!call->dir) {
 		pr_warning("Could not create debugfs "
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
index deb95e5006c8..b015d7b19878 100644
--- a/kernel/trace/trace_events.h
+++ b/kernel/trace/trace_events.h
@@ -7,6 +7,7 @@
 
 struct ftrace_event_call {
 	char		*name;
+	char		*system;
 	struct dentry	*dir;
 	int		enabled;
 	int		(*regfunc)(void);
@@ -44,6 +45,7 @@ static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
+	.system			= STR(TRACE_SYSTEM),			\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
-- 
cgit v1.2.3-59-g8ed1b


From b628b3e629b1436710e59a21cc020fbb04a52ce1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 23:32:58 -0500
Subject: tracing: make the set_event and available_events subsystem aware

This patch makes the event files, set_event and available_events
aware of the subsystem.

Now you can enable an entire subsystem with:

  echo 'irq:*' > set_event

Note: the '*' is not needed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 43 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 19332200c457..b811eb343522 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -12,6 +12,8 @@
 
 #include "trace_events.h"
 
+#define TRACE_SYSTEM "TRACE_SYSTEM"
+
 #define events_for_each(event)						\
 	for (event = __start_ftrace_events;				\
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
@@ -45,14 +47,47 @@ static void ftrace_clear_events(void)
 static int ftrace_set_clr_event(char *buf, int set)
 {
 	struct ftrace_event_call *call = __start_ftrace_events;
+	char *event = NULL, *sub = NULL, *match;
+	int ret = -EINVAL;
+
+	/*
+	 * The buf format can be <subsystem>:<event-name>
+	 *  *:<event-name> means any event by that name.
+	 *  :<event-name> is the same.
+	 *
+	 *  <subsystem>:* means all events in that subsystem
+	 *  <subsystem>: means the same.
+	 *
+	 *  <name> (no ':') means all events in a subsystem with
+	 *  the name <name> or any event that matches <name>
+	 */
+
+	match = strsep(&buf, ":");
+	if (buf) {
+		sub = match;
+		event = buf;
+		match = NULL;
 
+		if (!strlen(sub) || strcmp(sub, "*") == 0)
+			sub = NULL;
+		if (!strlen(event) || strcmp(event, "*") == 0)
+			event = NULL;
+	}
 
 	events_for_each(call) {
 
 		if (!call->name)
 			continue;
 
-		if (strcmp(buf, call->name) != 0)
+		if (match &&
+		    strcmp(match, call->name) != 0 &&
+		    strcmp(match, call->system) != 0)
+			continue;
+
+		if (sub && strcmp(sub, call->system) != 0)
+			continue;
+
+		if (event && strcmp(event, call->name) != 0)
 			continue;
 
 		if (set) {
@@ -68,9 +103,9 @@ static int ftrace_set_clr_event(char *buf, int set)
 			call->enabled = 0;
 			call->unregfunc();
 		}
-		return 0;
+		ret = 0;
 	}
-	return -EINVAL;
+	return ret;
 }
 
 /* 128 should be much more than enough */
@@ -200,6 +235,8 @@ static int t_show(struct seq_file *m, void *v)
 {
 	struct ftrace_event_call *call = v;
 
+	if (strcmp(call->system, TRACE_SYSTEM) != 0)
+		seq_printf(m, "%s:", call->system);
 	seq_printf(m, "%s\n", call->name);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From ef5580d0fffce6e0a01043bac0625128b5d409a7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 19:38:04 -0500
Subject: tracing: add interface to write into current tracer buffer

Right now all tracers must manage their own trace buffers. This was
to enforce tracers to be independent in case we finally decide to
allow each tracer to have their own trace buffer.

But now we are adding event tracing that writes to the current tracer's
buffer. This adds an interface to allow events to write to the current
tracer buffer without having to manage its own. Since event tracing
has no "tracer", and is just a way to hook into any other tracer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 14 ++++++++++++++
 kernel/trace/trace.h |  6 ++++++
 2 files changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9c5987aca74b..c5e39cd7310d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -846,6 +846,20 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 	trace_wake_up();
 }
 
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+				  unsigned long flags, int pc)
+{
+	return trace_buffer_lock_reserve(&global_trace,
+					 type, len, flags, pc);
+}
+
+void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc)
+{
+	return trace_buffer_unlock_commit(&global_trace, event, flags, pc);
+}
+
 void
 trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 632191770aac..adf161f6dd11 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -442,6 +442,12 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 				struct ring_buffer_event *event,
 				unsigned long flags, int pc);
 
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+				  unsigned long flags, int pc);
+void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc);
+
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
 
-- 
cgit v1.2.3-59-g8ed1b


From c32e827b25054cb17b79cf97fb5e63ae4ce2223c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 19:12:30 -0500
Subject: tracing: add raw trace point recording infrastructure

Impact: lower overhead tracing

The current event tracer can automatically pick up trace points
that are registered with the TRACE_FORMAT macro. But it required
a printf format string and parsing. Although, this adds the ability
to get guaranteed information like task names and such, it took
a hit in overhead processing. This processing can add about 500-1000
nanoseconds overhead, but in some cases that too is considered
too much and we want to shave off as much from this overhead as
possible.

Tom Zanussi recently posted tracing patches to lkml that are based
on a nice idea about capturing the data via C structs using
STRUCT_ENTER, STRUCT_EXIT type of macros.

I liked that method very much, but did not like the implementation
that required a developer to add data/code in several disjoint
locations.

This patch extends the event_tracer macros to do a similar "raw C"
approach that Tom Zanussi did. But instead of having the developers
needing to tweak a bunch of code all over the place, they can do it
all in one macro - preferably placed near the code that it is
tracing. That makes it much more likely that tracepoints will be
maintained on an ongoing basis by the code they modify.

The new macro TRACE_EVENT_FORMAT is created for this approach. (Note,
a developer may still utilize the more low level DECLARE_TRACE macros
if they don't care about getting their traces automatically in the event
tracer.)

They can also use the existing TRACE_FORMAT if they don't need to code
the tracepoint in C, but just want to use the convenience of printf.

So if the developer wants to "hardwire" a tracepoint in the fastest
possible way, and wants to acquire their data via a user space utility
in a raw binary format, or wants to see it in the trace output but not
sacrifice any performance, then they can implement the faster but
more complex TRACE_EVENT_FORMAT macro.

Here's what usage looks like:

  TRACE_EVENT_FORMAT(name,
	TPPROTO(proto),
	TPARGS(args),
	TPFMT(fmt, fmt_args),
	TRACE_STUCT(
		TRACE_FIELD(type1, item1, assign1)
		TRACE_FIELD(type2, item2, assign2)
			[...]
	),
	TPRAWFMT(raw_fmt)
	);

Note name, proto, args, and fmt, are all identical to what TRACE_FORMAT
uses.

 name: is the unique identifier of the trace point
 proto: The proto type that the trace point uses
 args: the args in the proto type
 fmt: printf format to use with the event printf tracer
 fmt_args: the printf argments to match fmt

 TRACE_STRUCT starts the ability to create a structure.
 Each item in the structure is defined with a TRACE_FIELD

  TRACE_FIELD(type, item, assign)

 type: the C type of item.
 item: the name of the item in the stucture
 assign: what to assign the item in the trace point callback

 raw_fmt is a way to pretty print the struct. It must match
  the order of the items are added in TRACE_STUCT

 An example of this would be:

 TRACE_EVENT_FORMAT(sched_wakeup,
	TPPROTO(struct rq *rq, struct task_struct *p, int success),
	TPARGS(rq, p, success),
	TPFMT("task %s:%d %s",
	      p->comm, p->pid, success?"succeeded":"failed"),
	TRACE_STRUCT(
		TRACE_FIELD(pid_t, pid, p->pid)
		TRACE_FIELD(int, success, success)
	),
	TPRAWFMT("task %d success=%d")
	);

 This creates us a unique struct of:

 struct {
	pid_t		pid;
	int		success;
 };

 And the way the call back would assign these values would be:

	entry->pid = p->pid;
	entry->success = success;

The nice part about this is that the creation of the assignent is done
via macro magic in the event tracer.  Once the TRACE_EVENT_FORMAT is
created, the developer will then have a faster method to record
into the ring buffer. They do not need to worry about the tracer itself.

The developer would only need to touch the files in include/trace/*.h

Again, I would like to give special thanks to Tom Zanussi for this
nice idea.

Idea-from: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/events.c               |   6 +-
 kernel/trace/trace.h                |  19 ++++
 kernel/trace/trace_events.c         |   2 +-
 kernel/trace/trace_events.h         |  57 ----------
 kernel/trace/trace_events_stage_1.h |  34 ++++++
 kernel/trace/trace_events_stage_2.h |  72 ++++++++++++
 kernel/trace/trace_events_stage_3.h | 219 ++++++++++++++++++++++++++++++++++++
 7 files changed, 350 insertions(+), 59 deletions(-)
 delete mode 100644 kernel/trace/trace_events.h
 create mode 100644 kernel/trace/trace_events_stage_1.h
 create mode 100644 kernel/trace/trace_events_stage_2.h
 create mode 100644 kernel/trace/trace_events_stage_3.h

(limited to 'kernel')

diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 4e4e45860c58..f2509cbaacea 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -8,6 +8,10 @@
 
 #include <trace/trace_events.h>
 
-#include "trace_events.h"
+#include "trace_output.h"
+
+#include "trace_events_stage_1.h"
+#include "trace_events_stage_2.h"
+#include "trace_events_stage_3.h"
 
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index adf161f6dd11..aa1ab0cb80ab 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -726,4 +726,23 @@ static inline void trace_branch_disable(void)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
+struct ftrace_event_call {
+	char		*name;
+	char		*system;
+	struct dentry	*dir;
+	int		enabled;
+	int		(*regfunc)(void);
+	void		(*unregfunc)(void);
+	int		id;
+	struct dentry	*raw_dir;
+	int		raw_enabled;
+	int		(*raw_init)(void);
+	int		(*raw_reg)(void);
+	void		(*raw_unreg)(void);
+};
+
+void event_trace_printk(unsigned long ip, const char *fmt, ...);
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b811eb343522..77a5c02bd634 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -10,7 +10,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 
-#include "trace_events.h"
+#include "trace.h"
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
deleted file mode 100644
index b015d7b19878..000000000000
--- a/kernel/trace/trace_events.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef _LINUX_KERNEL_TRACE_EVENTS_H
-#define _LINUX_KERNEL_TRACE_EVENTS_H
-
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include "trace.h"
-
-struct ftrace_event_call {
-	char		*name;
-	char		*system;
-	struct dentry	*dir;
-	int		enabled;
-	int		(*regfunc)(void);
-	void		(*unregfunc)(void);
-};
-
-
-#undef TPFMT
-#define TPFMT(fmt, args...)	fmt "\n", ##args
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)				\
-static void ftrace_event_##call(proto)					\
-{									\
-	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
-}									\
-									\
-static int ftrace_reg_event_##call(void)				\
-{									\
-	int ret;							\
-									\
-	ret = register_trace_##call(ftrace_event_##call);		\
-	if (!ret)							\
-		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call);				\
-	return ret;							\
-}									\
-									\
-static void ftrace_unreg_event_##call(void)				\
-{									\
-	unregister_trace_##call(ftrace_event_##call);			\
-}									\
-									\
-static struct ftrace_event_call __used					\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
-	.system			= STR(TRACE_SYSTEM),			\
-	.regfunc		= ftrace_reg_event_##call,		\
-	.unregfunc		= ftrace_unreg_event_##call,		\
-}
-
-void event_trace_printk(unsigned long ip, const char *fmt, ...);
-extern struct ftrace_event_call __start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
-
-#endif /* _LINUX_KERNEL_TRACE_EVENTS_H */
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
new file mode 100644
index 000000000000..fd3bf9382d37
--- /dev/null
+++ b/kernel/trace/trace_events_stage_1.h
@@ -0,0 +1,34 @@
+/*
+ * Stage 1 of the trace events.
+ *
+ * Override the macros in <trace/trace_event_types.h> to include the following:
+ *
+ * struct ftrace_raw_<call> {
+ *	struct trace_entry		ent;
+ *	<type>				<item>;
+ *	[...]
+ * };
+ *
+ * The <type> <item> is created by the TRACE_FIELD(type, item, assign)
+ * macro. We simply do "type item;", and that will create the fields
+ * in the structure.
+ */
+
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)	\
+	struct ftrace_raw_##name {					\
+		struct trace_entry	ent;				\
+		tstruct							\
+	};								\
+	static struct ftrace_event_call event_##name
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+#define TRACE_FIELD(type, item, assign) \
+	type item;
+
+#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
new file mode 100644
index 000000000000..3eaaef5f19e1
--- /dev/null
+++ b/kernel/trace/trace_events_stage_2.h
@@ -0,0 +1,72 @@
+/*
+ * Stage 2 of the trace events.
+ *
+ * Override the macros in <trace/trace_event_types.h> to include the following:
+ *
+ * enum print_line_t
+ * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
+ * {
+ *	struct trace_seq *s = &iter->seq;
+ *	struct ftrace_raw_<call> *field; <-- defined in stage 1
+ *	struct trace_entry *entry;
+ *	int ret;
+ *
+ *	entry = iter->ent;
+ *
+ *	if (entry->type != event_<call>.id) {
+ *		WARN_ON_ONCE(1);
+ *		return TRACE_TYPE_UNHANDLED;
+ *	}
+ *
+ *	field = (typeof(field))entry;
+ *
+ *	ret = trace_seq_printf(s, <TPRAWFMT> "%s", <ARGS> "\n");
+ *	if (!ret)
+ *		return TRACE_TYPE_PARTIAL_LINE;
+ *
+ *	return TRACE_TYPE_HANDLED;
+ * }
+ *
+ * This is the method used to print the raw event to the trace
+ * output format. Note, this is not needed if the data is read
+ * in binary.
+ */
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign) \
+	field->item,
+
+
+#undef TPRAWFMT
+#define TPRAWFMT(args...)	args
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+enum print_line_t							\
+ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
+{									\
+	struct trace_seq *s = &iter->seq;				\
+	struct ftrace_raw_##call *field;				\
+	struct trace_entry *entry;					\
+	int ret;							\
+									\
+	entry = iter->ent;						\
+									\
+	if (entry->type != event_##call.id) {				\
+		WARN_ON_ONCE(1);					\
+		return TRACE_TYPE_UNHANDLED;				\
+	}								\
+									\
+	field = (typeof(field))entry;					\
+									\
+	ret = trace_seq_printf(s, tpfmt "%s", tstruct "\n");		\
+	if (!ret)							\
+		return TRACE_TYPE_PARTIAL_LINE;				\
+									\
+	return TRACE_TYPE_HANDLED;					\
+}
+
+#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
new file mode 100644
index 000000000000..7a161c49deb4
--- /dev/null
+++ b/kernel/trace/trace_events_stage_3.h
@@ -0,0 +1,219 @@
+/*
+ * Stage 3 of the trace events.
+ *
+ * Override the macros in <trace/trace_event_types.h> to include the following:
+ *
+ * static void ftrace_event_<call>(proto)
+ * {
+ * 	event_trace_printk(_RET_IP_, "(<call>) " <fmt>);
+ * }
+ *
+ * static int ftrace_reg_event_<call>(void)
+ * {
+ * 	int ret;
+ *
+ * 	ret = register_trace_<call>(ftrace_event_<call>);
+ * 	if (!ret)
+ * 		pr_info("event trace: Could not activate trace point "
+ * 			"probe to  <call>");
+ * 	return ret;
+ * }
+ *
+ * static void ftrace_unreg_event_<call>(void)
+ * {
+ * 	unregister_trace_<call>(ftrace_event_<call>);
+ * }
+ *
+ * For those macros defined with TRACE_FORMAT:
+ *
+ * static struct ftrace_event_call __used
+ * __attribute__((__aligned__(4)))
+ * __attribute__((section("_ftrace_events"))) event_<call> = {
+ * 	.name 			= "<call>",
+ * 	.regfunc		= ftrace_reg_event_<call>,
+ * 	.unregfunc		= ftrace_unreg_event_<call>,
+ * }
+ *
+ *
+ * For those macros defined with TRACE_EVENT_FORMAT:
+ *
+ * static struct ftrace_event_call event_<call>;
+ *
+ * static void ftrace_raw_event_<call>(proto)
+ * {
+ * 	struct ring_buffer_event *event;
+ * 	struct ftrace_raw_<call> *entry; <-- defined in stage 1
+ * 	unsigned long irq_flags;
+ * 	int pc;
+ *
+ * 	local_save_flags(irq_flags);
+ * 	pc = preempt_count();
+ *
+ * 	event = trace_current_buffer_lock_reserve(event_<call>.id,
+ * 				  sizeof(struct ftrace_raw_<call>),
+ * 				  irq_flags, pc);
+ * 	if (!event)
+ * 		return;
+ * 	entry	= ring_buffer_event_data(event);
+ *
+ * 	<tstruct>;  <-- Here we assign the entries by the TRACE_FIELD.
+ *
+ * 	trace_current_buffer_unlock_commit(event, irq_flags, pc);
+ * }
+ *
+ * static int ftrace_raw_reg_event_<call>(void)
+ * {
+ * 	int ret;
+ *
+ * 	ret = register_trace_<call>(ftrace_raw_event_<call>);
+ * 	if (!ret)
+ * 		pr_info("event trace: Could not activate trace point "
+ * 			"probe to <call>");
+ * 	return ret;
+ * }
+ *
+ * static void ftrace_unreg_event_<call>(void)
+ * {
+ * 	unregister_trace_<call>(ftrace_raw_event_<call>);
+ * }
+ *
+ * static struct trace_event ftrace_event_type_<call> = {
+ * 	.trace			= ftrace_raw_output_<call>, <-- stage 2
+ * };
+ *
+ * static int ftrace_raw_init_event_<call>(void)
+ * {
+ * 	int id;
+ *
+ * 	id = register_ftrace_event(&ftrace_event_type_<call>);
+ * 	if (!id)
+ * 		return -ENODEV;
+ * 	event_<call>.id = id;
+ * 	return 0;
+ * }
+ *
+ * static struct ftrace_event_call __used
+ * __attribute__((__aligned__(4)))
+ * __attribute__((section("_ftrace_events"))) event_<call> = {
+ * 	.name 			= "<call>",
+ * 	.regfunc		= ftrace_reg_event_<call>,
+ * 	.unregfunc		= ftrace_unreg_event_<call>,
+ * 	.raw_init		= ftrace_raw_init_event_<call>,
+ * 	.raw_reg		= ftrace_raw_reg_event_<call>,
+ * 	.raw_unreg		= ftrace_raw_unreg_event_<call>,
+ * }
+ *
+ */
+
+#undef TPFMT
+#define TPFMT(fmt, args...)	fmt "\n", ##args
+
+#define _TRACE_FORMAT(call, proto, args, fmt)				\
+static void ftrace_event_##call(proto)					\
+{									\
+	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
+}									\
+									\
+static int ftrace_reg_event_##call(void)				\
+{									\
+	int ret;							\
+									\
+	ret = register_trace_##call(ftrace_event_##call);		\
+	if (!ret)							\
+		pr_info("event trace: Could not activate trace point "	\
+			"probe to " #call);				\
+	return ret;							\
+}									\
+									\
+static void ftrace_unreg_event_##call(void)				\
+{									\
+	unregister_trace_##call(ftrace_event_##call);			\
+}									\
+
+
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)				\
+_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name 			= #call,				\
+	.system			= STR(TRACE_SYSTEM),			\
+	.regfunc		= ftrace_reg_event_##call,		\
+	.unregfunc		= ftrace_unreg_event_##call,		\
+}
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)\
+	entry->item = assign;
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+									\
+static struct ftrace_event_call event_##call;				\
+									\
+static void ftrace_raw_event_##call(proto)				\
+{									\
+	struct ring_buffer_event *event;				\
+	struct ftrace_raw_##call *entry;				\
+	unsigned long irq_flags;					\
+	int pc;								\
+									\
+	local_save_flags(irq_flags);					\
+	pc = preempt_count();						\
+									\
+	event = trace_current_buffer_lock_reserve(event_##call.id,	\
+				  sizeof(struct ftrace_raw_##call), 	\
+				  irq_flags, pc);			\
+	if (!event)							\
+		return;							\
+	entry	= ring_buffer_event_data(event);			\
+									\
+	tstruct;							\
+									\
+	trace_current_buffer_unlock_commit(event, irq_flags, pc);	\
+}									\
+									\
+static int ftrace_raw_reg_event_##call(void)				\
+{									\
+	int ret;							\
+									\
+	ret = register_trace_##call(ftrace_raw_event_##call);		\
+	if (!ret)							\
+		pr_info("event trace: Could not activate trace point "	\
+			"probe to " #call);				\
+	return ret;							\
+}									\
+									\
+static void ftrace_raw_unreg_event_##call(void)				\
+{									\
+	unregister_trace_##call(ftrace_raw_event_##call);		\
+}									\
+									\
+static struct trace_event ftrace_event_type_##call = {			\
+	.trace			= ftrace_raw_output_##call,		\
+};									\
+									\
+static int ftrace_raw_init_event_##call(void)				\
+{									\
+	int id;								\
+									\
+	id = register_ftrace_event(&ftrace_event_type_##call);		\
+	if (!id)							\
+		return -ENODEV;						\
+	event_##call.id = id;						\
+	return 0;							\
+}									\
+									\
+static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name 			= #call,				\
+	.system			= STR(TRACE_SYSTEM),			\
+	.regfunc		= ftrace_reg_event_##call,		\
+	.unregfunc		= ftrace_unreg_event_##call,		\
+	.raw_init		= ftrace_raw_init_event_##call,		\
+	.raw_reg		= ftrace_raw_reg_event_##call,		\
+	.raw_unreg		= ftrace_raw_unreg_event_##call,	\
+}
-- 
cgit v1.2.3-59-g8ed1b


From fd99498989f3b3feeab89dcadf537138ba136d24 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 28 Feb 2009 02:41:25 -0500
Subject: tracing: add raw fast tracing interface for trace events

This patch adds the interface to enable the C style trace points.
In the directory /debugfs/tracing/events/subsystem/event
We now have three files:

 enable : values 0 or 1 to enable or disable the trace event.

 available_types: values 'raw' and 'printf' which indicate the tracing
       types available for the trace point. If a developer does not
       use the TRACE_EVENT_FORMAT macro and just uses the TRACE_FORMAT
       macro, then only 'printf' will be available. This file is
       read only.

 type: values 'raw' or 'printf'. This indicates which type of tracing
       is active for that trace point. 'printf' is the default and
       if 'raw' is not available, this file is read only.

 # echo raw > /debug/tracing/events/sched/sched_wakeup/type
 # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable

 Will enable the C style tracing for the sched_wakeup trace point.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.h        |   7 ++
 kernel/trace/trace_events.c | 199 ++++++++++++++++++++++++++++++++++++++------
 2 files changed, 181 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index aa1ab0cb80ab..f6fa0b9f83a8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -726,6 +726,12 @@ static inline void trace_branch_disable(void)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
+/* trace event type bit fields, not numeric */
+enum {
+	TRACE_EVENT_TYPE_PRINTF		= 1,
+	TRACE_EVENT_TYPE_RAW		= 2,
+};
+
 struct ftrace_event_call {
 	char		*name;
 	char		*system;
@@ -736,6 +742,7 @@ struct ftrace_event_call {
 	int		id;
 	struct dentry	*raw_dir;
 	int		raw_enabled;
+	int		type;
 	int		(*raw_init)(void);
 	int		(*raw_reg)(void);
 	void		(*raw_unreg)(void);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 77a5c02bd634..1d07f800a9ce 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -44,6 +44,36 @@ static void ftrace_clear_events(void)
 	}
 }
 
+static void ftrace_event_enable_disable(struct ftrace_event_call *call,
+					int enable)
+{
+
+	switch (enable) {
+	case 0:
+		if (call->enabled) {
+			call->enabled = 0;
+			call->unregfunc();
+		}
+		if (call->raw_enabled) {
+			call->raw_enabled = 0;
+			call->raw_unreg();
+		}
+		break;
+	case 1:
+		if (!call->enabled &&
+		    (call->type & TRACE_EVENT_TYPE_PRINTF)) {
+			call->enabled = 1;
+			call->regfunc();
+		}
+		if (!call->raw_enabled &&
+		    (call->type & TRACE_EVENT_TYPE_RAW)) {
+			call->raw_enabled = 1;
+			call->raw_reg();
+		}
+		break;
+	}
+}
+
 static int ftrace_set_clr_event(char *buf, int set)
 {
 	struct ftrace_event_call *call = __start_ftrace_events;
@@ -90,19 +120,8 @@ static int ftrace_set_clr_event(char *buf, int set)
 		if (event && strcmp(event, call->name) != 0)
 			continue;
 
-		if (set) {
-			/* Already set? */
-			if (call->enabled)
-				return 0;
-			call->enabled = 1;
-			call->regfunc();
-		} else {
-			/* Already cleared? */
-			if (!call->enabled)
-				return 0;
-			call->enabled = 0;
-			call->unregfunc();
-		}
+		ftrace_event_enable_disable(call, set);
+
 		ret = 0;
 	}
 	return ret;
@@ -273,7 +292,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_event_call *call = filp->private_data;
 	char *buf;
 
-	if (call->enabled)
+	if (call->enabled || call->raw_enabled)
 		buf = "1\n";
 	else
 		buf = "0\n";
@@ -304,18 +323,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	switch (val) {
 	case 0:
-		if (!call->enabled)
-			break;
-
-		call->enabled = 0;
-		call->unregfunc();
-		break;
 	case 1:
-		if (call->enabled)
-			break;
-
-		call->enabled = 1;
-		call->regfunc();
+		ftrace_event_enable_disable(call, val);
 		break;
 
 	default:
@@ -327,6 +336,107 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static ssize_t
+event_type_read(struct file *filp, char __user *ubuf, size_t cnt,
+		loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[16];
+	int r = 0;
+
+	if (call->type & TRACE_EVENT_TYPE_PRINTF)
+		r += sprintf(buf, "printf\n");
+
+	if (call->type & TRACE_EVENT_TYPE_RAW)
+		r += sprintf(buf+r, "raw\n");
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+event_type_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		 loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[64];
+
+	/*
+	 * If there's only one type, we can't change it.
+	 * And currently we always have printf type, and we
+	 * may or may not have raw type.
+	 *
+	 * This is a redundant check, the file should be read
+	 * only if this is the case anyway.
+	 */
+
+	if (!call->raw_init)
+		return -EPERM;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (!strncmp(buf, "printf", 6) &&
+	    (!buf[6] || isspace(buf[6]))) {
+
+		call->type = TRACE_EVENT_TYPE_PRINTF;
+
+		/*
+		 * If raw enabled, the disable it and enable
+		 * printf type.
+		 */
+		if (call->raw_enabled) {
+			call->raw_enabled = 0;
+			call->raw_unreg();
+
+			call->enabled = 1;
+			call->regfunc();
+		}
+
+	} else if (!strncmp(buf, "raw", 3) &&
+	    (!buf[3] || isspace(buf[3]))) {
+
+		call->type = TRACE_EVENT_TYPE_RAW;
+
+		/*
+		 * If printf enabled, the disable it and enable
+		 * raw type.
+		 */
+		if (call->enabled) {
+			call->enabled = 0;
+			call->unregfunc();
+
+			call->raw_enabled = 1;
+			call->raw_reg();
+		}
+	} else
+		return -EINVAL;
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
+static ssize_t
+event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
+			   loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[16];
+	int r = 0;
+
+	r += sprintf(buf, "printf\n");
+
+	if (call->raw_init)
+		r += sprintf(buf+r, "raw\n");
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -362,6 +472,17 @@ static const struct file_operations ftrace_enable_fops = {
 	.write = event_enable_write,
 };
 
+static const struct file_operations ftrace_type_fops = {
+	.open = tracing_open_generic,
+	.read = event_type_read,
+	.write = event_type_write,
+};
+
+static const struct file_operations ftrace_available_types_fops = {
+	.open = tracing_open_generic,
+	.read = event_available_types_read,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -427,6 +548,7 @@ static int
 event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 {
 	struct dentry *entry;
+	int ret;
 
 	/*
 	 * If the trace point header did not define TRACE_SYSTEM
@@ -435,6 +557,18 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	if (strcmp(call->system, "TRACE_SYSTEM") != 0)
 		d_events = event_subsystem_dir(call->system, d_events);
 
+	if (call->raw_init) {
+		ret = call->raw_init();
+		if (ret < 0) {
+			pr_warning("Could not initialize trace point"
+				   " events/%s\n", call->name);
+			return ret;
+		}
+	}
+
+	/* default the output to printf */
+	call->type = TRACE_EVENT_TYPE_PRINTF;
+
 	call->dir = debugfs_create_dir(call->name, d_events);
 	if (!call->dir) {
 		pr_warning("Could not create debugfs "
@@ -448,6 +582,21 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		pr_warning("Could not create debugfs "
 			   "'%s/enable' entry\n", call->name);
 
+	/* Only let type be writable, if we can change it */
+	entry = debugfs_create_file("type",
+				    call->raw_init ? 0644 : 0444,
+				    call->dir, call,
+				    &ftrace_type_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/type' entry\n", call->name);
+
+	entry = debugfs_create_file("available_types", 0444, call->dir, call,
+				    &ftrace_available_types_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/type' available_types\n", call->name);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From d20e3b03842bfeb9d21817ff19054c277cc3eac0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 10:53:15 -0500
Subject: tracing: add TRACE_FIELD_SPECIAL to record complex entries

Tom Zanussi pointed out that the simple TRACE_FIELD was not enough to
record trace data that required memcpy. This patch addresses this issue
by adding a TRACE_FIELD_SPECIAL. The format is similar to TRACE_FIELD
but looks like so:

  TRACE_FIELD_SPECIAL(type_item, item, cmd)

What TRACE_FIELD gave was:

  TRACE_FIELD(type, item, assign)

The TRACE_FIELD would be used in declaring a structure:

  struct {
	type	item;
  };

And later assign it via:

  entry->item = assign;

What TRACE_FIELD_SPECIAL gives us is:

In the declaration of the structure:

  struct {
	type_item;
  };

And the assignment:

  cmd;

This change log will explain the one example used in the patch:

 TRACE_EVENT_FORMAT(sched_switch,
	TPPROTO(struct rq *rq, struct task_struct *prev,
		struct task_struct *next),
	TPARGS(rq, prev, next),
	TPFMT("task %s:%d ==> %s:%d",
	      prev->comm, prev->pid, next->comm, next->pid),
	TRACE_STRUCT(
		TRACE_FIELD(pid_t, prev_pid, prev->pid)
		TRACE_FIELD(int, prev_prio, prev->prio)
		TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
				    next_comm,
				    TPCMD(memcpy(TRACE_ENTRY->next_comm,
						 next->comm,
						 TASK_COMM_LEN)))
		TRACE_FIELD(pid_t, next_pid, next->pid)
		TRACE_FIELD(int, next_prio, next->prio)
	),
	TPRAWFMT("prev %d:%d ==> next %s:%d:%d")
	);

 The struct will be create as:

  struct {
	pid_t		prev_pid;
	int		prev_prio;
	char next_comm[TASK_COMM_LEN];
	pid_t		next_pid;
	int		next_prio;
  };

Note the TRACE_ENTRY in the cmd part of TRACE_SPECIAL. TRACE_ENTRY will
be set by the tracer to point to the structure inside the trace buffer.

  entry->prev_pid	= prev->pid;
  entry->prev_prio	= prev->prio;
  memcpy(entry->next_comm, next->comm, TASK_COMM_LEN);
  entry->next_pid	= next->pid;
  entry->next_prio	= next->prio

Reported-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/trace/sched_event_types.h   |  7 ++++++-
 kernel/trace/trace_events_stage_1.h |  2 ++
 kernel/trace/trace_events_stage_2.h |  4 ++++
 kernel/trace/trace_events_stage_3.h | 14 ++++++++++++++
 4 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index ba059c10b58a..a6de5c1601a0 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -71,10 +71,15 @@ TRACE_EVENT_FORMAT(sched_switch,
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, prev_pid, prev->pid)
 		TRACE_FIELD(int, prev_prio, prev->prio)
+		TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
+				    next_comm,
+				    TPCMD(memcpy(TRACE_ENTRY->next_comm,
+						 next->comm,
+						 TASK_COMM_LEN)))
 		TRACE_FIELD(pid_t, next_pid, next->pid)
 		TRACE_FIELD(int, next_prio, next->prio)
 	),
-	TPRAWFMT("prev %d:%d ==> next %d:%d")
+	TPRAWFMT("prev %d:%d ==> next %s:%d:%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_migrate_task,
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index fd3bf9382d37..3830a731424c 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -30,5 +30,7 @@
 
 #define TRACE_FIELD(type, item, assign) \
 	type item;
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+	type_item;
 
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 3eaaef5f19e1..dc79fe3a2ecb 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -39,6 +39,10 @@
 #define TRACE_FIELD(type, item, assign) \
 	field->item,
 
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+	field->item,
+
 
 #undef TPRAWFMT
 #define TPRAWFMT(args...)	args
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 7a161c49deb4..2ab65e958223 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -147,6 +147,20 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)\
+	entry->item = assign;
+
+#undef TPCMD
+#define TPCMD(cmd...)	cmd
+
+#undef TRACE_ENTRY
+#define TRACE_ENTRY	entry
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+	cmd;
+
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
 _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
-- 
cgit v1.2.3-59-g8ed1b


From 11a241a3302277db05561e01477528629d806c4e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 11:49:04 -0500
Subject: tracing: add protection around modify trace event fields

The trace event objects are currently not proctected against
reentrancy. This patch adds a mutex around the modifications of
the trace event fields.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1d07f800a9ce..26069fa6b3b0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -14,6 +14,8 @@
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
+static DEFINE_MUTEX(event_mutex);
+
 #define events_for_each(event)						\
 	for (event = __start_ftrace_events;				\
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
@@ -104,6 +106,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 			event = NULL;
 	}
 
+	mutex_lock(&event_mutex);
 	events_for_each(call) {
 
 		if (!call->name)
@@ -124,6 +127,8 @@ static int ftrace_set_clr_event(char *buf, int set)
 
 		ret = 0;
 	}
+	mutex_unlock(&event_mutex);
+
 	return ret;
 }
 
@@ -324,7 +329,9 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	switch (val) {
 	case 0:
 	case 1:
+		mutex_lock(&event_mutex);
 		ftrace_event_enable_disable(call, val);
+		mutex_unlock(&event_mutex);
 		break;
 
 	default:
-- 
cgit v1.2.3-59-g8ed1b


From f9520750c4c9924c14325cd951efae5fae58104c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 14:04:40 -0500
Subject: tracing: make trace_seq_reset global and rename to trace_seq_init

Impact: clean up

The trace_seq functions may be used separately outside of the ftrace
iterator. The trace_seq_reset is needed for these operations.

This patch also renames trace_seq_reset to the more appropriate
trace_seq_init.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 17 +++++------------
 kernel/trace/trace.h |  8 ++++++++
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c5e39cd7310d..ea055aa21cd9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -342,13 +342,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	tracing_record_cmdline(tsk);
 }
 
-static void
-trace_seq_reset(struct trace_seq *s)
-{
-	s->len = 0;
-	s->readpos = 0;
-}
-
 ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 {
 	int len;
@@ -395,7 +388,7 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s)
 	s->buffer[len] = 0;
 	seq_puts(m, s->buffer);
 
-	trace_seq_reset(s);
+	trace_seq_init(s);
 }
 
 /**
@@ -2620,7 +2613,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	if (sret != -EBUSY)
 		return sret;
 
-	trace_seq_reset(&iter->seq);
+	trace_seq_init(&iter->seq);
 
 	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
@@ -2682,7 +2675,7 @@ waitagain:
 	/* Now copy what we have to the user */
 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
 	if (iter->seq.readpos >= iter->seq.len)
-		trace_seq_reset(&iter->seq);
+		trace_seq_init(&iter->seq);
 
 	/*
 	 * If there was nothing to send to user, inspite of consuming trace
@@ -2819,7 +2812,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		partial[i].offset = 0;
 		partial[i].len = iter->seq.len;
 
-		trace_seq_reset(&iter->seq);
+		trace_seq_init(&iter->seq);
 	}
 
 	mutex_unlock(&iter->mutex);
@@ -3631,7 +3624,7 @@ trace_printk_seq(struct trace_seq *s)
 
 	printk(KERN_TRACE "%s", s->buffer);
 
-	trace_seq_reset(s);
+	trace_seq_init(s);
 }
 
 void ftrace_dump(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f6fa0b9f83a8..cf6ba4181b14 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -395,6 +395,14 @@ struct trace_seq {
 	unsigned int		readpos;
 };
 
+static inline void
+trace_seq_init(struct trace_seq *s)
+{
+	s->len = 0;
+	s->readpos = 0;
+}
+
+
 #define TRACE_PIPE_ALL_CPU	-1
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 981d081ec8b958b7d962ee40d433581a55d40fc5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 13:53:59 -0500
Subject: tracing: add format file to describe event struct fields

This patch adds the "format" file to the trace point event directory.
This is based off of work by Tom Zanussi, in which a file is exported
to be tread from user land such that a user space app may read the
binary record stored in the ring buffer.

 # cat /debug/tracing/events/sched/sched_switch/format
        field:pid_t prev_pid;   offset:12;      size:4;
        field:int prev_prio;    offset:16;      size:4;
        field special:char next_comm[TASK_COMM_LEN];    offset:20;      size:16;
        field:pid_t next_pid;   offset:36;      size:4;
        field:int next_prio;    offset:40;      size:4;

Idea-from: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.h                |  1 +
 kernel/trace/trace_events.c         | 56 ++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace_events_stage_2.h | 52 ++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_stage_3.h |  2 ++
 4 files changed, 110 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cf6ba4181b14..e606633fb498 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -754,6 +754,7 @@ struct ftrace_event_call {
 	int		(*raw_init)(void);
 	int		(*raw_reg)(void);
 	void		(*raw_unreg)(void);
+	int		(*show_format)(struct trace_seq *s);
 };
 
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 26069fa6b3b0..d57a772981c1 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3,6 +3,9 @@
  *
  * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
  *
+ *  - Added format output of fields of the trace point.
+ *    This was based off of work by Tom Zanussi <tzanussi@gmail.com>.
+ *
  */
 
 #include <linux/debugfs.h>
@@ -444,6 +447,42 @@ event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+static ssize_t
+event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
+		  loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	struct trace_seq *s;
+	char *buf;
+	int r;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	if (*ppos)
+		return 0;
+
+	r = call->show_format(s);
+	if (!r) {
+		/*
+		 * ug!  The format output is bigger than a PAGE!!
+		 */
+		buf = "FORMAT TOO BIG\n";
+		r = simple_read_from_buffer(ubuf, cnt, ppos,
+					      buf, strlen(buf));
+		goto out;
+	}
+
+	r = simple_read_from_buffer(ubuf, cnt, ppos,
+				    s->buffer, s->len);
+ out:
+	kfree(s);
+	return r;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -490,6 +529,11 @@ static const struct file_operations ftrace_available_types_fops = {
 	.read = event_available_types_read,
 };
 
+static const struct file_operations ftrace_event_format_fops = {
+	.open = tracing_open_generic,
+	.read = event_format_read,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -602,7 +646,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				    &ftrace_available_types_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
-			   "'%s/type' available_types\n", call->name);
+			   "'%s/available_types' entry\n", call->name);
+
+	/* A trace may not want to export its format */
+	if (!call->show_format)
+		return 0;
+
+	entry = debugfs_create_file("format", 0444, call->dir, call,
+				    &ftrace_event_format_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/format' entry\n", call->name);
 
 	return 0;
 }
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index dc79fe3a2ecb..3a80ea4e92cb 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -74,3 +74,55 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 }
 
 #include <trace/trace_event_types.h>
+
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ * 	struct ftrace_raw_##call field;
+ * 	int ret;
+ *
+ * 	ret = trace_seq_printf(s, #type " " #item ";"
+ * 			       " size:%d; offset:%d;\n",
+ * 			       sizeof(field.type),
+ * 			       offsetof(struct ftrace_raw_##call,
+ * 					item));
+ *
+ * }
+ */
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%lu;\tsize:%lu;\n",		\
+			       offsetof(typeof(field), item),		\
+			       sizeof(field.item));			\
+	if (!ret)							\
+		return 0;
+
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
+			       "offset:%lu;\tsize:%lu;\n",		\
+			       offsetof(typeof(field), item),		\
+			       sizeof(field.item));			\
+	if (!ret)							\
+		return 0;
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+int									\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field;					\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	return ret;							\
+}
+
+#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 2ab65e958223..c62a4d2a5283 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -101,6 +101,7 @@
  * 	.raw_init		= ftrace_raw_init_event_<call>,
  * 	.raw_reg		= ftrace_raw_reg_event_<call>,
  * 	.raw_unreg		= ftrace_raw_unreg_event_<call>,
+ *	.show_format		= ftrace_format_<call>,
  * }
  *
  */
@@ -230,4 +231,5 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
 	.raw_reg		= ftrace_raw_reg_event_##call,		\
 	.raw_unreg		= ftrace_raw_unreg_event_##call,	\
+	.show_format		= ftrace_format_##call,			\
 }
-- 
cgit v1.2.3-59-g8ed1b


From 91729ef96661bfa7dc53923746cd90b62d5495cc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 15:03:01 -0500
Subject: tracing: add ftrace headers to event format files

This patch includes the ftrace header to the event formats files:

 # cat /debug/tracing/events/sched/sched_switch/format
        field:unsigned char type;       offset:0;       size:1;
        field:unsigned char flags;      offset:1;       size:1;
        field:unsigned char preempt_count;      offset:2;       size:1;
        field:int pid;  offset:4;       size:4;
        field:int tgid; offset:8;       size:4;

        field:pid_t prev_pid;   offset:12;      size:4;
        field:int prev_prio;    offset:16;      size:4;
        field special:char next_comm[TASK_COMM_LEN];    offset:20;      size:16;
        field:pid_t next_pid;   offset:36;      size:4;
        field:int next_prio;    offset:40;      size:4;

A blank line is used as a deliminator between the ftrace header and the
trace point fields.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d57a772981c1..cdcc3aed76fd 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -13,7 +13,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 
-#include "trace.h"
+#include "trace_output.h"
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
@@ -447,6 +447,28 @@ event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+#undef FIELD
+#define FIELD(type, name) \
+	#type, #name, offsetof(typeof(field), name), sizeof(field.name)
+
+static int trace_write_header(struct trace_seq *s)
+{
+	struct trace_entry field;
+
+	/* struct trace_entry */
+	return trace_seq_printf(s,
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\n",
+				FIELD(unsigned char, type),
+				FIELD(unsigned char, flags),
+				FIELD(unsigned char, preempt_count),
+				FIELD(int, pid),
+				FIELD(int, tgid));
+}
 static ssize_t
 event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
@@ -465,6 +487,9 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 	if (*ppos)
 		return 0;
 
+	/* If this fails, so will the show_format. */
+	trace_write_header(s);
+
 	r = call->show_format(s);
 	if (!r) {
 		/*
-- 
cgit v1.2.3-59-g8ed1b


From c5e4e19271edfdf1abd4184933d40d646da6a091 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 15:10:02 -0500
Subject: tracing: add trace name and id to event formats

To be able to identify the trace in the binary format output, the
id of the trace event (which is dynamically assigned) must also be listed.

This patch adds the name of the trace point as well as the id assigned.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index cdcc3aed76fd..210e71ff82db 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -487,7 +487,11 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 	if (*ppos)
 		return 0;
 
-	/* If this fails, so will the show_format. */
+	/* If any of the first writes fail, so will the show_format. */
+
+	trace_seq_printf(s, "name: %s\n", call->name);
+	trace_seq_printf(s, "ID: %d\n", call->id);
+	trace_seq_printf(s, "format:\n");
 	trace_write_header(s);
 
 	r = call->show_format(s);
-- 
cgit v1.2.3-59-g8ed1b


From 96ccd21cd13140221bda74a4fc4e53ffeba7c7d4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 15:22:21 -0500
Subject: tracing: add print format to event trace format files

This patch adds the internal print format used to print the raw events
to the event trace point format file.

 # cat /debug/tracing/events/sched/sched_switch/format
name: sched_switch
ID: 29
format:
        field:unsigned char type;       offset:0;       size:1;
        field:unsigned char flags;      offset:1;       size:1;
        field:unsigned char preempt_count;      offset:2;       size:1;
        field:int pid;  offset:4;       size:4;
        field:int tgid; offset:8;       size:4;

        field:pid_t prev_pid;   offset:12;      size:4;
        field:int prev_prio;    offset:16;      size:4;
        field special:char next_comm[TASK_COMM_LEN];    offset:20;      size:16;
        field:pid_t next_pid;   offset:36;      size:4;
        field:int next_prio;    offset:40;      size:4;

print fmt: "prev %d:%d ==> next %s:%d:%d"

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_2.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 3a80ea4e92cb..b1cebba1d9b4 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -122,6 +122,8 @@ ftrace_format_##call(struct trace_seq *s)				\
 									\
 	tstruct;							\
 									\
+	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+									\
 	return ret;							\
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 633ddaa7f471e9db181f993c1458d6f4bae321ca Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 09:43:50 -0500
Subject: tracing: fix return value to registering events

The registering of events had the return value check backwards.
A zero returned is success, the check had it as a failure.

This patch also fixes a missing "\n" in the warning that the check
failed.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_3.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index c62a4d2a5283..041789ffbac1 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -120,9 +120,9 @@ static int ftrace_reg_event_##call(void)				\
 	int ret;							\
 									\
 	ret = register_trace_##call(ftrace_event_##call);		\
-	if (!ret)							\
+	if (ret)							\
 		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call);				\
+			"probe to " #call "\n");			\
 	return ret;							\
 }									\
 									\
@@ -195,9 +195,9 @@ static int ftrace_raw_reg_event_##call(void)				\
 	int ret;							\
 									\
 	ret = register_trace_##call(ftrace_raw_event_##call);		\
-	if (!ret)							\
+	if (ret)							\
 		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call);				\
+			"probe to " #call "\n");			\
 	return ret;							\
 }									\
 									\
-- 
cgit v1.2.3-59-g8ed1b


From 41be4da4e85e58520b934040966a6ae919c66c2d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 20:56:48 -0500
Subject: ring-buffer: reset write field for ring_buffer_read_page

Impact: fix ring_buffer_read_page

After a page is swapped into the ring buffer, the write field must
also be reset.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a8c275c01e83..9baad7ee4b36 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2492,6 +2492,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		rb_init_page(bpage);
 		bpage = cpu_buffer->reader_page->page;
 		cpu_buffer->reader_page->page = *data_page;
+		local_set(&cpu_buffer->reader_page->write, 0);
 		cpu_buffer->reader_page->read = 0;
 		*data_page = bpage;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From ef7a4a161472b952941bf78855a9cd95703c024e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 00:27:49 -0500
Subject: ring-buffer: fix ring_buffer_read_page

The ring_buffer_read_page was broken if it were to only copy part
of the page. This patch fixes that up as well as adds a parameter
to allow a length field, in order to only copy part of the buffer page.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h |  7 +++-
 kernel/trace/ring_buffer.c  | 92 +++++++++++++++++++++++++++++----------------
 2 files changed, 64 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index f5e793d69bd3..79fcbc4b09d6 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -121,6 +121,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 u64 ring_buffer_time_stamp(int cpu);
 void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 
+size_t ring_buffer_page_len(void *page);
+
+
 /*
  * The below functions are fine to use outside the tracing facility.
  */
@@ -138,8 +141,8 @@ static inline int tracing_is_on(void) { return 0; }
 
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
 void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
-int ring_buffer_read_page(struct ring_buffer *buffer,
-			  void **data_page, int cpu, int full);
+int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
+			  size_t len, int cpu, int full);
 
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9baad7ee4b36..2ad6bae95a3d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -234,6 +234,11 @@ static void rb_init_page(struct buffer_data_page *bpage)
 	local_set(&bpage->commit, 0);
 }
 
+size_t ring_buffer_page_len(void *page)
+{
+	return local_read(&((struct buffer_data_page *)page)->commit);
+}
+
 /*
  * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
  * this issue out.
@@ -2378,8 +2383,8 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
  */
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 {
-	unsigned long addr;
 	struct buffer_data_page *bpage;
+	unsigned long addr;
 
 	addr = __get_free_page(GFP_KERNEL);
 	if (!addr)
@@ -2387,6 +2392,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 
 	bpage = (void *)addr;
 
+	rb_init_page(bpage);
+
 	return bpage;
 }
 
@@ -2406,6 +2413,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  * ring_buffer_read_page - extract a page from the ring buffer
  * @buffer: buffer to extract from
  * @data_page: the page to use allocated from ring_buffer_alloc_read_page
+ * @len: amount to extract
  * @cpu: the cpu of the buffer to extract
  * @full: should the extraction only happen when the page is full.
  *
@@ -2418,7 +2426,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *	rpage = ring_buffer_alloc_read_page(buffer);
  *	if (!rpage)
  *		return error;
- *	ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
+ *	ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
  *	if (ret >= 0)
  *		process_page(rpage, ret);
  *
@@ -2435,71 +2443,89 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *  <0 if no data has been transferred.
  */
 int ring_buffer_read_page(struct ring_buffer *buffer,
-			    void **data_page, int cpu, int full)
+			  void **data_page, size_t len, int cpu, int full)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
 	struct buffer_data_page *bpage;
+	struct buffer_page *reader;
 	unsigned long flags;
+	unsigned int commit;
 	unsigned int read;
 	int ret = -1;
 
 	if (!data_page)
-		return 0;
+		return -1;
 
 	bpage = *data_page;
 	if (!bpage)
-		return 0;
+		return -1;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
-	/*
-	 * rb_buffer_peek will get the next ring buffer if
-	 * the current reader page is empty.
-	 */
-	event = rb_buffer_peek(buffer, cpu, NULL);
-	if (!event)
+	reader = rb_get_reader_page(cpu_buffer);
+	if (!reader)
 		goto out;
 
-	/* check for data */
-	if (!local_read(&cpu_buffer->reader_page->page->commit))
-		goto out;
+	event = rb_reader_event(cpu_buffer);
+
+	read = reader->read;
+	commit = rb_page_commit(reader);
 
-	read = cpu_buffer->reader_page->read;
 	/*
-	 * If the writer is already off of the read page, then simply
-	 * switch the read page with the given page. Otherwise
-	 * we need to copy the data from the reader to the writer.
+	 * If len > what's left on the page, and the writer is also off of
+	 * the read page, then simply switch the read page with the given
+	 * page. Otherwise we need to copy the data from the reader to the
+	 * writer.
 	 */
-	if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
-		unsigned int commit = rb_page_commit(cpu_buffer->reader_page);
+	if ((len < (commit - read)) ||
+	    cpu_buffer->reader_page == cpu_buffer->commit_page) {
 		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
+		unsigned int pos = read;
+		unsigned int size;
 
 		if (full)
 			goto out;
-		/* The writer is still on the reader page, we must copy */
-		memcpy(bpage->data + read, rpage->data + read, commit - read);
 
-		/* consume what was read */
-		cpu_buffer->reader_page->read = commit;
+		if (len > (commit - read))
+			len = (commit - read);
+
+		size = rb_event_length(event);
+
+		if (len < size)
+			goto out;
+
+		/* Need to copy one event at a time */
+		do {
+			memcpy(bpage->data + pos, rpage->data + pos, size);
+
+			len -= size;
+
+			rb_advance_reader(cpu_buffer);
+			pos = reader->read;
+
+			event = rb_reader_event(cpu_buffer);
+			size = rb_event_length(event);
+		} while (len > size);
 
 		/* update bpage */
-		local_set(&bpage->commit, commit);
-		if (!read)
-			bpage->time_stamp = rpage->time_stamp;
+		local_set(&bpage->commit, pos);
+		bpage->time_stamp = rpage->time_stamp;
+
 	} else {
 		/* swap the pages */
 		rb_init_page(bpage);
-		bpage = cpu_buffer->reader_page->page;
-		cpu_buffer->reader_page->page = *data_page;
-		local_set(&cpu_buffer->reader_page->write, 0);
-		cpu_buffer->reader_page->read = 0;
+		bpage = reader->page;
+		reader->page = *data_page;
+		local_set(&reader->write, 0);
+		reader->read = 0;
 		*data_page = bpage;
+
+		/* update the entry counter */
+		rb_remove_entries(cpu_buffer, bpage, read);
 	}
 	ret = read;
 
-	/* update the entry counter */
-	rb_remove_entries(cpu_buffer, bpage, read);
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-- 
cgit v1.2.3-59-g8ed1b


From e3d6bf0a0781a269f34250fd41e0d3dbfe540cf1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 13:53:07 -0500
Subject: ring-buffer: replace sizeof of event header with offsetof

Impact: fix to possible alignment problems on some archs.

Some arch compilers include an NULL char array in the sizeof field.
Since the ring_buffer_event type includes one of these, it is better
to use the "offsetof" instead, to avoid strange bugs on these archs.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2ad6bae95a3d..27cf834d8b4e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -132,7 +132,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
 
-#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
 #define RB_MAX_SMALL_DATA	28
 
-- 
cgit v1.2.3-59-g8ed1b


From 474d32b68d6d842f3e710e9ae9fe2568c53339f8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 19:51:40 -0500
Subject: ring-buffer: make ring_buffer_read_page read from start on partial
 page

Impact: dont leave holes in read buffer page

The ring_buffer_read_page swaps a given page with the reader page
of the ring buffer, if certain conditions are set:

 1) requested length is big enough to hold entire page data

 2) a writer is not currently on the page

 3) the page is not partially consumed.

Instead of swapping with the supplied page. It copies the data to
the supplied page instead. But currently the data is copied in the
same offset as the source page. This causes a hole at the start
of the reader page. This complicates the use of this function.
Instead, it should copy the data at the beginning of the function
and update the index fields accordingly.

Other small clean ups are also done in this patch.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 43 +++++++++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 27cf834d8b4e..f2a163db52f9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -61,6 +61,8 @@ enum {
 
 static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
 
+#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
+
 /**
  * tracing_on - enable all tracing buffers
  *
@@ -234,9 +236,16 @@ static void rb_init_page(struct buffer_data_page *bpage)
 	local_set(&bpage->commit, 0);
 }
 
+/**
+ * ring_buffer_page_len - the size of data on the page.
+ * @page: The page to read
+ *
+ * Returns the amount of data on the page, including buffer page header.
+ */
 size_t ring_buffer_page_len(void *page)
 {
-	return local_read(&((struct buffer_data_page *)page)->commit);
+	return local_read(&((struct buffer_data_page *)page)->commit)
+		+ BUF_PAGE_HDR_SIZE;
 }
 
 /*
@@ -259,7 +268,7 @@ static inline int test_time_stamp(u64 delta)
 	return 0;
 }
 
-#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data))
+#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
 
 /*
  * head_page == tail_page && head == tail then buffer is empty.
@@ -2454,6 +2463,15 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	unsigned int read;
 	int ret = -1;
 
+	/*
+	 * If len is not big enough to hold the page header, then
+	 * we can not copy anything.
+	 */
+	if (len <= BUF_PAGE_HDR_SIZE)
+		return -1;
+
+	len -= BUF_PAGE_HDR_SIZE;
+
 	if (!data_page)
 		return -1;
 
@@ -2473,15 +2491,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	commit = rb_page_commit(reader);
 
 	/*
-	 * If len > what's left on the page, and the writer is also off of
-	 * the read page, then simply switch the read page with the given
-	 * page. Otherwise we need to copy the data from the reader to the
-	 * writer.
+	 * If this page has been partially read or
+	 * if len is not big enough to read the rest of the page or
+	 * a writer is still on the page, then
+	 * we must copy the data from the page to the buffer.
+	 * Otherwise, we can simply swap the page with the one passed in.
 	 */
-	if ((len < (commit - read)) ||
+	if (read || (len < (commit - read)) ||
 	    cpu_buffer->reader_page == cpu_buffer->commit_page) {
 		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
-		unsigned int pos = read;
+		unsigned int rpos = read;
+		unsigned int pos = 0;
 		unsigned int size;
 
 		if (full)
@@ -2497,12 +2517,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 
 		/* Need to copy one event at a time */
 		do {
-			memcpy(bpage->data + pos, rpage->data + pos, size);
+			memcpy(bpage->data + pos, rpage->data + rpos, size);
 
 			len -= size;
 
 			rb_advance_reader(cpu_buffer);
-			pos = reader->read;
+			rpos = reader->read;
+			pos += size;
 
 			event = rb_reader_event(cpu_buffer);
 			size = rb_event_length(event);
@@ -2512,6 +2533,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		local_set(&bpage->commit, pos);
 		bpage->time_stamp = rpage->time_stamp;
 
+		/* we copied everything to the beginning */
+		read = 0;
 	} else {
 		/* swap the pages */
 		rb_init_page(bpage);
-- 
cgit v1.2.3-59-g8ed1b


From 2cadf9135eb3b6d84b6427314be827ddd443c308 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 1 Dec 2008 22:20:19 -0500
Subject: tracing: add binary buffer files for use with splice

Impact: new feature

This patch creates a directory of files that correspond to the
per CPU ring buffers. These are binary files and are made to
be used with splice. This is the fastest way to extract data from
the ftrace ring buffers.

Thanks to Jiaying Zhang for pushing me to get this code fixed,
 and to Eduard - Gabriel Munteanu for his splice code that helped
 me debug my code.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 274 +++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/trace/trace.h |   1 +
 2 files changed, 268 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ea055aa21cd9..12539f72f4a5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -11,31 +11,30 @@
  *  Copyright (C) 2004-2006 Ingo Molnar
  *  Copyright (C) 2004 William Lee Irwin III
  */
+#include <linux/ring_buffer.h>
 #include <linux/utsrelease.h>
+#include <linux/stacktrace.h>
+#include <linux/writeback.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/notifier.h>
+#include <linux/irqflags.h>
 #include <linux/debugfs.h>
 #include <linux/pagemap.h>
 #include <linux/hardirq.h>
 #include <linux/linkage.h>
 #include <linux/uaccess.h>
+#include <linux/kprobes.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
+#include <linux/splice.h>
 #include <linux/kdebug.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
 #include <linux/gfp.h>
 #include <linux/fs.h>
-#include <linux/kprobes.h>
-#include <linux/writeback.h>
-#include <linux/splice.h>
-
-#include <linux/stacktrace.h>
-#include <linux/ring_buffer.h>
-#include <linux/irqflags.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -3005,6 +3004,246 @@ static struct file_operations tracing_mark_fops = {
 	.write		= tracing_mark_write,
 };
 
+struct ftrace_buffer_info {
+	struct trace_array	*tr;
+	void			*spare;
+	int			cpu;
+	unsigned int		read;
+};
+
+static int tracing_buffers_open(struct inode *inode, struct file *filp)
+{
+	int cpu = (int)(long)inode->i_private;
+	struct ftrace_buffer_info *info;
+
+	if (tracing_disabled)
+		return -ENODEV;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	info->tr	= &global_trace;
+	info->cpu	= cpu;
+	info->spare	= ring_buffer_alloc_read_page(info->tr->buffer);
+	/* Force reading ring buffer for first read */
+	info->read	= (unsigned int)-1;
+	if (!info->spare)
+		goto out;
+
+	filp->private_data = info;
+
+	return 0;
+
+ out:
+	kfree(info);
+	return -ENOMEM;
+}
+
+static ssize_t
+tracing_buffers_read(struct file *filp, char __user *ubuf,
+		     size_t count, loff_t *ppos)
+{
+	struct ftrace_buffer_info *info = filp->private_data;
+	unsigned int pos;
+	ssize_t ret;
+	size_t size;
+
+	/* Do we have previous read data to read? */
+	if (info->read < PAGE_SIZE)
+		goto read;
+
+	info->read = 0;
+
+	ret = ring_buffer_read_page(info->tr->buffer,
+				    &info->spare,
+				    count,
+				    info->cpu, 0);
+	if (ret < 0)
+		return 0;
+
+	pos = ring_buffer_page_len(info->spare);
+
+	if (pos < PAGE_SIZE)
+		memset(info->spare + pos, 0, PAGE_SIZE - pos);
+
+read:
+	size = PAGE_SIZE - info->read;
+	if (size > count)
+		size = count;
+
+	ret = copy_to_user(ubuf, info->spare + info->read, size);
+	if (ret)
+		return -EFAULT;
+	*ppos += size;
+	info->read += size;
+
+	return size;
+}
+
+static int tracing_buffers_release(struct inode *inode, struct file *file)
+{
+	struct ftrace_buffer_info *info = file->private_data;
+
+	ring_buffer_free_read_page(info->tr->buffer, info->spare);
+	kfree(info);
+
+	return 0;
+}
+
+struct buffer_ref {
+	struct ring_buffer	*buffer;
+	void			*page;
+	int			ref;
+};
+
+static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	struct buffer_ref *ref = (struct buffer_ref *)buf->private;
+
+	if (--ref->ref)
+		return;
+
+	ring_buffer_free_read_page(ref->buffer, ref->page);
+	kfree(ref);
+	buf->private = 0;
+}
+
+static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
+				 struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
+				struct pipe_buffer *buf)
+{
+	struct buffer_ref *ref = (struct buffer_ref *)buf->private;
+
+	ref->ref++;
+}
+
+/* Pipe buffer operations for a buffer. */
+static struct pipe_buf_operations buffer_pipe_buf_ops = {
+	.can_merge		= 0,
+	.map			= generic_pipe_buf_map,
+	.unmap			= generic_pipe_buf_unmap,
+	.confirm		= generic_pipe_buf_confirm,
+	.release		= buffer_pipe_buf_release,
+	.steal			= buffer_pipe_buf_steal,
+	.get			= buffer_pipe_buf_get,
+};
+
+/*
+ * Callback from splice_to_pipe(), if we need to release some pages
+ * at the end of the spd in case we error'ed out in filling the pipe.
+ */
+static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+	struct buffer_ref *ref =
+		(struct buffer_ref *)spd->partial[i].private;
+
+	if (--ref->ref)
+		return;
+
+	ring_buffer_free_read_page(ref->buffer, ref->page);
+	kfree(ref);
+	spd->partial[i].private = 0;
+}
+
+static ssize_t
+tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+			    struct pipe_inode_info *pipe, size_t len,
+			    unsigned int flags)
+{
+	struct ftrace_buffer_info *info = file->private_data;
+	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_BUFFERS];
+	struct splice_pipe_desc spd = {
+		.pages		= pages,
+		.partial	= partial,
+		.flags		= flags,
+		.ops		= &buffer_pipe_buf_ops,
+		.spd_release	= buffer_spd_release,
+	};
+	struct buffer_ref *ref;
+	int size, i;
+	size_t ret;
+
+	/*
+	 * We can't seek on a buffer input
+	 */
+	if (unlikely(*ppos))
+		return -ESPIPE;
+
+
+	for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) {
+		struct page *page;
+		int r;
+
+		ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+		if (!ref)
+			break;
+
+		ref->buffer = info->tr->buffer;
+		ref->page = ring_buffer_alloc_read_page(ref->buffer);
+		if (!ref->page) {
+			kfree(ref);
+			break;
+		}
+
+		r = ring_buffer_read_page(ref->buffer, &ref->page,
+					  len, info->cpu, 0);
+		if (r < 0) {
+			ring_buffer_free_read_page(ref->buffer,
+						   ref->page);
+			kfree(ref);
+			break;
+		}
+
+		/*
+		 * zero out any left over data, this is going to
+		 * user land.
+		 */
+		size = ring_buffer_page_len(ref->page);
+		if (size < PAGE_SIZE)
+			memset(ref->page + size, 0, PAGE_SIZE - size);
+
+		page = virt_to_page(ref->page);
+
+		spd.pages[i] = page;
+		spd.partial[i].len = PAGE_SIZE;
+		spd.partial[i].offset = 0;
+		spd.partial[i].private = (unsigned long)ref;
+		spd.nr_pages++;
+	}
+
+	spd.nr_pages = i;
+
+	/* did we read anything? */
+	if (!spd.nr_pages) {
+		if (flags & SPLICE_F_NONBLOCK)
+			ret = -EAGAIN;
+		else
+			ret = 0;
+		/* TODO: block */
+		return ret;
+	}
+
+	ret = splice_to_pipe(pipe, &spd);
+
+	return ret;
+}
+
+static const struct file_operations tracing_buffers_fops = {
+	.open		= tracing_buffers_open,
+	.read		= tracing_buffers_read,
+	.release	= tracing_buffers_release,
+	.splice_read	= tracing_buffers_splice_read,
+	.llseek		= no_llseek,
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3399,6 +3638,7 @@ static __init void create_trace_options_dir(void)
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
+	struct dentry *buffers;
 	struct dentry *entry;
 	int cpu;
 
@@ -3471,6 +3711,26 @@ static __init int tracer_init_debugfs(void)
 		pr_warning("Could not create debugfs "
 			   "'trace_marker' entry\n");
 
+	buffers = debugfs_create_dir("binary_buffers", d_tracer);
+
+	if (!buffers)
+		pr_warning("Could not create buffers directory\n");
+	else {
+		int cpu;
+		char buf[64];
+
+		for_each_tracing_cpu(cpu) {
+			sprintf(buf, "%d", cpu);
+
+			entry = debugfs_create_file(buf, 0444, buffers,
+						    (void *)(long)cpu,
+						    &tracing_buffers_fops);
+			if (!entry)
+				pr_warning("Could not create debugfs buffers "
+					   "'%s' entry\n", buf);
+		}
+	}
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 				    &ftrace_update_tot_cnt,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e606633fb498..561bb5c5d988 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -217,6 +217,7 @@ enum trace_flag_type {
  */
 struct trace_array_cpu {
 	atomic_t		disabled;
+	void			*buffer_page;	/* ring buffer spare */
 
 	/* these fields get copied into max-trace: */
 	unsigned long		trace_idx;
-- 
cgit v1.2.3-59-g8ed1b


From 1c21f14ec48a2256fb03682b24dddd23eacdc96f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 4 Mar 2009 13:51:13 +0100
Subject: lockdep: fix incorrect state name

In the recent mark_lock_irq() rework a bug snuck in that would report the
state of write locks causing irq inversion under a read lock as a read
lock.

Fix this by masking the read bit of the state when validating write
dependencies.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1236172646.5330.7450.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 022d2ed7fd8b..ef6584fd9fe5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2015,7 +2015,8 @@ typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
 			     enum lock_usage_bit bit, const char *name);
 
 static int
-mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
+mark_lock_irq(struct task_struct *curr, struct held_lock *this,
+		enum lock_usage_bit new_bit)
 {
 	int excl_bit = exclusive_bit(new_bit);
 	int read = new_bit & 1;
@@ -2043,7 +2044,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
 	 * states.
 	 */
 	if ((!read || !dir || STRICT_READ_CHECKS) &&
-			!usage(curr, this, excl_bit, state_name(new_bit)))
+			!usage(curr, this, excl_bit, state_name(new_bit & ~1)))
 		return 0;
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 26575e28df5eb2050c02369843faba38cecb4d8c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 4 Mar 2009 14:53:24 +0100
Subject: lockdep: remove extra "irq" string

Impact: clarify lockdep printk text

print_irq_inversion_bug() gets handed state strings of the form

  "HARDIRQ", "SOFTIRQ", "RECLAIM_FS"

and appends "-irq-{un,}safe" to them, which is either redudant for *IRQ or
confusing in the RECLAIM_FS case.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1236175192.5330.7585.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index ef6584fd9fe5..02014f7ccc86 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1900,9 +1900,9 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
 		curr->comm, task_pid_nr(curr));
 	print_lock(this);
 	if (forwards)
-		printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
+		printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
 	else
-		printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass);
+		printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
 	print_lock_name(other);
 	printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
 
-- 
cgit v1.2.3-59-g8ed1b


From efed792d6738964f399a508ef9e831cd60fa4657 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 4 Mar 2009 12:32:55 +0100
Subject: tracing: add lockdep tracepoints for lock acquire/release

Augment the traces with lock names when lockdep is available:

 1)               |  down_read_trylock() {
 1)               |    _spin_lock_irqsave() {
 1)               |      /* lock_acquire: &sem->wait_lock */
 1)   4.201 us    |    }
 1)               |    _spin_unlock_irqrestore() {
 1)               |      /* lock_release: &sem->wait_lock */
 1)   3.523 us    |    }
 1)               |  /* lock_acquire: try read &mm->mmap_sem */
 1) + 13.386 us   |  }
 1)   1.635 us    |  find_vma();
 1)               |  handle_mm_fault() {
 1)               |    __do_fault() {
 1)               |      filemap_fault() {
 1)               |        find_lock_page() {
 1)               |          find_get_page() {
 1)               |            /* lock_acquire: read rcu_read_lock */
 1)               |            /* lock_release: rcu_read_lock */
 1)   5.697 us    |          }
 1)   8.158 us    |        }
 1) + 11.079 us   |      }
 1)               |      _spin_lock() {
 1)               |        /* lock_acquire: __pte_lockptr(page) */
 1)   3.949 us    |      }
 1)   1.460 us    |      page_add_file_rmap();
 1)               |      _spin_unlock() {
 1)               |        /* lock_release: __pte_lockptr(page) */
 1)   3.115 us    |      }
 1)               |      unlock_page() {
 1)   1.421 us    |        page_waitqueue();
 1)   1.220 us    |        __wake_up_bit();
 1)   6.519 us    |      }
 1) + 34.328 us   |    }
 1) + 37.452 us   |  }
 1)               |  up_read() {
 1)               |  /* lock_release: &mm->mmap_sem */
 1)               |    _spin_lock_irqsave() {
 1)               |      /* lock_acquire: &sem->wait_lock */
 1)   3.865 us    |    }
 1)               |    _spin_unlock_irqrestore() {
 1)               |      /* lock_release: &sem->wait_lock */
 1)   8.562 us    |    }
 1) + 17.370 us   |  }

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?T=F6r=F6k?= Edwin <edwintorok@gmail.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1236166375.5330.7209.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/lockdep.h             |  9 ++++++++
 include/trace/lockdep_event_types.h | 44 +++++++++++++++++++++++++++++++++++++
 include/trace/trace_event_types.h   |  1 +
 include/trace/trace_events.h        |  1 +
 kernel/lockdep.c                    | 17 ++++++++++++++
 kernel/trace/trace.c                | 14 +++++++-----
 kernel/trace/trace_events_stage_3.h |  4 ++--
 7 files changed, 82 insertions(+), 8 deletions(-)
 create mode 100644 include/trace/lockdep.h
 create mode 100644 include/trace/lockdep_event_types.h

(limited to 'kernel')

diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h
new file mode 100644
index 000000000000..5ca67df87f2a
--- /dev/null
+++ b/include/trace/lockdep.h
@@ -0,0 +1,9 @@
+#ifndef _TRACE_LOCKDEP_H
+#define _TRACE_LOCKDEP_H
+
+#include <linux/lockdep.h>
+#include <linux/tracepoint.h>
+
+#include <trace/lockdep_event_types.h>
+
+#endif
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
new file mode 100644
index 000000000000..f713d74a82b4
--- /dev/null
+++ b/include/trace/lockdep_event_types.h
@@ -0,0 +1,44 @@
+
+#ifndef TRACE_EVENT_FORMAT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lock
+
+#ifdef CONFIG_LOCKDEP
+
+TRACE_FORMAT(lock_acquire,
+	TPPROTO(struct lockdep_map *lock, unsigned int subclass,
+		int trylock, int read, int check,
+		struct lockdep_map *next_lock, unsigned long ip),
+	TPARGS(lock, subclass, trylock, read, check, next_lock, ip),
+	TPFMT("%s%s%s", trylock ? "try " : "",
+		read ? "read " : "", lock->name)
+	);
+
+TRACE_FORMAT(lock_release,
+	TPPROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+	TPARGS(lock, nested, ip),
+	TPFMT("%s", lock->name)
+	);
+
+#ifdef CONFIG_LOCK_STAT
+
+TRACE_FORMAT(lock_contended,
+	TPPROTO(struct lockdep_map *lock, unsigned long ip),
+	TPARGS(lock, ip),
+	TPFMT("%s", lock->name)
+	);
+
+TRACE_FORMAT(lock_acquired,
+	TPPROTO(struct lockdep_map *lock, unsigned long ip),
+	TPARGS(lock, ip),
+	TPFMT("%s", lock->name)
+	);
+
+#endif
+#endif
+
+#undef TRACE_SYSTEM
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
index 33c8ed5ccb6c..df56f5694be6 100644
--- a/include/trace/trace_event_types.h
+++ b/include/trace/trace_event_types.h
@@ -2,3 +2,4 @@
 
 #include <trace/sched_event_types.h>
 #include <trace/irq_event_types.h>
+#include <trace/lockdep_event_types.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index ea2ef2051762..fd13750ca4ba 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -2,3 +2,4 @@
 
 #include <trace/sched.h>
 #include <trace/irq.h>
+#include <trace/lockdep.h>
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 02014f7ccc86..cb70c1db85d0 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,7 @@
 #include <linux/hash.h>
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
+#include <trace/lockdep.h>
 
 #include <asm/sections.h>
 
@@ -2913,6 +2914,8 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
 }
 EXPORT_SYMBOL_GPL(lock_set_class);
 
+DEFINE_TRACE(lock_acquire);
+
 /*
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
@@ -2923,6 +2926,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 {
 	unsigned long flags;
 
+	trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
+
 	if (unlikely(current->lockdep_recursion))
 		return;
 
@@ -2937,11 +2942,15 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 }
 EXPORT_SYMBOL_GPL(lock_acquire);
 
+DEFINE_TRACE(lock_release);
+
 void lock_release(struct lockdep_map *lock, int nested,
 			  unsigned long ip)
 {
 	unsigned long flags;
 
+	trace_lock_release(lock, nested, ip);
+
 	if (unlikely(current->lockdep_recursion))
 		return;
 
@@ -3090,10 +3099,14 @@ found_it:
 	lock->ip = ip;
 }
 
+DEFINE_TRACE(lock_contended);
+
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
 
+	trace_lock_contended(lock, ip);
+
 	if (unlikely(!lock_stat))
 		return;
 
@@ -3109,10 +3122,14 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 }
 EXPORT_SYMBOL_GPL(lock_contended);
 
+DEFINE_TRACE(lock_acquired);
+
 void lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
 
+	trace_lock_acquired(lock, ip);
+
 	if (unlikely(!lock_stat))
 		return;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 12539f72f4a5..c8abbb0c8397 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -623,7 +623,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
 static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
 static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
 static int cmdline_idx;
-static DEFINE_SPINLOCK(trace_cmdline_lock);
+static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED;
 
 /* temporary disable recording */
 static atomic_t trace_record_cmdline_disabled __read_mostly;
@@ -735,7 +735,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
 	 * nor do we want to disable interrupts,
 	 * so if we miss here, then better luck next time.
 	 */
-	if (!spin_trylock(&trace_cmdline_lock))
+	if (!__raw_spin_trylock(&trace_cmdline_lock))
 		return;
 
 	idx = map_pid_to_cmdline[tsk->pid];
@@ -753,7 +753,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
 
 	memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
 
-	spin_unlock(&trace_cmdline_lock);
+	__raw_spin_unlock(&trace_cmdline_lock);
 }
 
 char *trace_find_cmdline(int pid)
@@ -3751,7 +3751,7 @@ static __init int tracer_init_debugfs(void)
 
 int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
-	static DEFINE_SPINLOCK(trace_buf_lock);
+	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
 	static char trace_buf[TRACE_BUF_SIZE];
 
 	struct ring_buffer_event *event;
@@ -3773,7 +3773,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 		goto out;
 
 	pause_graph_tracing();
-	spin_lock_irqsave(&trace_buf_lock, irq_flags);
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&trace_buf_lock);
 	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
 
 	len = min(len, TRACE_BUF_SIZE-1);
@@ -3792,7 +3793,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
-	spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
+	__raw_spin_unlock(&trace_buf_lock);
+	raw_local_irq_restore(irq_flags);
 	unpause_graph_tracing();
  out:
 	preempt_enable_notrace();
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 041789ffbac1..2c8d76c7dbed 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -5,7 +5,7 @@
  *
  * static void ftrace_event_<call>(proto)
  * {
- * 	event_trace_printk(_RET_IP_, "(<call>) " <fmt>);
+ * 	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
  * }
  *
  * static int ftrace_reg_event_<call>(void)
@@ -112,7 +112,7 @@
 #define _TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
 {									\
-	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
+	event_trace_printk(_RET_IP_, #call ": " fmt);			\
 }									\
 									\
 static int ftrace_reg_event_##call(void)				\
-- 
cgit v1.2.3-59-g8ed1b


From e543ad76914abec1acf6631604a4154cd7a2ca6b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 18:20:36 -0500
Subject: tracing: add cpu_file intialization for ftrace_dump

Impact: fix to ftrace_dump output corruption

The commit: b04cc6b1f6398b0e0b60d37e27ce51b4899672ec
  tracing/core: introduce per cpu tracing files

added a new field to the iterator called cpu_file. This was a handle
to differentiate between the per cpu trace output files and the
all cpu "trace" file. The all cpu "trace" file required setting this
to TRACE_PIPE_ALL_CPU.

The problem is that the ftrace_dump sets up its own iterator but was
not updated to handle this change. The result was only CPU 0 printing
out on crash and a lot of "<0>"'s also being printed.

Reported-by: Thomas Gleixner <tglx@linuxtronix.de>
Tested-by: Darren Hart <dvhtc@us.ibm.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c8abbb0c8397..ab5cbcae43a1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3918,8 +3918,10 @@ void ftrace_dump(void)
 
 	printk(KERN_TRACE "Dumping ftrace buffer:\n");
 
+	/* Simulate the iterator */
 	iter.tr = &global_trace;
 	iter.trace = current_trace;
+	iter.cpu_file = TRACE_PIPE_ALL_CPU;
 
 	/*
 	 * We need to stop all tracing on all CPUS to read the
-- 
cgit v1.2.3-59-g8ed1b


From 4f3640f8a358f2183a8c966f299eeb55ca523e06 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 23:52:42 -0500
Subject: ring-buffer: fix timestamp in partial ring_buffer_page_read

If a partial ring_buffer_page_read happens, then some of the
incremental timestamps may be lost. This patch writes the
recent timestamp into the page that is passed back to the caller.

A partial ring_buffer_page_read is where the full page would not
be written back to the user, and instead, just part of the page
is copied to the user. A full page would be a page swap with the
ring buffer and the timestamps would be correct.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f2a163db52f9..f7473645b9c6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2461,6 +2461,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	unsigned long flags;
 	unsigned int commit;
 	unsigned int read;
+	u64 save_timestamp;
 	int ret = -1;
 
 	/*
@@ -2515,6 +2516,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		if (len < size)
 			goto out;
 
+		/* save the current timestamp, since the user will need it */
+		save_timestamp = cpu_buffer->read_stamp;
+
 		/* Need to copy one event at a time */
 		do {
 			memcpy(bpage->data + pos, rpage->data + rpos, size);
@@ -2531,7 +2535,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 
 		/* update bpage */
 		local_set(&bpage->commit, pos);
-		bpage->time_stamp = rpage->time_stamp;
+		bpage->time_stamp = save_timestamp;
 
 		/* we copied everything to the beginning */
 		read = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 2dc5d12b1f43134e9bc5037f69f4739cfdfab93e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 19:10:05 -0500
Subject: tracing: do not return EFAULT if read copied anything

Impact: fix trace read to conform to standards

Andrew Morton, Theodore Tso and H. Peter Anvin brought to my attention
that a userspace read should not return -EFAULT if it succeeded in
copying anything. It should only return -EFAULT if it failed to copy
at all.

This patch modifies the check of copy_from_user and updates the return
code appropriately.

I also used H. Peter Anvin's short cut rule to just test ret == count.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ab5cbcae43a1..57155dc53530 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -346,6 +346,9 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 	int len;
 	int ret;
 
+	if (!cnt)
+		return 0;
+
 	if (s->len <= s->readpos)
 		return -EBUSY;
 
@@ -353,9 +356,11 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 	if (cnt > len)
 		cnt = len;
 	ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
-	if (ret)
+	if (ret == cnt)
 		return -EFAULT;
 
+	cnt -= ret;
+
 	s->readpos += len;
 	return cnt;
 }
@@ -3049,6 +3054,9 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 	ssize_t ret;
 	size_t size;
 
+	if (!count)
+		return 0;
+
 	/* Do we have previous read data to read? */
 	if (info->read < PAGE_SIZE)
 		goto read;
@@ -3073,8 +3081,10 @@ read:
 		size = count;
 
 	ret = copy_to_user(ubuf, info->spare + info->read, size);
-	if (ret)
+	if (ret == size)
 		return -EFAULT;
+	size -= ret;
+
 	*ppos += size;
 	info->read += size;
 
-- 
cgit v1.2.3-59-g8ed1b


From e74da5235cec6cb71eb338c987f876ecc793138b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 20:31:11 -0500
Subject: tracing: fix seq read from trace files

The buffer used by trace_seq was updated incorrectly. Instead
of consuming what was actually read, it consumed the rest of the
buffer on reads.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 57155dc53530..2e53e6f09440 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -361,7 +361,7 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 
 	cnt -= ret;
 
-	s->readpos += len;
+	s->readpos += cnt;
 	return cnt;
 }
 
@@ -380,7 +380,7 @@ ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 	if (!ret)
 		return -EFAULT;
 
-	s->readpos += len;
+	s->readpos += cnt;
 	return cnt;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From c032ef64d680717e4e8ce3da65da6419a35f8a2c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 20:34:24 -0500
Subject: tracing: add latency output format option

With the removal of the latency_trace file, we lost the ability
to see some of the finer details in a trace. Like the state of
interrupts enabled, the preempt count, need resched, and if we
are in an interrupt handler, softirq handler or not.

This patch simply creates an option to bring back the old format.
This also removes the warning about an unused variable that held
the latency_trace file operations.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 24 ++----------------------
 kernel/trace/trace.h |  3 ++-
 2 files changed, 4 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2e53e6f09440..55fcbb567950 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -299,6 +299,7 @@ static const char *trace_options[] = {
 	"sym-userobj",
 	"printk-msg-only",
 	"context-info",
+	"latency-format",
 	NULL
 };
 
@@ -1829,26 +1830,12 @@ static int tracing_open(struct inode *inode, struct file *file)
 	iter = __tracing_open(inode, file);
 	if (IS_ERR(iter))
 		ret = PTR_ERR(iter);
-
-	return ret;
-}
-
-static int tracing_lt_open(struct inode *inode, struct file *file)
-{
-	struct trace_iterator *iter;
-	int ret = 0;
-
-	iter = __tracing_open(inode, file);
-
-	if (IS_ERR(iter))
-		ret = PTR_ERR(iter);
-	else
+	else if (trace_flags & TRACE_ITER_LATENCY_FMT)
 		iter->iter_flags |= TRACE_FILE_LAT_FMT;
 
 	return ret;
 }
 
-
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
@@ -1927,13 +1914,6 @@ static struct file_operations tracing_fops = {
 	.release	= tracing_release,
 };
 
-static struct file_operations tracing_lt_fops = {
-	.open		= tracing_lt_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= tracing_release,
-};
-
 static struct file_operations show_traces_fops = {
 	.open		= show_traces_open,
 	.read		= seq_read,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 561bb5c5d988..12cd119cca32 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -651,7 +651,8 @@ enum trace_iterator_flags {
 	TRACE_ITER_USERSTACKTRACE       = 0x4000,
 	TRACE_ITER_SYM_USEROBJ          = 0x8000,
 	TRACE_ITER_PRINTK_MSGONLY	= 0x10000,
-	TRACE_ITER_CONTEXT_INFO		= 0x20000 /* Print pid/cpu/time */
+	TRACE_ITER_CONTEXT_INFO		= 0x20000, /* Print pid/cpu/time */
+	TRACE_ITER_LATENCY_FMT		= 0x40000,
 };
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 5fd73f862468280d4cbb5ba4321502f911f9f89a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 21:42:04 -0500
Subject: tracing: remove extra latency_trace method from trace structure

Impact: clean up

The trace and latency_trace function pointers are identical for
every tracer but the function tracer. The differences in the function
tracer are trivial (latency output puts paranthesis around parent).

This patch removes the latency_trace pointer and all prints will
now just use the trace output function pointer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/blktrace.c     |  1 -
 kernel/trace/trace.c        |  2 +-
 kernel/trace/trace_branch.c |  1 -
 kernel/trace/trace_output.c | 32 --------------------------------
 kernel/trace/trace_output.h |  1 -
 5 files changed, 1 insertion(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e82cb9e930cc..e39679a72a3b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1231,7 +1231,6 @@ static struct tracer blk_tracer __read_mostly = {
 static struct trace_event trace_blk_event = {
 	.type	 	= TRACE_BLK,
 	.trace		= blk_trace_event_print,
-	.latency_trace	= blk_trace_event_print,
 	.binary		= blk_trace_event_print_binary,
 };
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 55fcbb567950..21b89ecb8480 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1485,7 +1485,7 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 	}
 
 	if (event)
-		return event->latency_trace(iter, sym_flags);
+		return event->trace(iter, sym_flags);
 
 	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
 		goto partial;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index c2e68d440c4d..aaa0755268b9 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -159,7 +159,6 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
 static struct trace_event trace_branch_event = {
 	.type	 	= TRACE_BRANCH,
 	.trace		= trace_branch_print,
-	.latency_trace	= trace_branch_print,
 };
 
 static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 9fc815031b09..306fef84c503 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -437,8 +437,6 @@ int register_ftrace_event(struct trace_event *event)
 
 	if (event->trace == NULL)
 		event->trace = trace_nop_print;
-	if (event->latency_trace == NULL)
-		event->latency_trace = trace_nop_print;
 	if (event->raw == NULL)
 		event->raw = trace_nop_print;
 	if (event->hex == NULL)
@@ -480,29 +478,6 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
 }
 
 /* TRACE_FN */
-static enum print_line_t trace_fn_latency(struct trace_iterator *iter,
-					  int flags)
-{
-	struct ftrace_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!seq_print_ip_sym(s, field->ip, flags))
-		goto partial;
-	if (!trace_seq_puts(s, " ("))
-		goto partial;
-	if (!seq_print_ip_sym(s, field->parent_ip, flags))
-		goto partial;
-	if (!trace_seq_puts(s, ")\n"))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
 static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
@@ -573,7 +548,6 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
 static struct trace_event trace_fn_event = {
 	.type	 	= TRACE_FN,
 	.trace		= trace_fn_trace,
-	.latency_trace	= trace_fn_latency,
 	.raw		= trace_fn_raw,
 	.hex		= trace_fn_hex,
 	.binary		= trace_fn_bin,
@@ -705,7 +679,6 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
 static struct trace_event trace_ctx_event = {
 	.type	 	= TRACE_CTX,
 	.trace		= trace_ctx_print,
-	.latency_trace	= trace_ctx_print,
 	.raw		= trace_ctx_raw,
 	.hex		= trace_ctx_hex,
 	.binary		= trace_ctxwake_bin,
@@ -714,7 +687,6 @@ static struct trace_event trace_ctx_event = {
 static struct trace_event trace_wake_event = {
 	.type	 	= TRACE_WAKE,
 	.trace		= trace_wake_print,
-	.latency_trace	= trace_wake_print,
 	.raw		= trace_wake_raw,
 	.hex		= trace_wake_hex,
 	.binary		= trace_ctxwake_bin,
@@ -770,7 +742,6 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
 static struct trace_event trace_special_event = {
 	.type	 	= TRACE_SPECIAL,
 	.trace		= trace_special_print,
-	.latency_trace	= trace_special_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
 	.binary		= trace_special_bin,
@@ -808,7 +779,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 static struct trace_event trace_stack_event = {
 	.type	 	= TRACE_STACK,
 	.trace		= trace_stack_print,
-	.latency_trace	= trace_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
 	.binary		= trace_special_bin,
@@ -838,7 +808,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 static struct trace_event trace_user_stack_event = {
 	.type	 	= TRACE_USER_STACK,
 	.trace		= trace_user_stack_print,
-	.latency_trace	= trace_user_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
 	.binary		= trace_special_bin,
@@ -883,7 +852,6 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 static struct trace_event trace_print_event = {
 	.type	 	= TRACE_PRINT,
 	.trace		= trace_print_print,
-	.latency_trace	= trace_print_print,
 	.raw		= trace_print_raw,
 };
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 551a25a72217..8a34d688ed63 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -10,7 +10,6 @@ struct trace_event {
 	struct hlist_node	node;
 	int			type;
 	trace_print_func	trace;
-	trace_print_func	latency_trace;
 	trace_print_func	raw;
 	trace_print_func	hex;
 	trace_print_func	binary;
-- 
cgit v1.2.3-59-g8ed1b


From 27d48be84477d2f0a2e2ac3738a3971dece631d5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 21:57:29 -0500
Subject: tracing: consolidate print_lat_fmt and print_trace_fmt

Impact: clean up

Both print_lat_fmt and print_trace_fmt do pretty much the same thing
except for one different function call. This patch consolidates the
two functions and adds an if statement to perform the difference.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 39 +++++++--------------------------------
 1 file changed, 7 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 21b89ecb8480..d1ef43999d9e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1468,33 +1468,6 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 	trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
 }
 
-static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
-{
-	struct trace_seq *s = &iter->seq;
-	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
-	struct trace_event *event;
-	struct trace_entry *entry = iter->ent;
-
-	test_cpu_buff_start(iter);
-
-	event = ftrace_find_event(entry->type);
-
-	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		if (!trace_print_lat_context(iter))
-			goto partial;
-	}
-
-	if (event)
-		return event->trace(iter, sym_flags);
-
-	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
 static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1509,8 +1482,13 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	event = ftrace_find_event(entry->type);
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		if (!trace_print_context(iter))
-			goto partial;
+		if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+			if (!trace_print_lat_context(iter))
+				goto partial;
+		} else {
+			if (!trace_print_context(iter))
+				goto partial;
+		}
 	}
 
 	if (event)
@@ -1652,9 +1630,6 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 	if (trace_flags & TRACE_ITER_RAW)
 		return print_raw_fmt(iter);
 
-	if (iter->iter_flags & TRACE_FILE_LAT_FMT)
-		return print_lat_fmt(iter);
-
 	return print_trace_fmt(iter);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From e9d25fe6eaa2c720bb3ea661b660e58d54fa38bf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 22:15:30 -0500
Subject: tracing: have latency tracers set the latency format

The latency tracers (irqsoff, preemptoff, preemptirqsoff, and wakeup)
are pretty useless with the default output format. This patch makes them
automatically enable the latency format when they are selected. They
also record the state of the latency option, and if it was not enabled
when selected, they disable it on reset.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_irqsoff.c      | 8 ++++++++
 kernel/trace/trace_sched_wakeup.c | 8 ++++++++
 2 files changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9e5ebd844158..b923d13e2fad 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -32,6 +32,8 @@ enum {
 
 static int trace_type __read_mostly;
 
+static int save_lat_flag;
+
 #ifdef CONFIG_PREEMPT_TRACER
 static inline int
 preempt_trace(void)
@@ -370,6 +372,9 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
 
 static void __irqsoff_tracer_init(struct trace_array *tr)
 {
+	save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
+	trace_flags |= TRACE_ITER_LATENCY_FMT;
+
 	tracing_max_latency = 0;
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visible */
@@ -380,6 +385,9 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
 	stop_irqsoff_tracer(tr);
+
+	if (!save_lat_flag)
+		trace_flags &= ~TRACE_ITER_LATENCY_FMT;
 }
 
 static void irqsoff_tracer_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index db55f7aaa640..3c5ad6b2ec84 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -32,6 +32,8 @@ static raw_spinlock_t wakeup_lock =
 
 static void __wakeup_reset(struct trace_array *tr);
 
+static int save_lat_flag;
+
 #ifdef CONFIG_FUNCTION_TRACER
 /*
  * irqsoff uses its own tracer function to keep the overhead down:
@@ -324,6 +326,9 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 
 static int __wakeup_tracer_init(struct trace_array *tr)
 {
+	save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
+	trace_flags |= TRACE_ITER_LATENCY_FMT;
+
 	tracing_max_latency = 0;
 	wakeup_trace = tr;
 	start_wakeup_tracer(tr);
@@ -347,6 +352,9 @@ static void wakeup_tracer_reset(struct trace_array *tr)
 	stop_wakeup_tracer(tr);
 	/* make sure we put back any tasks we are tracing */
 	wakeup_reset(tr);
+
+	if (!save_lat_flag)
+		trace_flags &= ~TRACE_ITER_LATENCY_FMT;
 }
 
 static void wakeup_tracer_start(struct trace_array *tr)
-- 
cgit v1.2.3-59-g8ed1b


From 5e1607a00bd082972629d3d68c95c8bcf902b55a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 10:24:48 +0100
Subject: tracing: rename ftrace_printk() => trace_printk()

Impact: cleanup

Use a more generic name - this also allows the prototype to move
to kernel.h and be generally available to kernel developers who
want to do some quick tracing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/ftrace.txt |  6 +++---
 include/linux/ftrace.h   | 18 +++++++++---------
 kernel/trace/trace.c     |  8 ++++----
 kernel/trace/trace.h     |  2 +-
 4 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index 2041ee951c1a..22614bef6359 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -1466,11 +1466,11 @@ want, depending on your needs.
 
 
 You can put some comments on specific functions by using
-ftrace_printk() For example, if you want to put a comment inside
+trace_printk() For example, if you want to put a comment inside
 the __might_sleep() function, you just have to include
-<linux/ftrace.h> and call ftrace_printk() inside __might_sleep()
+<linux/ftrace.h> and call trace_printk() inside __might_sleep()
 
-ftrace_printk("I'm a comment!\n")
+trace_printk("I'm a comment!\n")
 
 will produce:
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1f69ac7c1587..fbb9c364e166 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -329,11 +329,11 @@ extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
 /**
- * ftrace_printk - printf formatting in the ftrace buffer
+ * trace_printk - printf formatting in the ftrace buffer
  * @fmt: the printf format for printing
  *
- * Note: __ftrace_printk is an internal function for ftrace_printk and
- *       the @ip is passed in via the ftrace_printk macro.
+ * Note: __trace_printk is an internal function for trace_printk and
+ *       the @ip is passed in via the trace_printk macro.
  *
  * This function allows a kernel developer to debug fast path sections
  * that printk is not appropriate for. By scattering in various
@@ -341,14 +341,14 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
  * where problems are occurring.
  *
  * This is intended as a debugging tool for the developer only.
- * Please refrain from leaving ftrace_printks scattered around in
+ * Please refrain from leaving trace_printks scattered around in
  * your code.
  */
-# define ftrace_printk(fmt...) __ftrace_printk(_THIS_IP_, fmt)
+# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt)
 extern int
-__ftrace_printk(unsigned long ip, const char *fmt, ...)
+__trace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
-# define ftrace_vprintk(fmt, ap) __ftrace_printk(_THIS_IP_, fmt, ap)
+# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap)
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 extern void ftrace_dump(void);
@@ -356,13 +356,13 @@ extern void ftrace_dump(void);
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 static inline int
-ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
+trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
 
 static inline void tracing_start(void) { }
 static inline void tracing_stop(void) { }
 static inline void ftrace_off_permanent(void) { }
 static inline int
-ftrace_printk(const char *fmt, ...)
+trace_printk(const char *fmt, ...)
 {
 	return 0;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d1ef43999d9e..c0e9c1263393 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -48,7 +48,7 @@ unsigned long __read_mostly	tracing_thresh;
  * We need to change this state when a selftest is running.
  * A selftest will lurk into the ring-buffer to count the
  * entries inserted during the selftest although some concurrent
- * insertions into the ring-buffer such as ftrace_printk could occurred
+ * insertions into the ring-buffer such as trace_printk could occurred
  * at the same time, giving false positive or negative results.
  */
 static bool __read_mostly tracing_selftest_running;
@@ -291,7 +291,7 @@ static const char *trace_options[] = {
 	"block",
 	"stacktrace",
 	"sched-tree",
-	"ftrace_printk",
+	"trace_printk",
 	"ftrace_preempt",
 	"branch",
 	"annotate",
@@ -3768,7 +3768,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 }
 EXPORT_SYMBOL_GPL(trace_vprintk);
 
-int __ftrace_printk(unsigned long ip, const char *fmt, ...)
+int __trace_printk(unsigned long ip, const char *fmt, ...)
 {
 	int ret;
 	va_list ap;
@@ -3781,7 +3781,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 	va_end(ap);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__ftrace_printk);
+EXPORT_SYMBOL_GPL(__trace_printk);
 
 int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 12cd119cca32..8beff03fda68 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -115,7 +115,7 @@ struct userstack_entry {
 };
 
 /*
- * ftrace_printk entry:
+ * trace_printk entry:
  */
 struct print_entry {
 	struct trace_entry	ent;
-- 
cgit v1.2.3-59-g8ed1b


From 03d78913f01e8f6599823f00357ed17b32747d3d Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 5 Mar 2009 02:29:05 -0800
Subject: lockdep: remove duplicate CONFIG_DEBUG_LOCKDEP definitions

Impact: cleanup

The atomic debug modifiers are already defined in
kernel/lockdep_internals.h.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <alpine.DEB.2.00.0903050222160.30401@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 02014f7ccc86..9a1e2bcc4b8d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -433,13 +433,6 @@ atomic_t nr_find_usage_forwards_checks;
 atomic_t nr_find_usage_forwards_recursions;
 atomic_t nr_find_usage_backwards_checks;
 atomic_t nr_find_usage_backwards_recursions;
-# define debug_atomic_inc(ptr)		atomic_inc(ptr)
-# define debug_atomic_dec(ptr)		atomic_dec(ptr)
-# define debug_atomic_read(ptr)		atomic_read(ptr)
-#else
-# define debug_atomic_inc(ptr)		do { } while (0)
-# define debug_atomic_dec(ptr)		do { } while (0)
-# define debug_atomic_read(ptr)		0
 #endif
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 0012693ad4f636c720fed3802027f9427962f540 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Mar 2009 01:49:22 +0100
Subject: tracing/function-graph-tracer: use the more lightweight local clock

Impact: decrease hangs risks with the graph tracer on slow systems

Since the function graph tracer can spend too much time on timer
interrupts, it's better now to use the more lightweight local
clock. Anyway, the function graph traces are more reliable on a
per cpu trace.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <49af243d.06e9300a.53ad.ffff840c@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c             |  2 +-
 include/linux/ftrace.h               | 13 +++++++------
 kernel/trace/trace_functions_graph.c |  2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 3925ec0184b1..a85da1764b1c 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -436,7 +436,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	calltime = cpu_clock(raw_smp_processor_id());
+	calltime = trace_clock_local();
 
 	if (ftrace_push_return_trace(old, calltime,
 				self_addr, &trace.depth) == -EBUSY) {
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1f69ac7c1587..6ea62acbe4b6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1,15 +1,16 @@
 #ifndef _LINUX_FTRACE_H
 #define _LINUX_FTRACE_H
 
-#include <linux/linkage.h>
-#include <linux/fs.h>
-#include <linux/ktime.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/module.h>
+#include <linux/trace_clock.h>
 #include <linux/kallsyms.h>
+#include <linux/linkage.h>
 #include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/ktime.h>
 #include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/fs.h>
 
 #include <asm/ftrace.h>
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c009553a8e81..e527f2f66c73 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -112,7 +112,7 @@ unsigned long ftrace_return_to_handler(void)
 	unsigned long ret;
 
 	ftrace_pop_return_trace(&trace, &ret);
-	trace.rettime = cpu_clock(raw_smp_processor_id());
+	trace.rettime = trace_clock_local();
 	ftrace_graph_return(&trace);
 
 	if (unlikely(!ret)) {
-- 
cgit v1.2.3-59-g8ed1b


From 40ada30f9621fbd831ac2437b9a2a399aad34b00 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 21:19:55 +0100
Subject: tracing: clean up menu

Clean up menu structure, introduce TRACING_SUPPORT switch that signals
whether an architecture supports various instrumentation mechanisms.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/Kconfig         |  1 +
 kernel/trace/Kconfig | 29 ++++++++++++++++-------------
 2 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 550dab22daa1..a092dc77c24d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -6,6 +6,7 @@ config OPROFILE
 	tristate "OProfile system profiling (EXPERIMENTAL)"
 	depends on PROFILING
 	depends on HAVE_OPROFILE
+	depends on TRACING_SUPPORT
 	select TRACING
 	select RING_BUFFER
 	help
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 999c6a2485df..5d733da5345a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -53,12 +53,22 @@ config TRACING
 	select TRACEPOINTS
 	select NOP_TRACER
 
+#
+# Minimum requirements an architecture has to meet for us to
+# be able to offer generic tracing facilities:
+#
+config TRACING_SUPPORT
+	bool
+	depends on TRACE_IRQFLAGS_SUPPORT
+	depends on STACKTRACE_SUPPORT
+
+if TRACING_SUPPORT
+
 menu "Tracers"
 
 config FUNCTION_TRACER
 	bool "Kernel Function Tracer"
 	depends on HAVE_FUNCTION_TRACER
-	depends on DEBUG_KERNEL
 	select FRAME_POINTER
 	select KALLSYMS
 	select TRACING
@@ -91,7 +101,6 @@ config IRQSOFF_TRACER
 	default n
 	depends on TRACE_IRQFLAGS_SUPPORT
 	depends on GENERIC_TIME
-	depends on DEBUG_KERNEL
 	select TRACE_IRQFLAGS
 	select TRACING
 	select TRACER_MAX_TRACE
@@ -114,7 +123,6 @@ config PREEMPT_TRACER
 	default n
 	depends on GENERIC_TIME
 	depends on PREEMPT
-	depends on DEBUG_KERNEL
 	select TRACING
 	select TRACER_MAX_TRACE
 	help
@@ -142,7 +150,6 @@ config SYSPROF_TRACER
 
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
-	depends on DEBUG_KERNEL
 	select TRACING
 	select CONTEXT_SWITCH_TRACER
 	select TRACER_MAX_TRACE
@@ -152,7 +159,6 @@ config SCHED_TRACER
 
 config CONTEXT_SWITCH_TRACER
 	bool "Trace process context switches"
-	depends on DEBUG_KERNEL
 	select TRACING
 	select MARKERS
 	help
@@ -161,7 +167,6 @@ config CONTEXT_SWITCH_TRACER
 
 config EVENT_TRACER
 	bool "Trace various events in the kernel"
-	depends on DEBUG_KERNEL
 	select TRACING
 	help
 	  This tracer hooks to various trace points in the kernel
@@ -170,7 +175,6 @@ config EVENT_TRACER
 
 config BOOT_TRACER
 	bool "Trace boot initcalls"
-	depends on DEBUG_KERNEL
 	select TRACING
 	select CONTEXT_SWITCH_TRACER
 	help
@@ -188,7 +192,6 @@ config BOOT_TRACER
 
 config TRACE_BRANCH_PROFILING
 	bool "Trace likely/unlikely profiler"
-	depends on DEBUG_KERNEL
 	select TRACING
 	help
 	  This tracer profiles all the the likely and unlikely macros
@@ -241,7 +244,6 @@ config BRANCH_TRACER
 
 config POWER_TRACER
 	bool "Trace power consumption behavior"
-	depends on DEBUG_KERNEL
 	depends on X86
 	select TRACING
 	help
@@ -253,7 +255,6 @@ config POWER_TRACER
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
-	depends on DEBUG_KERNEL
 	select FUNCTION_TRACER
 	select STACKTRACE
 	select KALLSYMS
@@ -343,7 +344,6 @@ config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FUNCTION_TRACER
 	depends on HAVE_DYNAMIC_FTRACE
-	depends on DEBUG_KERNEL
 	default y
 	help
          This option will modify all the calls to ftrace dynamically
@@ -369,7 +369,7 @@ config FTRACE_SELFTEST
 
 config FTRACE_STARTUP_TEST
 	bool "Perform a startup test on ftrace"
-	depends on TRACING && DEBUG_KERNEL
+	depends on TRACING
 	select FTRACE_SELFTEST
 	help
 	  This option performs a series of startup tests on ftrace. On bootup
@@ -379,7 +379,7 @@ config FTRACE_STARTUP_TEST
 
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
-	depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI
+	depends on HAVE_MMIOTRACE_SUPPORT && PCI
 	select TRACING
 	help
 	  Mmiotrace traces Memory Mapped I/O access and is meant for
@@ -401,3 +401,6 @@ config MMIOTRACE_TEST
 	  Say N, unless you absolutely know what you are doing.
 
 endmenu
+
+endif # TRACING_SUPPORT
+
-- 
cgit v1.2.3-59-g8ed1b


From 5e2336a0d47c9661a40cc5ef85135ce1406af6e8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Mar 2009 21:44:55 -0500
Subject: tracing: make all file_operations const

Impact: cleanup

All file_operations structures should be constant. No one is going to
change them.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c        | 10 +++++-----
 kernel/trace/ring_buffer.c   |  2 +-
 kernel/trace/trace.c         | 24 ++++++++++++------------
 kernel/trace/trace_sysprof.c |  2 +-
 4 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5a3a06b21eee..d7a06a0d9447 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1869,21 +1869,21 @@ ftrace_notrace_release(struct inode *inode, struct file *file)
 	return ftrace_regex_release(inode, file, 0);
 }
 
-static struct file_operations ftrace_avail_fops = {
+static const struct file_operations ftrace_avail_fops = {
 	.open = ftrace_avail_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = ftrace_avail_release,
 };
 
-static struct file_operations ftrace_failures_fops = {
+static const struct file_operations ftrace_failures_fops = {
 	.open = ftrace_failures_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = ftrace_avail_release,
 };
 
-static struct file_operations ftrace_filter_fops = {
+static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = ftrace_regex_read,
 	.write = ftrace_filter_write,
@@ -1891,7 +1891,7 @@ static struct file_operations ftrace_filter_fops = {
 	.release = ftrace_filter_release,
 };
 
-static struct file_operations ftrace_notrace_fops = {
+static const struct file_operations ftrace_notrace_fops = {
 	.open = ftrace_notrace_open,
 	.read = ftrace_regex_read,
 	.write = ftrace_notrace_write,
@@ -2423,7 +2423,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations ftrace_pid_fops = {
+static const struct file_operations ftrace_pid_fops = {
 	.read = ftrace_pid_read,
 	.write = ftrace_pid_write,
 };
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f7473645b9c6..178858492a89 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2606,7 +2606,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations rb_simple_fops = {
+static const struct file_operations rb_simple_fops = {
 	.open		= tracing_open_generic,
 	.read		= rb_simple_read,
 	.write		= rb_simple_write,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c0e9c1263393..e6144acf2b75 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1882,14 +1882,14 @@ static int show_traces_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static struct file_operations tracing_fops = {
+static const struct file_operations tracing_fops = {
 	.open		= tracing_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= tracing_release,
 };
 
-static struct file_operations show_traces_fops = {
+static const struct file_operations show_traces_fops = {
 	.open		= show_traces_open,
 	.read		= seq_read,
 	.release	= seq_release,
@@ -1982,7 +1982,7 @@ err_unlock:
 	return err;
 }
 
-static struct file_operations tracing_cpumask_fops = {
+static const struct file_operations tracing_cpumask_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_cpumask_read,
 	.write		= tracing_cpumask_write,
@@ -2134,7 +2134,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations tracing_iter_fops = {
+static const struct file_operations tracing_iter_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_trace_options_read,
 	.write		= tracing_trace_options_write,
@@ -2167,7 +2167,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
 					readme_msg, strlen(readme_msg));
 }
 
-static struct file_operations tracing_readme_fops = {
+static const struct file_operations tracing_readme_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_readme_read,
 };
@@ -2927,25 +2927,25 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations tracing_max_lat_fops = {
+static const struct file_operations tracing_max_lat_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_max_lat_read,
 	.write		= tracing_max_lat_write,
 };
 
-static struct file_operations tracing_ctrl_fops = {
+static const struct file_operations tracing_ctrl_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_ctrl_read,
 	.write		= tracing_ctrl_write,
 };
 
-static struct file_operations set_tracer_fops = {
+static const struct file_operations set_tracer_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_set_trace_read,
 	.write		= tracing_set_trace_write,
 };
 
-static struct file_operations tracing_pipe_fops = {
+static const struct file_operations tracing_pipe_fops = {
 	.open		= tracing_open_pipe,
 	.poll		= tracing_poll_pipe,
 	.read		= tracing_read_pipe,
@@ -2953,13 +2953,13 @@ static struct file_operations tracing_pipe_fops = {
 	.release	= tracing_release_pipe,
 };
 
-static struct file_operations tracing_entries_fops = {
+static const struct file_operations tracing_entries_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_entries_read,
 	.write		= tracing_entries_write,
 };
 
-static struct file_operations tracing_mark_fops = {
+static const struct file_operations tracing_mark_fops = {
 	.open		= tracing_open_generic,
 	.write		= tracing_mark_write,
 };
@@ -3240,7 +3240,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
 	return r;
 }
 
-static struct file_operations tracing_dyn_info_fops = {
+static const struct file_operations tracing_dyn_info_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_read_dyn_info,
 };
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index c771af4e8f1a..91fd19c2149f 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -314,7 +314,7 @@ sysprof_sample_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations sysprof_sample_fops = {
+static const struct file_operations sysprof_sample_fops = {
 	.read		= sysprof_sample_read,
 	.write		= sysprof_sample_write,
 };
-- 
cgit v1.2.3-59-g8ed1b


From 33b0c229e3abeae00493ed1d6f0b07191977a0a2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Mar 2009 11:45:43 -0500
Subject: tracing: move print of event format to separate file

Impact: clean up

Move the macro that creates the event format file to a separate header.
This will allow the default ftrace events to use this same macro
to create the formats to read those events.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_2.h | 53 +----------------------------------
 kernel/trace/trace_format.h         | 55 +++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 52 deletions(-)
 create mode 100644 kernel/trace/trace_format.h

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index b1cebba1d9b4..d24a97e74aea 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -75,56 +75,5 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 
 #include <trace/trace_event_types.h>
 
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- * 	struct ftrace_raw_##call field;
- * 	int ret;
- *
- * 	ret = trace_seq_printf(s, #type " " #item ";"
- * 			       " size:%d; offset:%d;\n",
- * 			       sizeof(field.type),
- * 			       offsetof(struct ftrace_raw_##call,
- * 					item));
- *
- * }
- */
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%lu;\tsize:%lu;\n",		\
-			       offsetof(typeof(field), item),		\
-			       sizeof(field.item));			\
-	if (!ret)							\
-		return 0;
-
-
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
-	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
-			       "offset:%lu;\tsize:%lu;\n",		\
-			       offsetof(typeof(field), item),		\
-			       sizeof(field.item));			\
-	if (!ret)							\
-		return 0;
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-int									\
-ftrace_format_##call(struct trace_seq *s)				\
-{									\
-	struct ftrace_raw_##call field;					\
-	int ret;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
-									\
-	return ret;							\
-}
-
+#include "trace_format.h"
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h
new file mode 100644
index 000000000000..53a6b1357116
--- /dev/null
+++ b/kernel/trace/trace_format.h
@@ -0,0 +1,55 @@
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *	struct ftrace_raw_##call field;
+ *	int ret;
+ *
+ *	ret = trace_seq_printf(s, #type " " #item ";"
+ *			       " size:%d; offset:%d;\n",
+ *			       sizeof(field.type),
+ *			       offsetof(struct ftrace_raw_##call,
+ *					item));
+ *
+ * }
+ */
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%lu;\tsize:%lu;\n",		\
+			       offsetof(typeof(field), item),		\
+			       sizeof(field.item));			\
+	if (!ret)							\
+		return 0;
+
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
+			       "offset:%lu;\tsize:%lu;\n",		\
+			       offsetof(typeof(field), item),		\
+			       sizeof(field.item));			\
+	if (!ret)							\
+		return 0;
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+int									\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field;					\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+									\
+	return ret;							\
+}
+
-- 
cgit v1.2.3-59-g8ed1b


From 770cb24345c0f6e0d47bd2b94aa6d67bea6f8b54 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Mar 2009 21:35:29 -0500
Subject: tracing: add format files for ftrace default entries

Impact: allow user apps to read binary format of basic ftrace entries

Currently, only defined raw events export their formats so a binary
reader can parse them. There's no reason that the default ftrace entries
can't export their formats.

This patch adds a subsystem called "ftrace" in the events directory
that includes the ftrace entries for basic ftrace recorded items.

These only have three files in the events directory:

 type             : printf
 available_types  : printf
 format           : format for the event entry

For example:

 # cat /debug/tracing/events/ftrace/wakeup/format
name: wakeup
ID: 3
format:
        field:unsigned char type;       offset:0;       size:1;
        field:unsigned char flags;      offset:1;       size:1;
        field:unsigned char preempt_count;      offset:2;       size:1;
        field:int pid;  offset:4;       size:4;
        field:int tgid; offset:8;       size:4;

        field:unsigned int prev_pid;    offset:12;      size:4;
        field:unsigned char prev_prio;  offset:16;      size:1;
        field:unsigned char prev_state; offset:17;      size:1;
        field:unsigned int next_pid;    offset:20;      size:4;
        field:unsigned char next_prio;  offset:24;      size:1;
        field:unsigned char next_state; offset:25;      size:1;
        field:unsigned int next_cpu;    offset:28;      size:4;

print fmt: "%u:%u:%u  ==+ %u:%u:%u [%03u]"

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/Makefile            |   1 +
 kernel/trace/trace_event_types.h | 165 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.c      |  12 +--
 kernel/trace/trace_export.c      |  81 +++++++++++++++++++
 kernel/trace/trace_format.h      |   2 +-
 5 files changed, 255 insertions(+), 6 deletions(-)
 create mode 100644 kernel/trace/trace_event_types.h
 create mode 100644 kernel/trace/trace_export.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c931fe0560cb..f44736c7574a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,5 +41,6 @@ obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
+obj-$(CONFIG_EVENT_TRACER) += trace_export.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
new file mode 100644
index 000000000000..fb4eba166433
--- /dev/null
+++ b/kernel/trace/trace_event_types.h
@@ -0,0 +1,165 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM	ftrace
+
+/*
+ * We cheat and use the proto type field as the ID
+ * and args as the entry type (minus 'struct')
+ */
+TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ip, ip)
+		TRACE_FIELD(unsigned long, parent_ip, parent_ip)
+	),
+	TPRAWFMT(" %lx <-- %lx")
+);
+
+TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
+		   ftrace_graph_ent_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, graph_ent.func, func)
+		TRACE_FIELD(int, graph_ent.depth, depth)
+	),
+	TPRAWFMT("--> %lx (%d)")
+);
+
+TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
+		   ftrace_graph_ret_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ret.func, func)
+		TRACE_FIELD(int, ret.depth, depth)
+	),
+	TPRAWFMT("<-- %lx (%d)")
+);
+
+TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned int, prev_pid, prev_pid)
+		TRACE_FIELD(unsigned char, prev_prio, prev_prio)
+		TRACE_FIELD(unsigned char, prev_state, prev_state)
+		TRACE_FIELD(unsigned int, next_pid, next_pid)
+		TRACE_FIELD(unsigned char, next_prio, next_prio)
+		TRACE_FIELD(unsigned char, next_state, next_state)
+		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
+	),
+	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+);
+
+TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned int, prev_pid, prev_pid)
+		TRACE_FIELD(unsigned char, prev_prio, prev_prio)
+		TRACE_FIELD(unsigned char, prev_state, prev_state)
+		TRACE_FIELD(unsigned int, next_pid, next_pid)
+		TRACE_FIELD(unsigned char, next_prio, next_prio)
+		TRACE_FIELD(unsigned char, next_state, next_state)
+		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
+	),
+	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+);
+
+TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, arg1, arg1)
+		TRACE_FIELD(unsigned long, arg2, arg2)
+		TRACE_FIELD(unsigned long, arg3, arg3)
+	),
+	TPRAWFMT("(%08lx) (%08lx) (%08lx)")
+);
+
+/*
+ * Stack-trace entry:
+ */
+
+/* #define FTRACE_STACK_ENTRIES   8 */
+
+TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, caller[0], stack0)
+		TRACE_FIELD(unsigned long, caller[1], stack1)
+		TRACE_FIELD(unsigned long, caller[2], stack2)
+		TRACE_FIELD(unsigned long, caller[3], stack3)
+		TRACE_FIELD(unsigned long, caller[4], stack4)
+		TRACE_FIELD(unsigned long, caller[5], stack5)
+		TRACE_FIELD(unsigned long, caller[6], stack6)
+		TRACE_FIELD(unsigned long, caller[7], stack7)
+	),
+	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
+);
+
+TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, caller[0], stack0)
+		TRACE_FIELD(unsigned long, caller[1], stack1)
+		TRACE_FIELD(unsigned long, caller[2], stack2)
+		TRACE_FIELD(unsigned long, caller[3], stack3)
+		TRACE_FIELD(unsigned long, caller[4], stack4)
+		TRACE_FIELD(unsigned long, caller[5], stack5)
+		TRACE_FIELD(unsigned long, caller[6], stack6)
+		TRACE_FIELD(unsigned long, caller[7], stack7)
+	),
+	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
+);
+
+TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ip, ip)
+		TRACE_FIELD(unsigned int, depth, depth)
+		TRACE_FIELD_ZERO_CHAR(buf)
+	),
+	TPRAWFMT("%08lx (%d) %s")
+);
+
+TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned int, line, line)
+		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
+		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
+		TRACE_FIELD(char, correct, correct)
+	),
+	TPRAWFMT("%u:%s:%s (%u)")
+);
+
+TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(u64, from, from)
+		TRACE_FIELD(u64, to, to)
+	),
+	TPRAWFMT("from: %llx to: %llx")
+);
+
+TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(ktime_t, state_data.stamp, stamp)
+		TRACE_FIELD(ktime_t, state_data.end, end)
+		TRACE_FIELD(int, state_data.type, type)
+		TRACE_FIELD(int, state_data.state, state)
+	),
+	TPRAWFMT("%llx->%llx type:%u state:%u")
+);
+
+TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
+		TRACE_FIELD(unsigned long, call_site, call_site)
+		TRACE_FIELD(const void *, ptr, ptr)
+		TRACE_FIELD(size_t, bytes_req, bytes_req)
+		TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
+		TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
+		TRACE_FIELD(int, node, node)
+	),
+	TPRAWFMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
+		 " flags:%x node:%d")
+);
+
+TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
+		TRACE_FIELD(unsigned long, call_site, call_site)
+		TRACE_FIELD(const void *, ptr, ptr)
+	),
+	TPRAWFMT("type:%u call_site:%lx ptr:%p")
+);
+
+#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 210e71ff82db..4488d90e75ef 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -656,11 +656,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		return -1;
 	}
 
-	entry = debugfs_create_file("enable", 0644, call->dir, call,
-				    &ftrace_enable_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/enable' entry\n", call->name);
+	if (call->regfunc) {
+		entry = debugfs_create_file("enable", 0644, call->dir, call,
+					    &ftrace_enable_fops);
+		if (!entry)
+			pr_warning("Could not create debugfs "
+				   "'%s/enable' entry\n", call->name);
+	}
 
 	/* Only let type be writable, if we can change it */
 	entry = debugfs_create_file("type",
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
new file mode 100644
index 000000000000..0fb7be73e31c
--- /dev/null
+++ b/kernel/trace/trace_export.c
@@ -0,0 +1,81 @@
+/*
+ * trace_export.c - export basic ftrace utilities to user space
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/stringify.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+
+#include "trace_format.h"
+
+#undef TRACE_FIELD_ZERO_CHAR
+#define TRACE_FIELD_ZERO_CHAR(item)				\
+	ret = trace_seq_printf(s, "\tfield: char " #item ";\t"	\
+			       "offset:%lu;\tsize:0;\n",	\
+			       offsetof(typeof(field), item));	\
+	if (!ret)						\
+		return 0;
+
+
+#undef TPRAWFMT
+#define TPRAWFMT(args...) args
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct args field;						\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+									\
+	return ret;							\
+}
+
+#include "trace_event_types.h"
+
+#undef TRACE_ZERO_CHAR
+#define TRACE_ZERO_CHAR(arg)
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)\
+	entry->item = assign;
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)\
+	entry->item = assign;
+
+#undef TPCMD
+#define TPCMD(cmd...)	cmd
+
+#undef TRACE_ENTRY
+#define TRACE_ENTRY	entry
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+	cmd;
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+									\
+static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name 			= #call,				\
+	.id			= proto,				\
+	.system			= __stringify(TRACE_SYSTEM),		\
+	.show_format		= ftrace_format_##call,			\
+}
+#include "trace_event_types.h"
diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h
index 53a6b1357116..03f9a4c165ca 100644
--- a/kernel/trace/trace_format.h
+++ b/kernel/trace/trace_format.h
@@ -40,7 +40,7 @@
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-int									\
+static int								\
 ftrace_format_##call(struct trace_seq *s)				\
 {									\
 	struct ftrace_raw_##call field;					\
-- 
cgit v1.2.3-59-g8ed1b


From 422d3c7a577b15e1384c9d4e72a9540896b685fa Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 6 Mar 2009 10:40:53 +0900
Subject: tracing: current tip/master can't enable ftrace

After commit 40ada30f9621fbd831ac2437b9a2a399aad34b00,
"make menuconfig" doesn't display "Tracer" item.

Following modification restores it.
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d733da5345a..058d949a3214 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -61,6 +61,7 @@ config TRACING_SUPPORT
 	bool
 	depends on TRACE_IRQFLAGS_SUPPORT
 	depends on STACKTRACE_SUPPORT
+	default y
 
 if TRACING_SUPPORT
 
-- 
cgit v1.2.3-59-g8ed1b


From 10dd3ebe213c31bff14b4dae3c5d32a76b1fad7c Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 6 Mar 2009 15:29:04 +0900
Subject: tracing: fix deadlock when setting set_ftrace_pid

Impact: fix deadlock while using set_ftrace_pid

Reproducer:

	# cd /sys/kernel/debug/tracing
	# echo $$ > set_ftrace_pid

	then, console becomes hung.

Details:

when writing set_ftracepid, kernel callstack is following

	ftrace_pid_write()
		mutex_lock(&ftrace_lock);
		ftrace_update_pid_func()
			mutex_lock(&ftrace_lock);
			mutex_unlock(&ftrace_lock);
		mutex_unlock(&ftrace_lock);

then, system always deadlocks when ftrace_pid_write() is called.

In past days, ftrace_pid_write() used ftrace_start_lock, but
commit e6ea44e9b4c12325337cd1c06103cd515a1c02b2 consolidated
ftrace_start_lock to ftrace_lock.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <20090306151155.0778.A69D9226@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d7a06a0d9447..d33d306bdcf4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -218,10 +218,8 @@ static void ftrace_update_pid_func(void)
 {
 	ftrace_func_t func;
 
-	mutex_lock(&ftrace_lock);
-
 	if (ftrace_trace_function == ftrace_stub)
-		goto out;
+		return;
 
 	func = ftrace_trace_function;
 
@@ -238,9 +236,6 @@ static void ftrace_update_pid_func(void)
 #else
 	__ftrace_trace_function = func;
 #endif
-
- out:
-	mutex_unlock(&ftrace_lock);
 }
 
 /* set when tracing only a pid */
-- 
cgit v1.2.3-59-g8ed1b


From 4460fdad85becd569f11501ad5b91814814335ff Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 6 Mar 2009 10:36:38 -0500
Subject: tracing, Text Edit Lock - kprobes architecture independent support

Use the mutual exclusion provided by the text edit lock in the kprobes code. It
allows coherent manipulation of the kernel code by other subsystems.

Changelog:

Move the kernel_text_lock/unlock out of the for loops.
Use text_mutex directly instead of a function.
Remove whitespace modifications.

(note : kprobes_mutex is always taken outside of text_mutex)

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
LKML-Reference: <49B14306.2080202@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7ba8cd9845cb..479d4d5672f9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -43,6 +43,7 @@
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/kdebug.h>
+#include <linux/memory.h>
 
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -699,9 +700,10 @@ int __kprobes register_kprobe(struct kprobe *p)
 		goto out;
 	}
 
+	mutex_lock(&text_mutex);
 	ret = arch_prepare_kprobe(p);
 	if (ret)
-		goto out;
+		goto out_unlock_text;
 
 	INIT_HLIST_NODE(&p->hlist);
 	hlist_add_head_rcu(&p->hlist,
@@ -710,6 +712,8 @@ int __kprobes register_kprobe(struct kprobe *p)
 	if (kprobe_enabled)
 		arch_arm_kprobe(p);
 
+out_unlock_text:
+	mutex_unlock(&text_mutex);
 out:
 	mutex_unlock(&kprobe_mutex);
 
@@ -746,8 +750,11 @@ valid_p:
 		 * enabled and not gone - otherwise, the breakpoint would
 		 * already have been removed. We save on flushing icache.
 		 */
-		if (kprobe_enabled && !kprobe_gone(old_p))
+		if (kprobe_enabled && !kprobe_gone(old_p)) {
+			mutex_lock(&text_mutex);
 			arch_disarm_kprobe(p);
+			mutex_unlock(&text_mutex);
+		}
 		hlist_del_rcu(&old_p->hlist);
 	} else {
 		if (p->break_handler && !kprobe_gone(p))
@@ -1280,12 +1287,14 @@ static void __kprobes enable_all_kprobes(void)
 	if (kprobe_enabled)
 		goto already_enabled;
 
+	mutex_lock(&text_mutex);
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist)
 			if (!kprobe_gone(p))
 				arch_arm_kprobe(p);
 	}
+	mutex_unlock(&text_mutex);
 
 	kprobe_enabled = true;
 	printk(KERN_INFO "Kprobes globally enabled\n");
@@ -1310,6 +1319,7 @@ static void __kprobes disable_all_kprobes(void)
 
 	kprobe_enabled = false;
 	printk(KERN_INFO "Kprobes globally disabled\n");
+	mutex_lock(&text_mutex);
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist) {
@@ -1318,6 +1328,7 @@ static void __kprobes disable_all_kprobes(void)
 		}
 	}
 
+	mutex_unlock(&text_mutex);
 	mutex_unlock(&kprobe_mutex);
 	/* Allow all currently running kprobes to complete */
 	synchronize_sched();
-- 
cgit v1.2.3-59-g8ed1b


From 1427cdf0592368bdec57276edaf714040ee8744f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 6 Mar 2009 17:21:47 +0100
Subject: tracing: infrastructure for supporting binary record

Impact: save on memory for tracing

Current tracers are typically using a struct(like struct ftrace_entry,
struct ctx_switch_entry, struct special_entr etc...)to record a binary
event. These structs can only record a their own kind of events.
A new kind of tracer need a new struct and a lot of code too handle it.

So we need a generic binary record for events. This infrastructure
is for this purpose.

[fweisbec@gmail.com: rebase against latest -tip, make it safe while sched
tracing as reported by Steven Rostedt]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h       |  3 ++
 kernel/trace/Kconfig         |  6 +++
 kernel/trace/Makefile        |  1 +
 kernel/trace/trace.c         | 56 ++++++++++++++++++++++++++++
 kernel/trace/trace.h         | 12 ++++++
 kernel/trace/trace_bprintk.c | 87 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.c  | 75 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 240 insertions(+)
 create mode 100644 kernel/trace/trace_bprintk.c

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 498769425eb2..1c9cdca02580 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -223,6 +223,9 @@ extern int ftrace_make_nop(struct module *mod,
  */
 extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
 
+#ifdef CONFIG_TRACE_BPRINTK
+extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
+#endif
 
 /* May be defined in arch */
 extern int ftrace_arch_read_dyn_info(char *buf, int size);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 058d949a3214..ad8d3617d0a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -97,6 +97,12 @@ config FUNCTION_GRAPH_TRACER
 	  This is done by setting the current return address on the current
 	  task structure into a stack of calls.
 
+config TRACE_BPRINTK
+	bool "Binary printk for tracing"
+	default y
+	depends on TRACING
+	select BINARY_PRINTF
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index f44736c7574a..46557ef4c379 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
+obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e6144acf2b75..ff53509e19f8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3792,6 +3792,62 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
+/**
+ * trace_vbprintk - write binary msg to tracing buffer
+ *
+ * Caller must insure @fmt are valid when msg is in tracing buffer.
+ */
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
+{
+	static DEFINE_SPINLOCK(trace_buf_lock);
+	static u32 trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	struct bprintk_entry *entry;
+	unsigned long flags;
+	int resched;
+	int cpu, len = 0, size, pc;
+
+	if (tracing_disabled || !trace_bprintk_enable)
+		return 0;
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	spin_lock_irqsave(&trace_buf_lock, flags);
+	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	if (len > TRACE_BUF_SIZE || len < 0)
+		goto out_unlock;
+
+	size = sizeof(*entry) + sizeof(u32) * len;
+	event = trace_buffer_lock_reserve(tr, TRACE_BPRINTK, size, flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->fmt			= fmt;
+
+	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+out_unlock:
+	spin_unlock_irqrestore(&trace_buf_lock, flags);
+
+out:
+	ftrace_preempt_enable(resched);
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_vbprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8beff03fda68..0f5077f8f957 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,6 +20,7 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
+	TRACE_BPRINTK,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -124,6 +125,16 @@ struct print_entry {
 	char			buf[];
 };
 
+struct bprintk_entry {
+	struct trace_entry ent;
+	unsigned long ip;
+	const char *fmt;
+	u32 buf[];
+};
+#ifdef CONFIG_TRACE_BPRINTK
+extern int trace_bprintk_enable;
+#endif
+
 #define TRACE_OLD_SIZE		88
 
 struct trace_field_cont {
@@ -285,6 +296,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
+		IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
new file mode 100644
index 000000000000..1f8e532c3fb9
--- /dev/null
+++ b/kernel/trace/trace_bprintk.c
@@ -0,0 +1,87 @@
+/*
+ * trace binary printk
+ *
+ * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/marker.h>
+#include <linux/uaccess.h>
+
+#include "trace.h"
+
+/* binary printk basic */
+static DEFINE_MUTEX(btrace_mutex);
+static int btrace_metadata_count;
+
+static inline void lock_btrace(void)
+{
+	mutex_lock(&btrace_mutex);
+}
+
+static inline void unlock_btrace(void)
+{
+	mutex_unlock(&btrace_mutex);
+}
+
+static void get_btrace_metadata(void)
+{
+	lock_btrace();
+	btrace_metadata_count++;
+	unlock_btrace();
+}
+
+static void put_btrace_metadata(void)
+{
+	lock_btrace();
+	btrace_metadata_count--;
+	unlock_btrace();
+}
+
+/* events tracer */
+int trace_bprintk_enable;
+
+static void start_bprintk_trace(struct trace_array *tr)
+{
+	get_btrace_metadata();
+	tracing_reset_online_cpus(tr);
+	trace_bprintk_enable = 1;
+}
+
+static void stop_bprintk_trace(struct trace_array *tr)
+{
+	trace_bprintk_enable = 0;
+	tracing_reset_online_cpus(tr);
+	put_btrace_metadata();
+}
+
+static int init_bprintk_trace(struct trace_array *tr)
+{
+	start_bprintk_trace(tr);
+	return 0;
+}
+
+static struct tracer bprintk_trace __read_mostly =
+{
+	.name	     = "events",
+	.init	     = init_bprintk_trace,
+	.reset	     = stop_bprintk_trace,
+	.start	     = start_bprintk_trace,
+	.stop	     = stop_bprintk_trace,
+};
+
+static __init int init_bprintk(void)
+{
+	return register_tracer(&bprintk_trace);
+}
+
+device_initcall(init_bprintk);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 306fef84c503..4ab71201862e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -53,6 +53,26 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	return len;
 }
 
+static int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+
 /**
  * trace_seq_puts - trace sequence printing of simple string
  * @s: trace sequence descriptor
@@ -855,6 +875,60 @@ static struct trace_event trace_print_event = {
 	.raw		= trace_print_raw,
 };
 
+/* TRACE_BPRINTK */
+static enum print_line_t
+trace_bprintk_print(struct trace_iterator *iter, int flags)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct bprintk_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (!trace_seq_puts(s, ": "))
+		goto partial;
+
+	if (!trace_seq_bprintf(s, field->fmt, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t
+trace_bprintk_raw(struct trace_iterator *iter, int flags)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct bprintk_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!trace_seq_printf(s, ": %lx : ", field->ip))
+		goto partial;
+
+	if (!trace_seq_bprintf(s, field->fmt, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_bprintk_event = {
+	.type	 	= TRACE_BPRINTK,
+	.trace		= trace_bprintk_print,
+	.raw		= trace_bprintk_raw,
+	.hex		= trace_nop_print,
+	.binary		= trace_nop_print,
+};
+
 static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
 	&trace_ctx_event,
@@ -863,6 +937,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_stack_event,
 	&trace_user_stack_event,
 	&trace_print_event,
+	&trace_bprintk_event,
 	NULL
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 1ba28e02a18cbdbea123836f6c98efb09cbf59ec Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 6 Mar 2009 17:21:48 +0100
Subject: tracing: add trace_bprintk()

Impact: add a generic printk() for tracing, like trace_printk()

trace_bprintk() uses the infrastructure to record events on ring_buffer.

[ fweisbec@gmail.com: ported to latest -tip, made it work if
  !CONFIG_MODULES, never free the format strings from modules
  because we can't keep track of them and conditionnaly create
  the ftrace format strings section (reported by Steven Rostedt) ]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h |  9 ++++
 include/linux/ftrace.h            | 21 ++++++++++
 include/linux/module.h            |  5 +++
 kernel/module.c                   |  6 +++
 kernel/trace/trace.c              | 15 +++++++
 kernel/trace/trace_bprintk.c      | 87 ++++++++++++++++++++++++++++++++++-----
 6 files changed, 133 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0add6b28c366..48ade3168b13 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -69,6 +69,14 @@
 #define FTRACE_EVENTS()
 #endif
 
+#ifdef CONFIG_TRACING
+#define TRACE_PRINTKS() VMLINUX_SYMBOL(__start___trace_bprintk_fmt) = .;      \
+			 *(__trace_printk_fmt) /* Trace_printk fmt' pointer */ \
+			 VMLINUX_SYMBOL(__stop___trace_bprintk_fmt) = .;
+#else
+#define TRACE_PRINTKS()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -100,6 +108,7 @@
 		*(__vermagic)		/* Kernel version magic */	\
 		*(__markers_strings)	/* Markers: strings */		\
 		*(__tracepoints_strings)/* Tracepoints: strings */	\
+		TRACE_PRINTKS()					\
 	}								\
 									\
 	.rodata1          : AT(ADDR(.rodata1) - LOAD_OFFSET) {		\
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1c9cdca02580..1cc8ca453a9b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -225,6 +225,27 @@ extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
 
 #ifdef CONFIG_TRACE_BPRINTK
 extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
+extern int __trace_bprintk(unsigned long ip, const char *fmt, ...)
+		__attribute__ ((format (printf, 2, 3)));
+
+static inline void  ____trace_bprintk_check_format(const char *fmt, ...)
+		__attribute__ ((format (printf, 1, 2)));
+static inline void ____trace_bprintk_check_format(const char *fmt, ...) {}
+#define __trace_bprintk_check_format(fmt, args...)			\
+do {									\
+	if (0)								\
+		____trace_bprintk_check_format(fmt, ##args);		\
+} while (0)
+
+#define trace_bprintk(fmt, args...)					\
+do {									\
+	static char *__attribute__((section("__trace_bprintk_fmt")))	\
+			trace_bprintk_fmt = fmt;			\
+	__trace_bprintk_check_format(fmt, ##args);			\
+	__trace_bprintk(_THIS_IP_, trace_bprintk_fmt, ##args);	\
+} while (0)
+#else
+#define trace_bprintk trace_printk
 #endif
 
 /* May be defined in arch */
diff --git a/include/linux/module.h b/include/linux/module.h
index 145a75528cc1..8cbec972d8e7 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -329,6 +329,11 @@ struct module
 	unsigned int num_tracepoints;
 #endif
 
+#ifdef CONFIG_TRACE_BPRINTK
+	const char **trace_bprintk_fmt_start;
+	unsigned int num_trace_bprintk_fmt;
+#endif
+
 #ifdef CONFIG_MODULE_UNLOAD
 	/* What modules depend on me? */
 	struct list_head modules_which_use_me;
diff --git a/kernel/module.c b/kernel/module.c
index 22d7379709da..2dece104f9a1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2158,6 +2158,12 @@ static noinline struct module *load_module(void __user *umod,
 					&mod->num_tracepoints);
 #endif
 
+#ifdef CONFIG_TRACE_BPRINTK
+	mod->trace_bprintk_fmt_start = section_objs(hdr, sechdrs, secstrings,
+			"__trace_bprintk_fmt", sizeof(char *),
+			&mod->num_trace_bprintk_fmt);
+#endif
+
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ff53509e19f8..46b3cd7a5752 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3848,6 +3848,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(trace_vbprintk);
 
+int __trace_bprintk(unsigned long ip, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	if (!fmt)
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vbprintk(ip, fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_bprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
index 1f8e532c3fb9..f4c245a5cd33 100644
--- a/kernel/trace/trace_bprintk.c
+++ b/kernel/trace/trace_bprintk.c
@@ -19,9 +19,21 @@
 
 #include "trace.h"
 
+#ifdef CONFIG_MODULES
+
 /* binary printk basic */
 static DEFINE_MUTEX(btrace_mutex);
-static int btrace_metadata_count;
+/*
+ * modules trace_bprintk()'s formats are autosaved in struct trace_bprintk_fmt
+ * which are queued on trace_bprintk_fmt_list.
+ */
+static LIST_HEAD(trace_bprintk_fmt_list);
+
+struct trace_bprintk_fmt {
+	struct list_head list;
+	char fmt[0];
+};
+
 
 static inline void lock_btrace(void)
 {
@@ -33,26 +45,75 @@ static inline void unlock_btrace(void)
 	mutex_unlock(&btrace_mutex);
 }
 
-static void get_btrace_metadata(void)
+
+static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
 {
-	lock_btrace();
-	btrace_metadata_count++;
-	unlock_btrace();
+	struct trace_bprintk_fmt *pos;
+	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
+		if (!strcmp(pos->fmt, fmt))
+			return pos;
+	}
+	return NULL;
 }
 
-static void put_btrace_metadata(void)
+static
+void hold_module_trace_bprintk_format(const char **start, const char **end)
 {
+	const char **iter;
 	lock_btrace();
-	btrace_metadata_count--;
+	for (iter = start; iter < end; iter++) {
+		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
+		if (tb_fmt) {
+			*iter = tb_fmt->fmt;
+			continue;
+		}
+
+		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
+				+ strlen(*iter) + 1, GFP_KERNEL);
+		if (tb_fmt) {
+			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+			strcpy(tb_fmt->fmt, *iter);
+			*iter = tb_fmt->fmt;
+		} else
+			*iter = NULL;
+	}
 	unlock_btrace();
 }
 
+static int module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	struct module *mod = data;
+	if (mod->num_trace_bprintk_fmt) {
+		const char **start = mod->trace_bprintk_fmt_start;
+		const char **end = start + mod->num_trace_bprintk_fmt;
+
+		if (val == MODULE_STATE_COMING)
+			hold_module_trace_bprintk_format(start, end);
+	}
+	return 0;
+}
+
+#else /* !CONFIG_MODULES */
+__init static int
+module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+
+__initdata_or_module static
+struct notifier_block module_trace_bprintk_format_nb = {
+	.notifier_call = module_trace_bprintk_format_notify,
+};
+
 /* events tracer */
 int trace_bprintk_enable;
 
 static void start_bprintk_trace(struct trace_array *tr)
 {
-	get_btrace_metadata();
 	tracing_reset_online_cpus(tr);
 	trace_bprintk_enable = 1;
 }
@@ -61,7 +122,6 @@ static void stop_bprintk_trace(struct trace_array *tr)
 {
 	trace_bprintk_enable = 0;
 	tracing_reset_online_cpus(tr);
-	put_btrace_metadata();
 }
 
 static int init_bprintk_trace(struct trace_array *tr)
@@ -81,7 +141,14 @@ static struct tracer bprintk_trace __read_mostly =
 
 static __init int init_bprintk(void)
 {
-	return register_tracer(&bprintk_trace);
+	int ret = register_module_notifier(&module_trace_bprintk_format_nb);
+	if (ret)
+		return ret;
+
+	ret = register_tracer(&bprintk_trace);
+	if (ret)
+		unregister_module_notifier(&module_trace_bprintk_format_nb);
+	return ret;
 }
 
 device_initcall(init_bprintk);
-- 
cgit v1.2.3-59-g8ed1b


From 769b0441f438c4bb4872cb8560eb6fe51bcc09ee Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 6 Mar 2009 17:21:49 +0100
Subject: tracing/core: drop the old trace_printk() implementation in favour of
 trace_bprintk()

Impact: faster and lighter tracing

Now that we have trace_bprintk() which is faster and consume lesser
memory than trace_printk() and has the same purpose, we can now drop
the old implementation in favour of the binary one from trace_bprintk(),
which means we move all the implementation of trace_bprintk() to
trace_printk(), so the Api doesn't change except that we must now use
trace_seq_bprintk() to print the TRACE_PRINT entries.

Some changes result of this:

- Previously, trace_bprintk depended of a single tracer and couldn't
  work without. This tracer has been dropped and the whole implementation
  of trace_printk() (like the module formats management) is now integrated
  in the tracing core (comes with CONFIG_TRACING), though we keep the file
  trace_printk (previously trace_bprintk.c) where we can find the module
  management. Thus we don't overflow trace.c

- changes some parts to use trace_seq_bprintk() to print TRACE_PRINT entries.

- change a bit trace_printk/trace_vprintk macros to support non-builtin formats
  constants, and fix 'const' qualifiers warnings. But this is all transparent for
  developers.

- etc...

V2:

- Rebase against last changes
- Fix mispell on the changelog

V3:

- Rebase against last changes (moving trace_printk() to kernel.h)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h               |  25 -----
 include/linux/kernel.h               |  34 +++++-
 include/linux/module.h               |   2 +-
 kernel/trace/Kconfig                 |   7 +-
 kernel/trace/Makefile                |   2 +-
 kernel/trace/trace.c                 | 212 ++++++++++-------------------------
 kernel/trace/trace.h                 |  14 +--
 kernel/trace/trace_bprintk.c         | 154 -------------------------
 kernel/trace/trace_functions_graph.c |   6 +-
 kernel/trace/trace_mmiotrace.c       |   9 +-
 kernel/trace/trace_output.c          |  70 ++----------
 kernel/trace/trace_output.h          |   2 +
 kernel/trace/trace_printk.c          | 138 +++++++++++++++++++++++
 13 files changed, 262 insertions(+), 413 deletions(-)
 delete mode 100644 kernel/trace/trace_bprintk.c
 create mode 100644 kernel/trace/trace_printk.c

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1cc8ca453a9b..e1583f2639b0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -223,31 +223,6 @@ extern int ftrace_make_nop(struct module *mod,
  */
 extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
 
-#ifdef CONFIG_TRACE_BPRINTK
-extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
-extern int __trace_bprintk(unsigned long ip, const char *fmt, ...)
-		__attribute__ ((format (printf, 2, 3)));
-
-static inline void  ____trace_bprintk_check_format(const char *fmt, ...)
-		__attribute__ ((format (printf, 1, 2)));
-static inline void ____trace_bprintk_check_format(const char *fmt, ...) {}
-#define __trace_bprintk_check_format(fmt, args...)			\
-do {									\
-	if (0)								\
-		____trace_bprintk_check_format(fmt, ##args);		\
-} while (0)
-
-#define trace_bprintk(fmt, args...)					\
-do {									\
-	static char *__attribute__((section("__trace_bprintk_fmt")))	\
-			trace_bprintk_fmt = fmt;			\
-	__trace_bprintk_check_format(fmt, ##args);			\
-	__trace_bprintk(_THIS_IP_, trace_bprintk_fmt, ##args);	\
-} while (0)
-#else
-#define trace_bprintk trace_printk
-#endif
-
 /* May be defined in arch */
 extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7aef15c4645e..4e726b9a71ec 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -423,6 +423,16 @@ extern void ftrace_off_permanent(void);
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
+static inline void __attribute__ ((format (printf, 1, 2)))
+____trace_printk_check_format(const char *fmt, ...)
+{
+}
+#define __trace_printk_check_format(fmt, args...)			\
+do {									\
+	if (0)								\
+		____trace_printk_check_format(fmt, ##args);		\
+} while (0)
+
 /**
  * trace_printk - printf formatting in the ftrace buffer
  * @fmt: the printf format for printing
@@ -439,13 +449,31 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
  * Please refrain from leaving trace_printks scattered around in
  * your code.
  */
-# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt)
+
+#define trace_printk(fmt, args...)					\
+do {									\
+	static const char *trace_printk_fmt				\
+	__attribute__((section("__trace_printk_fmt")));			\
+	trace_printk_fmt = fmt;					\
+	__trace_printk_check_format(fmt, ##args);			\
+	__trace_printk(_THIS_IP_, trace_printk_fmt, ##args);		\
+} while (0)
+
 extern int
 __trace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
-# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap)
+
+#define ftrace_vprintk(fmt, vargs)					\
+do {									\
+	static const char *trace_printk_fmt				\
+	__attribute__((section("__trace_printk_fmt")));			\
+	trace_printk_fmt = fmt;					\
+	__ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs);		\
+} while (0)
+
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
+
 extern void ftrace_dump(void);
 #else
 static inline void
@@ -467,7 +495,7 @@ ftrace_vprintk(const char *fmt, va_list ap)
 	return 0;
 }
 static inline void ftrace_dump(void) { }
-#endif
+#endif /* CONFIG_TRACING */
 
 /*
  *      Display an IP address in readable format.
diff --git a/include/linux/module.h b/include/linux/module.h
index 8cbec972d8e7..22d9878e868c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -329,7 +329,7 @@ struct module
 	unsigned int num_tracepoints;
 #endif
 
-#ifdef CONFIG_TRACE_BPRINTK
+#ifdef CONFIG_TRACING
 	const char **trace_bprintk_fmt_start;
 	unsigned int num_trace_bprintk_fmt;
 #endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ad8d3617d0a6..8e4a2a61cd75 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -52,6 +52,7 @@ config TRACING
 	select STACKTRACE if STACKTRACE_SUPPORT
 	select TRACEPOINTS
 	select NOP_TRACER
+	select BINARY_PRINTF
 
 #
 # Minimum requirements an architecture has to meet for us to
@@ -97,12 +98,6 @@ config FUNCTION_GRAPH_TRACER
 	  This is done by setting the current return address on the current
 	  task structure into a stack of calls.
 
-config TRACE_BPRINTK
-	bool "Binary printk for tracing"
-	default y
-	depends on TRACING
-	select BINARY_PRINTF
-
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 46557ef4c379..c7a2943796eb 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -22,7 +22,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
-obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o
+obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 46b3cd7a5752..cc94f8642485 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1169,6 +1169,67 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
+
+/**
+ * trace_vprintk - write binary msg to tracing buffer
+ *
+ */
+int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+{
+	static DEFINE_SPINLOCK(trace_buf_lock);
+	static u32 trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	struct print_entry *entry;
+	unsigned long flags;
+	int resched;
+	int cpu, len = 0, size, pc;
+
+	if (unlikely(tracing_selftest_running || tracing_disabled))
+		return 0;
+
+	/* Don't pollute graph traces with trace_vprintk internals */
+	pause_graph_tracing();
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	spin_lock_irqsave(&trace_buf_lock, flags);
+	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	if (len > TRACE_BUF_SIZE || len < 0)
+		goto out_unlock;
+
+	size = sizeof(*entry) + sizeof(u32) * len;
+	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->depth			= depth;
+	entry->fmt			= fmt;
+
+	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+out_unlock:
+	spin_unlock_irqrestore(&trace_buf_lock, flags);
+
+out:
+	ftrace_preempt_enable(resched);
+	unpause_graph_tracing();
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_vprintk);
+
 enum trace_file_type {
 	TRACE_FILE_LAT_FMT	= 1,
 	TRACE_FILE_ANNOTATE	= 2,
@@ -1564,7 +1625,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_printf(s, "%s", field->buf);
+	ret = trace_seq_bprintf(s, field->fmt, field->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -3714,155 +3775,6 @@ static __init int tracer_init_debugfs(void)
 	return 0;
 }
 
-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
-{
-	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
-	static char trace_buf[TRACE_BUF_SIZE];
-
-	struct ring_buffer_event *event;
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	int cpu, len = 0, size, pc;
-	struct print_entry *entry;
-	unsigned long irq_flags;
-
-	if (tracing_disabled || tracing_selftest_running)
-		return 0;
-
-	pc = preempt_count();
-	preempt_disable_notrace();
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-
-	if (unlikely(atomic_read(&data->disabled)))
-		goto out;
-
-	pause_graph_tracing();
-	raw_local_irq_save(irq_flags);
-	__raw_spin_lock(&trace_buf_lock);
-	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-
-	len = min(len, TRACE_BUF_SIZE-1);
-	trace_buf[len] = 0;
-
-	size = sizeof(*entry) + len + 1;
-	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
-	if (!event)
-		goto out_unlock;
-	entry = ring_buffer_event_data(event);
-	entry->ip			= ip;
-	entry->depth			= depth;
-
-	memcpy(&entry->buf, trace_buf, len);
-	entry->buf[len] = 0;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
- out_unlock:
-	__raw_spin_unlock(&trace_buf_lock);
-	raw_local_irq_restore(irq_flags);
-	unpause_graph_tracing();
- out:
-	preempt_enable_notrace();
-
-	return len;
-}
-EXPORT_SYMBOL_GPL(trace_vprintk);
-
-int __trace_printk(unsigned long ip, const char *fmt, ...)
-{
-	int ret;
-	va_list ap;
-
-	if (!(trace_flags & TRACE_ITER_PRINTK))
-		return 0;
-
-	va_start(ap, fmt);
-	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
-	va_end(ap);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__trace_printk);
-
-int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
-{
-	if (!(trace_flags & TRACE_ITER_PRINTK))
-		return 0;
-
-	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
-}
-EXPORT_SYMBOL_GPL(__ftrace_vprintk);
-
-/**
- * trace_vbprintk - write binary msg to tracing buffer
- *
- * Caller must insure @fmt are valid when msg is in tracing buffer.
- */
-int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
-{
-	static DEFINE_SPINLOCK(trace_buf_lock);
-	static u32 trace_buf[TRACE_BUF_SIZE];
-
-	struct ring_buffer_event *event;
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	struct bprintk_entry *entry;
-	unsigned long flags;
-	int resched;
-	int cpu, len = 0, size, pc;
-
-	if (tracing_disabled || !trace_bprintk_enable)
-		return 0;
-
-	pc = preempt_count();
-	resched = ftrace_preempt_disable();
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-
-	if (unlikely(atomic_read(&data->disabled)))
-		goto out;
-
-	spin_lock_irqsave(&trace_buf_lock, flags);
-	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-
-	if (len > TRACE_BUF_SIZE || len < 0)
-		goto out_unlock;
-
-	size = sizeof(*entry) + sizeof(u32) * len;
-	event = trace_buffer_lock_reserve(tr, TRACE_BPRINTK, size, flags, pc);
-	if (!event)
-		goto out_unlock;
-	entry = ring_buffer_event_data(event);
-	entry->ip			= ip;
-	entry->fmt			= fmt;
-
-	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-out_unlock:
-	spin_unlock_irqrestore(&trace_buf_lock, flags);
-
-out:
-	ftrace_preempt_enable(resched);
-
-	return len;
-}
-EXPORT_SYMBOL_GPL(trace_vbprintk);
-
-int __trace_bprintk(unsigned long ip, const char *fmt, ...)
-{
-	int ret;
-	va_list ap;
-
-	if (!fmt)
-		return 0;
-
-	va_start(ap, fmt);
-	ret = trace_vbprintk(ip, fmt, ap);
-	va_end(ap);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__trace_bprintk);
-
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0f5077f8f957..6140922392c8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,7 +20,6 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
-	TRACE_BPRINTK,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -120,16 +119,10 @@ struct userstack_entry {
  */
 struct print_entry {
 	struct trace_entry	ent;
-	unsigned long		ip;
+	unsigned long 		ip;
 	int			depth;
-	char			buf[];
-};
-
-struct bprintk_entry {
-	struct trace_entry ent;
-	unsigned long ip;
-	const char *fmt;
-	u32 buf[];
+	const char		*fmt;
+	u32 			buf[];
 };
 #ifdef CONFIG_TRACE_BPRINTK
 extern int trace_bprintk_enable;
@@ -296,7 +289,6 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
-		IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
deleted file mode 100644
index f4c245a5cd33..000000000000
--- a/kernel/trace/trace_bprintk.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * trace binary printk
- *
- * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
- *
- */
-#include <linux/kernel.h>
-#include <linux/ftrace.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/seq_file.h>
-#include <linux/fs.h>
-#include <linux/marker.h>
-#include <linux/uaccess.h>
-
-#include "trace.h"
-
-#ifdef CONFIG_MODULES
-
-/* binary printk basic */
-static DEFINE_MUTEX(btrace_mutex);
-/*
- * modules trace_bprintk()'s formats are autosaved in struct trace_bprintk_fmt
- * which are queued on trace_bprintk_fmt_list.
- */
-static LIST_HEAD(trace_bprintk_fmt_list);
-
-struct trace_bprintk_fmt {
-	struct list_head list;
-	char fmt[0];
-};
-
-
-static inline void lock_btrace(void)
-{
-	mutex_lock(&btrace_mutex);
-}
-
-static inline void unlock_btrace(void)
-{
-	mutex_unlock(&btrace_mutex);
-}
-
-
-static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
-{
-	struct trace_bprintk_fmt *pos;
-	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
-		if (!strcmp(pos->fmt, fmt))
-			return pos;
-	}
-	return NULL;
-}
-
-static
-void hold_module_trace_bprintk_format(const char **start, const char **end)
-{
-	const char **iter;
-	lock_btrace();
-	for (iter = start; iter < end; iter++) {
-		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
-		if (tb_fmt) {
-			*iter = tb_fmt->fmt;
-			continue;
-		}
-
-		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
-				+ strlen(*iter) + 1, GFP_KERNEL);
-		if (tb_fmt) {
-			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
-			strcpy(tb_fmt->fmt, *iter);
-			*iter = tb_fmt->fmt;
-		} else
-			*iter = NULL;
-	}
-	unlock_btrace();
-}
-
-static int module_trace_bprintk_format_notify(struct notifier_block *self,
-		unsigned long val, void *data)
-{
-	struct module *mod = data;
-	if (mod->num_trace_bprintk_fmt) {
-		const char **start = mod->trace_bprintk_fmt_start;
-		const char **end = start + mod->num_trace_bprintk_fmt;
-
-		if (val == MODULE_STATE_COMING)
-			hold_module_trace_bprintk_format(start, end);
-	}
-	return 0;
-}
-
-#else /* !CONFIG_MODULES */
-__init static int
-module_trace_bprintk_format_notify(struct notifier_block *self,
-		unsigned long val, void *data)
-{
-	return 0;
-}
-#endif /* CONFIG_MODULES */
-
-
-__initdata_or_module static
-struct notifier_block module_trace_bprintk_format_nb = {
-	.notifier_call = module_trace_bprintk_format_notify,
-};
-
-/* events tracer */
-int trace_bprintk_enable;
-
-static void start_bprintk_trace(struct trace_array *tr)
-{
-	tracing_reset_online_cpus(tr);
-	trace_bprintk_enable = 1;
-}
-
-static void stop_bprintk_trace(struct trace_array *tr)
-{
-	trace_bprintk_enable = 0;
-	tracing_reset_online_cpus(tr);
-}
-
-static int init_bprintk_trace(struct trace_array *tr)
-{
-	start_bprintk_trace(tr);
-	return 0;
-}
-
-static struct tracer bprintk_trace __read_mostly =
-{
-	.name	     = "events",
-	.init	     = init_bprintk_trace,
-	.reset	     = stop_bprintk_trace,
-	.start	     = start_bprintk_trace,
-	.stop	     = stop_bprintk_trace,
-};
-
-static __init int init_bprintk(void)
-{
-	int ret = register_module_notifier(&module_trace_bprintk_format_nb);
-	if (ret)
-		return ret;
-
-	ret = register_tracer(&bprintk_trace);
-	if (ret)
-		unregister_module_notifier(&module_trace_bprintk_format_nb);
-	return ret;
-}
-
-device_initcall(init_bprintk);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e527f2f66c73..453ebd3b636e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -742,7 +742,11 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 		}
 
 	/* The comment */
-	ret = trace_seq_printf(s, "/* %s", trace->buf);
+	ret = trace_seq_printf(s, "/* ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_bprintf(s, trace->fmt, trace->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index c401b908e805..23e346a734ca 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -254,15 +254,18 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 {
 	struct trace_entry *entry = iter->ent;
 	struct print_entry *print = (struct print_entry *)entry;
-	const char *msg		= print->buf;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
-	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
 	unsigned secs		= (unsigned long)t;
 	int ret;
 
 	/* The trailing newline must be in the message. */
-	ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
+	ret = trace_seq_printf(s, "MARK %u.%06lu ", secs, usec_rem);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_bprintf(s, print->fmt, print->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 4ab71201862e..ef8fd661b217 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -53,8 +53,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	return len;
 }
 
-static int
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
 	int len = (PAGE_SIZE - 1) - s->len;
 	int ret;
@@ -834,54 +833,12 @@ static struct trace_event trace_user_stack_event = {
 };
 
 /* TRACE_PRINT */
-static enum print_line_t trace_print_print(struct trace_iterator *iter,
-					   int flags)
-{
-	struct print_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!seq_print_ip_sym(s, field->ip, flags))
-		goto partial;
-
-	if (!trace_seq_printf(s, ": %s", field->buf))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
-{
-	struct print_entry *field;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
-static struct trace_event trace_print_event = {
-	.type	 	= TRACE_PRINT,
-	.trace		= trace_print_print,
-	.raw		= trace_print_raw,
-};
-
-/* TRACE_BPRINTK */
 static enum print_line_t
-trace_bprintk_print(struct trace_iterator *iter, int flags)
+trace_print_print(struct trace_iterator *iter, int flags)
 {
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *s = &iter->seq;
-	struct bprintk_entry *field;
+	struct print_entry *field;
 
 	trace_assign_type(field, entry);
 
@@ -900,14 +857,13 @@ trace_bprintk_print(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static enum print_line_t
-trace_bprintk_raw(struct trace_iterator *iter, int flags)
+
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 {
-	struct trace_entry *entry = iter->ent;
+	struct print_entry *field;
 	struct trace_seq *s = &iter->seq;
-	struct bprintk_entry *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!trace_seq_printf(s, ": %lx : ", field->ip))
 		goto partial;
@@ -921,12 +877,11 @@ trace_bprintk_raw(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static struct trace_event trace_bprintk_event = {
-	.type	 	= TRACE_BPRINTK,
-	.trace		= trace_bprintk_print,
-	.raw		= trace_bprintk_raw,
-	.hex		= trace_nop_print,
-	.binary		= trace_nop_print,
+
+static struct trace_event trace_print_event = {
+	.type	 	= TRACE_PRINT,
+	.trace		= trace_print_print,
+	.raw		= trace_print_raw,
 };
 
 static struct trace_event *events[] __initdata = {
@@ -937,7 +892,6 @@ static struct trace_event *events[] __initdata = {
 	&trace_stack_event,
 	&trace_user_stack_event,
 	&trace_print_event,
-	&trace_bprintk_event,
 	NULL
 };
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 8a34d688ed63..3b90e6ade1aa 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -18,6 +18,8 @@ struct trace_event {
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 extern int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
+extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
new file mode 100644
index 000000000000..a50aea22e929
--- /dev/null
+++ b/kernel/trace/trace_printk.c
@@ -0,0 +1,138 @@
+/*
+ * trace binary printk
+ *
+ * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/marker.h>
+#include <linux/uaccess.h>
+
+#include "trace.h"
+
+#ifdef CONFIG_MODULES
+
+/*
+ * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt
+ * which are queued on trace_bprintk_fmt_list.
+ */
+static LIST_HEAD(trace_bprintk_fmt_list);
+
+/* serialize accesses to trace_bprintk_fmt_list */
+static DEFINE_MUTEX(btrace_mutex);
+
+struct trace_bprintk_fmt {
+	struct list_head list;
+	char fmt[0];
+};
+
+static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
+{
+	struct trace_bprintk_fmt *pos;
+	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
+		if (!strcmp(pos->fmt, fmt))
+			return pos;
+	}
+	return NULL;
+}
+
+static
+void hold_module_trace_bprintk_format(const char **start, const char **end)
+{
+	const char **iter;
+
+	mutex_lock(&btrace_mutex);
+	for (iter = start; iter < end; iter++) {
+		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
+		if (tb_fmt) {
+			*iter = tb_fmt->fmt;
+			continue;
+		}
+
+		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
+				+ strlen(*iter) + 1, GFP_KERNEL);
+		if (tb_fmt) {
+			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+			strcpy(tb_fmt->fmt, *iter);
+			*iter = tb_fmt->fmt;
+		} else
+			*iter = NULL;
+	}
+	mutex_unlock(&btrace_mutex);
+}
+
+static int module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	struct module *mod = data;
+	if (mod->num_trace_bprintk_fmt) {
+		const char **start = mod->trace_bprintk_fmt_start;
+		const char **end = start + mod->num_trace_bprintk_fmt;
+
+		if (val == MODULE_STATE_COMING)
+			hold_module_trace_bprintk_format(start, end);
+	}
+	return 0;
+}
+
+#else /* !CONFIG_MODULES */
+__init static int
+module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+
+__initdata_or_module static
+struct notifier_block module_trace_bprintk_format_nb = {
+	.notifier_call = module_trace_bprintk_format_notify,
+};
+
+int __trace_printk(unsigned long ip, const char *fmt, ...)
+ {
+	int ret;
+	va_list ap;
+
+	if (unlikely(!fmt))
+		return 0;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_printk);
+
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+ {
+	if (unlikely(!fmt))
+		return 0;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+
+
+static __init int init_trace_printk(void)
+{
+	return register_module_notifier(&module_trace_bprintk_format_nb);
+}
+
+early_initcall(init_trace_printk);
-- 
cgit v1.2.3-59-g8ed1b


From 9de36825b321fe9fe9cf73260554251af579f4ca Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 6 Mar 2009 17:52:03 +0100
Subject: tracing: trace_bprintk() cleanups

Impact: cleanup

Remove a few leftovers and clean up the code a bit.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/module.c      |  6 ------
 kernel/trace/trace.h | 19 ++++++++-----------
 2 files changed, 8 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 2dece104f9a1..22d7379709da 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2158,12 +2158,6 @@ static noinline struct module *load_module(void __user *umod,
 					&mod->num_tracepoints);
 #endif
 
-#ifdef CONFIG_TRACE_BPRINTK
-	mod->trace_bprintk_fmt_start = section_objs(hdr, sechdrs, secstrings,
-			"__trace_bprintk_fmt", sizeof(char *),
-			&mod->num_trace_bprintk_fmt);
-#endif
-
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6140922392c8..2bfb7d11fc17 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -119,14 +119,11 @@ struct userstack_entry {
  */
 struct print_entry {
 	struct trace_entry	ent;
-	unsigned long 		ip;
+	unsigned long		ip;
 	int			depth;
 	const char		*fmt;
-	u32 			buf[];
+	u32			buf[];
 };
-#ifdef CONFIG_TRACE_BPRINTK
-extern int trace_bprintk_enable;
-#endif
 
 #define TRACE_OLD_SIZE		88
 
@@ -199,7 +196,7 @@ struct kmemtrace_free_entry {
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
  *  IRQS_OFF		- interrupts were disabled
- *  IRQS_NOSUPPORT 	- arch does not support irqs_disabled_flags
+ *  IRQS_NOSUPPORT	- arch does not support irqs_disabled_flags
  *  NEED_RESCED		- reschedule is requested
  *  HARDIRQ		- inside an interrupt handler
  *  SOFTIRQ		- inside a softirq handler
@@ -302,7 +299,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
- 		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
+		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
 		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
@@ -325,8 +322,8 @@ enum print_line_t {
  * flags value in struct tracer_flags.
  */
 struct tracer_opt {
-	const char 	*name; /* Will appear on the trace_options file */
-	u32 		bit; /* Mask assigned in val field in tracer_flags */
+	const char	*name; /* Will appear on the trace_options file */
+	u32		bit; /* Mask assigned in val field in tracer_flags */
 };
 
 /*
@@ -335,7 +332,7 @@ struct tracer_opt {
  */
 struct tracer_flags {
 	u32			val;
-	struct tracer_opt 	*opts;
+	struct tracer_opt	*opts;
 };
 
 /* Makes more easy to define a tracer opt */
@@ -390,7 +387,7 @@ struct tracer {
 	int			(*set_flag)(u32 old_flags, u32 bit, int set);
 	struct tracer		*next;
 	int			print_max;
-	struct tracer_flags 	*flags;
+	struct tracer_flags	*flags;
 	struct tracer_stat	*stats;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 888b55dc314d26239d84c3b187dae555a81c1605 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Sun, 8 Mar 2009 13:12:43 +0900
Subject: ftrace: tracing header should put '#' at the beginning of a line

In a recent discussion, Andrew Morton pointed out that tracing header
should put '#' at the beginning of a line.

Then, we can easily filtered the header by following grep usage:

  cat trace | grep -v '^#'

Wakeup trace also has the same header problem.

Comparison of headers displayed:

before this patch:

 # tracer: wakeup
 #
 wakeup latency trace v1.1.5 on 2.6.29-rc7-tip-tip
 --------------------------------------------------------------------
  latency: 19059 us, #21277/21277, CPU#1 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4)
     -----------------
     | task: kondemand/1-1644 (uid:0 nice:-5 policy:0 rt_prio:0)
     -----------------

 #                  _------=> CPU#
 #                 / _-----=> irqs-off
 #                | / _----=> need-resched
 #                || / _---=> hardirq/softirq
 #                ||| / _--=> preempt-depth
 #                |||| /
 #                |||||     delay
 #  cmd     pid   ||||| time  |   caller
 #     \   /      |||||   \   |   /
 irqbalan-1887    1d.s.    0us :   1887:120:R   + [001]  1644:115:S kondemand/1
 irqbalan-1887    1d.s.    1us : default_wake_function <-autoremove_wake_function
 irqbalan-1887    1d.s.    2us : check_preempt_wakeup <-try_to_wake_up

after this patch:

 # tracer: wakeup
 #
 # wakeup latency trace v1.1.5 on 2.6.29-rc7-tip-tip
 # --------------------------------------------------------------------
 # latency: 529 us, #530/530, CPU#0 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4)
 #    -----------------
 #    | task: kondemand/0-1641 (uid:0 nice:-5 policy:0 rt_prio:0)
 #    -----------------
 #
 #                  _------=> CPU#
 #                 / _-----=> irqs-off
 #                | / _----=> need-resched
 #                || / _---=> hardirq/softirq
 #                ||| / _--=> preempt-depth
 #                |||| /
 #                |||||     delay
 #  cmd     pid   ||||| time  |   caller
 #     \   /      |||||   \   |   /
     sshd-2496    0d.s.    0us :   2496:120:R   + [000]  1641:115:S kondemand/0
     sshd-2496    0d.s.    1us : default_wake_function <-autoremove_wake_function
     sshd-2496    0d.s.    1us : check_preempt_wakeup <-try_to_wake_up

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090308124421.23C3.A69D9226@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cc94f8642485..e5b56199e5e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1466,11 +1466,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 	total = entries +
 		ring_buffer_overruns(iter->tr->buffer);
 
-	seq_printf(m, "%s latency trace v1.1.5 on %s\n",
+	seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
 		   name, UTS_RELEASE);
-	seq_puts(m, "-----------------------------------"
+	seq_puts(m, "# -----------------------------------"
 		 "---------------------------------\n");
-	seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
+	seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |"
 		   " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
 		   nsecs_to_usecs(data->saved_latency),
 		   entries,
@@ -1492,24 +1492,24 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 #else
 	seq_puts(m, ")\n");
 #endif
-	seq_puts(m, "    -----------------\n");
-	seq_printf(m, "    | task: %.16s-%d "
+	seq_puts(m, "#    -----------------\n");
+	seq_printf(m, "#    | task: %.16s-%d "
 		   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
 		   data->comm, data->pid, data->uid, data->nice,
 		   data->policy, data->rt_priority);
-	seq_puts(m, "    -----------------\n");
+	seq_puts(m, "#    -----------------\n");
 
 	if (data->critical_start) {
-		seq_puts(m, " => started at: ");
+		seq_puts(m, "#  => started at: ");
 		seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
 		trace_print_seq(m, &iter->seq);
-		seq_puts(m, "\n => ended at:   ");
+		seq_puts(m, "\n#  => ended at:   ");
 		seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
 		trace_print_seq(m, &iter->seq);
-		seq_puts(m, "\n");
+		seq_puts(m, "#\n");
 	}
 
-	seq_puts(m, "\n");
+	seq_puts(m, "#\n");
 }
 
 static void test_cpu_buff_start(struct trace_iterator *iter)
-- 
cgit v1.2.3-59-g8ed1b


From c3ffc7a40b7e94b094efe1c8ab4e24370a782b65 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 9 Mar 2009 18:15:34 +0900
Subject: tracing: Don't use tracing_record_cmdline() in workqueue tracer

Impact: improve workqueue tracer output

Currently, /sys/kernel/debug/tracing/trace_stat/workqueues can display
wrong and strange thread names.

Why?

Currently, ftrace has tracing_record_cmdline()/trace_find_cmdline()
convenience function that implements a task->comm string cache.

This can avoid unnecessary memcpy overhead and the workqueue tracer
uses it.

However, in general, any trace statistics feature shouldn't use
tracing_record_cmdline() because trace statistics can display
very old process. Then comm cache can return wrong string because
recent process overrides the cache.

Fortunately, workqueue trace guarantees that displayed processes
are live. Thus we can search comm string from PID at display time.

<before>

% cat workqueues
 # CPU  INSERTED  EXECUTED   NAME
 # |      |         |          |

   7 431913     431913       kondemand/7
   7      0          0       tail
   7     21         21       git
   7      0          0       ls
   7      9          9       cat
   7 832632     832632       unix_chkpwd
   7 236292     236292       ls

Note: tail, git, ls, cat unix_chkpwd are obiously not workqueue thread.

<after>

% cat workqueues
 # CPU  INSERTED  EXECUTED   NAME
 # |      |         |          |

   7    510        510       kondemand/7
   7      0          0       kmpathd/7
   7     15         15       ata/7
   7      0          0       aio/7
   7     11         11       kblockd/7
   7   1063       1063       work_on_cpu/7
   7    167        167       events/7

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_workqueue.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 4664990fe9c5..46c8dc896bd3 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -99,8 +99,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 		pr_warning("trace_workqueue: not enough memory\n");
 		return;
 	}
-	tracing_record_cmdline(wq_thread);
-
 	INIT_LIST_HEAD(&cws->list);
 	cws->cpu = cpu;
 
@@ -195,11 +193,12 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 	struct cpu_workqueue_stats *cws = p;
 	unsigned long flags;
 	int cpu = cws->cpu;
+	struct task_struct *tsk = find_task_by_vpid(cws->pid);
 
 	seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu,
 		   atomic_read(&cws->inserted),
 		   cws->executed,
-		   trace_find_cmdline(cws->pid));
+		   tsk ? tsk->comm : "<...>");
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
 	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-- 
cgit v1.2.3-59-g8ed1b


From 156b5f172a64103bcb13b6d26288388b9019caa3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 6 Mar 2009 10:50:53 -0500
Subject: tracing: typecast sizeof and offsetof to unsigned int

Impact: fix compiler warnings

On x86_64 sizeof and offsetof are treated as long, where as on x86_32
they are int. This patch typecasts them to unsigned int to avoid
one arch giving warnings while the other does not.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 15 ++++++++-------
 kernel/trace/trace_export.c | 10 +++++-----
 kernel/trace/trace_format.h | 12 ++++++------
 3 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4488d90e75ef..fa32ca320767 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -448,8 +448,9 @@ event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
 }
 
 #undef FIELD
-#define FIELD(type, name) \
-	#type, #name, offsetof(typeof(field), name), sizeof(field.name)
+#define FIELD(type, name)						\
+	#type, #name, (unsigned int)offsetof(typeof(field), name),	\
+		(unsigned int)sizeof(field.name)
 
 static int trace_write_header(struct trace_seq *s)
 {
@@ -457,11 +458,11 @@ static int trace_write_header(struct trace_seq *s)
 
 	/* struct trace_entry */
 	return trace_seq_printf(s,
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
 				"\n",
 				FIELD(unsigned char, type),
 				FIELD(unsigned char, flags),
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 0fb7be73e31c..7162ab49d05d 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -18,11 +18,11 @@
 #include "trace_format.h"
 
 #undef TRACE_FIELD_ZERO_CHAR
-#define TRACE_FIELD_ZERO_CHAR(item)				\
-	ret = trace_seq_printf(s, "\tfield: char " #item ";\t"	\
-			       "offset:%lu;\tsize:0;\n",	\
-			       offsetof(typeof(field), item));	\
-	if (!ret)						\
+#define TRACE_FIELD_ZERO_CHAR(item)					\
+	ret = trace_seq_printf(s, "\tfield: char " #item ";\t"		\
+			       "offset:%u;\tsize:0;\n",			\
+			       (unsigned int)offsetof(typeof(field), item)); \
+	if (!ret)							\
 		return 0;
 
 
diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h
index 03f9a4c165ca..97e59a9c82ea 100644
--- a/kernel/trace/trace_format.h
+++ b/kernel/trace/trace_format.h
@@ -22,9 +22,9 @@
 #undef TRACE_FIELD
 #define TRACE_FIELD(type, item, assign)					\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%lu;\tsize:%lu;\n",		\
-			       offsetof(typeof(field), item),		\
-			       sizeof(field.item));			\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
 	if (!ret)							\
 		return 0;
 
@@ -32,9 +32,9 @@
 #undef TRACE_FIELD_SPECIAL
 #define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
 	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
-			       "offset:%lu;\tsize:%lu;\n",		\
-			       offsetof(typeof(field), item),		\
-			       sizeof(field.item));			\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
 	if (!ret)							\
 		return 0;
 
-- 
cgit v1.2.3-59-g8ed1b


From 2939b0469d04ba9ac791aca9a81625d7eb50662b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Mar 2009 15:47:18 -0400
Subject: tracing: replace TP<var> with TP_<var>

Impact: clean up

The macros TPPROTO, TPARGS, TPFMT, TPRAWFMT, and TPCMD all look a bit
ugly. This patch adds an underscore to their names.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 Documentation/tracepoints.txt          |  8 +--
 include/linux/tracepoint.h             |  6 +--
 include/trace/block.h                  | 70 ++++++++++++------------
 include/trace/irq_event_types.h        | 16 +++---
 include/trace/lockdep_event_types.h    | 24 ++++-----
 include/trace/power.h                  | 12 ++---
 include/trace/sched_event_types.h      | 98 +++++++++++++++++-----------------
 include/trace/workqueue.h              | 16 +++---
 kernel/trace/trace_event_types.h       | 28 +++++-----
 kernel/trace/trace_events_stage_2.h    |  6 +--
 kernel/trace/trace_events_stage_3.h    |  8 +--
 kernel/trace/trace_export.c            |  8 +--
 kernel/trace/trace_format.h            | 55 -------------------
 samples/tracepoints/tp-samples-trace.h |  8 +--
 14 files changed, 154 insertions(+), 209 deletions(-)
 delete mode 100644 kernel/trace/trace_format.h

(limited to 'kernel')

diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt
index 6f0a044f5b5e..4ff43c6de299 100644
--- a/Documentation/tracepoints.txt
+++ b/Documentation/tracepoints.txt
@@ -45,8 +45,8 @@ In include/trace/subsys.h :
 #include <linux/tracepoint.h>
 
 DECLARE_TRACE(subsys_eventname,
-	TPPROTO(int firstarg, struct task_struct *p),
-	TPARGS(firstarg, p));
+	TP_PROTO(int firstarg, struct task_struct *p),
+	TP_ARGS(firstarg, p));
 
 In subsys/file.c (where the tracing statement must be added) :
 
@@ -66,10 +66,10 @@ Where :
     - subsys is the name of your subsystem.
     - eventname is the name of the event to trace.
 
-- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the
+- TP_PROTO(int firstarg, struct task_struct *p) is the prototype of the
   function called by this tracepoint.
 
-- TPARGS(firstarg, p) are the parameters names, same as found in the
+- TP_ARGS(firstarg, p) are the parameters names, same as found in the
   prototype.
 
 Connecting a function (probe) to a tracepoint is done by providing a
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 152b2f03fb86..3bcc3e171443 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -31,8 +31,8 @@ struct tracepoint {
 					 * Keep in sync with vmlinux.lds.h.
 					 */
 
-#define TPPROTO(args...)	args
-#define TPARGS(args...)		args
+#define TP_PROTO(args...)	args
+#define TP_ARGS(args...)		args
 
 #ifdef CONFIG_TRACEPOINTS
 
@@ -65,7 +65,7 @@ struct tracepoint {
 	{								\
 		if (unlikely(__tracepoint_##name.state))		\
 			__DO_TRACE(&__tracepoint_##name,		\
-				TPPROTO(proto), TPARGS(args));		\
+				TP_PROTO(proto), TP_ARGS(args));	\
 	}								\
 	static inline int register_trace_##name(void (*probe)(proto))	\
 	{								\
diff --git a/include/trace/block.h b/include/trace/block.h
index 25c6a1fd5b77..25b7068b819e 100644
--- a/include/trace/block.h
+++ b/include/trace/block.h
@@ -5,72 +5,72 @@
 #include <linux/tracepoint.h>
 
 DECLARE_TRACE(block_rq_abort,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_insert,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_issue,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_requeue,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_complete,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_bio_bounce,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_complete,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_backmerge,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_frontmerge,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_queue,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_getrq,
-	TPPROTO(struct request_queue *q, struct bio *bio, int rw),
-		TPARGS(q, bio, rw));
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+	      TP_ARGS(q, bio, rw));
 
 DECLARE_TRACE(block_sleeprq,
-	TPPROTO(struct request_queue *q, struct bio *bio, int rw),
-		TPARGS(q, bio, rw));
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+	      TP_ARGS(q, bio, rw));
 
 DECLARE_TRACE(block_plug,
-	TPPROTO(struct request_queue *q),
-		TPARGS(q));
+	TP_PROTO(struct request_queue *q),
+	      TP_ARGS(q));
 
 DECLARE_TRACE(block_unplug_timer,
-	TPPROTO(struct request_queue *q),
-		TPARGS(q));
+	TP_PROTO(struct request_queue *q),
+	      TP_ARGS(q));
 
 DECLARE_TRACE(block_unplug_io,
-	TPPROTO(struct request_queue *q),
-		TPARGS(q));
+	TP_PROTO(struct request_queue *q),
+	      TP_ARGS(q));
 
 DECLARE_TRACE(block_split,
-	TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
-		TPARGS(q, bio, pdu));
+	TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
+	      TP_ARGS(q, bio, pdu));
 
 DECLARE_TRACE(block_remap,
-	TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		sector_t from, sector_t to),
-		TPARGS(q, bio, dev, from, to));
+	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+		 sector_t from, sector_t to),
+	      TP_ARGS(q, bio, dev, from, to));
 
 #endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
index 65850bc5ea06..0147d9eef5f4 100644
--- a/include/trace/irq_event_types.h
+++ b/include/trace/irq_event_types.h
@@ -9,25 +9,25 @@
 #define TRACE_SYSTEM irq
 
 TRACE_EVENT_FORMAT(irq_handler_entry,
-	TPPROTO(int irq, struct irqaction *action),
-	TPARGS(irq, action),
-	TPFMT("irq=%d handler=%s", irq, action->name),
+	TP_PROTO(int irq, struct irqaction *action),
+	TP_ARGS(irq, action),
+	TP_FMT("irq=%d handler=%s", irq, action->name),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, irq, irq)
 	),
-	TPRAWFMT("irq %d")
+	TP_RAW_FMT("irq %d")
 	);
 
 TRACE_EVENT_FORMAT(irq_handler_exit,
-	TPPROTO(int irq, struct irqaction *action, int ret),
-	TPARGS(irq, action, ret),
-	TPFMT("irq=%d handler=%s return=%s",
+	TP_PROTO(int irq, struct irqaction *action, int ret),
+	TP_ARGS(irq, action, ret),
+	TP_FMT("irq=%d handler=%s return=%s",
 		irq, action->name, ret ? "handled" : "unhandled"),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, irq, irq)
 		TRACE_FIELD(int, ret, ret)
 	),
-	TPRAWFMT("irq %d ret %d")
+	TP_RAW_FMT("irq %d ret %d")
 	);
 
 #undef TRACE_SYSTEM
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
index f713d74a82b4..1f00e8b3543e 100644
--- a/include/trace/lockdep_event_types.h
+++ b/include/trace/lockdep_event_types.h
@@ -10,32 +10,32 @@
 #ifdef CONFIG_LOCKDEP
 
 TRACE_FORMAT(lock_acquire,
-	TPPROTO(struct lockdep_map *lock, unsigned int subclass,
+	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
 		int trylock, int read, int check,
 		struct lockdep_map *next_lock, unsigned long ip),
-	TPARGS(lock, subclass, trylock, read, check, next_lock, ip),
-	TPFMT("%s%s%s", trylock ? "try " : "",
+	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+	TP_FMT("%s%s%s", trylock ? "try " : "",
 		read ? "read " : "", lock->name)
 	);
 
 TRACE_FORMAT(lock_release,
-	TPPROTO(struct lockdep_map *lock, int nested, unsigned long ip),
-	TPARGS(lock, nested, ip),
-	TPFMT("%s", lock->name)
+	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+	TP_ARGS(lock, nested, ip),
+	TP_FMT("%s", lock->name)
 	);
 
 #ifdef CONFIG_LOCK_STAT
 
 TRACE_FORMAT(lock_contended,
-	TPPROTO(struct lockdep_map *lock, unsigned long ip),
-	TPARGS(lock, ip),
-	TPFMT("%s", lock->name)
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+	TP_ARGS(lock, ip),
+	TP_FMT("%s", lock->name)
 	);
 
 TRACE_FORMAT(lock_acquired,
-	TPPROTO(struct lockdep_map *lock, unsigned long ip),
-	TPARGS(lock, ip),
-	TPFMT("%s", lock->name)
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+	TP_ARGS(lock, ip),
+	TP_FMT("%s", lock->name)
 	);
 
 #endif
diff --git a/include/trace/power.h b/include/trace/power.h
index 38aca537e497..ef204666e983 100644
--- a/include/trace/power.h
+++ b/include/trace/power.h
@@ -18,15 +18,15 @@ struct power_trace {
 };
 
 DECLARE_TRACE(power_start,
-	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
-		TPARGS(it, type, state));
+	TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state),
+	      TP_ARGS(it, type, state));
 
 DECLARE_TRACE(power_mark,
-	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
-		TPARGS(it, type, state));
+	TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state),
+	      TP_ARGS(it, type, state));
 
 DECLARE_TRACE(power_end,
-	TPPROTO(struct power_trace *it),
-		TPARGS(it));
+	TP_PROTO(struct power_trace *it),
+	      TP_ARGS(it));
 
 #endif /* _TRACE_POWER_H */
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index a6de5c1601a0..71b14828a957 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -9,143 +9,143 @@
 #define TRACE_SYSTEM sched
 
 TRACE_EVENT_FORMAT(sched_kthread_stop,
-	TPPROTO(struct task_struct *t),
-	TPARGS(t),
-	TPFMT("task %s:%d", t->comm, t->pid),
+	TP_PROTO(struct task_struct *t),
+	TP_ARGS(t),
+	TP_FMT("task %s:%d", t->comm, t->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, t->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_kthread_stop_ret,
-	TPPROTO(int ret),
-	TPARGS(ret),
-	TPFMT("ret=%d", ret),
+	TP_PROTO(int ret),
+	TP_ARGS(ret),
+	TP_FMT("ret=%d", ret),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, ret, ret)
 	),
-	TPRAWFMT("ret=%d")
+	TP_RAW_FMT("ret=%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_wait_task,
-	TPPROTO(struct rq *rq, struct task_struct *p),
-	TPARGS(rq, p),
-	TPFMT("task %s:%d", p->comm, p->pid),
+	TP_PROTO(struct rq *rq, struct task_struct *p),
+	TP_ARGS(rq, p),
+	TP_FMT("task %s:%d", p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_wakeup,
-	TPPROTO(struct rq *rq, struct task_struct *p, int success),
-	TPARGS(rq, p, success),
-	TPFMT("task %s:%d %s",
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+	TP_ARGS(rq, p, success),
+	TP_FMT("task %s:%d %s",
 	      p->comm, p->pid, success ? "succeeded" : "failed"),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 		TRACE_FIELD(int, success, success)
 	),
-	TPRAWFMT("task %d success=%d")
+	TP_RAW_FMT("task %d success=%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_wakeup_new,
-	TPPROTO(struct rq *rq, struct task_struct *p, int success),
-	TPARGS(rq, p, success),
-	TPFMT("task %s:%d",
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+	TP_ARGS(rq, p, success),
+	TP_FMT("task %s:%d",
 	      p->comm, p->pid, success ? "succeeded" : "failed"),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 		TRACE_FIELD(int, success, success)
 	),
-	TPRAWFMT("task %d success=%d")
+	TP_RAW_FMT("task %d success=%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_switch,
-	TPPROTO(struct rq *rq, struct task_struct *prev,
+	TP_PROTO(struct rq *rq, struct task_struct *prev,
 		struct task_struct *next),
-	TPARGS(rq, prev, next),
-	TPFMT("task %s:%d ==> %s:%d",
+	TP_ARGS(rq, prev, next),
+	TP_FMT("task %s:%d ==> %s:%d",
 	      prev->comm, prev->pid, next->comm, next->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, prev_pid, prev->pid)
 		TRACE_FIELD(int, prev_prio, prev->prio)
 		TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
 				    next_comm,
-				    TPCMD(memcpy(TRACE_ENTRY->next_comm,
+				    TP_CMD(memcpy(TRACE_ENTRY->next_comm,
 						 next->comm,
 						 TASK_COMM_LEN)))
 		TRACE_FIELD(pid_t, next_pid, next->pid)
 		TRACE_FIELD(int, next_prio, next->prio)
 	),
-	TPRAWFMT("prev %d:%d ==> next %s:%d:%d")
+	TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_migrate_task,
-	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
-	TPARGS(p, orig_cpu, dest_cpu),
-	TPFMT("task %s:%d from: %d  to: %d",
+	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+	TP_ARGS(p, orig_cpu, dest_cpu),
+	TP_FMT("task %s:%d from: %d  to: %d",
 	      p->comm, p->pid, orig_cpu, dest_cpu),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 		TRACE_FIELD(int, orig_cpu, orig_cpu)
 		TRACE_FIELD(int, dest_cpu, dest_cpu)
 	),
-	TPRAWFMT("task %d  from: %d to: %d")
+	TP_RAW_FMT("task %d  from: %d to: %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_free,
-	TPPROTO(struct task_struct *p),
-	TPARGS(p),
-	TPFMT("task %s:%d", p->comm, p->pid),
+	TP_PROTO(struct task_struct *p),
+	TP_ARGS(p),
+	TP_FMT("task %s:%d", p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_exit,
-	TPPROTO(struct task_struct *p),
-	TPARGS(p),
-	TPFMT("task %s:%d", p->comm, p->pid),
+	TP_PROTO(struct task_struct *p),
+	TP_ARGS(p),
+	TP_FMT("task %s:%d", p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_wait,
-	TPPROTO(struct pid *pid),
-	TPARGS(pid),
-	TPFMT("pid %d", pid_nr(pid)),
+	TP_PROTO(struct pid *pid),
+	TP_ARGS(pid),
+	TP_FMT("pid %d", pid_nr(pid)),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, pid_nr(pid))
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_fork,
-	TPPROTO(struct task_struct *parent, struct task_struct *child),
-	TPARGS(parent, child),
-	TPFMT("parent %s:%d  child %s:%d",
+	TP_PROTO(struct task_struct *parent, struct task_struct *child),
+	TP_ARGS(parent, child),
+	TP_FMT("parent %s:%d  child %s:%d",
 	      parent->comm, parent->pid, child->comm, child->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, parent, parent->pid)
 		TRACE_FIELD(pid_t, child, child->pid)
 	),
-	TPRAWFMT("parent %d  child %d")
+	TP_RAW_FMT("parent %d  child %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_signal_send,
-	TPPROTO(int sig, struct task_struct *p),
-	TPARGS(sig, p),
-	TPFMT("sig: %d   task %s:%d", sig, p->comm, p->pid),
+	TP_PROTO(int sig, struct task_struct *p),
+	TP_ARGS(sig, p),
+	TP_FMT("sig: %d   task %s:%d", sig, p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, sig, sig)
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("sig: %d  task %d")
+	TP_RAW_FMT("sig: %d  task %d")
 	);
 
 #undef TRACE_SYSTEM
diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h
index 867829df4571..7626523deeba 100644
--- a/include/trace/workqueue.h
+++ b/include/trace/workqueue.h
@@ -6,20 +6,20 @@
 #include <linux/sched.h>
 
 DECLARE_TRACE(workqueue_insertion,
-	   TPPROTO(struct task_struct *wq_thread, struct work_struct *work),
-	   TPARGS(wq_thread, work));
+	   TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+	   TP_ARGS(wq_thread, work));
 
 DECLARE_TRACE(workqueue_execution,
-	   TPPROTO(struct task_struct *wq_thread, struct work_struct *work),
-	   TPARGS(wq_thread, work));
+	   TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+	   TP_ARGS(wq_thread, work));
 
 /* Trace the creation of one workqueue thread on a cpu */
 DECLARE_TRACE(workqueue_creation,
-	   TPPROTO(struct task_struct *wq_thread, int cpu),
-	   TPARGS(wq_thread, cpu));
+	   TP_PROTO(struct task_struct *wq_thread, int cpu),
+	   TP_ARGS(wq_thread, cpu));
 
 DECLARE_TRACE(workqueue_destruction,
-	   TPPROTO(struct task_struct *wq_thread),
-	   TPARGS(wq_thread));
+	   TP_PROTO(struct task_struct *wq_thread),
+	   TP_ARGS(wq_thread));
 
 #endif /* __TRACE_WORKQUEUE_H */
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index fb4eba166433..d94179aa1fc2 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -10,7 +10,7 @@ TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned long, parent_ip, parent_ip)
 	),
-	TPRAWFMT(" %lx <-- %lx")
+	TP_RAW_FMT(" %lx <-- %lx")
 );
 
 TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
@@ -19,7 +19,7 @@ TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
 		TRACE_FIELD(unsigned long, graph_ent.func, func)
 		TRACE_FIELD(int, graph_ent.depth, depth)
 	),
-	TPRAWFMT("--> %lx (%d)")
+	TP_RAW_FMT("--> %lx (%d)")
 );
 
 TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
@@ -28,7 +28,7 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
 		TRACE_FIELD(unsigned long, ret.func, func)
 		TRACE_FIELD(int, ret.depth, depth)
 	),
-	TPRAWFMT("<-- %lx (%d)")
+	TP_RAW_FMT("<-- %lx (%d)")
 );
 
 TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
@@ -41,7 +41,7 @@ TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
 		TRACE_FIELD(unsigned char, next_state, next_state)
 		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
 	),
-	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 
 TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
@@ -54,7 +54,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
 		TRACE_FIELD(unsigned char, next_state, next_state)
 		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
 	),
-	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 
 TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
@@ -63,7 +63,7 @@ TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
 		TRACE_FIELD(unsigned long, arg2, arg2)
 		TRACE_FIELD(unsigned long, arg3, arg3)
 	),
-	TPRAWFMT("(%08lx) (%08lx) (%08lx)")
+	TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
 );
 
 /*
@@ -83,7 +83,7 @@ TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
 		TRACE_FIELD(unsigned long, caller[6], stack6)
 		TRACE_FIELD(unsigned long, caller[7], stack7)
 	),
-	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+	TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
@@ -98,7 +98,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 		TRACE_FIELD(unsigned long, caller[6], stack6)
 		TRACE_FIELD(unsigned long, caller[7], stack7)
 	),
-	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+	TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
@@ -108,7 +108,7 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 		TRACE_FIELD(unsigned int, depth, depth)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
-	TPRAWFMT("%08lx (%d) %s")
+	TP_RAW_FMT("%08lx (%d) %s")
 );
 
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
@@ -118,7 +118,7 @@ TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
 		TRACE_FIELD(char, correct, correct)
 	),
-	TPRAWFMT("%u:%s:%s (%u)")
+	TP_RAW_FMT("%u:%s:%s (%u)")
 );
 
 TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
@@ -126,7 +126,7 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
 		TRACE_FIELD(u64, from, from)
 		TRACE_FIELD(u64, to, to)
 	),
-	TPRAWFMT("from: %llx to: %llx")
+	TP_RAW_FMT("from: %llx to: %llx")
 );
 
 TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
@@ -136,7 +136,7 @@ TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
 		TRACE_FIELD(int, state_data.type, type)
 		TRACE_FIELD(int, state_data.state, state)
 	),
-	TPRAWFMT("%llx->%llx type:%u state:%u")
+	TP_RAW_FMT("%llx->%llx type:%u state:%u")
 );
 
 TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
@@ -149,7 +149,7 @@ TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
 		TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
 		TRACE_FIELD(int, node, node)
 	),
-	TPRAWFMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
+	TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
 		 " flags:%x node:%d")
 );
 
@@ -159,7 +159,7 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
 		TRACE_FIELD(unsigned long, call_site, call_site)
 		TRACE_FIELD(const void *, ptr, ptr)
 	),
-	TPRAWFMT("type:%u call_site:%lx ptr:%p")
+	TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
 );
 
 #undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index d24a97e74aea..8e2e0f56c2a8 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -20,7 +20,7 @@
  *
  *	field = (typeof(field))entry;
  *
- *	ret = trace_seq_printf(s, <TPRAWFMT> "%s", <ARGS> "\n");
+ *	ret = trace_seq_printf(s, <TP_RAW_FMT> "%s", <ARGS> "\n");
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
  *
@@ -44,8 +44,8 @@
 	field->item,
 
 
-#undef TPRAWFMT
-#define TPRAWFMT(args...)	args
+#undef TP_RAW_FMT
+#define TP_RAW_FMT(args...)	args
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 2c8d76c7dbed..557ca52bcdcd 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -106,8 +106,8 @@
  *
  */
 
-#undef TPFMT
-#define TPFMT(fmt, args...)	fmt "\n", ##args
+#undef TP_FMT
+#define TP_FMT(fmt, args...)	fmt "\n", ##args
 
 #define _TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
@@ -152,8 +152,8 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
-#undef TPCMD
-#define TPCMD(cmd...)	cmd
+#undef TP_CMD
+#define TP_CMD(cmd...)	cmd
 
 #undef TRACE_ENTRY
 #define TRACE_ENTRY	entry
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7162ab49d05d..e62bc10f8103 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -26,8 +26,8 @@
 		return 0;
 
 
-#undef TPRAWFMT
-#define TPRAWFMT(args...) args
+#undef TP_RAW_FMT
+#define TP_RAW_FMT(args...) args
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
@@ -57,8 +57,8 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
-#undef TPCMD
-#define TPCMD(cmd...)	cmd
+#undef TP_CMD
+#define TP_CMD(cmd...)	cmd
 
 #undef TRACE_ENTRY
 #define TRACE_ENTRY	entry
diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h
deleted file mode 100644
index 97e59a9c82ea..000000000000
--- a/kernel/trace/trace_format.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- *	struct ftrace_raw_##call field;
- *	int ret;
- *
- *	ret = trace_seq_printf(s, #type " " #item ";"
- *			       " size:%d; offset:%d;\n",
- *			       sizeof(field.type),
- *			       offsetof(struct ftrace_raw_##call,
- *					item));
- *
- * }
- */
-
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
-	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-static int								\
-ftrace_format_##call(struct trace_seq *s)				\
-{									\
-	struct ftrace_raw_##call field;					\
-	int ret;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
-									\
-	return ret;							\
-}
-
diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h
index 01724e04c556..dffdc49878af 100644
--- a/samples/tracepoints/tp-samples-trace.h
+++ b/samples/tracepoints/tp-samples-trace.h
@@ -5,9 +5,9 @@
 #include <linux/tracepoint.h>
 
 DECLARE_TRACE(subsys_event,
-	TPPROTO(struct inode *inode, struct file *file),
-	TPARGS(inode, file));
+	TP_PROTO(struct inode *inode, struct file *file),
+	TP_ARGS(inode, file));
 DECLARE_TRACE(subsys_eventb,
-	TPPROTO(void),
-	TPARGS());
+	TP_PROTO(void),
+	TP_ARGS());
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 9cc26a261d43e5898287a1f5808132f8f05ceb1c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Mar 2009 16:00:22 -0400
Subject: tracing: use generic __stringify

Impact: clean up

This removes the custom made STR(x) macros in the tracer and uses
the generic __stringify macro instead.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/events.c               | 4 +---
 kernel/trace/trace_events_stage_3.h | 4 ++--
 kernel/trace/trace_selftest.c       | 6 ++----
 3 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index f2509cbaacea..9fc918da404f 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -2,9 +2,7 @@
  * This is the place to register all trace points as events.
  */
 
-/* someday this needs to go in a generic header */
-#define __STR(x) #x
-#define STR(x) __STR(x)
+#include <linux/stringify.h>
 
 #include <trace/trace_events.h>
 
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 557ca52bcdcd..41b82b93c9c7 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -139,7 +139,7 @@ static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
-	.system			= STR(TRACE_SYSTEM),			\
+	.system			= __stringify(TRACE_SYSTEM),		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
@@ -225,7 +225,7 @@ static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
-	.system			= STR(TRACE_SYSTEM),			\
+	.system			= __stringify(TRACE_SYSTEM),		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 7238646b8723..f907a2b29028 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1,5 +1,6 @@
 /* Include in trace.c */
 
+#include <linux/stringify.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
 
@@ -100,9 +101,6 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
-#define __STR(x) #x
-#define STR(x) __STR(x)
-
 /* Test dynamic code modification and ftrace filters */
 int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 					   struct trace_array *tr,
@@ -130,7 +128,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	 * start of the function names. We simply put a '*' to
 	 * accommodate them.
 	 */
-	func_name = "*" STR(DYN_FTRACE_TEST_NAME);
+	func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
 
 	/* filter only on our function */
 	ftrace_set_filter(func_name, strlen(func_name), 1);
-- 
cgit v1.2.3-59-g8ed1b


From da4d03020c2af32f73e8bfbab0a66620d85bb9bb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Mar 2009 17:14:30 -0400
Subject: tracing: new format for specialized trace points

Impact: clean up and enhancement

The TRACE_EVENT_FORMAT macro looks quite ugly and is limited in its
ability to save data as well as to print the record out. Working with
Ingo Molnar, we came up with a new format that is much more pleasing to
the eye of C developers. This new macro is more C style than the old
macro, and is more obvious to what it does.

Here's the example. The only updated macro in this patch is the
sched_switch trace point.

The old method looked like this:

 TRACE_EVENT_FORMAT(sched_switch,
        TP_PROTO(struct rq *rq, struct task_struct *prev,
                struct task_struct *next),
        TP_ARGS(rq, prev, next),
        TP_FMT("task %s:%d ==> %s:%d",
              prev->comm, prev->pid, next->comm, next->pid),
        TRACE_STRUCT(
                TRACE_FIELD(pid_t, prev_pid, prev->pid)
                TRACE_FIELD(int, prev_prio, prev->prio)
                TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
                                    next_comm,
                                    TP_CMD(memcpy(TRACE_ENTRY->next_comm,
                                                 next->comm,
                                                 TASK_COMM_LEN)))
                TRACE_FIELD(pid_t, next_pid, next->pid)
                TRACE_FIELD(int, next_prio, next->prio)
        ),
        TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d")
        );

The above method is hard to read and requires two format fields.

The new method:

 /*
  * Tracepoint for task switches, performed by the scheduler:
  *
  * (NOTE: the 'rq' argument is not used by generic trace events,
  *        but used by the latency tracer plugin. )
  */
 TRACE_EVENT(sched_switch,

	TP_PROTO(struct rq *rq, struct task_struct *prev,
		 struct task_struct *next),

	TP_ARGS(rq, prev, next),

	TP_STRUCT__entry(
		__array(	char,	prev_comm,	TASK_COMM_LEN	)
		__field(	pid_t,	prev_pid			)
		__field(	int,	prev_prio			)
		__array(	char,	next_comm,	TASK_COMM_LEN	)
		__field(	pid_t,	next_pid			)
		__field(	int,	next_prio			)
	),

	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
		__entry->next_comm, __entry->next_pid, __entry->next_prio),

	TP_fast_assign(
		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
		__entry->prev_pid	= prev->pid;
		__entry->prev_prio	= prev->prio;
		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
		__entry->next_pid	= next->pid;
		__entry->next_prio	= next->prio;
	)
 );

This macro is called TRACE_EVENT, it is broken up into 5 parts:

 TP_PROTO:        the proto type of the trace point
 TP_ARGS:         the arguments of the trace point
 TP_STRUCT_entry: the structure layout of the entry in the ring buffer
 TP_printk:       the printk format
 TP_fast_assign:  the method used to write the entry into the ring buffer

The structure is the definition of how the event will be saved in the
ring buffer. The printk is used by the internal tracing in case of
an oops, and the kernel needs to print out the format of the record
to the console. This the TP_printk gives a means to show the records
in a human readable format. It is also used to print out the data
from the trace file.

The TP_fast_assign is executed directly. It is basically like a C function,
where the __entry is the handle to the record.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h          |   3 +
 include/trace/sched_event_types.h   |  48 +++++++----
 kernel/trace/trace.h                |   5 --
 kernel/trace/trace_event_types.h    |   3 +-
 kernel/trace/trace_events.c         | 159 +-----------------------------------
 kernel/trace/trace_events_stage_1.h |  28 ++++---
 kernel/trace/trace_events_stage_2.h |  89 ++++++++++++++++----
 kernel/trace/trace_events_stage_3.h |  34 +++-----
 kernel/trace/trace_export.c         |  23 +++++-
 9 files changed, 159 insertions(+), 233 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 3bcc3e171443..6b4f1bb3701e 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -160,4 +160,7 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_EVENT_FORMAT(name, proto, args, fmt, struct, tpfmt)	\
 	TRACE_FORMAT(name, PARAMS(proto), PARAMS(args), PARAMS(fmt))
 
+#define TRACE_EVENT(name, proto, args, struct, print, assign)	\
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index 71b14828a957..aa77fb754038 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -62,25 +62,41 @@ TRACE_EVENT_FORMAT(sched_wakeup_new,
 	TP_RAW_FMT("task %d success=%d")
 	);
 
-TRACE_EVENT_FORMAT(sched_switch,
+/*
+ * Tracepoint for task switches, performed by the scheduler:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_switch,
+
 	TP_PROTO(struct rq *rq, struct task_struct *prev,
-		struct task_struct *next),
+		 struct task_struct *next),
+
 	TP_ARGS(rq, prev, next),
-	TP_FMT("task %s:%d ==> %s:%d",
-	      prev->comm, prev->pid, next->comm, next->pid),
-	TRACE_STRUCT(
-		TRACE_FIELD(pid_t, prev_pid, prev->pid)
-		TRACE_FIELD(int, prev_prio, prev->prio)
-		TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
-				    next_comm,
-				    TP_CMD(memcpy(TRACE_ENTRY->next_comm,
-						 next->comm,
-						 TASK_COMM_LEN)))
-		TRACE_FIELD(pid_t, next_pid, next->pid)
-		TRACE_FIELD(int, next_prio, next->prio)
+
+	TP_STRUCT__entry(
+		__array(	char,	prev_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	prev_pid			)
+		__field(	int,	prev_prio			)
+		__array(	char,	next_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	next_pid			)
+		__field(	int,	next_prio			)
 	),
-	TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d")
-	);
+
+	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->next_comm, __entry->next_pid, __entry->next_prio),
+
+	TP_fast_assign(
+		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+		__entry->prev_pid	= prev->pid;
+		__entry->prev_prio	= prev->prio;
+		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
+		__entry->next_pid	= next->pid;
+		__entry->next_prio	= next->prio;
+	)
+);
 
 TRACE_EVENT_FORMAT(sched_migrate_task,
 	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2bfb7d11fc17..c5e1d8865fe4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -751,12 +751,7 @@ struct ftrace_event_call {
 	int		(*regfunc)(void);
 	void		(*unregfunc)(void);
 	int		id;
-	struct dentry	*raw_dir;
-	int		raw_enabled;
-	int		type;
 	int		(*raw_init)(void);
-	int		(*raw_reg)(void);
-	void		(*raw_unreg)(void);
 	int		(*show_format)(struct trace_seq *s);
 };
 
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index d94179aa1fc2..5cca4c978bde 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -106,9 +106,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned int, depth, depth)
+		TRACE_FIELD(char *, fmt, fmt)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
-	TP_RAW_FMT("%08lx (%d) %s")
+	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
 );
 
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fa32ca320767..1880a6438097 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -59,22 +59,12 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 			call->enabled = 0;
 			call->unregfunc();
 		}
-		if (call->raw_enabled) {
-			call->raw_enabled = 0;
-			call->raw_unreg();
-		}
 		break;
 	case 1:
-		if (!call->enabled &&
-		    (call->type & TRACE_EVENT_TYPE_PRINTF)) {
+		if (!call->enabled) {
 			call->enabled = 1;
 			call->regfunc();
 		}
-		if (!call->raw_enabled &&
-		    (call->type & TRACE_EVENT_TYPE_RAW)) {
-			call->raw_enabled = 1;
-			call->raw_reg();
-		}
 		break;
 	}
 }
@@ -300,7 +290,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_event_call *call = filp->private_data;
 	char *buf;
 
-	if (call->enabled || call->raw_enabled)
+	if (call->enabled)
 		buf = "1\n";
 	else
 		buf = "0\n";
@@ -346,107 +336,6 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
-static ssize_t
-event_type_read(struct file *filp, char __user *ubuf, size_t cnt,
-		loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[16];
-	int r = 0;
-
-	if (call->type & TRACE_EVENT_TYPE_PRINTF)
-		r += sprintf(buf, "printf\n");
-
-	if (call->type & TRACE_EVENT_TYPE_RAW)
-		r += sprintf(buf+r, "raw\n");
-
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static ssize_t
-event_type_write(struct file *filp, const char __user *ubuf, size_t cnt,
-		 loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[64];
-
-	/*
-	 * If there's only one type, we can't change it.
-	 * And currently we always have printf type, and we
-	 * may or may not have raw type.
-	 *
-	 * This is a redundant check, the file should be read
-	 * only if this is the case anyway.
-	 */
-
-	if (!call->raw_init)
-		return -EPERM;
-
-	if (cnt >= sizeof(buf))
-		return -EINVAL;
-
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-
-	buf[cnt] = 0;
-
-	if (!strncmp(buf, "printf", 6) &&
-	    (!buf[6] || isspace(buf[6]))) {
-
-		call->type = TRACE_EVENT_TYPE_PRINTF;
-
-		/*
-		 * If raw enabled, the disable it and enable
-		 * printf type.
-		 */
-		if (call->raw_enabled) {
-			call->raw_enabled = 0;
-			call->raw_unreg();
-
-			call->enabled = 1;
-			call->regfunc();
-		}
-
-	} else if (!strncmp(buf, "raw", 3) &&
-	    (!buf[3] || isspace(buf[3]))) {
-
-		call->type = TRACE_EVENT_TYPE_RAW;
-
-		/*
-		 * If printf enabled, the disable it and enable
-		 * raw type.
-		 */
-		if (call->enabled) {
-			call->enabled = 0;
-			call->unregfunc();
-
-			call->raw_enabled = 1;
-			call->raw_reg();
-		}
-	} else
-		return -EINVAL;
-
-	*ppos += cnt;
-
-	return cnt;
-}
-
-static ssize_t
-event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
-			   loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[16];
-	int r = 0;
-
-	r += sprintf(buf, "printf\n");
-
-	if (call->raw_init)
-		r += sprintf(buf+r, "raw\n");
-
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
 #undef FIELD
 #define FIELD(type, name)						\
 	#type, #name, (unsigned int)offsetof(typeof(field), name),	\
@@ -470,6 +359,7 @@ static int trace_write_header(struct trace_seq *s)
 				FIELD(int, pid),
 				FIELD(int, tgid));
 }
+
 static ssize_t
 event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
@@ -527,13 +417,6 @@ static const struct seq_operations show_set_event_seq_ops = {
 	.stop = t_stop,
 };
 
-static const struct file_operations ftrace_avail_fops = {
-	.open = ftrace_event_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 static const struct file_operations ftrace_set_event_fops = {
 	.open = ftrace_event_seq_open,
 	.read = seq_read,
@@ -548,17 +431,6 @@ static const struct file_operations ftrace_enable_fops = {
 	.write = event_enable_write,
 };
 
-static const struct file_operations ftrace_type_fops = {
-	.open = tracing_open_generic,
-	.read = event_type_read,
-	.write = event_type_write,
-};
-
-static const struct file_operations ftrace_available_types_fops = {
-	.open = tracing_open_generic,
-	.read = event_available_types_read,
-};
-
 static const struct file_operations ftrace_event_format_fops = {
 	.open = tracing_open_generic,
 	.read = event_format_read,
@@ -647,9 +519,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		}
 	}
 
-	/* default the output to printf */
-	call->type = TRACE_EVENT_TYPE_PRINTF;
-
 	call->dir = debugfs_create_dir(call->name, d_events);
 	if (!call->dir) {
 		pr_warning("Could not create debugfs "
@@ -665,21 +534,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   "'%s/enable' entry\n", call->name);
 	}
 
-	/* Only let type be writable, if we can change it */
-	entry = debugfs_create_file("type",
-				    call->raw_init ? 0644 : 0444,
-				    call->dir, call,
-				    &ftrace_type_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/type' entry\n", call->name);
-
-	entry = debugfs_create_file("available_types", 0444, call->dir, call,
-				    &ftrace_available_types_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/available_types' entry\n", call->name);
-
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
@@ -704,13 +558,6 @@ static __init int event_trace_init(void)
 	if (!d_tracer)
 		return 0;
 
-	entry = debugfs_create_file("available_events", 0444, d_tracer,
-				    (void *)&show_event_seq_ops,
-				    &ftrace_avail_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'available_events' entry\n");
-
 	entry = debugfs_create_file("set_event", 0644, d_tracer,
 				    (void *)&show_set_event_seq_ops,
 				    &ftrace_set_event_fops);
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 3830a731424c..edfcbd3a0d1b 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -18,19 +18,23 @@
 #define TRACE_FORMAT(call, proto, args, fmt)
 
 #undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)	\
-	struct ftrace_raw_##name {					\
-		struct trace_entry	ent;				\
-		tstruct							\
-	};								\
-	static struct ftrace_event_call event_##name
+#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)
+
+#undef __array
+#define __array(type, item, len)	type	item[len];
 
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
+#undef __field
+#define __field(type, item)		type	item;
 
-#define TRACE_FIELD(type, item, assign) \
-	type item;
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	type_item;
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, print, assign)	\
+	struct ftrace_raw_##name {				\
+		struct trace_entry	ent;			\
+		tstruct						\
+	};							\
+	static struct ftrace_event_call event_##name
 
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 8e2e0f56c2a8..d91bf4c56661 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -32,23 +32,14 @@
  * in binary.
  */
 
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
+#undef __entry
+#define __entry field
 
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign) \
-	field->item,
+#undef TP_printk
+#define TP_printk(fmt, args...) fmt "\n", args
 
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	field->item,
-
-
-#undef TP_RAW_FMT
-#define TP_RAW_FMT(args...)	args
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
 enum print_line_t							\
 ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 {									\
@@ -66,14 +57,76 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 									\
 	field = (typeof(field))entry;					\
 									\
-	ret = trace_seq_printf(s, tpfmt "%s", tstruct "\n");		\
+	ret = trace_seq_printf(s, print);				\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
 	return TRACE_TYPE_HANDLED;					\
 }
-
+	
 #include <trace/trace_event_types.h>
 
-#include "trace_format.h"
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *	struct ftrace_raw_##call field;
+ *	int ret;
+ *
+ *	ret = trace_seq_printf(s, #type " " #item ";"
+ *			       " size:%d; offset:%d;\n",
+ *			       sizeof(field.type),
+ *			       offsetof(struct ftrace_raw_##call,
+ *					item));
+ *
+ * }
+ */
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef __field
+#define __field(type, item)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __array
+#define __array(type, item, len)						\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __entry
+#define __entry "REC"
+
+#undef TP_printk
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args
+
+#undef TP_fast_assign
+#define TP_fast_assign(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, func)		\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field;					\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: " print);			\
+									\
+	return ret;							\
+}
+
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 41b82b93c9c7..8e398d864096 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -144,27 +144,15 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
 
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)\
-	entry->item = assign;
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)\
-	entry->item = assign;
-
-#undef TP_CMD
-#define TP_CMD(cmd...)	cmd
-
-#undef TRACE_ENTRY
-#define TRACE_ENTRY	entry
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, raw)	\
+	TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))
 
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	cmd;
+#undef __entry
+#define __entry entry
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
 									\
 static struct ftrace_event_call event_##call;				\
 									\
@@ -185,7 +173,7 @@ static void ftrace_raw_event_##call(proto)				\
 		return;							\
 	entry	= ring_buffer_event_data(event);			\
 									\
-	tstruct;							\
+	assign;								\
 									\
 	trace_current_buffer_unlock_commit(event, irq_flags, pc);	\
 }									\
@@ -226,10 +214,8 @@ __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
-	.regfunc		= ftrace_reg_event_##call,		\
-	.unregfunc		= ftrace_unreg_event_##call,		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
-	.raw_reg		= ftrace_raw_reg_event_##call,		\
-	.raw_unreg		= ftrace_raw_unreg_event_##call,	\
+	.regfunc		= ftrace_raw_reg_event_##call,		\
+	.unregfunc		= ftrace_raw_unreg_event_##call,	\
 	.show_format		= ftrace_format_##call,			\
 }
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e62bc10f8103..23ae78430d58 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,7 +15,28 @@
 
 #include "trace_output.h"
 
-#include "trace_format.h"
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
 
 #undef TRACE_FIELD_ZERO_CHAR
 #define TRACE_FIELD_ZERO_CHAR(item)					\
-- 
cgit v1.2.3-59-g8ed1b


From 157587d7ac555458da9f682e3250135e468470a6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 00:15:34 -0400
Subject: tracing: remove obsolete TRACE_EVENT_FORMAT macro

Impact: clean up

The TRACE_EVENT_FORMAT macro is no longer used by trace points
and only the DECLARE_TRACE, TRACE_FORMAT or TRACE_EVENT macros should
be used by them. Although the TRACE_EVENT_FORMAT macro is still used
by the internal tracing utility, it should not be used in core
kernel code.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h          | 3 ---
 include/trace/lockdep_event_types.h | 2 +-
 include/trace/sched_event_types.h   | 2 +-
 kernel/trace/trace_events_stage_1.h | 3 ---
 kernel/trace/trace_events_stage_3.h | 6 +-----
 5 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 6b4f1bb3701e..69b56988813d 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -157,9 +157,6 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
-#define TRACE_EVENT_FORMAT(name, proto, args, fmt, struct, tpfmt)	\
-	TRACE_FORMAT(name, PARAMS(proto), PARAMS(args), PARAMS(fmt))
-
 #define TRACE_EVENT(name, proto, args, struct, print, assign)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
index 1f00e8b3543e..adccfcd2ec8f 100644
--- a/include/trace/lockdep_event_types.h
+++ b/include/trace/lockdep_event_types.h
@@ -1,5 +1,5 @@
 
-#ifndef TRACE_EVENT_FORMAT
+#ifndef TRACE_FORMAT
 # error Do not include this file directly.
 # error Unless you know what you are doing.
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index 0bbbf410e01f..fb37af672c88 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -1,6 +1,6 @@
 
 /* use <trace/sched.h> instead */
-#ifndef TRACE_EVENT_FORMAT
+#ifndef TRACE_EVENT
 # error Do not include this file directly.
 # error Unless you know what you are doing.
 #endif
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index edfcbd3a0d1b..15e9bf965a18 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -17,9 +17,6 @@
 #undef TRACE_FORMAT
 #define TRACE_FORMAT(call, proto, args, fmt)
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)
-
 #undef __array
 #define __array(type, item, len)	type	item[len];
 
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 8e398d864096..3ba55d4ab073 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -35,7 +35,7 @@
  * }
  *
  *
- * For those macros defined with TRACE_EVENT_FORMAT:
+ * For those macros defined with TRACE_EVENT:
  *
  * static struct ftrace_event_call event_<call>;
  *
@@ -144,10 +144,6 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, raw)	\
-	TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))
-
 #undef __entry
 #define __entry entry
 
-- 
cgit v1.2.3-59-g8ed1b


From bbcd3063597a3824357cd83c501c2a2aa21ef37b Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 10 Mar 2009 10:49:53 +0900
Subject: tracing: Don't assume possible cpu list have continuous numbers

"for (++cpu ; cpu < num_possible_cpus(); cpu++)" statement assumes
possible cpus have continuous number - but that's a wrong assumption.

Insted, cpumask_next() should be used.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090310104437.A480.A69D9226@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_workqueue.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 46c8dc896bd3..739fdacf873b 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -91,7 +91,7 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 	struct cpu_workqueue_stats *cws;
 	unsigned long flags;
 
-	WARN_ON(cpu < 0 || cpu >= num_possible_cpus());
+	WARN_ON(cpu < 0);
 
 	/* Workqueues are sometimes created in atomic context */
 	cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
@@ -175,12 +175,12 @@ static void *workqueue_stat_next(void *prev, int idx)
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
 	if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
 		spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-		for (++cpu ; cpu < num_possible_cpus(); cpu++) {
-			ret = workqueue_stat_start_cpu(cpu);
-			if (ret)
-				return ret;
-		}
-		return NULL;
+		do {
+			cpu = cpumask_next(cpu, cpu_possible_mask);
+			if (cpu >= nr_cpu_ids)
+				return NULL;
+		} while (!(ret = workqueue_stat_start_cpu(cpu)));
+		return ret;
 	}
 	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
-- 
cgit v1.2.3-59-g8ed1b


From bbb76b552a9cef86777181c8577acc907b122b41 Mon Sep 17 00:00:00 2001
From: Américo Wang <xiyou.wangcong@gmail.com>
Date: Tue, 10 Mar 2009 17:34:47 +0800
Subject: ptrace: remove a useless goto

Impact: cleanup

Obviously, this goto is useless. Remove it.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Roland McGrath <roland@redhat.com>
LKML-Reference: <20090310093447.GC3179@hack>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/ptrace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c9cf48b21f05..137955913e7e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -612,8 +612,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
 		goto out_put_task_struct;
 
 	ret = arch_ptrace(child, request, addr, data);
-	if (ret < 0)
-		goto out_put_task_struct;
 
  out_put_task_struct:
 	put_task_struct(child);
-- 
cgit v1.2.3-59-g8ed1b


From ce8eb2bf05042452107e489782105d2e235cbdd0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 10:14:35 -0400
Subject: tracing: fix printk format specifier

Impact: clean up

The offsetof and sizeof are of type size_t, and instead of typecasting
them to unsigned int for printk formatting, one could just use %zu.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1880a6438097..a0b41cc26f26 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -338,8 +338,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 #undef FIELD
 #define FIELD(type, name)						\
-	#type, #name, (unsigned int)offsetof(typeof(field), name),	\
-		(unsigned int)sizeof(field.name)
+	#type, #name, offsetof(typeof(field), name), sizeof(field.name)
 
 static int trace_write_header(struct trace_seq *s)
 {
@@ -347,11 +346,11 @@ static int trace_write_header(struct trace_seq *s)
 
 	/* struct trace_entry */
 	return trace_seq_printf(s,
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
 				FIELD(unsigned char, type),
 				FIELD(unsigned char, flags),
-- 
cgit v1.2.3-59-g8ed1b


From 40e26815fafd3b8c4aced17b1f22e68ef33eb8db Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 11:32:40 -0400
Subject: tracing: do not allow modifying the ftrace events via the event files

Impact: fix to prevent crash on calling NULL function pointer

The ftrace internal records have their format exported via the event
system under the ftrace subsystem. These are only for exporting the
format to allow binary readers to be able to parse them in a binary
output.

The ftrace subsystem events can only be enabled via the ftrace tracers
and do not have a registering function. The event files expect the
event record to have registering function and will call it directly.
Passing in a ftrace subsystem event will cause the kernel to crash
because it will execute a NULL pointer.

This patch prevents the ftrace subsystem from being viewable to the
event enabling files.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a0b41cc26f26..85ec10fbb38d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -102,7 +102,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 	mutex_lock(&event_mutex);
 	events_for_each(call) {
 
-		if (!call->name)
+		if (!call->name || !call->regfunc)
 			continue;
 
 		if (match &&
@@ -207,8 +207,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 	(*pos)++;
 
-	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
-		return NULL;
+	for (;;) {
+		if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+			return NULL;
+
+		/*
+		 * The ftrace subsystem is for showing formats only.
+		 * They can not be enabled or disabled via the event files.
+		 */
+		if (call->regfunc)
+			break;
+
+		call++;
+		next = call;
+	}
 
 	m->private = ++next;
 
-- 
cgit v1.2.3-59-g8ed1b


From 2314c4ae1461c9e8b26cf8b9a851f280bc5769e1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 12:04:02 -0400
Subject: tracing: add back the available_events file

The event directory files type and available_types were no longer
needed with the new TRACE_EVENT_FORMAT macros, they were deleted.
But by accident the available_events file was also removed.
This patch brings it back.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 85ec10fbb38d..769dfd00fc85 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -428,6 +428,13 @@ static const struct seq_operations show_set_event_seq_ops = {
 	.stop = t_stop,
 };
 
+static const struct file_operations ftrace_avail_fops = {
+	.open = ftrace_event_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
 static const struct file_operations ftrace_set_event_fops = {
 	.open = ftrace_event_seq_open,
 	.read = seq_read,
@@ -569,6 +576,13 @@ static __init int event_trace_init(void)
 	if (!d_tracer)
 		return 0;
 
+	entry = debugfs_create_file("available_events", 0444, d_tracer,
+				    (void *)&show_event_seq_ops,
+				    &ftrace_avail_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'available_events' entry\n");
+
 	entry = debugfs_create_file("set_event", 0644, d_tracer,
 				    (void *)&show_set_event_seq_ops,
 				    &ftrace_set_event_fops);
-- 
cgit v1.2.3-59-g8ed1b


From 30a8fecc2d34f086df34fe2f2b926f080e002600 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 12:41:38 -0400
Subject: tracing: flip the TP_printk and TP_fast_assign in the TRACE_EVENT
 macro

Impact: clean up

In trying to stay consistant with the C style format in the TRACE_EVENT
macro, it makes more sense to do the printk after the assigning of
the variables.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h          |   2 +-
 include/trace/irq_event_types.h     |   8 +--
 include/trace/sched_event_types.h   | 102 ++++++++++++++++++------------------
 kernel/trace/trace_events_stage_1.h |   2 +-
 kernel/trace/trace_events_stage_2.h |   4 +-
 kernel/trace/trace_events_stage_3.h |   2 +-
 6 files changed, 60 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 69b56988813d..c7b09452514b 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -157,7 +157,7 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
-#define TRACE_EVENT(name, proto, args, struct, print, assign)	\
+#define TRACE_EVENT(name, proto, args, struct, assign, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
 #endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
index 43bcb74dd49f..214bb928fe9e 100644
--- a/include/trace/irq_event_types.h
+++ b/include/trace/irq_event_types.h
@@ -31,13 +31,13 @@ TRACE_EVENT(irq_handler_exit,
 		__field(	int,	ret	)
 	),
 
-	TP_printk("irq=%d return=%s",
-		  __entry->irq, __entry->ret ? "handled" : "unhandled"),
-
 	TP_fast_assign(
 		__entry->irq	= irq;
 		__entry->ret	= ret;
-	)
+	),
+
+	TP_printk("irq=%d return=%s",
+		  __entry->irq, __entry->ret ? "handled" : "unhandled")
 );
 
 #undef TRACE_SYSTEM
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index fb37af672c88..63547dc1125f 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -22,12 +22,12 @@ TRACE_EVENT(sched_kthread_stop,
 		__field(	pid_t,	pid			)
 	),
 
-	TP_printk("task %s:%d", __entry->comm, __entry->pid),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
 		__entry->pid	= t->pid;
-	)
+	),
+
+	TP_printk("task %s:%d", __entry->comm, __entry->pid)
 );
 
 /*
@@ -43,11 +43,11 @@ TRACE_EVENT(sched_kthread_stop_ret,
 		__field(	int,	ret	)
 	),
 
-	TP_printk("ret %d", __entry->ret),
-
 	TP_fast_assign(
 		__entry->ret	= ret;
-	)
+	),
+
+	TP_printk("ret %d", __entry->ret)
 );
 
 /*
@@ -68,14 +68,14 @@ TRACE_EVENT(sched_wait_task,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid	= p->pid;
 		__entry->prio	= p->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -97,16 +97,16 @@ TRACE_EVENT(sched_wakeup,
 		__field(	int,	success			)
 	),
 
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
 		__entry->success	= success;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
 );
 
 /*
@@ -128,16 +128,16 @@ TRACE_EVENT(sched_wakeup_new,
 		__field(	int,	success			)
 	),
 
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
 		__entry->success	= success;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
 );
 
 /*
@@ -162,10 +162,6 @@ TRACE_EVENT(sched_switch,
 		__field(	int,	next_prio			)
 	),
 
-	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
-		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-		__entry->next_comm, __entry->next_pid, __entry->next_prio),
-
 	TP_fast_assign(
 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
 		__entry->prev_pid	= prev->pid;
@@ -173,7 +169,11 @@ TRACE_EVENT(sched_switch,
 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
 		__entry->next_pid	= next->pid;
 		__entry->next_prio	= next->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
 
 /*
@@ -193,17 +193,17 @@ TRACE_EVENT(sched_migrate_task,
 		__field(	int,	dest_cpu		)
 	),
 
-	TP_printk("task %s:%d [%d] from: %d  to: %d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->orig_cpu, __entry->dest_cpu),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
 		__entry->orig_cpu	= orig_cpu;
 		__entry->dest_cpu	= dest_cpu;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] from: %d  to: %d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->orig_cpu, __entry->dest_cpu)
 );
 
 /*
@@ -221,14 +221,14 @@ TRACE_EVENT(sched_process_free,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -246,14 +246,14 @@ TRACE_EVENT(sched_process_exit,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -271,14 +271,14 @@ TRACE_EVENT(sched_process_wait,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 		__entry->pid		= pid_nr(pid);
 		__entry->prio		= current->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -297,16 +297,16 @@ TRACE_EVENT(sched_process_fork,
 		__field(	pid_t,	child_pid			)
 	),
 
-	TP_printk("parent %s:%d  child %s:%d",
-		__entry->parent_comm, __entry->parent_pid,
-		__entry->child_comm, __entry->child_pid),
-
 	TP_fast_assign(
 		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
 		__entry->parent_pid	= parent->pid;
 		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
 		__entry->child_pid	= child->pid;
-	)
+	),
+
+	TP_printk("parent %s:%d  child %s:%d",
+		__entry->parent_comm, __entry->parent_pid,
+		__entry->child_comm, __entry->child_pid)
 );
 
 /*
@@ -324,14 +324,14 @@ TRACE_EVENT(sched_signal_send,
 		__field(	pid_t,	pid			)
 	),
 
-	TP_printk("sig: %d  task %s:%d",
-		  __entry->sig, __entry->comm, __entry->pid),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid	= p->pid;
 		__entry->sig	= sig;
-	)
+	),
+
+	TP_printk("sig: %d  task %s:%d",
+		  __entry->sig, __entry->comm, __entry->pid)
 );
 
 #undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 15e9bf965a18..82f68443c556 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -27,7 +27,7 @@
 #define TP_STRUCT__entry(args...) args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, print, assign)	\
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
 	struct ftrace_raw_##name {				\
 		struct trace_entry	ent;			\
 		tstruct						\
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index d91bf4c56661..1ad9f8d2fe45 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -39,7 +39,7 @@
 #define TP_printk(fmt, args...) fmt "\n", args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 enum print_line_t							\
 ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 {									\
@@ -115,7 +115,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 #define TP_fast_assign(args...) args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, func)		\
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
 static int								\
 ftrace_format_##call(struct trace_seq *s)				\
 {									\
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 3ba55d4ab073..d6de06b9201a 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -148,7 +148,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define __entry entry
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 									\
 static struct ftrace_event_call event_##call;				\
 									\
-- 
cgit v1.2.3-59-g8ed1b


From 0e3d0f0566f3fcf664782f597070bbc669d78454 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 13:12:58 -0400
Subject: tracing: update comments to match event code macros

Impact: clean up / comments

The comments that described the ftrace macros to manipulate the
TRACE_EVENT and TRACE_FORMAT macros no longer match the code.
This patch updates them.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_1.h | 6 ++++--
 kernel/trace/trace_events_stage_2.h | 9 ++++-----
 kernel/trace/trace_events_stage_3.h | 8 ++++----
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 82f68443c556..38985f9b379c 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -6,11 +6,13 @@
  * struct ftrace_raw_<call> {
  *	struct trace_entry		ent;
  *	<type>				<item>;
+ *	<type2>				<item2>[<len>];
  *	[...]
  * };
  *
- * The <type> <item> is created by the TRACE_FIELD(type, item, assign)
- * macro. We simply do "type item;", and that will create the fields
+ * The <type> <item> is created by the __field(type, item) macro or
+ * the __array(type2, item2, len) macro.
+ * We simply do "type item;", and that will create the fields
  * in the structure.
  */
 
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 1ad9f8d2fe45..ca347afd6aa0 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -20,7 +20,7 @@
  *
  *	field = (typeof(field))entry;
  *
- *	ret = trace_seq_printf(s, <TP_RAW_FMT> "%s", <ARGS> "\n");
+ *	ret = trace_seq_printf(s, <TP_printk> "\n");
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
  *
@@ -76,10 +76,9 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
  *	int ret;
  *
  *	ret = trace_seq_printf(s, #type " " #item ";"
- *			       " size:%d; offset:%d;\n",
- *			       sizeof(field.type),
- *			       offsetof(struct ftrace_raw_##call,
- *					item));
+ *			       " offset:%u; size:%u;\n",
+ *			       offsetof(struct ftrace_raw_##call, item),
+ *			       sizeof(field.type));
  *
  * }
  */
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index d6de06b9201a..6ee1de59f19d 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -56,7 +56,8 @@
  * 		return;
  * 	entry	= ring_buffer_event_data(event);
  *
- * 	<tstruct>;  <-- Here we assign the entries by the TRACE_FIELD.
+ * 	<assign>;  <-- Here we assign the entries by the __field and
+ *			__array macros.
  *
  * 	trace_current_buffer_unlock_commit(event, irq_flags, pc);
  * }
@@ -96,11 +97,10 @@
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
  * 	.name 			= "<call>",
+ *	.system			= "<system>",
+ * 	.raw_init		= ftrace_raw_init_event_<call>,
  * 	.regfunc		= ftrace_reg_event_<call>,
  * 	.unregfunc		= ftrace_unreg_event_<call>,
- * 	.raw_init		= ftrace_raw_init_event_<call>,
- * 	.raw_reg		= ftrace_raw_reg_event_<call>,
- * 	.raw_unreg		= ftrace_raw_unreg_event_<call>,
  *	.show_format		= ftrace_format_<call>,
  * }
  *
-- 
cgit v1.2.3-59-g8ed1b


From ef18012b248b47ec9a12c3a83ca5e99782d39c5d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 14:10:56 -0400
Subject: tracing: remove funky whitespace in the trace code

Impact: clean up

There existed a lot of <space><tab>'s in the tracing code. This
patch removes them.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h           | 16 +++---
 kernel/trace/blktrace.c              | 10 ++--
 kernel/trace/trace.c                 |  2 +-
 kernel/trace/trace_branch.c          |  2 +-
 kernel/trace/trace_events_stage_3.h  | 98 ++++++++++++++++++------------------
 kernel/trace/trace_export.c          |  2 +-
 kernel/trace/trace_functions_graph.c |  6 +--
 kernel/trace/trace_output.c          | 14 +++---
 kernel/trace/trace_workqueue.c       |  6 +--
 9 files changed, 78 insertions(+), 78 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 119ece224c21..d35a7ee7611f 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -178,8 +178,8 @@ static inline void tracepoint_synchronize_unregister(void)
  *	* prototype, declare it via TP_PROTO():
  *	*
  *
- * 	TP_PROTO(struct rq *rq, struct task_struct *prev,
- * 		 struct task_struct *next),
+ *	TP_PROTO(struct rq *rq, struct task_struct *prev,
+ *		 struct task_struct *next),
  *
  *	*
  *	* Define the call signature of the 'function'.
@@ -187,7 +187,7 @@ static inline void tracepoint_synchronize_unregister(void)
  *	*  TP_PROTO1/TP_PROTO2/TP_PROTO3 ugliness.)
  *	*
  *
- * 	TP_ARGS(rq, prev, next),
+ *	TP_ARGS(rq, prev, next),
  *
  *	*
  *	* Fast binary tracing: define the trace record via
@@ -229,13 +229,13 @@ static inline void tracepoint_synchronize_unregister(void)
  *	* happens, on an active tracepoint.
  *	*
  *
- * 	TP_fast_assign(
- * 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
- * 		__entry->prev_pid	= prev->pid;
- * 		__entry->prev_prio	= prev->prio;
+ *	TP_fast_assign(
+ *		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+ *		__entry->prev_pid	= prev->pid;
+ *		__entry->prev_prio	= prev->prio;
  *		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
  *		__entry->next_pid	= next->pid;
- * 		__entry->next_prio	= next->prio;
+ *		__entry->next_prio	= next->prio;
  *	)
  *
  *	*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e39679a72a3b..bec69d3678c1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -33,7 +33,7 @@ static struct trace_array *blk_tr;
 static int __read_mostly  blk_tracer_enabled;
 
 /* Select an alternative, minimalistic output than the original one */
-#define TRACE_BLK_OPT_CLASSIC 	0x1
+#define TRACE_BLK_OPT_CLASSIC	0x1
 
 static struct tracer_opt blk_tracer_opts[] = {
 	/* Default disable the minimalistic output */
@@ -564,7 +564,7 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop);
 /**
  * blk_trace_ioctl: - handle the ioctls associated with tracing
  * @bdev:	the block device
- * @cmd: 	the ioctl cmd
+ * @cmd:	the ioctl cmd
  * @arg:	the argument data, if any
  *
  **/
@@ -1128,9 +1128,9 @@ static void blk_tracer_reset(struct trace_array *tr)
 
 static struct {
 	const char *act[2];
-	int 	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
+	int	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
 } what2act[] __read_mostly = {
-	[__BLK_TA_QUEUE]	= {{  "Q", "queue" }, 	   blk_log_generic },
+	[__BLK_TA_QUEUE]	= {{  "Q", "queue" },	   blk_log_generic },
 	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
 	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
 	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
@@ -1229,7 +1229,7 @@ static struct tracer blk_tracer __read_mostly = {
 };
 
 static struct trace_event trace_blk_event = {
-	.type	 	= TRACE_BLK,
+	.type		= TRACE_BLK,
 	.trace		= blk_trace_event_print,
 	.binary		= blk_trace_event_print_binary,
 };
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cc94f8642485..8c6a902db40a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -799,7 +799,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
-	entry->tgid               	= (tsk) ? tsk->tgid : 0;
+	entry->tgid			= (tsk) ? tsk->tgid : 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index aaa0755268b9..ad8c22efff41 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -157,7 +157,7 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
 
 
 static struct trace_event trace_branch_event = {
-	.type	 	= TRACE_BRANCH,
+	.type		= TRACE_BRANCH,
 	.trace		= trace_branch_print,
 };
 
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 6ee1de59f19d..ae2e323df0c7 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -5,23 +5,23 @@
  *
  * static void ftrace_event_<call>(proto)
  * {
- * 	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
+ *	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
  * }
  *
  * static int ftrace_reg_event_<call>(void)
  * {
- * 	int ret;
+ *	int ret;
  *
- * 	ret = register_trace_<call>(ftrace_event_<call>);
- * 	if (!ret)
- * 		pr_info("event trace: Could not activate trace point "
- * 			"probe to  <call>");
- * 	return ret;
+ *	ret = register_trace_<call>(ftrace_event_<call>);
+ *	if (!ret)
+ *		pr_info("event trace: Could not activate trace point "
+ *			"probe to  <call>");
+ *	return ret;
  * }
  *
  * static void ftrace_unreg_event_<call>(void)
  * {
- * 	unregister_trace_<call>(ftrace_event_<call>);
+ *	unregister_trace_<call>(ftrace_event_<call>);
  * }
  *
  * For those macros defined with TRACE_FORMAT:
@@ -29,9 +29,9 @@
  * static struct ftrace_event_call __used
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
- * 	.name 			= "<call>",
- * 	.regfunc		= ftrace_reg_event_<call>,
- * 	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.name			= "<call>",
+ *	.regfunc		= ftrace_reg_event_<call>,
+ *	.unregfunc		= ftrace_unreg_event_<call>,
  * }
  *
  *
@@ -41,66 +41,66 @@
  *
  * static void ftrace_raw_event_<call>(proto)
  * {
- * 	struct ring_buffer_event *event;
- * 	struct ftrace_raw_<call> *entry; <-- defined in stage 1
- * 	unsigned long irq_flags;
- * 	int pc;
- *
- * 	local_save_flags(irq_flags);
- * 	pc = preempt_count();
- *
- * 	event = trace_current_buffer_lock_reserve(event_<call>.id,
- * 				  sizeof(struct ftrace_raw_<call>),
- * 				  irq_flags, pc);
- * 	if (!event)
- * 		return;
- * 	entry	= ring_buffer_event_data(event);
- *
- * 	<assign>;  <-- Here we assign the entries by the __field and
+ *	struct ring_buffer_event *event;
+ *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
+ *	unsigned long irq_flags;
+ *	int pc;
+ *
+ *	local_save_flags(irq_flags);
+ *	pc = preempt_count();
+ *
+ *	event = trace_current_buffer_lock_reserve(event_<call>.id,
+ *				  sizeof(struct ftrace_raw_<call>),
+ *				  irq_flags, pc);
+ *	if (!event)
+ *		return;
+ *	entry	= ring_buffer_event_data(event);
+ *
+ *	<assign>;  <-- Here we assign the entries by the __field and
  *			__array macros.
  *
- * 	trace_current_buffer_unlock_commit(event, irq_flags, pc);
+ *	trace_current_buffer_unlock_commit(event, irq_flags, pc);
  * }
  *
  * static int ftrace_raw_reg_event_<call>(void)
  * {
- * 	int ret;
+ *	int ret;
  *
- * 	ret = register_trace_<call>(ftrace_raw_event_<call>);
- * 	if (!ret)
- * 		pr_info("event trace: Could not activate trace point "
- * 			"probe to <call>");
- * 	return ret;
+ *	ret = register_trace_<call>(ftrace_raw_event_<call>);
+ *	if (!ret)
+ *		pr_info("event trace: Could not activate trace point "
+ *			"probe to <call>");
+ *	return ret;
  * }
  *
  * static void ftrace_unreg_event_<call>(void)
  * {
- * 	unregister_trace_<call>(ftrace_raw_event_<call>);
+ *	unregister_trace_<call>(ftrace_raw_event_<call>);
  * }
  *
  * static struct trace_event ftrace_event_type_<call> = {
- * 	.trace			= ftrace_raw_output_<call>, <-- stage 2
+ *	.trace			= ftrace_raw_output_<call>, <-- stage 2
  * };
  *
  * static int ftrace_raw_init_event_<call>(void)
  * {
- * 	int id;
+ *	int id;
  *
- * 	id = register_ftrace_event(&ftrace_event_type_<call>);
- * 	if (!id)
- * 		return -ENODEV;
- * 	event_<call>.id = id;
- * 	return 0;
+ *	id = register_ftrace_event(&ftrace_event_type_<call>);
+ *	if (!id)
+ *		return -ENODEV;
+ *	event_<call>.id = id;
+ *	return 0;
  * }
  *
  * static struct ftrace_event_call __used
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
- * 	.name 			= "<call>",
+ *	.name			= "<call>",
  *	.system			= "<system>",
- * 	.raw_init		= ftrace_raw_init_event_<call>,
- * 	.regfunc		= ftrace_reg_event_<call>,
- * 	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.raw_init		= ftrace_raw_init_event_<call>,
+ *	.regfunc		= ftrace_reg_event_<call>,
+ *	.unregfunc		= ftrace_unreg_event_<call>,
  *	.show_format		= ftrace_format_<call>,
  * }
  *
@@ -138,7 +138,7 @@ _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
@@ -163,7 +163,7 @@ static void ftrace_raw_event_##call(proto)				\
 	pc = preempt_count();						\
 									\
 	event = trace_current_buffer_lock_reserve(event_##call.id,	\
-				  sizeof(struct ftrace_raw_##call), 	\
+				  sizeof(struct ftrace_raw_##call),	\
 				  irq_flags, pc);			\
 	if (!event)							\
 		return;							\
@@ -208,7 +208,7 @@ static int ftrace_raw_init_event_##call(void)				\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 23ae78430d58..4d9952d3df50 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -94,7 +94,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.id			= proto,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.show_format		= ftrace_format_##call,			\
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 453ebd3b636e..d1493b853e41 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -841,12 +841,12 @@ static void graph_trace_close(struct trace_iterator *iter)
 }
 
 static struct tracer graph_trace __read_mostly = {
-	.name	     	= "function_graph",
+	.name		= "function_graph",
 	.open		= graph_trace_open,
 	.close		= graph_trace_close,
 	.wait_pipe	= poll_wait_pipe,
-	.init	     	= graph_trace_init,
-	.reset	     	= graph_trace_reset,
+	.init		= graph_trace_init,
+	.reset		= graph_trace_reset,
 	.print_line	= print_graph_function,
 	.print_header	= print_graph_headers,
 	.flags		= &tracer_flags,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ef8fd661b217..491832af9ba1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -565,7 +565,7 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
 }
 
 static struct trace_event trace_fn_event = {
-	.type	 	= TRACE_FN,
+	.type		= TRACE_FN,
 	.trace		= trace_fn_trace,
 	.raw		= trace_fn_raw,
 	.hex		= trace_fn_hex,
@@ -696,7 +696,7 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_ctx_event = {
-	.type	 	= TRACE_CTX,
+	.type		= TRACE_CTX,
 	.trace		= trace_ctx_print,
 	.raw		= trace_ctx_raw,
 	.hex		= trace_ctx_hex,
@@ -704,7 +704,7 @@ static struct trace_event trace_ctx_event = {
 };
 
 static struct trace_event trace_wake_event = {
-	.type	 	= TRACE_WAKE,
+	.type		= TRACE_WAKE,
 	.trace		= trace_wake_print,
 	.raw		= trace_wake_raw,
 	.hex		= trace_wake_hex,
@@ -759,7 +759,7 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_special_event = {
-	.type	 	= TRACE_SPECIAL,
+	.type		= TRACE_SPECIAL,
 	.trace		= trace_special_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -796,7 +796,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_stack_event = {
-	.type	 	= TRACE_STACK,
+	.type		= TRACE_STACK,
 	.trace		= trace_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -825,7 +825,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_user_stack_event = {
-	.type	 	= TRACE_USER_STACK,
+	.type		= TRACE_USER_STACK,
 	.trace		= trace_user_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -879,7 +879,7 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 
 
 static struct trace_event trace_print_event = {
-	.type	 	= TRACE_PRINT,
+	.type		= TRACE_PRINT,
 	.trace		= trace_print_print,
 	.raw		= trace_print_raw,
 };
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 4664990fe9c5..e542483df623 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -19,14 +19,14 @@ struct cpu_workqueue_stats {
 /* Useful to know if we print the cpu headers */
 	bool		            first_entry;
 	int		            cpu;
-	pid_t 			    pid;
+	pid_t			    pid;
 /* Can be inserted from interrupt or user context, need to be atomic */
-	atomic_t 	            inserted;
+	atomic_t	            inserted;
 /*
  *  Don't need to be atomic, works are serialized in a single workqueue thread
  *  on a single CPU.
  */
-	unsigned int 	 	    executed;
+	unsigned int		    executed;
 };
 
 /* List of workqueue threads on one cpu */
-- 
cgit v1.2.3-59-g8ed1b


From 6cc3c6e12bb039047974ad2e7e2d46d15a1b762f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 10 Mar 2009 19:03:43 +0100
Subject: trace_clock: fix preemption bug

Using the function_graph tracer in recent kernels generates a spew of
preemption BUGs. Fix this by not requiring trace_clock_local() users
to disable preemption themselves.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_clock.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 2d4953f93560..05b176abfd30 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -27,12 +27,19 @@
  */
 u64 notrace trace_clock_local(void)
 {
+	unsigned long flags;
+	u64 clock;
+
 	/*
 	 * sched_clock() is an architecture implemented, fast, scalable,
 	 * lockless clock. It is not guaranteed to be coherent across
 	 * CPUs, nor across CPU idle events.
 	 */
-	return sched_clock();
+	raw_local_irq_save(flags);
+	clock = sched_clock();
+	raw_local_irq_restore(flags);
+
+	return clock;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 80370cb758e7ca2692cd9fb5e413d970b1f4b2b2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 17:16:35 -0400
Subject: tracing: use raw spinlocks for trace_vprintk

Impact: prevent locking up by lockdep tracer

The lockdep tracer uses trace_vprintk and thus trace_vprintk can not
call back into lockdep without locking up.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8c6a902db40a..4c97947650ae 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1176,7 +1176,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
  */
 int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
-	static DEFINE_SPINLOCK(trace_buf_lock);
+	static raw_spinlock_t trace_buf_lock =
+		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	static u32 trace_buf[TRACE_BUF_SIZE];
 
 	struct ring_buffer_event *event;
@@ -1201,7 +1202,9 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	if (unlikely(atomic_read(&data->disabled)))
 		goto out;
 
-	spin_lock_irqsave(&trace_buf_lock, flags);
+	/* Lockdep uses trace_printk for lock tracing */
+	local_irq_save(flags);
+	__raw_spin_lock(&trace_buf_lock);
 	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
 
 	if (len > TRACE_BUF_SIZE || len < 0)
@@ -1220,7 +1223,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 out_unlock:
-	spin_unlock_irqrestore(&trace_buf_lock, flags);
+	__raw_spin_unlock(&trace_buf_lock);
+	local_irq_restore(flags);
 
 out:
 	ftrace_preempt_enable(resched);
-- 
cgit v1.2.3-59-g8ed1b


From 73c5162aa362a543793f4a957c6c536dcbaa89ce Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 11 Mar 2009 13:42:01 -0400
Subject: tracing: keep ring buffer to minimum size till used

Impact: less memory impact on systems not using tracer

When the kernel boots up that has tracing configured, it allocates
the default size of the ring buffer. This currently happens to be
1.4Megs per possible CPU. This is quite a bit of wasted memory if
the system is never using the tracer.

The current solution is to keep the ring buffers to a minimum size
until the user uses them. Once a tracer is piped into the current_tracer
the ring buffer will be expanded to the default size. If the user
changes the size of the ring buffer, it will take the size given
by the user immediately.

If the user adds a "ftrace=" to the kernel command line, then the ring
buffers will be set to the default size on initialization.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 79 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 59 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4c97947650ae..0c1dc1850858 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -44,6 +44,12 @@
 unsigned long __read_mostly	tracing_max_latency;
 unsigned long __read_mostly	tracing_thresh;
 
+/*
+ * On boot up, the ring buffer is set to the minimum size, so that
+ * we do not waste memory on systems that are not using tracing.
+ */
+static int ring_buffer_expanded;
+
 /*
  * We need to change this state when a selftest is running.
  * A selftest will lurk into the ring-buffer to count the
@@ -128,6 +134,8 @@ static int __init set_ftrace(char *str)
 {
 	strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
 	default_bootup_tracer = bootup_tracer_buf;
+	/* We are using ftrace early, expand it */
+	ring_buffer_expanded = 1;
 	return 1;
 }
 __setup("ftrace=", set_ftrace);
@@ -2315,6 +2323,40 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
 	return t->init(tr);
 }
 
+static int tracing_resize_ring_buffer(unsigned long size)
+{
+	int ret;
+
+	/*
+	 * If kernel or user changes the size of the ring buffer
+	 * it get completed.
+	 */
+	ring_buffer_expanded = 1;
+
+	ret = ring_buffer_resize(global_trace.buffer, size);
+	if (ret < 0)
+		return ret;
+
+	ret = ring_buffer_resize(max_tr.buffer, size);
+	if (ret < 0) {
+		int r;
+
+		r = ring_buffer_resize(global_trace.buffer,
+				       global_trace.entries);
+		if (r < 0) {
+			/* AARGH! We are left with different
+			 * size max buffer!!!! */
+			WARN_ON(1);
+			tracing_disabled = 1;
+		}
+		return ret;
+	}
+
+	global_trace.entries = size;
+
+	return ret;
+}
+
 struct trace_option_dentry;
 
 static struct trace_option_dentry *
@@ -2330,6 +2372,13 @@ static int tracing_set_tracer(const char *buf)
 	struct tracer *t;
 	int ret = 0;
 
+	if (!ring_buffer_expanded) {
+		ret = tracing_resize_ring_buffer(trace_buf_size);
+		if (ret < 0)
+			return ret;
+		ret = 0;
+	}
+
 	mutex_lock(&trace_types_lock);
 	for (t = trace_types; t; t = t->next) {
 		if (strcmp(t->name, buf) == 0)
@@ -2903,28 +2952,11 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	val <<= 10;
 
 	if (val != global_trace.entries) {
-		ret = ring_buffer_resize(global_trace.buffer, val);
+		ret = tracing_resize_ring_buffer(val);
 		if (ret < 0) {
 			cnt = ret;
 			goto out;
 		}
-
-		ret = ring_buffer_resize(max_tr.buffer, val);
-		if (ret < 0) {
-			int r;
-			cnt = ret;
-			r = ring_buffer_resize(global_trace.buffer,
-					       global_trace.entries);
-			if (r < 0) {
-				/* AARGH! We are left with different
-				 * size max buffer!!!! */
-				WARN_ON(1);
-				tracing_disabled = 1;
-			}
-			goto out;
-		}
-
-		global_trace.entries = val;
 	}
 
 	filp->f_pos += cnt;
@@ -3916,6 +3948,7 @@ void ftrace_dump(void)
 __init static int tracer_alloc_buffers(void)
 {
 	struct trace_array_cpu *data;
+	int ring_buf_size;
 	int i;
 	int ret = -ENOMEM;
 
@@ -3928,12 +3961,18 @@ __init static int tracer_alloc_buffers(void)
 	if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
 		goto out_free_tracing_cpumask;
 
+	/* To save memory, keep the ring buffer size to its minimum */
+	if (ring_buffer_expanded)
+		ring_buf_size = trace_buf_size;
+	else
+		ring_buf_size = 1;
+
 	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
 	cpumask_copy(tracing_cpumask, cpu_all_mask);
 	cpumask_clear(tracing_reader_cpumask);
 
 	/* TODO: make the number of buffers hot pluggable with CPUS */
-	global_trace.buffer = ring_buffer_alloc(trace_buf_size,
+	global_trace.buffer = ring_buffer_alloc(ring_buf_size,
 						   TRACE_BUFFER_FLAGS);
 	if (!global_trace.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
@@ -3944,7 +3983,7 @@ __init static int tracer_alloc_buffers(void)
 
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-	max_tr.buffer = ring_buffer_alloc(trace_buf_size,
+	max_tr.buffer = ring_buffer_alloc(ring_buf_size,
 					     TRACE_BUFFER_FLAGS);
 	if (!max_tr.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
-- 
cgit v1.2.3-59-g8ed1b


From 1852fcce181faa237c010a3dbedb473cf9d4555f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 11 Mar 2009 14:33:00 -0400
Subject: tracing: expand the ring buffers when an event is activated

To save memory, the tracer ring buffers are set to a minimum.
The activating of a trace expands the ring buffer size. This patch
adds this expanding, when an event is activated.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c        | 20 ++++++++++++++++++++
 kernel/trace/trace.h        |  3 +++
 kernel/trace/trace_events.c |  8 ++++++++
 3 files changed, 31 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0c1dc1850858..35ee63ae4122 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2357,6 +2357,26 @@ static int tracing_resize_ring_buffer(unsigned long size)
 	return ret;
 }
 
+/**
+ * tracing_update_buffers - used by tracing facility to expand ring buffers
+ *
+ * To save on memory when the tracing is never used on a system with it
+ * configured in. The ring buffers are set to a minimum size. But once
+ * a user starts to use the tracing facility, then they need to grow
+ * to their default size.
+ *
+ * This function is to be called when a tracer is about to be used.
+ */
+int tracing_update_buffers(void)
+{
+	int ret = 0;
+
+	if (!ring_buffer_expanded)
+		ret = tracing_resize_ring_buffer(trace_buf_size);
+
+	return ret;
+}
+
 struct trace_option_dentry;
 
 static struct trace_option_dentry *
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c5e1d8865fe4..336324d717f8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -737,6 +737,9 @@ static inline void trace_branch_disable(void)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
+/* set ring buffers to default size if not already done so */
+int tracing_update_buffers(void);
+
 /* trace event type bit fields, not numeric */
 enum {
 	TRACE_EVENT_TYPE_PRINTF		= 1,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 769dfd00fc85..ca624df73591 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -141,6 +141,10 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 	if (!cnt || cnt < 0)
 		return 0;
 
+	ret = tracing_update_buffers();
+	if (ret < 0)
+		return ret;
+
 	ret = get_user(ch, ubuf++);
 	if (ret)
 		return ret;
@@ -331,6 +335,10 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	if (ret < 0)
 		return ret;
 
+	ret = tracing_update_buffers();
+	if (ret < 0)
+		return ret;
+
 	switch (val) {
 	case 0:
 	case 1:
-- 
cgit v1.2.3-59-g8ed1b


From 9aba60fe6eb20453de53a572143bef22fa929fba Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 11 Mar 2009 19:52:30 -0400
Subject: tracing: fix trace_wait to know to wait on all cpus or just one

Impact: fix to task live locking on reading trace_pipe on one CPU

The same code is used for both trace_pipe (all CPUS) and the per_cpu
trace_pipe file. When there is no data to read, it will check for
signals and wait on the trace wait queue.

The problem happens with the per_cpu wait. The trace_wait code checks
all CPUs. Thus, if there's data in another CPU buffer, then it will
exit the wait, without checking for signals or waiting on the wait queue.

It would then try to read the empty buffer, and since that will just
return nothing, then it will try to wait again. Unfortunately, that will
again fail due to there still being data in the other buffers. This
ends up with a live lock for the task.

This patch fixes the trace_wait to be aware that the iterator may only
be waiting on a single buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 35ee63ae4122..e60f4be10d64 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1666,6 +1666,19 @@ static int trace_empty(struct trace_iterator *iter)
 {
 	int cpu;
 
+	/* If we are looking at one CPU buffer, only check that one */
+	if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
+		cpu = iter->cpu_file;
+		if (iter->buffer_iter[cpu]) {
+			if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+				return 0;
+		} else {
+			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+				return 0;
+		}
+		return 1;
+	}
+
 	for_each_tracing_cpu(cpu) {
 		if (iter->buffer_iter[cpu]) {
 			if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
-- 
cgit v1.2.3-59-g8ed1b


From 554f786e284a6ce859d51f62240d615603944c8e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 11 Mar 2009 22:00:13 -0400
Subject: ring-buffer: only allocate buffers for online cpus

Impact: save on memory

Currently, a ring buffer was allocated for each "possible_cpus". On
some systems, this is the same as NR_CPUS. Thus, if a system defined
NR_CPUS = 64 but it only had 1 CPU, we could have possibly 63 useless
ring buffers taking up space. With a default buffer of 3 megs, this
could be quite drastic.

This patch changes the ring buffer code to only allocate ring buffers
for online CPUs.  If a CPU goes off line, we do not free the buffer.
This is because the user may still have trace data in that buffer
that they would like to look at.

Perhaps in the future we could add code to delete a ring buffer if
the CPU is offline and the ring buffer becomes empty.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 266 ++++++++++++++++++++++++++++++++++++---------
 kernel/trace/trace.c       |   6 -
 2 files changed, 216 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 178858492a89..d07c2888396f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/list.h>
+#include <linux/cpu.h>
 #include <linux/fs.h>
 
 #include "trace.h"
@@ -301,6 +302,10 @@ struct ring_buffer {
 	struct mutex			mutex;
 
 	struct ring_buffer_per_cpu	**buffers;
+
+#ifdef CONFIG_HOTPLUG
+	struct notifier_block		cpu_notify;
+#endif
 };
 
 struct ring_buffer_iter {
@@ -459,6 +464,11 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
  */
 extern int ring_buffer_page_too_big(void);
 
+#ifdef CONFIG_HOTPLUG
+static int __cpuinit rb_cpu_notify(struct notifier_block *self,
+				   unsigned long action, void *hcpu);
+#endif
+
 /**
  * ring_buffer_alloc - allocate a new ring_buffer
  * @size: the size in bytes per cpu that is needed.
@@ -496,7 +506,8 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	if (buffer->pages == 1)
 		buffer->pages++;
 
-	cpumask_copy(buffer->cpumask, cpu_possible_mask);
+	get_online_cpus();
+	cpumask_copy(buffer->cpumask, cpu_online_mask);
 	buffer->cpus = nr_cpu_ids;
 
 	bsize = sizeof(void *) * nr_cpu_ids;
@@ -512,6 +523,13 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 			goto fail_free_buffers;
 	}
 
+#ifdef CONFIG_HOTPLUG
+	buffer->cpu_notify.notifier_call = rb_cpu_notify;
+	buffer->cpu_notify.priority = 0;
+	register_cpu_notifier(&buffer->cpu_notify);
+#endif
+
+	put_online_cpus();
 	mutex_init(&buffer->mutex);
 
 	return buffer;
@@ -525,6 +543,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 
  fail_free_cpumask:
 	free_cpumask_var(buffer->cpumask);
+	put_online_cpus();
 
  fail_free_buffer:
 	kfree(buffer);
@@ -541,9 +560,17 @@ ring_buffer_free(struct ring_buffer *buffer)
 {
 	int cpu;
 
+	get_online_cpus();
+
+#ifdef CONFIG_HOTPLUG
+	unregister_cpu_notifier(&buffer->cpu_notify);
+#endif
+
 	for_each_buffer_cpu(buffer, cpu)
 		rb_free_cpu_buffer(buffer->buffers[cpu]);
 
+	put_online_cpus();
+
 	free_cpumask_var(buffer->cpumask);
 
 	kfree(buffer);
@@ -649,16 +676,15 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 		return size;
 
 	mutex_lock(&buffer->mutex);
+	get_online_cpus();
 
 	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 
 	if (size < buffer_size) {
 
 		/* easy case, just free pages */
-		if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) {
-			mutex_unlock(&buffer->mutex);
-			return -1;
-		}
+		if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
+			goto out_fail;
 
 		rm_pages = buffer->pages - nr_pages;
 
@@ -677,10 +703,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	 * add these pages to the cpu_buffers. Otherwise we just free
 	 * them all and return -ENOMEM;
 	 */
-	if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) {
-		mutex_unlock(&buffer->mutex);
-		return -1;
-	}
+	if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
+		goto out_fail;
 
 	new_pages = nr_pages - buffer->pages;
 
@@ -705,13 +729,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 		rb_insert_pages(cpu_buffer, &pages, new_pages);
 	}
 
-	if (RB_WARN_ON(buffer, !list_empty(&pages))) {
-		mutex_unlock(&buffer->mutex);
-		return -1;
-	}
+	if (RB_WARN_ON(buffer, !list_empty(&pages)))
+		goto out_fail;
 
  out:
 	buffer->pages = nr_pages;
+	put_online_cpus();
 	mutex_unlock(&buffer->mutex);
 
 	return size;
@@ -721,8 +744,18 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 		list_del_init(&bpage->list);
 		free_buffer_page(bpage);
 	}
+	put_online_cpus();
 	mutex_unlock(&buffer->mutex);
 	return -ENOMEM;
+
+	/*
+	 * Something went totally wrong, and we are too paranoid
+	 * to even clean up the mess.
+	 */
+ out_fail:
+	put_online_cpus();
+	mutex_unlock(&buffer->mutex);
+	return -1;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
 
@@ -1528,11 +1561,15 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
+	get_online_cpus();
+
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
 	atomic_inc(&cpu_buffer->record_disabled);
+ out:
+	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
 
@@ -1548,11 +1585,15 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
+	get_online_cpus();
+
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
 	atomic_dec(&cpu_buffer->record_disabled);
+ out:
+	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 
@@ -1564,12 +1605,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret = 0;
+
+	get_online_cpus();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return 0;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
-	return cpu_buffer->entries;
+	ret = cpu_buffer->entries;
+ out:
+	put_online_cpus();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
 
@@ -1581,12 +1629,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret = 0;
+
+	get_online_cpus();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return 0;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
-	return cpu_buffer->overrun;
+	ret = cpu_buffer->overrun;
+ out:
+	put_online_cpus();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
 
@@ -1603,12 +1658,16 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 	unsigned long entries = 0;
 	int cpu;
 
+	get_online_cpus();
+
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		entries += cpu_buffer->entries;
 	}
 
+	put_online_cpus();
+
 	return entries;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_entries);
@@ -1626,12 +1685,16 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
 	unsigned long overruns = 0;
 	int cpu;
 
+	get_online_cpus();
+
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		overruns += cpu_buffer->overrun;
 	}
 
+	put_online_cpus();
+
 	return overruns;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_overruns);
@@ -1663,9 +1726,14 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
  */
 void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
 {
-	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+	struct ring_buffer_per_cpu *cpu_buffer;
 	unsigned long flags;
 
+	if (!iter)
+		return;
+
+	cpu_buffer = iter->cpu_buffer;
+
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	rb_iter_reset(iter);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -1900,9 +1968,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	struct buffer_page *reader;
 	int nr_loops = 0;
 
-	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return NULL;
-
 	cpu_buffer = buffer->buffers[cpu];
 
  again:
@@ -2028,13 +2093,21 @@ struct ring_buffer_event *
 ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
-	struct ring_buffer_event *event;
+	struct ring_buffer_event *event = NULL;
 	unsigned long flags;
 
+	get_online_cpus();
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		goto out;
+
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+ out:
+	put_online_cpus();
+
 	return event;
 }
 
@@ -2071,24 +2144,31 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 struct ring_buffer_event *
 ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
-	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
-	struct ring_buffer_event *event;
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_event *event = NULL;
 	unsigned long flags;
 
+	/* might be called in atomic */
+	preempt_disable();
+
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return NULL;
+		goto out;
 
+	cpu_buffer = buffer->buffers[cpu];
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	event = rb_buffer_peek(buffer, cpu, ts);
 	if (!event)
-		goto out;
+		goto out_unlock;
 
 	rb_advance_reader(cpu_buffer);
 
- out:
+ out_unlock:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+ out:
+	preempt_enable();
+
 	return event;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
@@ -2109,15 +2189,17 @@ struct ring_buffer_iter *
 ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	struct ring_buffer_iter *iter;
+	struct ring_buffer_iter *iter = NULL;
 	unsigned long flags;
 
+	get_online_cpus();
+
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return NULL;
+		goto out;
 
 	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter)
-		return NULL;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
 
@@ -2132,6 +2214,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+ out:
+	put_online_cpus();
+
 	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -2224,9 +2309,13 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	unsigned long flags;
+	int resched;
+
+	/* Can't use get_online_cpus because this can be in atomic */
+	resched = ftrace_preempt_disable();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return;
+		goto out;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
@@ -2237,6 +2326,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ out:
+	ftrace_preempt_enable(resched);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
 
@@ -2246,10 +2337,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
  */
 void ring_buffer_reset(struct ring_buffer *buffer)
 {
+	int resched;
 	int cpu;
 
+	/* Can't use get_online_cpus because this can be in atomic */
+	resched = ftrace_preempt_disable();
+
 	for_each_buffer_cpu(buffer, cpu)
 		ring_buffer_reset_cpu(buffer, cpu);
+
+	ftrace_preempt_enable(resched);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset);
 
@@ -2262,12 +2359,17 @@ int ring_buffer_empty(struct ring_buffer *buffer)
 	struct ring_buffer_per_cpu *cpu_buffer;
 	int cpu;
 
+	get_online_cpus();
+
 	/* yes this is racy, but if you don't like the race, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		if (!rb_per_cpu_empty(cpu_buffer))
 			return 0;
 	}
+
+	put_online_cpus();
+
 	return 1;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_empty);
@@ -2280,12 +2382,20 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
 int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
+	int ret = 1;
+
+	get_online_cpus();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return 1;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
-	return rb_per_cpu_empty(cpu_buffer);
+	ret = rb_per_cpu_empty(cpu_buffer);
+
+ out:
+	put_online_cpus();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
 
@@ -2304,32 +2414,37 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 {
 	struct ring_buffer_per_cpu *cpu_buffer_a;
 	struct ring_buffer_per_cpu *cpu_buffer_b;
+	int ret = -EINVAL;
+
+	get_online_cpus();
 
 	if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
 	    !cpumask_test_cpu(cpu, buffer_b->cpumask))
-		return -EINVAL;
+		goto out;
 
 	/* At least make sure the two buffers are somewhat the same */
 	if (buffer_a->pages != buffer_b->pages)
-		return -EINVAL;
+		goto out;
+
+	ret = -EAGAIN;
 
 	if (ring_buffer_flags != RB_BUFFERS_ON)
-		return -EAGAIN;
+		goto out;
 
 	if (atomic_read(&buffer_a->record_disabled))
-		return -EAGAIN;
+		goto out;
 
 	if (atomic_read(&buffer_b->record_disabled))
-		return -EAGAIN;
+		goto out;
 
 	cpu_buffer_a = buffer_a->buffers[cpu];
 	cpu_buffer_b = buffer_b->buffers[cpu];
 
 	if (atomic_read(&cpu_buffer_a->record_disabled))
-		return -EAGAIN;
+		goto out;
 
 	if (atomic_read(&cpu_buffer_b->record_disabled))
-		return -EAGAIN;
+		goto out;
 
 	/*
 	 * We can't do a synchronize_sched here because this
@@ -2349,7 +2464,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 	atomic_dec(&cpu_buffer_a->record_disabled);
 	atomic_dec(&cpu_buffer_b->record_disabled);
 
-	return 0;
+	ret = 0;
+out:
+	put_online_cpus();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 
@@ -2464,27 +2583,32 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	u64 save_timestamp;
 	int ret = -1;
 
+	get_online_cpus();
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		goto out;
+
 	/*
 	 * If len is not big enough to hold the page header, then
 	 * we can not copy anything.
 	 */
 	if (len <= BUF_PAGE_HDR_SIZE)
-		return -1;
+		goto out;
 
 	len -= BUF_PAGE_HDR_SIZE;
 
 	if (!data_page)
-		return -1;
+		goto out;
 
 	bpage = *data_page;
 	if (!bpage)
-		return -1;
+		goto out;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	reader = rb_get_reader_page(cpu_buffer);
 	if (!reader)
-		goto out;
+		goto out_unlock;
 
 	event = rb_reader_event(cpu_buffer);
 
@@ -2506,7 +2630,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		unsigned int size;
 
 		if (full)
-			goto out;
+			goto out_unlock;
 
 		if (len > (commit - read))
 			len = (commit - read);
@@ -2514,7 +2638,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		size = rb_event_length(event);
 
 		if (len < size)
-			goto out;
+			goto out_unlock;
 
 		/* save the current timestamp, since the user will need it */
 		save_timestamp = cpu_buffer->read_stamp;
@@ -2553,9 +2677,12 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	}
 	ret = read;
 
- out:
+ out_unlock:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+ out:
+	put_online_cpus();
+
 	return ret;
 }
 
@@ -2629,3 +2756,42 @@ static __init int rb_init_debugfs(void)
 }
 
 fs_initcall(rb_init_debugfs);
+
+#ifdef CONFIG_HOTPLUG
+static int __cpuinit rb_cpu_notify(struct notifier_block *self,
+				   unsigned long action, void *hcpu)
+{
+	struct ring_buffer *buffer =
+		container_of(self, struct ring_buffer, cpu_notify);
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		if (cpu_isset(cpu, *buffer->cpumask))
+			return NOTIFY_OK;
+
+		buffer->buffers[cpu] =
+			rb_allocate_cpu_buffer(buffer, cpu);
+		if (!buffer->buffers[cpu]) {
+			WARN(1, "failed to allocate ring buffer on CPU %ld\n",
+			     cpu);
+			return NOTIFY_OK;
+		}
+		smp_wmb();
+		cpu_set(cpu, *buffer->cpumask);
+		break;
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		/*
+		 * Do nothing.
+		 *  If we were to free the buffer, then the user would
+		 *  lose any trace that was in the buffer.
+		 */
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+#endif
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e60f4be10d64..14c98f6a47bc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1805,17 +1805,11 @@ __tracing_open(struct inode *inode, struct file *file)
 
 			iter->buffer_iter[cpu] =
 				ring_buffer_read_start(iter->tr->buffer, cpu);
-
-			if (!iter->buffer_iter[cpu])
-				goto fail_buffer;
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
 				ring_buffer_read_start(iter->tr->buffer, cpu);
-
-		if (!iter->buffer_iter[cpu])
-			goto fail;
 	}
 
 	/* TODO stop tracer */
-- 
cgit v1.2.3-59-g8ed1b


From d820ac4c2fa881079e6b689d2098adce337558ae Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 01:30:40 +0100
Subject: locking: rename trace_softirq_[enter|exit] =>
 lockdep_softirq_[enter|exit]

Impact: cleanup

The naming clashes with upcoming softirq tracepoints, so rename the
APIs to lockdep_*().

Requested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irqflags.h | 8 ++++----
 kernel/softirq.c         | 4 ++--
 lib/locking-selftest.c   | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 74bde13224c9..b02a3f1d46a0 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -24,8 +24,8 @@
 # define trace_softirqs_enabled(p)	((p)->softirqs_enabled)
 # define trace_hardirq_enter()	do { current->hardirq_context++; } while (0)
 # define trace_hardirq_exit()	do { current->hardirq_context--; } while (0)
-# define trace_softirq_enter()	do { current->softirq_context++; } while (0)
-# define trace_softirq_exit()	do { current->softirq_context--; } while (0)
+# define lockdep_softirq_enter()	do { current->softirq_context++; } while (0)
+# define lockdep_softirq_exit()	do { current->softirq_context--; } while (0)
 # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
 #else
 # define trace_hardirqs_on()		do { } while (0)
@@ -38,8 +38,8 @@
 # define trace_softirqs_enabled(p)	0
 # define trace_hardirq_enter()		do { } while (0)
 # define trace_hardirq_exit()		do { } while (0)
-# define trace_softirq_enter()		do { } while (0)
-# define trace_softirq_exit()		do { } while (0)
+# define lockdep_softirq_enter()	do { } while (0)
+# define lockdep_softirq_exit()		do { } while (0)
 # define INIT_TRACE_IRQFLAGS
 #endif
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9041ea7948fe..08a030f85416 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -180,7 +180,7 @@ asmlinkage void __do_softirq(void)
 	account_system_vtime(current);
 
 	__local_bh_disable((unsigned long)__builtin_return_address(0));
-	trace_softirq_enter();
+	lockdep_softirq_enter();
 
 	cpu = smp_processor_id();
 restart:
@@ -220,7 +220,7 @@ restart:
 	if (pending)
 		wakeup_softirqd();
 
-	trace_softirq_exit();
+	lockdep_softirq_exit();
 
 	account_system_vtime(current);
 	_local_bh_enable();
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 280332c1827c..619313ed6c46 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -157,11 +157,11 @@ static void init_shared_classes(void)
 #define SOFTIRQ_ENTER()				\
 		local_bh_disable();		\
 		local_irq_disable();		\
-		trace_softirq_enter();		\
+		lockdep_softirq_enter();	\
 		WARN_ON(!in_softirq());
 
 #define SOFTIRQ_EXIT()				\
-		trace_softirq_exit();		\
+		lockdep_softirq_exit();		\
 		local_irq_enable();		\
 		local_bh_enable();
 
-- 
cgit v1.2.3-59-g8ed1b


From a123c52b46a1f84bcec3dc963351896c6d6afaf7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 11:21:08 -0400
Subject: tracing: fix comments about trace buffer resizing

Impact: cleanup

Some of the comments about the trace buffer resizing is gobbledygook.
And I wonder why people question if I'm a native English speaker.

This patch makes the comments make a bit more sense.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c3946a6df34e..c61ee85c50bb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2336,7 +2336,8 @@ static int tracing_resize_ring_buffer(unsigned long size)
 
 	/*
 	 * If kernel or user changes the size of the ring buffer
-	 * it get completed.
+	 * we use the size that was given, and we can forget about
+	 * expanding it later.
 	 */
 	ring_buffer_expanded = 1;
 
@@ -2351,8 +2352,20 @@ static int tracing_resize_ring_buffer(unsigned long size)
 		r = ring_buffer_resize(global_trace.buffer,
 				       global_trace.entries);
 		if (r < 0) {
-			/* AARGH! We are left with different
-			 * size max buffer!!!! */
+			/*
+			 * AARGH! We are left with different
+			 * size max buffer!!!!
+			 * The max buffer is our "snapshot" buffer.
+			 * When a tracer needs a snapshot (one of the
+			 * latency tracers), it swaps the max buffer
+			 * with the saved snap shot. We succeeded to
+			 * update the size of the main buffer, but failed to
+			 * update the size of the max buffer. But when we tried
+			 * to reset the main buffer to the original size, we
+			 * failed there too. This is very unlikely to
+			 * happen, but if it does, warn and kill all
+			 * tracing.
+			 */
 			WARN_ON(1);
 			tracing_disabled = 1;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 1027fcb206a0fb8348e63aff078c74bdee1c2698 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 11:33:20 -0400
Subject: tracing: protect ring_buffer_expanded with trace_types_lock

Impact: prevent races with ring_buffer_expanded

This patch places the expanding of the tracing buffer under the
protection of the trace_types_lock mutex. It is highly unlikely
that there would be any contention, but better safe than sorry.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c61ee85c50bb..04ab8243a13d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2391,8 +2391,10 @@ int tracing_update_buffers(void)
 {
 	int ret = 0;
 
+	mutex_lock(&trace_types_lock);
 	if (!ring_buffer_expanded)
 		ret = tracing_resize_ring_buffer(trace_buf_size);
+	mutex_unlock(&trace_types_lock);
 
 	return ret;
 }
@@ -2412,6 +2414,8 @@ static int tracing_set_tracer(const char *buf)
 	struct tracer *t;
 	int ret = 0;
 
+	mutex_lock(&trace_types_lock);
+
 	if (!ring_buffer_expanded) {
 		ret = tracing_resize_ring_buffer(trace_buf_size);
 		if (ret < 0)
@@ -2419,7 +2423,6 @@ static int tracing_set_tracer(const char *buf)
 		ret = 0;
 	}
 
-	mutex_lock(&trace_types_lock);
 	for (t = trace_types; t; t = t->next) {
 		if (strcmp(t->name, buf) == 0)
 			break;
-- 
cgit v1.2.3-59-g8ed1b


From 59222efe2d184956464abe5b637bc842ff053b93 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 11:46:03 -0400
Subject: ring-buffer: use CONFIG_HOTPLUG_CPU not CONFIG_HOTPLUG

The hotplug code in the ring buffers is for use with CPU hotplug,
not generic hotplug.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d07c2888396f..035b56c3a6c9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -303,7 +303,7 @@ struct ring_buffer {
 
 	struct ring_buffer_per_cpu	**buffers;
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 	struct notifier_block		cpu_notify;
 #endif
 };
@@ -464,7 +464,7 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
  */
 extern int ring_buffer_page_too_big(void);
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 static int __cpuinit rb_cpu_notify(struct notifier_block *self,
 				   unsigned long action, void *hcpu);
 #endif
@@ -523,7 +523,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 			goto fail_free_buffers;
 	}
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 	buffer->cpu_notify.notifier_call = rb_cpu_notify;
 	buffer->cpu_notify.priority = 0;
 	register_cpu_notifier(&buffer->cpu_notify);
@@ -562,7 +562,7 @@ ring_buffer_free(struct ring_buffer *buffer)
 
 	get_online_cpus();
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 	unregister_cpu_notifier(&buffer->cpu_notify);
 #endif
 
@@ -2757,7 +2757,7 @@ static __init int rb_init_debugfs(void)
 
 fs_initcall(rb_init_debugfs);
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 static int __cpuinit rb_cpu_notify(struct notifier_block *self,
 				   unsigned long action, void *hcpu)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 8aabee573dff131a085c63de7667eacd94ba4ccb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 13:13:49 -0400
Subject: ring-buffer: remove unneeded get_online_cpus

Impact: speed up and remove possible races

The get_online_cpus was added to the ring buffer because the original
design would free the ring buffer on a CPU that was being taken
off line. The final design kept the ring buffer around even when the
CPU was taken off line. This is to allow a user to still read the
information on that ring buffer.

Most of the get_online_cpus are no longer needed since the ring buffer will
not disappear from the use cases.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 90 ++++++++--------------------------------------
 1 file changed, 14 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 035b56c3a6c9..2c36be9fac2e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1561,15 +1561,11 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return;
 
 	cpu_buffer = buffer->buffers[cpu];
 	atomic_inc(&cpu_buffer->record_disabled);
- out:
-	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
 
@@ -1585,15 +1581,11 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return;
 
 	cpu_buffer = buffer->buffers[cpu];
 	atomic_dec(&cpu_buffer->record_disabled);
- out:
-	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 
@@ -1605,17 +1597,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long ret = 0;
-
-	get_online_cpus();
+	unsigned long ret;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return 0;
 
 	cpu_buffer = buffer->buffers[cpu];
 	ret = cpu_buffer->entries;
- out:
-	put_online_cpus();
 
 	return ret;
 }
@@ -1629,17 +1617,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long ret = 0;
-
-	get_online_cpus();
+	unsigned long ret;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return 0;
 
 	cpu_buffer = buffer->buffers[cpu];
 	ret = cpu_buffer->overrun;
- out:
-	put_online_cpus();
 
 	return ret;
 }
@@ -1658,16 +1642,12 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 	unsigned long entries = 0;
 	int cpu;
 
-	get_online_cpus();
-
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		entries += cpu_buffer->entries;
 	}
 
-	put_online_cpus();
-
 	return entries;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_entries);
@@ -1685,16 +1665,12 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
 	unsigned long overruns = 0;
 	int cpu;
 
-	get_online_cpus();
-
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		overruns += cpu_buffer->overrun;
 	}
 
-	put_online_cpus();
-
 	return overruns;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_overruns);
@@ -2093,21 +2069,16 @@ struct ring_buffer_event *
 ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
-	struct ring_buffer_event *event = NULL;
+	struct ring_buffer_event *event;
 	unsigned long flags;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return NULL;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
- out:
-	put_online_cpus();
-
 	return event;
 }
 
@@ -2189,17 +2160,15 @@ struct ring_buffer_iter *
 ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	struct ring_buffer_iter *iter = NULL;
+	struct ring_buffer_iter *iter;
 	unsigned long flags;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return NULL;
 
 	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter)
-		goto out;
+		return NULL;
 
 	cpu_buffer = buffer->buffers[cpu];
 
@@ -2214,9 +2183,6 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
- out:
-	put_online_cpus();
-
 	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -2309,13 +2275,9 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	unsigned long flags;
-	int resched;
-
-	/* Can't use get_online_cpus because this can be in atomic */
-	resched = ftrace_preempt_disable();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
@@ -2326,8 +2288,6 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- out:
-	ftrace_preempt_enable(resched);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
 
@@ -2337,16 +2297,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
  */
 void ring_buffer_reset(struct ring_buffer *buffer)
 {
-	int resched;
 	int cpu;
 
-	/* Can't use get_online_cpus because this can be in atomic */
-	resched = ftrace_preempt_disable();
-
 	for_each_buffer_cpu(buffer, cpu)
 		ring_buffer_reset_cpu(buffer, cpu);
-
-	ftrace_preempt_enable(resched);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset);
 
@@ -2359,8 +2313,6 @@ int ring_buffer_empty(struct ring_buffer *buffer)
 	struct ring_buffer_per_cpu *cpu_buffer;
 	int cpu;
 
-	get_online_cpus();
-
 	/* yes this is racy, but if you don't like the race, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
@@ -2368,8 +2320,6 @@ int ring_buffer_empty(struct ring_buffer *buffer)
 			return 0;
 	}
 
-	put_online_cpus();
-
 	return 1;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_empty);
@@ -2382,18 +2332,14 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
 int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	int ret = 1;
-
-	get_online_cpus();
+	int ret;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return 1;
 
 	cpu_buffer = buffer->buffers[cpu];
 	ret = rb_per_cpu_empty(cpu_buffer);
 
- out:
-	put_online_cpus();
 
 	return ret;
 }
@@ -2416,8 +2362,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 	struct ring_buffer_per_cpu *cpu_buffer_b;
 	int ret = -EINVAL;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
 	    !cpumask_test_cpu(cpu, buffer_b->cpumask))
 		goto out;
@@ -2466,8 +2410,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 
 	ret = 0;
 out:
-	put_online_cpus();
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
@@ -2583,8 +2525,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	u64 save_timestamp;
 	int ret = -1;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		goto out;
 
@@ -2681,8 +2621,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
  out:
-	put_online_cpus();
-
 	return ret;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From db526ca329f855510e8ce672332eba3304aed590 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 13:53:25 -0400
Subject: tracing: show that buffer size is not expanded

Impact: do not confuse user on small trace buffer sizes

When the system boots up, the trace buffer is small to conserve memory.
It is only two pages per online CPU. When the tracer is used, it expands
to the default value.

This can confuse the user if they look at the buffer size and see only
7, but then later they see 1408.

 # cat /debug/tracing/buffer_size_kb
7

 # echo sched_switch > /debug/tracing/current_tracer

 # cat /debug/tracing/buffer_size_kb
1408

This patch tries to help remove this confustion by showing that the
buffer has not been expanded.

 # cat /debug/tracing/buffer_size_kb
7 (expanded: 1408)

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 04ab8243a13d..62a63b2b33dd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2948,10 +2948,18 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
 {
 	struct trace_array *tr = filp->private_data;
-	char buf[64];
+	char buf[96];
 	int r;
 
-	r = sprintf(buf, "%lu\n", tr->entries >> 10);
+	mutex_lock(&trace_types_lock);
+	if (!ring_buffer_expanded)
+		r = sprintf(buf, "%lu (expanded: %lu)\n",
+			    tr->entries >> 10,
+			    trace_buf_size >> 10);
+	else
+		r = sprintf(buf, "%lu\n", tr->entries >> 10);
+	mutex_unlock(&trace_types_lock);
+
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 48ead02030f849d011259244bb4ea9b985479006 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 12 Mar 2009 18:24:49 +0100
Subject: tracing/core: bring back raw trace_printk for dynamic formats strings

Impact: fix callsites with dynamic format strings

Since its new binary implementation, trace_printk() internally uses static
containers for the format strings on each callsites. But the value is
assigned once at build time, which means that it can't take dynamic
formats.

So this patch unearthes the raw trace_printk implementation for the callers
that will need trace_printk to be able to carry these dynamic format
strings. The trace_printk() macro will use the appropriate implementation
for each callsite. Most of the time however, the binary implementation will
still be used.

The other impact of this patch is that mmiotrace_printk() will use the old
implementation because it calls the low level trace_vprintk and we can't
guess here whether the format passed in it is dynamic or not.

Some parts of this patch have been written by Steven Rostedt (most notably
the part that chooses the appropriate implementation for each callsites).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/kernel.h               | 40 +++++++++++------
 kernel/trace/trace.c                 | 85 +++++++++++++++++++++++++++++++++---
 kernel/trace/trace.h                 | 13 +++++-
 kernel/trace/trace_event_types.h     | 11 ++++-
 kernel/trace/trace_functions_graph.c |  6 +--
 kernel/trace/trace_mmiotrace.c       |  7 +--
 kernel/trace/trace_output.c          | 57 +++++++++++++++++++++---
 kernel/trace/trace_printk.c          | 33 +++++++++++---
 8 files changed, 213 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7742798c9208..1daca3b062bb 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -452,31 +452,45 @@ do {									\
 
 #define trace_printk(fmt, args...)					\
 do {									\
-	static const char *trace_printk_fmt				\
-	__attribute__((section("__trace_printk_fmt")));			\
-									\
-	if (!trace_printk_fmt)						\
-		trace_printk_fmt = fmt;					\
-									\
 	__trace_printk_check_format(fmt, ##args);			\
-	__trace_printk(_THIS_IP_, trace_printk_fmt, ##args);		\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
+									\
+		__trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args);	\
+	} else								\
+		__trace_printk(_THIS_IP_, fmt, ##args);		\
 } while (0)
 
+extern int
+__trace_bprintk(unsigned long ip, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+
 extern int
 __trace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 
+/*
+ * The double __builtin_constant_p is because gcc will give us an error
+ * if we try to allocate the static variable to fmt if it is not a
+ * constant. Even with the outer if statement.
+ */
 #define ftrace_vprintk(fmt, vargs)					\
 do {									\
-	static const char *trace_printk_fmt				\
-	__attribute__((section("__trace_printk_fmt")));			\
-									\
-	if (!trace_printk_fmt)						\
-		trace_printk_fmt = fmt;					\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
 									\
-	__ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs);		\
+		__ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs);	\
+	} else								\
+		__ftrace_vprintk(_THIS_IP_, fmt, vargs);		\
 } while (0)
 
+extern int
+__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
+
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62a63b2b33dd..dbb077d8a172 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1179,10 +1179,10 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 
 
 /**
- * trace_vprintk - write binary msg to tracing buffer
+ * trace_vbprintk - write binary msg to tracing buffer
  *
  */
-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
 	static raw_spinlock_t trace_buf_lock =
 		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
@@ -1191,7 +1191,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
-	struct print_entry *entry;
+	struct bprint_entry *entry;
 	unsigned long flags;
 	int resched;
 	int cpu, len = 0, size, pc;
@@ -1219,7 +1219,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 		goto out_unlock;
 
 	size = sizeof(*entry) + sizeof(u32) * len;
-	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, flags, pc);
+	event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc);
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
@@ -1240,6 +1240,60 @@ out:
 
 	return len;
 }
+EXPORT_SYMBOL_GPL(trace_vbprintk);
+
+int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+{
+	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
+	static char trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	int cpu, len = 0, size, pc;
+	struct print_entry *entry;
+	unsigned long irq_flags;
+
+	if (tracing_disabled || tracing_selftest_running)
+		return 0;
+
+	pc = preempt_count();
+	preempt_disable_notrace();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	pause_graph_tracing();
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&trace_buf_lock);
+	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	len = min(len, TRACE_BUF_SIZE-1);
+	trace_buf[len] = 0;
+
+	size = sizeof(*entry) + len + 1;
+	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->depth			= depth;
+
+	memcpy(&entry->buf, trace_buf, len);
+	entry->buf[len] = 0;
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+ out_unlock:
+	__raw_spin_unlock(&trace_buf_lock);
+	raw_local_irq_restore(irq_flags);
+	unpause_graph_tracing();
+ out:
+	preempt_enable_notrace();
+
+	return len;
+}
 EXPORT_SYMBOL_GPL(trace_vprintk);
 
 enum trace_file_type {
@@ -1628,6 +1682,22 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t print_bprintk_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct bprint_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_bprintf(s, field->fmt, field->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1637,7 +1707,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_bprintf(s, field->fmt, field->buf);
+	ret = trace_seq_printf(s, "%s", field->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -1702,6 +1772,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 			return ret;
 	}
 
+	if (iter->ent->type == TRACE_BPRINT &&
+			trace_flags & TRACE_ITER_PRINTK &&
+			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+		return print_bprintk_msg_only(iter);
+
 	if (iter->ent->type == TRACE_PRINT &&
 			trace_flags & TRACE_ITER_PRINTK &&
 			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 336324d717f8..cede1ab49d07 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,6 +20,7 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
+	TRACE_BPRINT,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -117,7 +118,7 @@ struct userstack_entry {
 /*
  * trace_printk entry:
  */
-struct print_entry {
+struct bprint_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
 	int			depth;
@@ -125,6 +126,13 @@ struct print_entry {
 	u32			buf[];
 };
 
+struct print_entry {
+	struct trace_entry	ent;
+	unsigned long		ip;
+	int			depth;
+	char			buf[];
+};
+
 #define TRACE_OLD_SIZE		88
 
 struct trace_field_cont {
@@ -286,6 +294,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
+		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
@@ -570,6 +579,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 extern void *head_page(struct trace_array_cpu *data);
 extern long ns2usecs(cycle_t nsec);
 extern int
+trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args);
+extern int
 trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
 
 extern unsigned long trace_flags;
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 5cca4c978bde..d0907d746425 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -102,7 +102,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
-TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
+TRACE_EVENT_FORMAT(bprint, TRACE_PRINT, bprint_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned int, depth, depth)
@@ -112,6 +112,15 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
 );
 
+TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ip, ip)
+		TRACE_FIELD(unsigned int, depth, depth)
+		TRACE_FIELD_ZERO_CHAR(buf)
+	),
+	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
+);
+
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned int, line, line)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8566c14b3e9a..4c388607ed67 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -684,7 +684,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 }
 
 static enum print_line_t
-print_graph_comment(struct print_entry *trace, struct trace_seq *s,
+print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 		   struct trace_entry *ent, struct trace_iterator *iter)
 {
 	int i;
@@ -781,8 +781,8 @@ print_graph_function(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 		return print_graph_return(&field->ret, s, entry, iter);
 	}
-	case TRACE_PRINT: {
-		struct print_entry *field;
+	case TRACE_BPRINT: {
+		struct bprint_entry *field;
 		trace_assign_type(field, entry);
 		return print_graph_comment(field, s, entry, iter);
 	}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 23e346a734ca..f095916e477f 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -254,6 +254,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 {
 	struct trace_entry *entry = iter->ent;
 	struct print_entry *print = (struct print_entry *)entry;
+	const char *msg		= print->buf;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
 	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
@@ -261,11 +262,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 	int ret;
 
 	/* The trailing newline must be in the message. */
-	ret = trace_seq_printf(s, "MARK %u.%06lu ", secs, usec_rem);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ret = trace_seq_bprintf(s, print->fmt, print->buf);
+	ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 491832af9ba1..ea9d3b410c7a 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -832,13 +832,13 @@ static struct trace_event trace_user_stack_event = {
 	.binary		= trace_special_bin,
 };
 
-/* TRACE_PRINT */
+/* TRACE_BPRINT */
 static enum print_line_t
-trace_print_print(struct trace_iterator *iter, int flags)
+trace_bprint_print(struct trace_iterator *iter, int flags)
 {
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *s = &iter->seq;
-	struct print_entry *field;
+	struct bprint_entry *field;
 
 	trace_assign_type(field, entry);
 
@@ -858,9 +858,10 @@ trace_print_print(struct trace_iterator *iter, int flags)
 }
 
 
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t
+trace_bprint_raw(struct trace_iterator *iter, int flags)
 {
-	struct print_entry *field;
+	struct bprint_entry *field;
 	struct trace_seq *s = &iter->seq;
 
 	trace_assign_type(field, iter->ent);
@@ -878,12 +879,55 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 }
 
 
+static struct trace_event trace_bprint_event = {
+	.type		= TRACE_BPRINT,
+	.trace		= trace_bprint_print,
+	.raw		= trace_bprint_raw,
+};
+
+/* TRACE_PRINT */
+static enum print_line_t trace_print_print(struct trace_iterator *iter,
+					   int flags)
+{
+	struct print_entry *field;
+	struct trace_seq *s = &iter->seq;
+
+	trace_assign_type(field, iter->ent);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (!trace_seq_printf(s, ": %s", field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+{
+	struct print_entry *field;
+
+	trace_assign_type(field, iter->ent);
+
+	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
 static struct trace_event trace_print_event = {
-	.type		= TRACE_PRINT,
+	.type	 	= TRACE_PRINT,
 	.trace		= trace_print_print,
 	.raw		= trace_print_raw,
 };
 
+
 static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
 	&trace_ctx_event,
@@ -891,6 +935,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_special_event,
 	&trace_stack_event,
 	&trace_user_stack_event,
+	&trace_bprint_event,
 	&trace_print_event,
 	NULL
 };
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index a50aea22e929..f307a11e2332 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -99,7 +99,7 @@ struct notifier_block module_trace_bprintk_format_nb = {
 	.notifier_call = module_trace_bprintk_format_notify,
 };
 
-int __trace_printk(unsigned long ip, const char *fmt, ...)
+int __trace_bprintk(unsigned long ip, const char *fmt, ...)
  {
 	int ret;
 	va_list ap;
@@ -111,13 +111,13 @@ int __trace_printk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	ret = trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
 	va_end(ap);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__trace_printk);
+EXPORT_SYMBOL_GPL(__trace_bprintk);
 
-int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
  {
 	if (unlikely(!fmt))
 		return 0;
@@ -125,11 +125,34 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 	if (!(trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
+	return trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vbprintk);
+
+int __trace_printk(unsigned long ip, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_printk);
+
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+{
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
 	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
-
 static __init int init_trace_printk(void)
 {
 	return register_module_notifier(&module_trace_bprintk_format_nb);
-- 
cgit v1.2.3-59-g8ed1b


From 828275574e0161bdddb5817d4bd76a0265ef0470 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 14:14:31 -0400
Subject: tracing: make bprint event use the proper event id

The bprint record is using TRACE_PRINT when it should be TRACE_BPRINT.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_event_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index d0907d746425..019915063fe6 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -102,7 +102,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
-TRACE_EVENT_FORMAT(bprint, TRACE_PRINT, bprint_entry, ignore,
+TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned int, depth, depth)
-- 
cgit v1.2.3-59-g8ed1b


From e9fb2b6d5845e24f104713591286b6f39761c027 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 14:19:25 -0400
Subject: tracing: have event_trace_printk use static tracer

Impact: speed up on event tracing

The event_trace_printk is currently a wrapper function that calls
trace_vprintk. Because it uses a variable for the fmt it misses out
on the optimization of using the binary printk.

This patch makes event_trace_printk into a macro wrapper to use the
fmt as the same as the trace_printks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.h        | 17 +++++++++++++++++
 kernel/trace/trace_events.c | 10 ----------
 2 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cede1ab49d07..35cfa7bbaf38 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -773,4 +773,21 @@ void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
+extern const char *__start___trace_bprintk_fmt[];
+extern const char *__stop___trace_bprintk_fmt[];
+
+#define event_trace_printk(ip, fmt, args...)				\
+do {									\
+	__trace_printk_check_format(fmt, ##args);			\
+	tracing_record_cmdline(current);				\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
+									\
+		__trace_bprintk(ip, trace_printk_fmt, ##args);		\
+	} else								\
+		__trace_printk(ip, fmt, ##args);			\
+} while (0)
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ca624df73591..238ea95a4115 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -24,16 +24,6 @@ static DEFINE_MUTEX(event_mutex);
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
 	     event++)
 
-void event_trace_printk(unsigned long ip, const char *fmt, ...)
-{
-	va_list ap;
-
-	va_start(ap, fmt);
-	tracing_record_cmdline(current);
-	trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
-	va_end(ap);
-}
-
 static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call = (void *)__start_ftrace_events;
-- 
cgit v1.2.3-59-g8ed1b


From 7975a2be16dd42df2cab80c80cb6ece382edb6ec Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 14:23:17 -0400
Subject: tracing: export trace formats to user space

The binary printk saves a pointer to the format string in the ring buffer.
On output, the format is processed. But if the user is reading the
ring buffer through a binary interface, the pointer is meaningless.

This patch creates a file called printk_formats that maps the pointers
to the formats.

 # cat /debug/tracing/printk_formats
0xffffffff80713d40 : "irq_handler_entry: irq=%d handler=%s\n"
0xffffffff80713d48 : "lock_acquire: %s%s%s\n"
0xffffffff80713d50 : "lock_release: %s\n"

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_printk.c | 119 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 114 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index f307a11e2332..486785214e3e 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -4,18 +4,19 @@
  * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
  *
  */
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/ftrace.h>
 #include <linux/string.h>
+#include <linux/module.h>
+#include <linux/marker.h>
+#include <linux/mutex.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
-#include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/seq_file.h>
 #include <linux/fs.h>
-#include <linux/marker.h>
-#include <linux/uaccess.h>
 
 #include "trace.h"
 
@@ -153,6 +154,114 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	const char **fmt = m->private;
+	const char **next = fmt;
+
+	(*pos)++;
+
+	if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
+		return NULL;
+
+	next = fmt;
+	m->private = ++next;
+
+	return fmt;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	return t_next(m, NULL, pos);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	const char **fmt = v;
+	const char *str = *fmt;
+	int i;
+
+	seq_printf(m, "0x%lx : \"", (unsigned long)fmt);
+
+	/*
+	 * Tabs and new lines need to be converted.
+	 */
+	for (i = 0; str[i]; i++) {
+		switch (str[i]) {
+		case '\n':
+			seq_puts(m, "\\n");
+			break;
+		case '\t':
+			seq_puts(m, "\\t");
+			break;
+		case '\\':
+			seq_puts(m, "\\");
+			break;
+		case '"':
+			seq_puts(m, "\\\"");
+			break;
+		default:
+			seq_putc(m, str[i]);
+		}
+	}
+	seq_puts(m, "\"\n");
+
+	return 0;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations show_format_seq_ops = {
+	.start = t_start,
+	.next = t_next,
+	.show = t_show,
+	.stop = t_stop,
+};
+
+static int
+ftrace_formats_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &show_format_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+
+		m->private = __start___trace_bprintk_fmt;
+	}
+	return ret;
+}
+
+static const struct file_operations ftrace_formats_fops = {
+	.open = ftrace_formats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static __init int init_trace_printk_function_export(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return 0;
+
+	entry = debugfs_create_file("printk_formats", 0444, d_tracer,
+				    NULL, &ftrace_formats_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'printk_formats' entry\n");
+
+	return 0;
+}
+
+fs_initcall(init_trace_printk_function_export);
+
 static __init int init_trace_printk(void)
 {
 	return register_module_notifier(&module_trace_bprintk_format_nb);
-- 
cgit v1.2.3-59-g8ed1b


From 2da03ecee6308ea174e8a02b92a3c4ec92e886c8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 18:57:51 -0400
Subject: tracing: fix stack tracer header

The stack tracer use to look like this:

 # cat /debug/tracing/stack_trace
         Depth  Size      Location    (57 entries)
         -----  ----      --------
  0)     5088      16   mempool_alloc_slab+0x16/0x18
  1)     5072     144   mempool_alloc+0x4d/0xfe
  2)     4928      16   scsi_sg_alloc+0x48/0x4a [scsi_mod]

Now it looks like this:

 # cat /debug/tracing/stack_trace

        Depth    Size      Location    (57 entries)
        -----    ----      --------
  0)     5088      16   mempool_alloc_slab+0x16/0x18
  1)     5072     144   mempool_alloc+0x4d/0xfe
  2)     4928      16   scsi_sg_alloc+0x48/0x4a [scsi_mod]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_stack.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index d0871bc0aca5..4564fd94b0cd 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -251,9 +251,9 @@ static int t_show(struct seq_file *m, void *v)
 	int size;
 
 	if (v == SEQ_START_TOKEN) {
-		seq_printf(m, "        Depth   Size      Location"
+		seq_printf(m, "        Depth    Size      Location"
 			   "    (%d entries)\n"
-			   "        -----   ----      --------\n",
+			   "        -----    ----      --------\n",
 			   max_stack_trace.nr_entries);
 		return 0;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From e447e1df2e568cd43d1918963c9f09fae85aea57 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 19:42:29 -0400
Subject: tracing: explain why stack tracer is empty

If the stack tracing is disabled (by default) the stack_trace file
will only contain the header:

 # cat /debug/tracing/stack_trace
        Depth    Size      Location    (0 entries)
        -----    ----      --------

This can be frustrating to a developer that does not realize that the
stack tracer is disabled. This patch adds the following text:

  # cat /debug/tracing/stack_trace
        Depth    Size      Location    (0 entries)
        -----    ----      --------
 #
 #  Stack tracer disabled
 #
 # To enable the stack tracer, either add 'stacktrace' to the
 # kernel command line
 # or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'
 #

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_stack.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4564fd94b0cd..91ccbf396c9a 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -245,6 +245,17 @@ static int trace_lookup_stack(struct seq_file *m, long i)
 #endif
 }
 
+static void print_disabled(struct seq_file *m)
+{
+	seq_puts(m, "#\n"
+		 "#  Stack tracer disabled\n"
+		 "#\n"
+		 "# To enable the stack tracer, either add 'stacktrace' to the\n"
+		 "# kernel command line\n"
+		 "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n"
+		 "#\n");
+}
+
 static int t_show(struct seq_file *m, void *v)
 {
 	long i;
@@ -255,6 +266,10 @@ static int t_show(struct seq_file *m, void *v)
 			   "    (%d entries)\n"
 			   "        -----    ----      --------\n",
 			   max_stack_trace.nr_entries);
+
+		if (!stack_tracer_enabled && !max_stack_size)
+			print_disabled(m);
+
 		return 0;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 5d592b44b29a1d73e13d5c9e3426eed843bdc359 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 12 Mar 2009 14:33:36 -0400
Subject: tracing: tracepoints for softirq entry/exit - add softirq-to-name
 array

Create a 'softirq_to_name' array, which is indexed by softirq #, so
that we can easily convert between the softirq index # and its name, in
order to get more meaningful output messages.

LKML-Reference: <20090312183336.GB3352@redhat.com>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/interrupt.h | 5 +++++
 kernel/softirq.c          | 9 ++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 472f11765f60..9b7e9d743476 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -258,6 +258,11 @@ enum
 	NR_SOFTIRQS
 };
 
+/* map softirq index to softirq name. update 'softirq_to_name' in
+ * kernel/softirq.c when adding a new softirq.
+ */
+extern char *softirq_to_name[NR_SOFTIRQS];
+
 /* softirq mask and active fields moved to irq_cpustat_t in
  * asm/hardirq.h to get better cache usage.  KAO
  */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7571bcb71be4..9f90fdc039f4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -53,6 +53,12 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
 
 static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
+char *softirq_to_name[NR_SOFTIRQS] = {
+	"HI_SOFTIRQ", "TIMER_SOFTIRQ", "NET_TX_SOFTIRQ", "NET_RX_SOFTIRQ",
+	"BLOCK_SOFTIRQ", "TASKLET_SOFTIRQ", "SCHED_SOFTIRQ", "HRTIMER_SOFTIRQ",
+	"RCU_SOFTIRQ"
+};
+
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
  * but we also don't want to introduce a worst case 1/HZ latency
@@ -209,9 +215,10 @@ restart:
 			h->action(h);
 
 			if (unlikely(prev_count != preempt_count())) {
-				printk(KERN_ERR "huh, entered softirq %td %p"
+				printk(KERN_ERR "huh, entered softirq %td %s %p"
 				       "with preempt_count %08x,"
 				       " exited with %08x?\n", h - softirq_vec,
+				       softirq_to_name[h - softirq_vec],
 				       h->action, prev_count, preempt_count());
 				preempt_count() = prev_count;
 			}
-- 
cgit v1.2.3-59-g8ed1b


From 39842323ceb368d2ea36ab7696aedbe296e13b61 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 12 Mar 2009 14:36:03 -0400
Subject: tracing: tracepoints for softirq entry/exit - tracepoints

Introduce softirq entry/exit tracepoints. These are useful for
augmenting existing tracers, and to figure out softirq frequencies and
timings.

[
  s/irq_softirq_/softirq_/ for trace point names and
  Fixed printf format in TRACE_FORMAT macro
   - Steven Rostedt
]

LKML-Reference: <20090312183603.GC3352@redhat.com>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/trace/irq_event_types.h | 12 ++++++++++++
 kernel/softirq.c                |  7 ++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
index 214bb928fe9e..85964ebd47ec 100644
--- a/include/trace/irq_event_types.h
+++ b/include/trace/irq_event_types.h
@@ -40,4 +40,16 @@ TRACE_EVENT(irq_handler_exit,
 		  __entry->irq, __entry->ret ? "handled" : "unhandled")
 );
 
+TRACE_FORMAT(softirq_entry,
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+	TP_ARGS(h, vec),
+	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
+	);
+
+TRACE_FORMAT(softirq_exit,
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+	TP_ARGS(h, vec),
+	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
+	);
+
 #undef TRACE_SYSTEM
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9f90fdc039f4..a5e81231ca7a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,6 +24,7 @@
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
+#include <trace/irq.h>
 
 #include <asm/irq.h>
 /*
@@ -186,6 +187,9 @@ EXPORT_SYMBOL(local_bh_enable_ip);
  */
 #define MAX_SOFTIRQ_RESTART 10
 
+DEFINE_TRACE(softirq_entry);
+DEFINE_TRACE(softirq_exit);
+
 asmlinkage void __do_softirq(void)
 {
 	struct softirq_action *h;
@@ -212,8 +216,9 @@ restart:
 		if (pending & 1) {
 			int prev_count = preempt_count();
 
+			trace_softirq_entry(h, softirq_vec);
 			h->action(h);
-
+			trace_softirq_exit(h, softirq_vec);
 			if (unlikely(prev_count != preempt_count())) {
 				printk(KERN_ERR "huh, entered softirq %td %s %p"
 				       "with preempt_count %08x,"
-- 
cgit v1.2.3-59-g8ed1b


From 889a6c367283709a80dad9413488472596a1a1d2 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 13 Mar 2009 09:03:04 +0900
Subject: tracing: Don't use tracing_record_cmdline() in workqueue tracer fix

commit c3ffc7a40b7e94b094efe1c8ab4e24370a782b65
"Don't use tracing_record_cmdline() in workqueue tracer"
has a race window.

find_task_by_vpid() requires task_list_lock().

LKML-Reference: <20090313090042.43CD.A69D9226@jp.fujitsu.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_workqueue.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index fb5ccac8bbc0..9ab035b58cf1 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -193,12 +193,20 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 	struct cpu_workqueue_stats *cws = p;
 	unsigned long flags;
 	int cpu = cws->cpu;
-	struct task_struct *tsk = find_task_by_vpid(cws->pid);
-
-	seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu,
-		   atomic_read(&cws->inserted),
-		   cws->executed,
-		   tsk ? tsk->comm : "<...>");
+	struct pid *pid;
+	struct task_struct *tsk;
+
+	pid = find_get_pid(cws->pid);
+	if (pid) {
+		tsk = get_pid_task(pid, PIDTYPE_PID);
+		if (tsk) {
+			seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu,
+				   atomic_read(&cws->inserted), cws->executed,
+				   tsk->comm);
+			put_task_struct(tsk);
+		}
+		put_pid(pid);
+	}
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
 	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-- 
cgit v1.2.3-59-g8ed1b


From f28e55765e40450c127e44d00ae65d0cd1a4efec Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 22:00:19 -0400
Subject: tracing: show event name in trace for TRACE_EVENT created events

Unlike TRACE_FORMAT() macros, the TRACE_EVENT() macros do not show
the event name in the trace file. Knowing the event type in the trace
output is very useful.

Instead of:

   task swapper:0 [140] ==> ntpd:3308 [120]

We now have:

   sched_switch: task swapper:0 [140] ==> ntpd:3308 [120]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index ca347afd6aa0..5117c43f5c67 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -57,7 +57,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 									\
 	field = (typeof(field))entry;					\
 									\
-	ret = trace_seq_printf(s, print);				\
+	ret = trace_seq_printf(s, #call ": " print);			\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
-- 
cgit v1.2.3-59-g8ed1b


From 5cc985488845ec7227a2c5cfd2fd62cf57fb411a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 22:24:17 -0400
Subject: ring-buffer: document reader page design

In a private email conversation I explained how the ring buffer
page worked by using silly ASCII art. Ingo suggested that I add
that to the comments of the code.

Here it is.

Requested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2c36be9fac2e..58128ad2fde0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -21,6 +21,74 @@
 
 #include "trace.h"
 
+/*
+ * The ring buffer is made up of a list of pages. A separate list of pages is
+ * allocated for each CPU. A writer may only write to a buffer that is
+ * associated with the CPU it is currently executing on.  A reader may read
+ * from any per cpu buffer.
+ *
+ * The reader is special. For each per cpu buffer, the reader has its own
+ * reader page. When a reader has read the entire reader page, this reader
+ * page is swapped with another page in the ring buffer.
+ *
+ * Now, as long as the writer is off the reader page, the reader can do what
+ * ever it wants with that page. The writer will never write to that page
+ * again (as long as it is out of the ring buffer).
+ *
+ * Here's some silly ASCII art.
+ *
+ *   +------+
+ *   |reader|          RING BUFFER
+ *   |page  |
+ *   +------+        +---+   +---+   +---+
+ *                   |   |-->|   |-->|   |
+ *                   +---+   +---+   +---+
+ *                     ^               |
+ *                     |               |
+ *                     +---------------+
+ *
+ *
+ *   +------+
+ *   |reader|          RING BUFFER
+ *   |page  |------------------v
+ *   +------+        +---+   +---+   +---+
+ *                   |   |-->|   |-->|   |
+ *                   +---+   +---+   +---+
+ *                     ^               |
+ *                     |               |
+ *                     +---------------+
+ *
+ *
+ *   +------+
+ *   |reader|          RING BUFFER
+ *   |page  |------------------v
+ *   +------+        +---+   +---+   +---+
+ *      ^            |   |-->|   |-->|   |
+ *      |            +---+   +---+   +---+
+ *      |                              |
+ *      |                              |
+ *      +------------------------------+
+ *
+ *
+ *   +------+
+ *   |buffer|          RING BUFFER
+ *   |page  |------------------v
+ *   +------+        +---+   +---+   +---+
+ *      ^            |   |   |   |-->|   |
+ *      |   New      +---+   +---+   +---+
+ *      |  Reader------^               |
+ *      |   page                       |
+ *      +------------------------------+
+ *
+ *
+ * After we make this swap, the reader can hand this page off to the splice
+ * code and be done with it. It can even allocate a new page if it needs to
+ * and swap that into the ring buffer.
+ *
+ * We will be using cmpxchg soon to make all this lockless.
+ *
+ */
+
 /*
  * A fast way to enable or disable all ring buffers is to
  * call tracing_on or tracing_off. Turning off the ring buffers
-- 
cgit v1.2.3-59-g8ed1b


From eb1871f34358024acfa3523ef375ef14b7527173 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Mar 2009 00:00:58 -0400
Subject: tracing: left align location header in stack_trace

Ingo Molnar suggested, instead of:

        Depth    Size      Location    (27 entries)
        -----    ----      --------
  0)     2880      48   lock_timer_base+0x2b/0x4f
  1)     2832      80   __mod_timer+0x33/0xe0
  2)     2752      16   __ide_set_handler+0x63/0x65

To have it be:

        Depth    Size   Location    (27 entries)
        -----    ----   --------
  0)     2880      48   lock_timer_base+0x2b/0x4f
  1)     2832      80   __mod_timer+0x33/0xe0
  2)     2752      16   __ide_set_handler+0x63/0x65

Requested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_stack.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 91ccbf396c9a..c750f65f9661 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -262,9 +262,9 @@ static int t_show(struct seq_file *m, void *v)
 	int size;
 
 	if (v == SEQ_START_TOKEN) {
-		seq_printf(m, "        Depth    Size      Location"
+		seq_printf(m, "        Depth    Size   Location"
 			   "    (%d entries)\n"
-			   "        -----    ----      --------\n",
+			   "        -----    ----   --------\n",
 			   max_stack_trace.nr_entries);
 
 		if (!stack_tracer_enabled && !max_stack_size)
-- 
cgit v1.2.3-59-g8ed1b


From bdc067582b8b71c7771bab076bbc51569c594fb4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Mar 2009 00:12:52 -0400
Subject: tracing: add comment for use of double __builtin_consant_p

Impact: documentation

The use of the double __builtin_contant_p checks in the event_trace_printk
can be confusing to developers and reviewers. This patch adds a comment
to explain why it is there.

Requested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
LKML-Reference: <20090313122235.43EB.A69D9226@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 35cfa7bbaf38..67595b8f0f15 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -776,6 +776,11 @@ extern struct ftrace_event_call __stop_ftrace_events[];
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 
+/*
+ * The double __builtin_constant_p is because gcc will give us an error
+ * if we try to allocate the static variable to fmt if it is not a
+ * constant. Even with the outer if statement optimizing out.
+ */
 #define event_trace_printk(ip, fmt, args...)				\
 do {									\
 	__trace_printk_check_format(fmt, ##args);			\
-- 
cgit v1.2.3-59-g8ed1b


From c69fc56de1df5769f2ec69c915c7ad5afe63804c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Mar 2009 14:49:46 +1030
Subject: cpumask: use topology_core_cpumask/topology_thread_cpumask instead of
 cpu_core_map/cpu_sibling_map

Impact: cleanup

This is presumably what those definitions are for, and while all archs
define cpu_core_map/cpu_sibling map, that's changing (eg. x86 wants to
change it to a pointer).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 block/blk.h    | 2 +-
 kernel/sched.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/block/blk.h b/block/blk.h
index 0dce92c37496..3ee94358b43d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -102,7 +102,7 @@ static inline int blk_cpu_to_group(int cpu)
 	const struct cpumask *mask = cpu_coregroup_mask(cpu);
 	return cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
-	return first_cpu(per_cpu(cpu_sibling_map, cpu));
+	return cpumask_first(topology_thread_cpumask(cpu));
 #else
 	return cpu;
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 0a76d0b6f215..5dabd80c3c15 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7249,7 +7249,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
 {
 	int group;
 
-	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
 	group = cpumask_first(mask);
 	if (sg)
 		*sg = &per_cpu(sched_group_core, group).sg;
@@ -7278,7 +7278,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
 	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
 	group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
-	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
 	group = cpumask_first(mask);
 #else
 	group = cpu;
@@ -7621,7 +7621,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		cpumask_and(sched_domain_span(sd),
-			    &per_cpu(cpu_sibling_map, i), cpu_map);
+			    topology_thread_cpumask(i), cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7632,7 +7632,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 	/* Set up CPU (sibling) groups */
 	for_each_cpu(i, cpu_map) {
 		cpumask_and(this_sibling_map,
-			    &per_cpu(cpu_sibling_map, i), cpu_map);
+			    topology_thread_cpumask(i), cpu_map);
 		if (i != cpumask_first(this_sibling_map))
 			continue;
 
-- 
cgit v1.2.3-59-g8ed1b


From 7f96f93f02b7637491a1637dee12dcdcd40b9802 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Mar 2009 00:37:42 -0400
Subject: tracing: move binary buffers into per cpu directory

The binary_buffers directory in /debugfs/tracing held the files
to read the trace buffers in a binary format. This held one file
per CPU buffer. But we also have a per_cpu directory that holds
a way to read the pretty-print formats.

This patch moves the binary buffers into the per_cpu_directory:

 # ls /debug/tracing/per_cpu/cpu1/
trace  trace_pipe  trace_pipe_raw

The new name is called "trace_pipe_raw". The binary buffers always
acted similar to trace_pipe, except that they produce raw data.

Requested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dbb077d8a172..efe3202c0209 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3543,6 +3543,11 @@ static void tracing_init_debugfs_percpu(long cpu)
 				(void *) cpu, &tracing_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu,
+				    (void *) cpu, &tracing_buffers_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
@@ -3826,7 +3831,6 @@ static __init void create_trace_options_dir(void)
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *buffers;
 	struct dentry *entry;
 	int cpu;
 
@@ -3899,26 +3903,6 @@ static __init int tracer_init_debugfs(void)
 		pr_warning("Could not create debugfs "
 			   "'trace_marker' entry\n");
 
-	buffers = debugfs_create_dir("binary_buffers", d_tracer);
-
-	if (!buffers)
-		pr_warning("Could not create buffers directory\n");
-	else {
-		int cpu;
-		char buf[64];
-
-		for_each_tracing_cpu(cpu) {
-			sprintf(buf, "%d", cpu);
-
-			entry = debugfs_create_file(buf, 0444, buffers,
-						    (void *)(long)cpu,
-						    &tracing_buffers_fops);
-			if (!entry)
-				pr_warning("Could not create debugfs buffers "
-					   "'%s' entry\n", buf);
-		}
-	}
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 				    &ftrace_update_tot_cnt,
-- 
cgit v1.2.3-59-g8ed1b


From 899039e8746bb9a09b6487ddb8ab2275ce9d0256 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Mar 2009 00:43:33 -0400
Subject: softirq: no need to have SOFTIRQ in softirq name

Impact: clean up

It is redundant to have 'SOFTIRQ' in the softirq names.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/softirq.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index a5e81231ca7a..65ff3e3961b4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -55,9 +55,8 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
 static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
 char *softirq_to_name[NR_SOFTIRQS] = {
-	"HI_SOFTIRQ", "TIMER_SOFTIRQ", "NET_TX_SOFTIRQ", "NET_RX_SOFTIRQ",
-	"BLOCK_SOFTIRQ", "TASKLET_SOFTIRQ", "SCHED_SOFTIRQ", "HRTIMER_SOFTIRQ",
-	"RCU_SOFTIRQ"
+	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK",
+	"TASKLET", "SCHED", "HRTIMER",	"RCU"
 };
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From ee08c6eccb7d1295516f7cf420fddf7b14e9146f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Mar 2009 05:52:59 +0100
Subject: tracing/ftrace: syscall tracing infrastructure, basics

Provide basic callbacks to do syscall tracing.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <1236401580-5758-2-git-send-email-fweisbec@gmail.com>
[ simplified it to a trace_printk() for now. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h        |  21 ++++++++
 kernel/trace/Kconfig          |  10 ++++
 kernel/trace/Makefile         |   1 +
 kernel/trace/trace.h          |   2 +
 kernel/trace/trace_syscalls.c | 113 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 147 insertions(+)
 create mode 100644 kernel/trace/trace_syscalls.c

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e1583f2639b0..c146c1021a29 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -503,4 +503,25 @@ static inline void trace_hw_branch_oops(void) {}
 
 #endif /* CONFIG_HW_BRANCH_TRACER */
 
+/*
+ * A syscall entry in the ftrace syscalls array.
+ *
+ * @syscall_nr: syscall number
+ */
+struct syscall_trace_entry {
+	int		syscall_nr;
+};
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+extern void start_ftrace_syscalls(void);
+extern void stop_ftrace_syscalls(void);
+extern void ftrace_syscall_enter(struct pt_regs *regs);
+extern void ftrace_syscall_exit(struct pt_regs *regs);
+#else
+static inline void start_ftrace_syscalls(void) { }
+static inline void stop_ftrace_syscalls(void) { }
+static inline void ftrace_syscall_enter(struct pt_regs *regs) { }
+static inline void ftrace_syscall_exit(struct pt_regs *regs) { }
+#endif
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8e4a2a61cd75..95a0ad191f19 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -34,6 +34,9 @@ config HAVE_FTRACE_MCOUNT_RECORD
 config HAVE_HW_BRANCH_TRACER
 	bool
 
+config HAVE_FTRACE_SYSCALLS
+	bool
+
 config TRACER_MAX_TRACE
 	bool
 
@@ -175,6 +178,13 @@ config EVENT_TRACER
 	  allowing the user to pick and choose which trace point they
 	  want to trace.
 
+config FTRACE_SYSCALLS
+	bool "Trace syscalls"
+	depends on HAVE_FTRACE_SYSCALLS
+	select TRACING
+	help
+	  Basic tracer to catch the syscall entry and exit events.
+
 config BOOT_TRACER
 	bool "Trace boot initcalls"
 	select TRACING
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c7a2943796eb..c3feea01c3e0 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -43,5 +43,6 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACER) += trace_export.o
+obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c5e1d8865fe4..3d49daae47dc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -30,6 +30,8 @@ enum trace_type {
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_HW_BRANCHES,
+	TRACE_SYSCALL_ENTER,
+	TRACE_SYSCALL_EXIT,
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
new file mode 100644
index 000000000000..66cf97449af3
--- /dev/null
+++ b/kernel/trace/trace_syscalls.c
@@ -0,0 +1,113 @@
+#include <linux/ftrace.h>
+#include <linux/kernel.h>
+
+#include <asm/syscall.h>
+
+#include "trace_output.h"
+#include "trace.h"
+
+static atomic_t refcount;
+
+void start_ftrace_syscalls(void)
+{
+	unsigned long flags;
+	struct task_struct *g, *t;
+
+	if (atomic_inc_return(&refcount) != 1)
+		goto out;
+
+	read_lock_irqsave(&tasklist_lock, flags);
+
+	do_each_thread(g, t) {
+		set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
+	} while_each_thread(g, t);
+
+	read_unlock_irqrestore(&tasklist_lock, flags);
+out:
+	atomic_dec(&refcount);
+}
+
+void stop_ftrace_syscalls(void)
+{
+	unsigned long flags;
+	struct task_struct *g, *t;
+
+	if (atomic_dec_return(&refcount))
+		goto out;
+
+	read_lock_irqsave(&tasklist_lock, flags);
+
+	do_each_thread(g, t) {
+		clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
+	} while_each_thread(g, t);
+
+	read_unlock_irqrestore(&tasklist_lock, flags);
+out:
+	atomic_inc(&refcount);
+}
+
+void ftrace_syscall_enter(struct pt_regs *regs)
+{
+	int syscall_nr;
+
+	syscall_nr = syscall_get_nr(current, regs);
+
+	trace_printk("syscall %d enter\n", syscall_nr);
+}
+
+void ftrace_syscall_exit(struct pt_regs *regs)
+{
+	int syscall_nr;
+
+	syscall_nr = syscall_get_nr(current, regs);
+
+	trace_printk("syscall %d exit\n", syscall_nr);
+}
+
+static int init_syscall_tracer(struct trace_array *tr)
+{
+	start_ftrace_syscalls();
+
+	return 0;
+}
+
+static void reset_syscall_tracer(struct trace_array *tr)
+{
+	stop_ftrace_syscalls();
+}
+
+static struct trace_event syscall_enter_event = {
+	.type		= TRACE_SYSCALL_ENTER,
+};
+
+static struct trace_event syscall_exit_event = {
+	.type		= TRACE_SYSCALL_EXIT,
+};
+
+static struct tracer syscall_tracer __read_mostly = {
+	.name		= "syscall",
+	.init		= init_syscall_tracer,
+	.reset		= reset_syscall_tracer
+};
+
+__init int register_ftrace_syscalls(void)
+{
+	int ret;
+
+	ret = register_ftrace_event(&syscall_enter_event);
+	if (!ret) {
+		printk(KERN_WARNING "event %d failed to register\n",
+		       syscall_enter_event.type);
+		WARN_ON_ONCE(1);
+	}
+
+	ret = register_ftrace_event(&syscall_exit_event);
+	if (!ret) {
+		printk(KERN_WARNING "event %d failed to register\n",
+		       syscall_exit_event.type);
+		WARN_ON_ONCE(1);
+	}
+
+	return register_tracer(&syscall_tracer);
+}
+device_initcall(register_ftrace_syscalls);
-- 
cgit v1.2.3-59-g8ed1b


From b00f0b6dc1773b4c8f538503247da050b5ea631b Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 13 Mar 2009 17:14:01 +0800
Subject: ftrace: avoid double-free of dyn_ftrace

If dyn_ftrace is freed before ftrace_release(), ftrace_release()
will free it again and make ftrace_free_records wrong.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: "Steven Rostedt ;" <rostedt@goodmis.org>
LKML-Reference: <49BA23D9.1050900@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d33d306bdcf4..26c45aaf6805 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -356,7 +356,8 @@ void ftrace_release(void *start, unsigned long size)
 
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
-		if ((rec->ip >= s) && (rec->ip < e))
+		if ((rec->ip >= s) && (rec->ip < e) &&
+		    !(rec->flags & FTRACE_FL_FREE))
 			ftrace_free_rec(rec);
 	} while_for_each_ftrace_rec();
 	mutex_unlock(&ftrace_lock);
-- 
cgit v1.2.3-59-g8ed1b


From fa9d13cf135efbd454453a53b6299976bea245a9 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 13 Mar 2009 17:16:34 +0800
Subject: ftrace: don't try to __ftrace_replace_code on !FTRACE_FL_CONVERTED
 rec

Do __ftrace_replace_code for !FTRACE_FL_CONVERTED rec will always
fail, we should ignore this rec.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: "Steven Rostedt ;" <rostedt@goodmis.org>
LKML-Reference: <49BA2472.4060206@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 26c45aaf6805..08f4a624e31f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -532,11 +532,12 @@ static void ftrace_replace_code(int enable)
 
 	do_for_each_ftrace_rec(pg, rec) {
 		/*
-		 * Skip over free records and records that have
-		 * failed.
+		 * Skip over free records, records that have
+		 * failed and not converted.
 		 */
 		if (rec->flags & FTRACE_FL_FREE ||
-		    rec->flags & FTRACE_FL_FAILED)
+		    rec->flags & FTRACE_FL_FAILED ||
+		    rec->flags & FTRACE_FL_CONVERTED)
 			continue;
 
 		/* ignore updates to this record's mcount site */
@@ -548,7 +549,7 @@ static void ftrace_replace_code(int enable)
 		}
 
 		failed = __ftrace_replace_code(rec, enable);
-		if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
+		if (failed) {
 			rec->flags |= FTRACE_FL_FAILED;
 			if ((system_state == SYSTEM_BOOTING) ||
 			    !core_kernel_text(rec->ip)) {
-- 
cgit v1.2.3-59-g8ed1b


From 641cd4cfcdc71ce01535b31cc4d57d59a1fae1fc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 10:47:34 +0100
Subject: generic-ipi: eliminate WARN_ON()s during oops/panic

Do not output smp-call related warnings in the oops/panic codepath.

Reported-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <49B91A7E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 7ad2262d2eca..858baac568ee 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -5,6 +5,7 @@
  */
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
@@ -285,7 +286,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 	this_cpu = get_cpu();
 
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
+	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
 
 	if (cpu == this_cpu) {
 		local_irq_save(flags);
@@ -329,7 +330,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 	csd_lock(data);
 
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(wait && irqs_disabled());
+	WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress);
 
 	generic_exec_single(cpu, data, wait);
 }
@@ -365,7 +366,7 @@ void smp_call_function_many(const struct cpumask *mask,
 	int cpu, next_cpu, this_cpu = smp_processor_id();
 
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
+	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
 
 	/* So, what's a CPU they want? Ignoring this one. */
 	cpu = cpumask_first_and(mask, cpu_online_mask);
-- 
cgit v1.2.3-59-g8ed1b


From ffd71da4e3f323b7673b061e6f7e0d0c12dc2b49 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 10:54:24 +0100
Subject: panic: decrease oops_in_progress only after having done the panic

Impact: eliminate secondary warnings during panic()

We can panic() in a number of difficult, atomic contexts, hence
we use bust_spinlocks(1) in panic() to increase oops_in_progress,
which prevents various debug checks we have in place.

But in practice this protection only covers the first few printk's
done by panic() - it does not cover the later attempt to stop all
other CPUs and kexec(). If a secondary warning triggers in one of
those facilities that can make the panic message scroll off.

So do bust_spinlocks(0) only much later in panic(). (which code
is only reached if panic policy is relaxed that it can return
after a warning message)

Reported-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49B91A7E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 32fe4eff1b89..57fb005de546 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -77,7 +77,6 @@ NORET_TYPE void panic(const char * fmt, ...)
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 	dump_stack();
 #endif
-	bust_spinlocks(0);
 
 	/*
 	 * If we have crashed and we have a crash kernel loaded let it handle
@@ -136,6 +135,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 		mdelay(1);
 		i++;
 	}
+	bust_spinlocks(0);
 }
 
 EXPORT_SYMBOL(panic);
-- 
cgit v1.2.3-59-g8ed1b


From d1dedb52acd98bd5e13e1ff4c4d045d58bbd16fe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 11:14:06 +0100
Subject: panic, smp: provide smp_send_stop() wrapper on UP too

Impact: cleanup, no code changed

Remove an ugly #ifdef CONFIG_SMP from panic(), by providing
an smp_send_stop() wrapper on UP too.

LKML-Reference: <49B91A7E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/smp.h | 4 +++-
 kernel/panic.c      | 2 --
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 2d3bcb6b37ff..a69db820eed6 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -38,7 +38,7 @@ int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
 /*
  * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
  * (defined in asm header):
- */ 
+ */
 
 /*
  * stops all CPUs but the current one:
@@ -122,6 +122,8 @@ extern unsigned int setup_max_cpus;
 
 #else /* !SMP */
 
+static inline void smp_send_stop(void) { }
+
 /*
  *	These macros fold the SMP functionality into a single CPU system
  */
diff --git a/kernel/panic.c b/kernel/panic.c
index 57fb005de546..ca75e819d0ea 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -85,14 +85,12 @@ NORET_TYPE void panic(const char * fmt, ...)
 	 */
 	crash_kexec(NULL);
 
-#ifdef CONFIG_SMP
 	/*
 	 * Note smp_send_stop is the usual smp shutdown function, which
 	 * unfortunately means it may not be hardened to work in a panic
 	 * situation.
 	 */
 	smp_send_stop();
-#endif
 
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
-- 
cgit v1.2.3-59-g8ed1b


From c95dbf27e201587b2a6cf63f8501a0313d9b4801 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 11:14:06 +0100
Subject: panic: clean up kernel/panic.c

Impact: cleanup, no code changed

Clean up kernel/panic.c some more and make it more consistent.

LKML-Reference: <49B91A7E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 111 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 59 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index ca75e819d0ea..3fd8c5bf8b39 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -8,19 +8,19 @@
  * This function is used through-out the kernel (including mm and fs)
  * to indicate a major problem.
  */
+#include <linux/debug_locks.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
 #include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/delay.h>
+#include <linux/random.h>
 #include <linux/reboot.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
 #include <linux/sysrq.h>
-#include <linux/interrupt.h>
+#include <linux/init.h>
 #include <linux/nmi.h>
-#include <linux/kexec.h>
-#include <linux/debug_locks.h>
-#include <linux/random.h>
-#include <linux/kallsyms.h>
 #include <linux/dmi.h>
 
 int panic_on_oops;
@@ -52,19 +52,15 @@ EXPORT_SYMBOL(panic_blink);
  *
  *	This function never returns.
  */
-
 NORET_TYPE void panic(const char * fmt, ...)
 {
-	long i;
 	static char buf[1024];
 	va_list args;
-#if defined(CONFIG_S390)
-	unsigned long caller = (unsigned long) __builtin_return_address(0);
-#endif
+	long i;
 
 	/*
-	 * It's possible to come here directly from a panic-assertion and not
-	 * have preempt disabled. Some functions called from here want
+	 * It's possible to come here directly from a panic-assertion and
+	 * not have preempt disabled. Some functions called from here want
 	 * preempt to be disabled. No point enabling it later though...
 	 */
 	preempt_disable();
@@ -99,19 +95,21 @@ NORET_TYPE void panic(const char * fmt, ...)
 
 	if (panic_timeout > 0) {
 		/*
-	 	 * Delay timeout seconds before rebooting the machine. 
-		 * We can't use the "normal" timers since we just panicked..
-	 	 */
-		printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
+		 * Delay timeout seconds before rebooting the machine.
+		 * We can't use the "normal" timers since we just panicked.
+		 */
+		printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
+
 		for (i = 0; i < panic_timeout*1000; ) {
 			touch_nmi_watchdog();
 			i += panic_blink(i);
 			mdelay(1);
 			i++;
 		}
-		/*	This will not be a clean reboot, with everything
-		 *	shutting down.  But if there is a chance of
-		 *	rebooting the system it will be rebooted.
+		/*
+		 * This will not be a clean reboot, with everything
+		 * shutting down.  But if there is a chance of
+		 * rebooting the system it will be rebooted.
 		 */
 		emergency_restart();
 	}
@@ -124,10 +122,15 @@ NORET_TYPE void panic(const char * fmt, ...)
 	}
 #endif
 #if defined(CONFIG_S390)
-	disabled_wait(caller);
+	{
+		unsigned long caller;
+
+		caller = (unsigned long)__builtin_return_address(0);
+		disabled_wait(caller);
+	}
 #endif
 	local_irq_enable();
-	for (i = 0;;) {
+	for (i = 0; ; ) {
 		touch_softlockup_watchdog();
 		i += panic_blink(i);
 		mdelay(1);
@@ -140,23 +143,23 @@ EXPORT_SYMBOL(panic);
 
 
 struct tnt {
-	u8 bit;
-	char true;
-	char false;
+	u8	bit;
+	char	true;
+	char	false;
 };
 
 static const struct tnt tnts[] = {
-	{ TAINT_PROPRIETARY_MODULE, 'P', 'G' },
-	{ TAINT_FORCED_MODULE, 'F', ' ' },
-	{ TAINT_UNSAFE_SMP, 'S', ' ' },
-	{ TAINT_FORCED_RMMOD, 'R', ' ' },
-	{ TAINT_MACHINE_CHECK, 'M', ' ' },
-	{ TAINT_BAD_PAGE, 'B', ' ' },
-	{ TAINT_USER, 'U', ' ' },
-	{ TAINT_DIE, 'D', ' ' },
-	{ TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
-	{ TAINT_WARN, 'W', ' ' },
-	{ TAINT_CRAP, 'C', ' ' },
+	{ TAINT_PROPRIETARY_MODULE,	'P', 'G' },
+	{ TAINT_FORCED_MODULE,		'F', ' ' },
+	{ TAINT_UNSAFE_SMP,		'S', ' ' },
+	{ TAINT_FORCED_RMMOD,		'R', ' ' },
+	{ TAINT_MACHINE_CHECK,		'M', ' ' },
+	{ TAINT_BAD_PAGE,		'B', ' ' },
+	{ TAINT_USER,			'U', ' ' },
+	{ TAINT_DIE,			'D', ' ' },
+	{ TAINT_OVERRIDDEN_ACPI_TABLE,	'A', ' ' },
+	{ TAINT_WARN,			'W', ' ' },
+	{ TAINT_CRAP,			'C', ' ' },
 };
 
 /**
@@ -193,7 +196,8 @@ const char *print_tainted(void)
 		*s = 0;
 	} else
 		snprintf(buf, sizeof(buf), "Not tainted");
-	return(buf);
+
+	return buf;
 }
 
 int test_taint(unsigned flag)
@@ -209,7 +213,8 @@ unsigned long get_taint(void)
 
 void add_taint(unsigned flag)
 {
-	debug_locks = 0; /* can't trust the integrity of the kernel anymore */
+	/* can't trust the integrity of the kernel anymore: */
+	debug_locks = 0;
 	set_bit(flag, &tainted_mask);
 }
 EXPORT_SYMBOL(add_taint);
@@ -264,8 +269,8 @@ static void do_oops_enter_exit(void)
 }
 
 /*
- * Return true if the calling CPU is allowed to print oops-related info.  This
- * is a bit racy..
+ * Return true if the calling CPU is allowed to print oops-related info.
+ * This is a bit racy..
  */
 int oops_may_print(void)
 {
@@ -274,20 +279,22 @@ int oops_may_print(void)
 
 /*
  * Called when the architecture enters its oops handler, before it prints
- * anything.  If this is the first CPU to oops, and it's oopsing the first time
- * then let it proceed.
+ * anything.  If this is the first CPU to oops, and it's oopsing the first
+ * time then let it proceed.
  *
- * This is all enabled by the pause_on_oops kernel boot option.  We do all this
- * to ensure that oopses don't scroll off the screen.  It has the side-effect
- * of preventing later-oopsing CPUs from mucking up the display, too.
+ * This is all enabled by the pause_on_oops kernel boot option.  We do all
+ * this to ensure that oopses don't scroll off the screen.  It has the
+ * side-effect of preventing later-oopsing CPUs from mucking up the display,
+ * too.
  *
- * It turns out that the CPU which is allowed to print ends up pausing for the
- * right duration, whereas all the other CPUs pause for twice as long: once in
- * oops_enter(), once in oops_exit().
+ * It turns out that the CPU which is allowed to print ends up pausing for
+ * the right duration, whereas all the other CPUs pause for twice as long:
+ * once in oops_enter(), once in oops_exit().
  */
 void oops_enter(void)
 {
-	debug_locks_off(); /* can't trust the integrity of the kernel anymore */
+	/* can't trust the integrity of the kernel anymore: */
+	debug_locks_off();
 	do_oops_enter_exit();
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 850a80cfaa5aec3e626eb3736eff890a80e4fa77 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 13 Mar 2009 17:47:23 +0800
Subject: ftrace: use seq_read

Impact: cleanup

VFS layer has tested the file mode, we do not need test it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <49BA2BAB.6010608@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 08f4a624e31f..bf78a4c75c67 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1120,16 +1120,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
 	return ftrace_regex_open(inode, file, 0);
 }
 
-static ssize_t
-ftrace_regex_read(struct file *file, char __user *ubuf,
-		       size_t cnt, loff_t *ppos)
-{
-	if (file->f_mode & FMODE_READ)
-		return seq_read(file, ubuf, cnt, ppos);
-	else
-		return -EPERM;
-}
-
 static loff_t
 ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
 {
@@ -1882,7 +1872,7 @@ static const struct file_operations ftrace_failures_fops = {
 
 static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
-	.read = ftrace_regex_read,
+	.read = seq_read,
 	.write = ftrace_filter_write,
 	.llseek = ftrace_regex_lseek,
 	.release = ftrace_filter_release,
@@ -1890,7 +1880,7 @@ static const struct file_operations ftrace_filter_fops = {
 
 static const struct file_operations ftrace_notrace_fops = {
 	.open = ftrace_notrace_open,
-	.read = ftrace_regex_read,
+	.read = seq_read,
 	.write = ftrace_notrace_write,
 	.llseek = ftrace_regex_lseek,
 	.release = ftrace_notrace_release,
@@ -1992,16 +1982,6 @@ ftrace_graph_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static ssize_t
-ftrace_graph_read(struct file *file, char __user *ubuf,
-		       size_t cnt, loff_t *ppos)
-{
-	if (file->f_mode & FMODE_READ)
-		return seq_read(file, ubuf, cnt, ppos);
-	else
-		return -EPERM;
-}
-
 static int
 ftrace_set_func(unsigned long *array, int *idx, char *buffer)
 {
@@ -2132,7 +2112,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
 
 static const struct file_operations ftrace_graph_fops = {
 	.open = ftrace_graph_open,
-	.read = ftrace_graph_read,
+	.read = seq_read,
 	.write = ftrace_graph_write,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-- 
cgit v1.2.3-59-g8ed1b


From e94142a67f8bad494c593f0a07c9fc2fbec98c0e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 13 Mar 2009 17:51:27 +0800
Subject: ftrace: remove struct list_head from struct dyn_ftrace

Impact: save memory

The struct dyn_ftrace table is very large, this patch will save
about 50%.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <49BA2C9F.8020009@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  1 -
 kernel/trace/ftrace.c  | 14 ++++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index c146c1021a29..9d598bbf28a6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -145,7 +145,6 @@ enum {
 };
 
 struct dyn_ftrace {
-	struct list_head	list;
 	unsigned long		ip; /* address of mcount call-site */
 	unsigned long		flags;
 	struct dyn_arch_ftrace	arch;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bf78a4c75c67..90d5729afeff 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -272,7 +272,7 @@ enum {
 
 static int ftrace_filtered;
 
-static LIST_HEAD(ftrace_new_addrs);
+static struct dyn_ftrace *ftrace_new_addrs;
 
 static DEFINE_MUTEX(ftrace_regex_lock);
 
@@ -409,8 +409,8 @@ ftrace_record_ip(unsigned long ip)
 		return NULL;
 
 	rec->ip = ip;
-
-	list_add(&rec->list, &ftrace_new_addrs);
+	rec->flags = (unsigned long)ftrace_new_addrs;
+	ftrace_new_addrs = rec;
 
 	return rec;
 }
@@ -716,19 +716,21 @@ unsigned long		ftrace_update_tot_cnt;
 
 static int ftrace_update_code(struct module *mod)
 {
-	struct dyn_ftrace *p, *t;
+	struct dyn_ftrace *p;
 	cycle_t start, stop;
 
 	start = ftrace_now(raw_smp_processor_id());
 	ftrace_update_cnt = 0;
 
-	list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) {
+	while (ftrace_new_addrs) {
 
 		/* If something went wrong, bail without enabling anything */
 		if (unlikely(ftrace_disabled))
 			return -1;
 
-		list_del_init(&p->list);
+		p = ftrace_new_addrs;
+		ftrace_new_addrs = (struct dyn_ftrace *)p->flags;
+		p->flags = 0L;
 
 		/* convert record (i.e, patch mcount-call with NOP) */
 		if (ftrace_code_disable(mod, p)) {
-- 
cgit v1.2.3-59-g8ed1b


From bed1ffca022cc876fb83161d26670e9b5d3cf36b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 13 Mar 2009 15:42:11 +0100
Subject: tracing/syscalls: core infrastructure for syscalls tracing,
 enhancements

Impact: new feature

This adds the generic support for syscalls tracing. This is
currently exploited through a devoted tracer but other tracing
engines can use it. (They just have to play with
{start,stop}_ftrace_syscalls() and use the display callbacks
unless they want to override them.)

The syscalls prototypes definitions are abused here to steal
some metadata informations:

- syscall name, param types, param names, number of params

The syscall addr is not directly saved during this definition
because we don't know if its prototype is available in the
namespace. But we don't really need it. The arch has just to
build a function able to resolve the syscall number to its
metadata struct.

The current tracer prints the syscall names, parameters names
and values (and their types optionally). Currently the value is
a raw hex but higher level values diplaying is on my TODO list.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1236955332-10133-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h |  11 ++-
 include/linux/ftrace.h            |  14 +++-
 include/linux/syscalls.h          |  60 +++++++++++++++-
 kernel/trace/trace.h              |  17 +++++
 kernel/trace/trace_syscalls.c     | 146 +++++++++++++++++++++++++++++++++++---
 5 files changed, 234 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0e0f39be6c8b..d3bc3c86df6a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -77,6 +77,14 @@
 #define TRACE_PRINTKS()
 #endif
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define TRACE_SYSCALLS() VMLINUX_SYMBOL(__start_syscalls_metadata) = .;	\
+			 *(__syscalls_metadata)				\
+			 VMLINUX_SYMBOL(__stop_syscalls_metadata) = .;
+#else
+#define TRACE_SYSCALLS()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -99,7 +107,8 @@
 	LIKELY_PROFILE()		       				\
 	BRANCH_PROFILE()						\
 	TRACE_PRINTKS()							\
-	FTRACE_EVENTS()
+	FTRACE_EVENTS()							\
+	TRACE_SYSCALLS()
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index c146c1021a29..6dc1c652447e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -506,13 +506,21 @@ static inline void trace_hw_branch_oops(void) {}
 /*
  * A syscall entry in the ftrace syscalls array.
  *
- * @syscall_nr: syscall number
+ * @name: name of the syscall
+ * @nb_args: number of parameters it takes
+ * @types: list of types as strings
+ * @args: list of args as strings (args[i] matches types[i])
  */
-struct syscall_trace_entry {
-	int		syscall_nr;
+struct syscall_metadata {
+	const char	*name;
+	int		nb_args;
+	const char	**types;
+	const char	**args;
 };
 
 #ifdef CONFIG_FTRACE_SYSCALLS
+extern void arch_init_ftrace_syscalls(void);
+extern struct syscall_metadata *syscall_nr_to_meta(int nr);
 extern void start_ftrace_syscalls(void);
 extern void stop_ftrace_syscalls(void);
 extern void ftrace_syscall_enter(struct pt_regs *regs);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f9f900cfd066..0cff9bb80b02 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -65,6 +65,7 @@ struct old_linux_dirent;
 #include <asm/signal.h>
 #include <linux/quota.h>
 #include <linux/key.h>
+#include <linux/ftrace.h>
 
 #define __SC_DECL1(t1, a1)	t1 a1
 #define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
@@ -95,7 +96,46 @@ struct old_linux_dirent;
 #define __SC_TEST5(t5, a5, ...)	__SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
 #define __SC_TEST6(t6, a6, ...)	__SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define __SC_STR_ADECL1(t, a)		#a
+#define __SC_STR_ADECL2(t, a, ...)	#a, __SC_STR_ADECL1(__VA_ARGS__)
+#define __SC_STR_ADECL3(t, a, ...)	#a, __SC_STR_ADECL2(__VA_ARGS__)
+#define __SC_STR_ADECL4(t, a, ...)	#a, __SC_STR_ADECL3(__VA_ARGS__)
+#define __SC_STR_ADECL5(t, a, ...)	#a, __SC_STR_ADECL4(__VA_ARGS__)
+#define __SC_STR_ADECL6(t, a, ...)	#a, __SC_STR_ADECL5(__VA_ARGS__)
+
+#define __SC_STR_TDECL1(t, a)		#t
+#define __SC_STR_TDECL2(t, a, ...)	#t, __SC_STR_TDECL1(__VA_ARGS__)
+#define __SC_STR_TDECL3(t, a, ...)	#t, __SC_STR_TDECL2(__VA_ARGS__)
+#define __SC_STR_TDECL4(t, a, ...)	#t, __SC_STR_TDECL3(__VA_ARGS__)
+#define __SC_STR_TDECL5(t, a, ...)	#t, __SC_STR_TDECL4(__VA_ARGS__)
+#define __SC_STR_TDECL6(t, a, ...)	#t, __SC_STR_TDECL5(__VA_ARGS__)
+
+#define SYSCALL_METADATA(sname, nb)				\
+	static const struct syscall_metadata __used		\
+	  __attribute__((__aligned__(4)))			\
+	  __attribute__((section("__syscalls_metadata")))	\
+	  __syscall_meta_##sname = {				\
+		.name 		= "sys"#sname,			\
+		.nb_args 	= nb,				\
+		.types		= types_##sname,		\
+		.args		= args_##sname,			\
+	}
+
+#define SYSCALL_DEFINE0(sname)					\
+	static const struct syscall_metadata __used		\
+	  __attribute__((__aligned__(4)))			\
+	  __attribute__((section("__syscalls_metadata")))	\
+	  __syscall_meta_##sname = {				\
+		.name 		= "sys_"#sname,			\
+		.nb_args 	= 0,				\
+	};							\
+	asmlinkage long sys_##sname(void)
+
+#else
 #define SYSCALL_DEFINE0(name)	   asmlinkage long sys_##name(void)
+#endif
+
 #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
@@ -117,10 +157,26 @@ struct old_linux_dirent;
 #endif
 #endif
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define SYSCALL_DEFINEx(x, sname, ...)				\
+	static const char *types_##sname[] = {			\
+		__SC_STR_TDECL##x(__VA_ARGS__)			\
+	};							\
+	static const char *args_##sname[] = {			\
+		__SC_STR_ADECL##x(__VA_ARGS__)			\
+	};							\
+	SYSCALL_METADATA(sname, x);				\
+	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+#else
+#define SYSCALL_DEFINEx(x, sname, ...)				\
+	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+#endif
+
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 
 #define SYSCALL_DEFINE(name) static inline long SYSC_##name
-#define SYSCALL_DEFINEx(x, name, ...)					\
+
+#define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));		\
 	static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));	\
 	asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))		\
@@ -134,7 +190,7 @@ struct old_linux_dirent;
 #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
 
 #define SYSCALL_DEFINE(name) asmlinkage long sys_##name
-#define SYSCALL_DEFINEx(x, name, ...)					\
+#define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
 
 #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3d49daae47dc..d80ca0d464d9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -194,6 +194,19 @@ struct kmemtrace_free_entry {
 	const void *ptr;
 };
 
+struct syscall_trace_enter {
+	struct trace_entry	ent;
+	int			nr;
+	unsigned long		args[];
+};
+
+struct syscall_trace_exit {
+	struct trace_entry	ent;
+	int			nr;
+	unsigned long		ret;
+};
+
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -306,6 +319,10 @@ extern void __ftrace_bad_type(void);
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
 			  TRACE_KMEM_FREE);	\
+		IF_ASSIGN(var, ent, struct syscall_trace_enter,		\
+			  TRACE_SYSCALL_ENTER);				\
+		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
+			  TRACE_SYSCALL_EXIT);				\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 66cf97449af3..c72e599230ff 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,6 +1,5 @@
-#include <linux/ftrace.h>
 #include <linux/kernel.h>
-
+#include <linux/ftrace.h>
 #include <asm/syscall.h>
 
 #include "trace_output.h"
@@ -8,6 +7,90 @@
 
 static atomic_t refcount;
 
+/* Our two options */
+enum {
+	TRACE_SYSCALLS_OPT_TYPES = 0x1,
+};
+
+static struct tracer_opt syscalls_opts[] = {
+	{ TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
+	{ }
+};
+
+static struct tracer_flags syscalls_flags = {
+	.val = 0, /* By default: no args types */
+	.opts = syscalls_opts
+};
+
+enum print_line_t
+print_syscall_enter(struct trace_iterator *iter, int flags)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *ent = iter->ent;
+	struct syscall_trace_enter *trace;
+	struct syscall_metadata *entry;
+	int i, ret, syscall;
+
+	trace_assign_type(trace, ent);
+
+	syscall = trace->nr;
+
+	entry = syscall_nr_to_meta(syscall);
+	if (!entry)
+		goto end;
+
+	ret = trace_seq_printf(s, "%s(", entry->name);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	for (i = 0; i < entry->nb_args; i++) {
+		/* parameter types */
+		if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) {
+			ret = trace_seq_printf(s, "%s ", entry->types[i]);
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+		/* parameter values */
+		ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i],
+				       trace->args[i],
+				       i == entry->nb_args - 1 ? ")" : ",");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+end:
+	trace_seq_printf(s, "\n");
+	return TRACE_TYPE_HANDLED;
+}
+
+enum print_line_t
+print_syscall_exit(struct trace_iterator *iter, int flags)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *ent = iter->ent;
+	struct syscall_trace_exit *trace;
+	int syscall;
+	struct syscall_metadata *entry;
+	int ret;
+
+	trace_assign_type(trace, ent);
+
+	syscall = trace->nr;
+
+	entry = syscall_nr_to_meta(syscall);
+	if (!entry) {
+		trace_seq_printf(s, "\n");
+		return TRACE_TYPE_HANDLED;
+	}
+
+	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
+				trace->ret);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 void start_ftrace_syscalls(void)
 {
 	unsigned long flags;
@@ -16,6 +99,7 @@ void start_ftrace_syscalls(void)
 	if (atomic_inc_return(&refcount) != 1)
 		goto out;
 
+	arch_init_ftrace_syscalls();
 	read_lock_irqsave(&tasklist_lock, flags);
 
 	do_each_thread(g, t) {
@@ -48,20 +132,63 @@ out:
 
 void ftrace_syscall_enter(struct pt_regs *regs)
 {
+	struct syscall_trace_enter *entry;
+	struct syscall_metadata *sys_data;
+	struct ring_buffer_event *event;
+	int size;
 	int syscall_nr;
+	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	trace_printk("syscall %d enter\n", syscall_nr);
+	cpu = raw_smp_processor_id();
+
+	sys_data = syscall_nr_to_meta(syscall_nr);
+	if (!sys_data)
+		return;
+
+	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+
+	event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size,
+							0, 0);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	entry->nr = syscall_nr;
+	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
+
+	trace_current_buffer_unlock_commit(event, 0, 0);
+	trace_wake_up();
 }
 
 void ftrace_syscall_exit(struct pt_regs *regs)
 {
+	struct syscall_trace_exit *entry;
+	struct syscall_metadata *sys_data;
+	struct ring_buffer_event *event;
 	int syscall_nr;
+	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	trace_printk("syscall %d exit\n", syscall_nr);
+	cpu = raw_smp_processor_id();
+
+	sys_data = syscall_nr_to_meta(syscall_nr);
+	if (!sys_data)
+		return;
+
+	event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT,
+				sizeof(*entry), 0, 0);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	entry->nr = syscall_nr;
+	entry->ret = syscall_get_return_value(current, regs);
+
+	trace_current_buffer_unlock_commit(event, 0, 0);
+	trace_wake_up();
 }
 
 static int init_syscall_tracer(struct trace_array *tr)
@@ -77,17 +204,20 @@ static void reset_syscall_tracer(struct trace_array *tr)
 }
 
 static struct trace_event syscall_enter_event = {
-	.type		= TRACE_SYSCALL_ENTER,
+	.type	 	= TRACE_SYSCALL_ENTER,
+	.trace		= print_syscall_enter,
 };
 
 static struct trace_event syscall_exit_event = {
-	.type		= TRACE_SYSCALL_EXIT,
+	.type	 	= TRACE_SYSCALL_EXIT,
+	.trace		= print_syscall_exit,
 };
 
 static struct tracer syscall_tracer __read_mostly = {
-	.name		= "syscall",
+	.name	     	= "syscall",
 	.init		= init_syscall_tracer,
-	.reset		= reset_syscall_tracer
+	.reset		= reset_syscall_tracer,
+	.flags		= &syscalls_flags,
 };
 
 __init int register_ftrace_syscalls(void)
-- 
cgit v1.2.3-59-g8ed1b


From ac99c58c9e56967037382e31f865b72b10127965 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:35 +0100
Subject: tracing/syscalls: fix missing release of tracing

Impact: fix 'stuck' syscall tracer

The syscall tracer uses a refcounter to enable several users
simultaneously.

But the refcounter did not behave correctly and always restored
its value to 0 after calling start_syscall_tracing(). Therefore,
stop_syscall_tracing() couldn't release correctly the tasks from
tracing.

Also the tracer forgot to reset the buffer when it is released.

Drop the pointless refcount decrement on start_syscall_tracing()
and reset the buffer when we release the tracer.

This fixes two reported issue:

- when we switch from syscall tracer to another tracer, syscall
  tracing continued.

- incorrect use of the refcount.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237151439-6755-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_syscalls.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c72e599230ff..c5fc1d8880f6 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -96,8 +96,9 @@ void start_ftrace_syscalls(void)
 	unsigned long flags;
 	struct task_struct *g, *t;
 
+	/* Don't enable the flag on the tasks twice */
 	if (atomic_inc_return(&refcount) != 1)
-		goto out;
+		return;
 
 	arch_init_ftrace_syscalls();
 	read_lock_irqsave(&tasklist_lock, flags);
@@ -107,8 +108,6 @@ void start_ftrace_syscalls(void)
 	} while_each_thread(g, t);
 
 	read_unlock_irqrestore(&tasklist_lock, flags);
-out:
-	atomic_dec(&refcount);
 }
 
 void stop_ftrace_syscalls(void)
@@ -116,8 +115,9 @@ void stop_ftrace_syscalls(void)
 	unsigned long flags;
 	struct task_struct *g, *t;
 
+	/* There are perhaps still some users */
 	if (atomic_dec_return(&refcount))
-		goto out;
+		return;
 
 	read_lock_irqsave(&tasklist_lock, flags);
 
@@ -126,8 +126,6 @@ void stop_ftrace_syscalls(void)
 	} while_each_thread(g, t);
 
 	read_unlock_irqrestore(&tasklist_lock, flags);
-out:
-	atomic_inc(&refcount);
 }
 
 void ftrace_syscall_enter(struct pt_regs *regs)
@@ -201,6 +199,7 @@ static int init_syscall_tracer(struct trace_array *tr)
 static void reset_syscall_tracer(struct trace_array *tr)
 {
 	stop_ftrace_syscalls();
+	tracing_reset_online_cpus(tr);
 }
 
 static struct trace_event syscall_enter_event = {
-- 
cgit v1.2.3-59-g8ed1b


From 6404434525bb9f8f2239998f30fd7c93f2efa5b3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:36 +0100
Subject: tracing/syscalls: various cleanups

Impact: cleanup

- Drop unused cpu variable
- Fix some errors on comments

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237151439-6755-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_syscalls.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c5fc1d8880f6..26f9a8679d3d 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -7,7 +7,7 @@
 
 static atomic_t refcount;
 
-/* Our two options */
+/* Option to display the parameters types */
 enum {
 	TRACE_SYSCALLS_OPT_TYPES = 0x1,
 };
@@ -18,7 +18,7 @@ static struct tracer_opt syscalls_opts[] = {
 };
 
 static struct tracer_flags syscalls_flags = {
-	.val = 0, /* By default: no args types */
+	.val = 0, /* By default: no parameters types */
 	.opts = syscalls_opts
 };
 
@@ -135,12 +135,9 @@ void ftrace_syscall_enter(struct pt_regs *regs)
 	struct ring_buffer_event *event;
 	int size;
 	int syscall_nr;
-	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	cpu = raw_smp_processor_id();
-
 	sys_data = syscall_nr_to_meta(syscall_nr);
 	if (!sys_data)
 		return;
@@ -166,12 +163,9 @@ void ftrace_syscall_exit(struct pt_regs *regs)
 	struct syscall_metadata *sys_data;
 	struct ring_buffer_event *event;
 	int syscall_nr;
-	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	cpu = raw_smp_processor_id();
-
 	sys_data = syscall_nr_to_meta(syscall_nr);
 	if (!sys_data)
 		return;
-- 
cgit v1.2.3-59-g8ed1b


From 5be71b61f17b0e3bc8ad0b1a1b7b53ab7d574ebb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:37 +0100
Subject: tracing/syscalls: protect thread flag toggling from races

Impact: fix syscall tracer enable/disable race

The current thread flag toggling is racy as shown in the following
scenario:

- task A is the last user of syscall tracing, it releases the
  TIF_SYSCALL_FTRACE on each tasks

- at the same time task B start syscall tracing. refcount == 0 so
  it sets up TIF_SYSCALL_FTRACE on each tasks.

The effect of the mixup is unpredictable.
So this fix adds a mutex on {start,stop}_syscall_tracing().

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1237151439-6755-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_syscalls.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 26f9a8679d3d..a2a3af29c943 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -5,7 +5,11 @@
 #include "trace_output.h"
 #include "trace.h"
 
-static atomic_t refcount;
+/* Keep a counter of the syscall tracing users */
+static int refcount;
+
+/* Prevent from races on thread flags toggling */
+static DEFINE_MUTEX(syscall_trace_lock);
 
 /* Option to display the parameters types */
 enum {
@@ -96,9 +100,11 @@ void start_ftrace_syscalls(void)
 	unsigned long flags;
 	struct task_struct *g, *t;
 
+	mutex_lock(&syscall_trace_lock);
+
 	/* Don't enable the flag on the tasks twice */
-	if (atomic_inc_return(&refcount) != 1)
-		return;
+	if (++refcount != 1)
+		goto unlock;
 
 	arch_init_ftrace_syscalls();
 	read_lock_irqsave(&tasklist_lock, flags);
@@ -108,6 +114,9 @@ void start_ftrace_syscalls(void)
 	} while_each_thread(g, t);
 
 	read_unlock_irqrestore(&tasklist_lock, flags);
+
+unlock:
+	mutex_unlock(&syscall_trace_lock);
 }
 
 void stop_ftrace_syscalls(void)
@@ -115,9 +124,11 @@ void stop_ftrace_syscalls(void)
 	unsigned long flags;
 	struct task_struct *g, *t;
 
+	mutex_lock(&syscall_trace_lock);
+
 	/* There are perhaps still some users */
-	if (atomic_dec_return(&refcount))
-		return;
+	if (--refcount)
+		goto unlock;
 
 	read_lock_irqsave(&tasklist_lock, flags);
 
@@ -126,6 +137,9 @@ void stop_ftrace_syscalls(void)
 	} while_each_thread(g, t);
 
 	read_unlock_irqrestore(&tasklist_lock, flags);
+
+unlock:
+	mutex_unlock(&syscall_trace_lock);
 }
 
 void ftrace_syscall_enter(struct pt_regs *regs)
-- 
cgit v1.2.3-59-g8ed1b


From 0ea1c4156bf9e2eb370cc5c6fa6eb112bd844dec Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:38 +0100
Subject: tracing/syscalls: select kallsysms

Syscall tracing must select kallsysms.

The arch code builds a table to find the syscall metadata by syscall
number. It needs the syscalls names resolution from the symbol table
to know which name found on the syscalls metadatas match a function
pointer from the arch sys_call_table.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237151439-6755-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 95a0ad191f19..b0a46f889659 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -182,6 +182,7 @@ config FTRACE_SYSCALLS
 	bool "Trace syscalls"
 	depends on HAVE_FTRACE_SYSCALLS
 	select TRACING
+	select KALLSYMS
 	help
 	  Basic tracer to catch the syscall entry and exit events.
 
-- 
cgit v1.2.3-59-g8ed1b


From 59f586db98919d7d9c43527b26c8de1cdf9ed912 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:39 +0100
Subject: tracing/core: fix missing mutex unlock on tracing_set_tracer()

Impact: fix possible locking imbalance

In case of ring buffer resize failure, tracing_set_tracer forgot to
release trace_types_lock. Fix it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237151439-6755-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index efe3202c0209..c0cf946d42f5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2494,7 +2494,7 @@ static int tracing_set_tracer(const char *buf)
 	if (!ring_buffer_expanded) {
 		ret = tracing_resize_ring_buffer(trace_buf_size);
 		if (ret < 0)
-			return ret;
+			goto out;
 		ret = 0;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From ac1d52d0b85854958c7e78c8006e39aadb6ce4b8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 16 Mar 2009 00:32:41 +0100
Subject: tracing/ftrace: fix double calls to tracing_start()

Impact: fix a warning during preemptirqsoff selftests

When the preemptirqsoff selftest fails, we see the following
warning:

[    6.050000] Testing tracer preemptirqsoff: .. no entries found ..
------------[ cut here ]------------
[    6.060000] WARNING: at kernel/trace/trace.c:688 tracing_start+0x67/0xd3()
[    6.060000] Modules linked in:
[    6.060000] Pid: 1, comm: swapper Tainted: G
[    6.060000] Call Trace:
[    6.060000]  [<ffffffff802460ff>] warn_slowpath+0xb1/0x100
[    6.060000]  [<ffffffff802a8f5b>] ? trace_preempt_on+0x35/0x4b
[    6.060000]  [<ffffffff802a37fb>] ? tracing_start+0x31/0xd3
[    6.060000]  [<ffffffff802a37fb>] ? tracing_start+0x31/0xd3
[    6.060000]  [<ffffffff80271e0b>] ? __lock_acquired+0xe6/0x1f2
[    6.060000]  [<ffffffff802a37fb>] ? tracing_start+0x31/0xd3
[    6.060000]  [<ffffffff802a3831>] tracing_start+0x67/0xd3
[    6.060000]  [<ffffffff802a8ace>] ? irqsoff_tracer_reset+0x2d/0x57
[    6.060000]  [<ffffffff802a4d1c>] trace_selftest_startup_preemptirqsoff+0x1c8/0x1f1
[    6.060000]  [<ffffffff802a4798>] register_tracer+0x12f/0x241
[    6.060000]  [<ffffffff810250d0>] ? init_irqsoff_tracer+0x0/0x53
[    6.060000]  [<ffffffff8102510b>] init_irqsoff_tracer+0x3b/0x53

This is because in fail case, the preemptirqsoff tracer selftest calls twice
the tracing_start() function:

int
trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
{
        if (!ret && !count) {
                printk(KERN_CONT ".. no entries found ..");
                ret = -1;
                tracing_start(); <-----
                goto out;
        }
        [...]
out:
        trace->reset(tr);
        tracing_start(); <------
        tracing_max_latency = save_max;

        return ret;
}

Since it is well handled in the out path, we don't need the conditional one.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237159961-7447-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index f907a2b29028..a2ca6f0fef9b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -414,7 +414,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
-		goto out;
+		goto out_no_start;
 	}
 
 	/* reset the max latency */
@@ -432,21 +432,16 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	tracing_stop();
 	/* check both trace buffers */
 	ret = trace_test_buffer(tr, NULL);
-	if (ret) {
-		tracing_start();
+	if (ret)
 		goto out;
-	}
 
 	ret = trace_test_buffer(&max_tr, &count);
-	if (ret) {
-		tracing_start();
+	if (ret)
 		goto out;
-	}
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
 		ret = -1;
-		tracing_start();
 		goto out;
 	}
 
@@ -475,9 +470,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 		goto out;
 	}
 
- out:
-	trace->reset(tr);
+out:
 	tracing_start();
+out_no_start:
+	trace->reset(tr);
 	tracing_max_latency = save_max;
 
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 2fc1dfbe17e7705c55b7a99da995fa565e26f151 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 16 Mar 2009 01:45:03 +0100
Subject: tracing/core: fix early free of cpumasks

Impact: fix crashes when tracing cpumasks

While ring-buffer allocation, the cpumasks are allocated too,
including the tracing cpumask and the per-cpu file mask handler.
But these cpumasks are freed accidentally just after.
Fix it.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237164303-11476-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c0cf946d42f5..ae32d3b99b4b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4125,7 +4125,8 @@ __init static int tracer_alloc_buffers(void)
 				       &trace_panic_notifier);
 
 	register_die_notifier(&trace_die_notifier);
-	ret = 0;
+
+	return 0;
 
 out_free_cpumask:
 	free_cpumask_var(tracing_reader_cpumask);
-- 
cgit v1.2.3-59-g8ed1b


From 03303549b1695dc024d4a653cc16bd79f78f9750 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 16 Mar 2009 22:41:00 +0100
Subject: tracing/ftrace: fix the check on nopped sites

Impact: fix a dynamic tracing failure

Recently, the function and function graph tracers failed to use dynamic
tracing after the following commit:

fa9d13cf135efbd454453a53b6299976bea245a9
(ftrace: don't try to __ftrace_replace_code on !FTRACE_FL_CONVERTED rec)

The patch is right except a mistake on the check for the FTRACE_FL_CONVERTED
flag. The code patching is aborted in case of successfully nopped sites.
What we want is the opposite: ignore the callsites that haven't been nopped.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 90d5729afeff..7847806eefef 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -537,7 +537,7 @@ static void ftrace_replace_code(int enable)
 		 */
 		if (rec->flags & FTRACE_FL_FREE ||
 		    rec->flags & FTRACE_FL_FAILED ||
-		    rec->flags & FTRACE_FL_CONVERTED)
+		    !(rec->flags & FTRACE_FL_CONVERTED))
 			continue;
 
 		/* ignore updates to this record's mcount site */
-- 
cgit v1.2.3-59-g8ed1b


From 4ca530852346be239b7c19e7bec5d2b78855bebe Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Mar 2009 19:20:15 -0400
Subject: tracing: protect reader of cmdline output

Impact: fix to one cause of incorrect comm outputs in trace

The spinlock only protected the creation of a comm <=> pid pair.
But it was possible that a reader could look up a pid, and get the
wrong comm because it had no locking.

This also required changing trace_find_cmdline to copy the comm cache
and not just send back a pointer to it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/blktrace.c              | 23 ++++++++++++++++++-----
 kernel/trace/trace.c                 | 20 ++++++++++++--------
 kernel/trace/trace.h                 |  2 +-
 kernel/trace/trace_functions_graph.c | 12 ++++++------
 kernel/trace/trace_output.c          | 18 ++++++++++++------
 5 files changed, 49 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 1f32e4edf490..b171778e3863 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1027,7 +1027,9 @@ static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
 
 static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 {
-	const char *cmd = trace_find_cmdline(ent->pid);
+	char cmd[TASK_COMM_LEN];
+
+	trace_find_cmdline(ent->pid, cmd);
 
 	if (t_sec(ent))
 		return trace_seq_printf(s, "%llu + %u [%s]\n",
@@ -1057,19 +1059,30 @@ static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
 
 static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
 {
-	return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid));
+	char cmd[TASK_COMM_LEN];
+
+	trace_find_cmdline(ent->pid, cmd);
+
+	return trace_seq_printf(s, "[%s]\n", cmd);
 }
 
 static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
 {
-	return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid),
-				get_pdu_int(ent));
+	char cmd[TASK_COMM_LEN];
+
+	trace_find_cmdline(ent->pid, cmd);
+
+	return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
 }
 
 static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
 {
+	char cmd[TASK_COMM_LEN];
+
+	trace_find_cmdline(ent->pid, cmd);
+
 	return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
-				get_pdu_int(ent), trace_find_cmdline(ent->pid));
+				get_pdu_int(ent), cmd);
 }
 
 /*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index efe3202c0209..2796bd2b17e4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -770,25 +770,29 @@ static void trace_save_cmdline(struct task_struct *tsk)
 	__raw_spin_unlock(&trace_cmdline_lock);
 }
 
-char *trace_find_cmdline(int pid)
+void trace_find_cmdline(int pid, char comm[])
 {
-	char *cmdline = "<...>";
 	unsigned map;
 
-	if (!pid)
-		return "<idle>";
+	if (!pid) {
+		strcpy(comm, "<idle>");
+		return;
+	}
 
-	if (pid > PID_MAX_DEFAULT)
-		goto out;
+	if (pid > PID_MAX_DEFAULT) {
+		strcpy(comm, "<...>");
+		return;
+	}
 
+	__raw_spin_lock(&trace_cmdline_lock);
 	map = map_pid_to_cmdline[pid];
 	if (map >= SAVED_CMDLINES)
 		goto out;
 
-	cmdline = saved_cmdlines[map];
+	strcpy(comm, saved_cmdlines[map]);
 
  out:
-	return cmdline;
+	__raw_spin_unlock(&trace_cmdline_lock);
 }
 
 void tracing_record_cmdline(struct task_struct *tsk)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 56ce34d90b03..b0ecad8ecc34 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -547,7 +547,7 @@ struct tracer_switch_ops {
 };
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
-extern char *trace_find_cmdline(int pid);
+extern void trace_find_cmdline(int pid, char comm[]);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern unsigned long ftrace_update_tot_cnt;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4c388607ed67..6004ccac2dd7 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -190,15 +190,15 @@ print_graph_cpu(struct trace_seq *s, int cpu)
 static enum print_line_t
 print_graph_proc(struct trace_seq *s, pid_t pid)
 {
-	int i;
-	int ret;
-	int len;
-	char comm[8];
-	int spaces = 0;
+	char comm[TASK_COMM_LEN];
 	/* sign + log10(MAX_INT) + '\0' */
 	char pid_str[11];
+	int spaces = 0;
+	int ret;
+	int len;
+	int i;
 
-	strncpy(comm, trace_find_cmdline(pid), 7);
+	trace_find_cmdline(pid, comm);
 	comm[7] = '\0';
 	sprintf(pid_str, "%d", pid);
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ea9d3b410c7a..6a4c9dea191e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -309,9 +309,9 @@ static int
 lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 {
 	int hardirq, softirq;
-	char *comm;
+	char comm[TASK_COMM_LEN];
 
-	comm = trace_find_cmdline(entry->pid);
+	trace_find_cmdline(entry->pid, comm);
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
 	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
 
@@ -346,10 +346,12 @@ int trace_print_context(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry = iter->ent;
-	char *comm = trace_find_cmdline(entry->pid);
 	unsigned long long t = ns2usecs(iter->ts);
 	unsigned long usec_rem = do_div(t, USEC_PER_SEC);
 	unsigned long secs = (unsigned long)t;
+	char comm[TASK_COMM_LEN];
+
+	trace_find_cmdline(entry->pid, comm);
 
 	return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
 				comm, entry->pid, iter->cpu, secs, usec_rem);
@@ -372,7 +374,10 @@ int trace_print_lat_context(struct trace_iterator *iter)
 	rel_usecs = ns2usecs(next_ts - iter->ts);
 
 	if (verbose) {
-		char *comm = trace_find_cmdline(entry->pid);
+		char comm[TASK_COMM_LEN];
+
+		trace_find_cmdline(entry->pid, comm);
+
 		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
 				       " %ld.%03ldms (+%ld.%03ldms): ", comm,
 				       entry->pid, iter->cpu, entry->flags,
@@ -577,14 +582,15 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
 					     char *delim)
 {
 	struct ctx_switch_entry *field;
-	char *comm;
+	char comm[TASK_COMM_LEN];
 	int S, T;
 
+
 	trace_assign_type(field, iter->ent);
 
 	T = task_state_char(field->next_state);
 	S = task_state_char(field->prev_state);
-	comm = trace_find_cmdline(field->next_pid);
+	trace_find_cmdline(field->next_pid, comm);
 	if (!trace_seq_printf(&iter->seq,
 			      " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
 			      field->prev_pid,
-- 
cgit v1.2.3-59-g8ed1b


From 6adaad14d7d4d3ef31b4e2dc992b18b5da7c4eb3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Mar 2009 21:57:17 -0400
Subject: tracing: stop comm recording on tracing off

Impact: fix for losing comms in trace

The command lines of tasks are cached at sched switch to not need
to record them at every trace point.  Disabling the tracing on stops
the recording of traces, but does not stop the caching of command lines.
When the tracing is off the cache may overflow and cause the tracing
to show incorrect tasks matching the PIDs.

This patch disables prevents updates to the comm cache when the ring buffer
is off.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2796bd2b17e4..8f89690230e6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -797,7 +797,7 @@ void trace_find_cmdline(int pid, char comm[])
 
 void tracing_record_cmdline(struct task_struct *tsk)
 {
-	if (atomic_read(&trace_record_cmdline_disabled))
+	if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
 		return;
 
 	trace_save_cmdline(tsk);
-- 
cgit v1.2.3-59-g8ed1b


From c269fc8c537d761f36cb98e637ae934d9331a9d5 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 17 Mar 2009 01:20:59 -0500
Subject: tracing: fix leak in event_format_read()

Impact: fix memory leak

If event_format_read() exits early due to nonzero ppos, the
previous kmalloc doesn't get freed - might as well do the
check before the kmalloc and avoid the problem.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237270859.8033.141.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 238ea95a4115..c88227b3b9db 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -378,15 +378,15 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 	char *buf;
 	int r;
 
+	if (*ppos)
+		return 0;
+
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
 		return -ENOMEM;
 
 	trace_seq_init(s);
 
-	if (*ppos)
-		return 0;
-
 	/* If any of the first writes fail, so will the show_format. */
 
 	trace_seq_printf(s, "name: %s\n", call->name);
-- 
cgit v1.2.3-59-g8ed1b


From f2d28a2ebcb525a6ec7e2152106ddb385ef52b73 Mon Sep 17 00:00:00 2001
From: Guillaume Knispel <gknispel@proformatique.com>
Date: Tue, 17 Mar 2009 16:18:42 +0100
Subject: printk: correct the behavior of printk_timed_ratelimit()

Impact: fix jiffies-comparison sign-wrap behavior

The behavior provided by printk_timed_ratelimit() is, in some
situations, probably not what a caller would reasonably expect:

bool printk_timed_ratelimit(unsigned long *caller_jiffies,
			unsigned int interval_msecs)
{
	if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
		*caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
		return true;
	}
	return false;
}

On a 32 bit computer, if printk_timed_ratelimit() is initially called at
time jiffies == Ja, *caller_jiffies is set to
Ja + msecs_to_jiffies(interval_msecs): let's say Ja + 42 for this
example.

If this caller then doesn't call printk_timed_ratelimit() until
jiffies == Ja + (1 << 31) + 42 (which can happen as soon as ~ 25 days
later on a 1000 HZ system), printk_timed_ratelimit() will then always
return false to this caller until jiffies loops completely (1 << 31 more
ticks).

Ths change makes it only return false if jiffies is in the small
time window starting at the previous call when true was returned and
ending interval_msecs later.  Note that if jiffies loops completely
between two calls to printk_timed_ratelimit(), it will obviously still
wrongly return false, but this is something with a low probability.

If something completely reliable is needed I guess jiffies_64 must be
used (which this change does not do).

Signed-off-by: Guillaume Knispel <gknispel@proformatique.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20090317161842.0059096b@xilun.lan.proformatique.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index e3602d0755b0..2be719908d1f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1292,8 +1292,11 @@ EXPORT_SYMBOL(printk_ratelimit);
 bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 			unsigned int interval_msecs)
 {
-	if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
-		*caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
+	if (*caller_jiffies == 0
+			|| !time_in_range(jiffies, *caller_jiffies,
+					*caller_jiffies
+					+ msecs_to_jiffies(interval_msecs))) {
+		*caller_jiffies = jiffies;
 		return true;
 	}
 	return false;
-- 
cgit v1.2.3-59-g8ed1b


From 37886f6a9f62d22530ffee8d3f9215c8345b6969 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Mar 2009 17:22:06 -0400
Subject: ring-buffer: add api to allow a tracer to change clock source

This patch adds a new function called ring_buffer_set_clock that
allows a tracer to assign its own clock source to the buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h |  7 +++--
 kernel/trace/ring_buffer.c  | 65 ++++++++++++++++++++++++++-------------------
 kernel/trace/trace.c        | 21 ++++++++++-----
 3 files changed, 57 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b1a0068a5557..9e6052bd1a1c 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -118,8 +118,11 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 
-u64 ring_buffer_time_stamp(int cpu);
-void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
+u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
+void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
+				      int cpu, u64 *ts);
+void ring_buffer_set_clock(struct ring_buffer *buffer,
+			   u64 (*clock)(void));
 
 size_t ring_buffer_page_len(void *page);
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 58128ad2fde0..bbf51922a8ca 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -180,29 +180,6 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 
 #include "trace.h"
 
-/* Up this if you want to test the TIME_EXTENTS and normalization */
-#define DEBUG_SHIFT 0
-
-u64 ring_buffer_time_stamp(int cpu)
-{
-	u64 time;
-
-	preempt_disable_notrace();
-	/* shift to debug/test normalization and TIME_EXTENTS */
-	time = trace_clock_local() << DEBUG_SHIFT;
-	preempt_enable_no_resched_notrace();
-
-	return time;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
-
-void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
-{
-	/* Just stupid testing the normalize function and deltas */
-	*ts >>= DEBUG_SHIFT;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
-
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
 #define RB_MAX_SMALL_DATA	28
@@ -374,6 +351,7 @@ struct ring_buffer {
 #ifdef CONFIG_HOTPLUG_CPU
 	struct notifier_block		cpu_notify;
 #endif
+	u64				(*clock)(void);
 };
 
 struct ring_buffer_iter {
@@ -394,6 +372,30 @@ struct ring_buffer_iter {
 		_____ret;					\
 	})
 
+/* Up this if you want to test the TIME_EXTENTS and normalization */
+#define DEBUG_SHIFT 0
+
+u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+	u64 time;
+
+	preempt_disable_notrace();
+	/* shift to debug/test normalization and TIME_EXTENTS */
+	time = buffer->clock() << DEBUG_SHIFT;
+	preempt_enable_no_resched_notrace();
+
+	return time;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
+
+void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
+				      int cpu, u64 *ts)
+{
+	/* Just stupid testing the normalize function and deltas */
+	*ts >>= DEBUG_SHIFT;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
+
 /**
  * check_pages - integrity check of buffer pages
  * @cpu_buffer: CPU buffer with pages to test
@@ -569,6 +571,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 
 	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 	buffer->flags = flags;
+	buffer->clock = trace_clock_local;
 
 	/* need at least two pages */
 	if (buffer->pages == 1)
@@ -645,6 +648,12 @@ ring_buffer_free(struct ring_buffer *buffer)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free);
 
+void ring_buffer_set_clock(struct ring_buffer *buffer,
+			   u64 (*clock)(void))
+{
+	buffer->clock = clock;
+}
+
 static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
 
 static void
@@ -1191,7 +1200,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 			cpu_buffer->tail_page = next_page;
 
 			/* reread the time stamp */
-			*ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+			*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
 			cpu_buffer->tail_page->page->time_stamp = *ts;
 		}
 
@@ -1334,7 +1343,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		return NULL;
 
-	ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+	ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
 
 	/*
 	 * Only the first commit can update the timestamp.
@@ -2051,7 +2060,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	case RINGBUF_TYPE_DATA:
 		if (ts) {
 			*ts = cpu_buffer->read_stamp + event->time_delta;
-			ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+			ring_buffer_normalize_time_stamp(buffer,
+							 cpu_buffer->cpu, ts);
 		}
 		return event;
 
@@ -2112,7 +2122,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	case RINGBUF_TYPE_DATA:
 		if (ts) {
 			*ts = iter->read_stamp + event->time_delta;
-			ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+			ring_buffer_normalize_time_stamp(buffer,
+							 cpu_buffer->cpu, ts);
 		}
 		return event;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8f89690230e6..3be2f788e10d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -155,13 +155,6 @@ ns2usecs(cycle_t nsec)
 	return nsec;
 }
 
-cycle_t ftrace_now(int cpu)
-{
-	u64 ts = ring_buffer_time_stamp(cpu);
-	ring_buffer_normalize_time_stamp(cpu, &ts);
-	return ts;
-}
-
 /*
  * The global_trace is the descriptor that holds the tracing
  * buffers for the live tracing. For each CPU, it contains
@@ -178,6 +171,20 @@ static struct trace_array	global_trace;
 
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
 
+cycle_t ftrace_now(int cpu)
+{
+	u64 ts;
+
+	/* Early boot up does not have a buffer yet */
+	if (!global_trace.buffer)
+		return trace_clock_local();
+
+	ts = ring_buffer_time_stamp(global_trace.buffer, cpu);
+	ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts);
+
+	return ts;
+}
+
 /*
  * The max_tr is used to snapshot the global_trace when a maximum
  * latency is reached. Some tracers will use this to store a maximum
-- 
cgit v1.2.3-59-g8ed1b


From af4617bdba34aa556272b34c3986b0a4d588f568 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Mar 2009 18:09:55 -0400
Subject: tracing: add global-clock option to provide cross CPU clock to traces

Impact: feature to allow better serialized clock

This patch adds an option called "global-clock" that will allow
the tracer to switch to a slower but more accurate (across CPUs)
clock.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 34 ++++++++++++++++++++++++++++++----
 kernel/trace/trace.h |  1 +
 2 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3be2f788e10d..2f994caab0b7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -315,6 +315,7 @@ static const char *trace_options[] = {
 	"printk-msg-only",
 	"context-info",
 	"latency-format",
+	"global-clock",
 	NULL
 };
 
@@ -2251,6 +2252,34 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
 	return 0;
 }
 
+static void set_tracer_flags(unsigned int mask, int enabled)
+{
+	/* do nothing if flag is already set */
+	if (!!(trace_flags & mask) == !!enabled)
+		return;
+
+	if (enabled)
+		trace_flags |= mask;
+	else
+		trace_flags &= ~mask;
+
+	if (mask == TRACE_ITER_GLOBAL_CLK) {
+		u64 (*func)(void);
+
+		if (enabled)
+			func = trace_clock_global;
+		else
+			func = trace_clock_local;
+
+		mutex_lock(&trace_types_lock);
+		ring_buffer_set_clock(global_trace.buffer, func);
+
+		if (max_tr.buffer)
+			ring_buffer_set_clock(max_tr.buffer, func);
+		mutex_unlock(&trace_types_lock);
+	}
+}
+
 static ssize_t
 tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 			size_t cnt, loff_t *ppos)
@@ -2278,10 +2307,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 		int len = strlen(trace_options[i]);
 
 		if (strncmp(cmp, trace_options[i], len) == 0) {
-			if (neg)
-				trace_flags &= ~(1 << i);
-			else
-				trace_flags |= (1 << i);
+			set_tracer_flags(1 << i, !neg);
 			break;
 		}
 	}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b0ecad8ecc34..26a7a28ca110 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -667,6 +667,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_PRINTK_MSGONLY	= 0x10000,
 	TRACE_ITER_CONTEXT_INFO		= 0x20000, /* Print pid/cpu/time */
 	TRACE_ITER_LATENCY_FMT		= 0x40000,
+	TRACE_ITER_GLOBAL_CLK		= 0x80000,
 };
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 5fec6ddcb43a91aa9a254c8ecf174c803de6f07e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Mar 2009 19:59:53 -0400
Subject: tracing: make sched_switch stop/start light weight

The stopping and starting of a tracer should be light weight and
be able to be called in all contexts. The sched_switch grabbed
mutexes in the start/stop functions. This patch changes it to a
simple variable, on/off.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_sched_switch.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 77132c2cf3d9..de35f200abd3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -18,6 +18,7 @@ static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
 static int			sched_ref;
 static DEFINE_MUTEX(sched_register_mutex);
+static int			sched_stopped;
 
 static void
 probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -28,7 +29,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 	int cpu;
 	int pc;
 
-	if (!sched_ref)
+	if (!sched_ref || sched_stopped)
 		return;
 
 	tracing_record_cmdline(prev);
@@ -193,6 +194,7 @@ static void stop_sched_trace(struct trace_array *tr)
 static int sched_switch_trace_init(struct trace_array *tr)
 {
 	ctx_trace = tr;
+	tracing_reset_online_cpus(tr);
 	tracing_start_sched_switch_record();
 	return 0;
 }
@@ -205,13 +207,12 @@ static void sched_switch_trace_reset(struct trace_array *tr)
 
 static void sched_switch_trace_start(struct trace_array *tr)
 {
-	tracing_reset_online_cpus(tr);
-	tracing_start_sched_switch();
+	sched_stopped = 0;
 }
 
 static void sched_switch_trace_stop(struct trace_array *tr)
 {
-	tracing_stop_sched_switch();
+	sched_stopped = 1;
 }
 
 static struct tracer sched_switch_trace __read_mostly =
-- 
cgit v1.2.3-59-g8ed1b


From 62524d55e5b9ffe36e3bf3dd7a594114f150b449 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Mar 2009 20:58:00 -0400
Subject: tracing: make power tracer start/stop methods lighter weight

The start/stop methods of a tracer should be able to be executed
in all contexts. This patch converts the power tracer to do so.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_power.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 91ce672fb037..bae791ebcc51 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -122,10 +122,14 @@ fail_start:
 static void start_power_trace(struct trace_array *tr)
 {
 	trace_power_enabled = 1;
-	tracing_power_register();
 }
 
 static void stop_power_trace(struct trace_array *tr)
+{
+	trace_power_enabled = 0;
+}
+
+static void power_trace_reset(struct trace_array *tr)
 {
 	trace_power_enabled = 0;
 	unregister_trace_power_start(probe_power_start);
@@ -188,7 +192,7 @@ static struct tracer power_tracer __read_mostly =
 	.init		= power_trace_init,
 	.start		= start_power_trace,
 	.stop		= stop_power_trace,
-	.reset		= stop_power_trace,
+	.reset		= power_trace_reset,
 	.print_line	= power_print_line,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 18aecd362a1c991fbf5f7919ae051a77532ba2f8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Mar 2009 08:56:58 +0100
Subject: tracing: stop command line recording when tracing is disabled

Impact: prevent overwrite of command line entries

When the tracer is stopped the command line recording continues to
record. The check for tracing_is_on() is not sufficient here as the
ringbuffer status is not affected by setting
debug/tracing/tracing_enabled to 0. On a non idle system this can
result in the loss of the command line information for the stopped
trace, which makes the trace harder to read and analyse.

Check tracer_enabled to allow further recording.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1ce6208fd727..7b6043ea256e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -797,7 +797,8 @@ void trace_find_cmdline(int pid, char comm[])
 
 void tracing_record_cmdline(struct task_struct *tsk)
 {
-	if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
+	if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled ||
+	    !tracing_is_on())
 		return;
 
 	trace_save_cmdline(tsk);
-- 
cgit v1.2.3-59-g8ed1b


From 2c7eea4c62ba090b7f4583c3d7337ea0019be900 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Mar 2009 09:03:19 +0100
Subject: tracing: replace the crude (unsigned) -1 hackery

Impact: cleanup

The command line recorder uses (unsigned) -1 to mark non mapped
entries in the pid to command line maps. The validity check is
completely unintuitive: idx >= SAVED_CMDLINES

There is no need for such casting games. Use a constant to mark
unmapped entries and check for that constant to make the code readable
and understandable.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7b6043ea256e..ca673c475687 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -633,6 +633,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
 }
 
 #define SAVED_CMDLINES 128
+#define NO_CMDLINE_MAP UINT_MAX
 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
 static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
 static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
@@ -644,8 +645,8 @@ static atomic_t trace_record_cmdline_disabled __read_mostly;
 
 static void trace_init_cmdlines(void)
 {
-	memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline));
-	memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid));
+	memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline));
+	memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid));
 	cmdline_idx = 0;
 }
 
@@ -753,12 +754,12 @@ static void trace_save_cmdline(struct task_struct *tsk)
 		return;
 
 	idx = map_pid_to_cmdline[tsk->pid];
-	if (idx >= SAVED_CMDLINES) {
+	if (idx == NO_CMDLINE_MAP) {
 		idx = (cmdline_idx + 1) % SAVED_CMDLINES;
 
 		map = map_cmdline_to_pid[idx];
-		if (map <= PID_MAX_DEFAULT)
-			map_pid_to_cmdline[map] = (unsigned)-1;
+		if (map != NO_CMDLINE_MAP)
+			map_pid_to_cmdline[map] = NO_CMDLINE_MAP;
 
 		map_pid_to_cmdline[tsk->pid] = idx;
 
@@ -786,7 +787,7 @@ void trace_find_cmdline(int pid, char comm[])
 
 	__raw_spin_lock(&trace_cmdline_lock);
 	map = map_pid_to_cmdline[pid];
-	if (map >= SAVED_CMDLINES)
+	if (map == NO_CMDLINE_MAP)
 		goto out;
 
 	strcpy(comm, saved_cmdlines[map]);
-- 
cgit v1.2.3-59-g8ed1b


From 50d88758a3f9787cbdbdbc030560b815721eab4b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Mar 2009 08:58:44 +0100
Subject: tracing: fix trace_find_cmdline()

Impact: prevent stale command line output

In case there is no valid command line mapping for a pid
trace_find_cmdline() returns without updating the comm buffer. The
trace dump keeps the previous entry which results in confusing trace
output:

     <idle>-0     [000]   280.702056 ....
     <idle>-23456 [000]   280.702080 ....

Update the comm buffer with "<...>" when no mapping is found.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ca673c475687..06c69a260328 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -787,12 +787,11 @@ void trace_find_cmdline(int pid, char comm[])
 
 	__raw_spin_lock(&trace_cmdline_lock);
 	map = map_pid_to_cmdline[pid];
-	if (map == NO_CMDLINE_MAP)
-		goto out;
-
-	strcpy(comm, saved_cmdlines[map]);
+	if (map != NO_CMDLINE_MAP)
+		strcpy(comm, saved_cmdlines[map]);
+	else
+		strcpy(comm, "<...>");
 
- out:
 	__raw_spin_unlock(&trace_cmdline_lock);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From a635cf0497342978d417cae19d4a4823932977ff Mon Sep 17 00:00:00 2001
From: Carsten Emde <Carsten.Emde@osadl.org>
Date: Wed, 18 Mar 2009 09:00:41 +0100
Subject: tracing: fix command line to pid reverse map

Impact: fix command line to pid mapping

map_cmdline_to_pid[] is checked in trace_save_cmdline(), but never
updated. This results in stale pid to command line mappings and the
tracer output will associate the wrong comm string.

Signed-off-by: Carsten Emde <Carsten.Emde@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 06c69a260328..305c562dae2a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -738,8 +738,7 @@ void trace_stop_cmdline_recording(void);
 
 static void trace_save_cmdline(struct task_struct *tsk)
 {
-	unsigned map;
-	unsigned idx;
+	unsigned pid, idx;
 
 	if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
 		return;
@@ -757,10 +756,17 @@ static void trace_save_cmdline(struct task_struct *tsk)
 	if (idx == NO_CMDLINE_MAP) {
 		idx = (cmdline_idx + 1) % SAVED_CMDLINES;
 
-		map = map_cmdline_to_pid[idx];
-		if (map != NO_CMDLINE_MAP)
-			map_pid_to_cmdline[map] = NO_CMDLINE_MAP;
+		/*
+		 * Check whether the cmdline buffer at idx has a pid
+		 * mapped. We are going to overwrite that entry so we
+		 * need to clear the map_pid_to_cmdline. Otherwise we
+		 * would read the new comm for the old pid.
+		 */
+		pid = map_cmdline_to_pid[idx];
+		if (pid != NO_CMDLINE_MAP)
+			map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
 
+		map_cmdline_to_pid[idx] = tsk->pid;
 		map_pid_to_cmdline[tsk->pid] = idx;
 
 		cmdline_idx = idx;
-- 
cgit v1.2.3-59-g8ed1b


From 490362003457f8d387f6f6e73e3a7efbf56c3314 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 17 Mar 2009 22:38:58 +0100
Subject: tracing/ftrace: stop {irqs, preempt}soff tracers when tracing is
 stopped

Impact: fix a selftest warning

In some cases, it's possible to see the following warning on irqsoff
tracer selftest:

[    4.640003] Testing tracer irqsoff: <4>------------[ cut here ]------------
[    4.653562] WARNING: at kernel/trace/trace.c:458 update_max_tr_single+0x9a/0xc4()
[    4.660000] Hardware name: System Product Name
[    4.660000] Modules linked in:
[    4.660000] Pid: 301, comm: kstop/1 Not tainted 2.6.29-rc8-tip #35837
[    4.660000] Call Trace:
[    4.660000]  [<4014b588>] warn_slowpath+0x79/0x8f
[    4.660000]  [<402d6949>] ? put_dec+0x64/0x6b
[    4.660000]  [<40162b56>] ? getnstimeofday+0x58/0xdd
[    4.660000]  [<40162210>] ? clocksource_read+0x3/0xf
[    4.660000]  [<4015eb44>] ? ktime_set+0x8/0x34
[    4.660000]  [<4014101a>] ? balance_runtime+0x8/0x56
[    4.660000]  [<405f6f11>] ? _spin_lock+0x3/0x10
[    4.660000]  [<4011f643>] ? ftrace_call+0x5/0x8
[    4.660000]  [<4015d0f1>] ? task_cputime_zero+0x3/0x27
[    4.660000]  [<40190ee7>] ? cpupri_set+0x90/0xcb
[    4.660000]  [<405f7208>] ? _spin_lock_irqsave+0x22/0x34
[    4.660000]  [<40190f12>] ? cpupri_set+0xbb/0xcb
[    4.660000]  [<405f7151>] ? _spin_unlock_irqrestore+0x23/0x35
[    4.660000]  [<4018493f>] ? ring_buffer_reset_cpu+0x27/0x51
[    4.660000]  [<405f7208>] ? _spin_lock_irqsave+0x22/0x34
[    4.660000]  [<40184962>] ? ring_buffer_reset_cpu+0x4a/0x51
[    4.660000]  [<405f7151>] ? _spin_unlock_irqrestore+0x23/0x35
[    4.660000]  [<4018cc29>] ? trace_hardirqs_off+0x1a/0x1c
[    4.660000]  [<405f7151>] ? _spin_unlock_irqrestore+0x23/0x35
[    4.660000]  [<40184962>] ? ring_buffer_reset_cpu+0x4a/0x51
[    4.660000]  [<401850f3>] ? cpumask_next+0x15/0x18
[    4.660000]  [<4018a41f>] update_max_tr_single+0x9a/0xc4
[    4.660000]  [<4014e5fe>] ? exit_notify+0x16/0xf2
[    4.660000]  [<4018cd13>] check_critical_timing+0xcc/0x11e
[    4.660000]  [<4014e5fe>] ? exit_notify+0x16/0xf2
[    4.660000]  [<4014e5fe>] ? exit_notify+0x16/0xf2
[    4.660000]  [<4018cdf1>] stop_critical_timing+0x8c/0x9f
[    4.660000]  [<4014e5c4>] ? forget_original_parent+0xac/0xd0
[    4.660000]  [<4018ce3a>] trace_hardirqs_on+0x1a/0x1c
[    4.660000]  [<4014e5c4>] forget_original_parent+0xac/0xd0
[    4.660000]  [<4014e5fe>] exit_notify+0x16/0xf2
[    4.660000]  [<4014e8a5>] do_exit+0x1cb/0x225
[    4.660000]  [<4015c72b>] ? kthread+0x0/0x69
[    4.660000]  [<4011f61d>] kernel_thread_helper+0xd/0x10
[    4.660000] ---[ end trace a7919e7f17c0a725 ]---
[    4.660164] .. no entries found ..FAILED!

During the selftest of irqsoff tracer, we do that:

	/* disable interrupts for a bit */
	local_irq_disable();
	udelay(100);
	local_irq_enable();
	/* stop the tracing. */
	tracing_stop();
	/* check both trace buffers */
	ret = trace_test_buffer(tr, NULL);

If a callsite performs a new max delay with irqs off just after
tracing_stop, update_max_tr_single() -> ring_buffer_swap_cpu()
will be called with the buffers disabled by tracing_stop(), hence
the warning, then ring_buffer_swap_cpu() return -EAGAIN and
update_max_tr_single() complains.

Fix it by also stopping the tracer before stopping the tracing globally.
A similar situation can happen with preemptoff and preemptirqsoff tracers
where we apply the same fix.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237325938-5240-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index a2ca6f0fef9b..38856ba78a92 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -315,6 +315,14 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 	local_irq_disable();
 	udelay(100);
 	local_irq_enable();
+
+	/*
+	 * Stop the tracer to avoid a warning subsequent
+	 * to buffer flipping failure because tracing_stop()
+	 * disables the tr and max buffers, making flipping impossible
+	 * in case of parallels max irqs off latencies.
+	 */
+	trace->stop(tr);
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
@@ -369,6 +377,14 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	preempt_disable();
 	udelay(100);
 	preempt_enable();
+
+	/*
+	 * Stop the tracer to avoid a warning subsequent
+	 * to buffer flipping failure because tracing_stop()
+	 * disables the tr and max buffers, making flipping impossible
+	 * in case of parallels max preempt off latencies.
+	 */
+	trace->stop(tr);
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
@@ -428,6 +444,13 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	/* reverse the order of preempt vs irqs */
 	local_irq_enable();
 
+	/*
+	 * Stop the tracer to avoid a warning subsequent
+	 * to buffer flipping failure because tracing_stop()
+	 * disables the tr and max buffers, making flipping impossible
+	 * in case of parallels max irqs/preempt off latencies.
+	 */
+	trace->stop(tr);
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
@@ -448,6 +471,8 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	/* do the test by disabling interrupts first this time */
 	tracing_max_latency = 0;
 	tracing_start();
+	trace->start(tr);
+
 	preempt_disable();
 	local_irq_disable();
 	udelay(100);
@@ -455,6 +480,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	/* reverse the order of preempt vs irqs */
 	local_irq_enable();
 
+	trace->stop(tr);
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
-- 
cgit v1.2.3-59-g8ed1b


From f02b8624fedca39886b0eef770dca70c2f0749b3 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Wed, 18 Mar 2009 17:06:21 +0530
Subject: kprobes: Fix locking imbalance in kretprobes

Fix locking imbalance in kretprobes:

=====================================
[ BUG: bad unlock balance detected! ]
-------------------------------------
kthreadd/2 is trying to release lock (&rp->lock) at:
[<c06b3080>] pre_handler_kretprobe+0xea/0xf4
but there are no more locks to release!

other info that might help us debug this:
1 lock held by kthreadd/2:
 #0:  (rcu_read_lock){..--}, at: [<c06b2b24>] __atomic_notifier_call_chain+0x0/0x5a

stack backtrace:
Pid: 2, comm: kthreadd Not tainted 2.6.29-rc8 #1
Call Trace:
 [<c06ae498>] ? printk+0xf/0x17
 [<c06b3080>] ? pre_handler_kretprobe+0xea/0xf4
 [<c044ce6c>] print_unlock_inbalance_bug+0xc3/0xce
 [<c0444d4b>] ? clocksource_read+0x7/0xa
 [<c04450a4>] ? getnstimeofday+0x5f/0xf6
 [<c044a9ca>] ? register_lock_class+0x17/0x293
 [<c044b72c>] ? mark_lock+0x1e/0x30b
 [<c0448956>] ? tick_dev_program_event+0x4a/0xbc
 [<c0498100>] ? __slab_alloc+0xa5/0x415
 [<c06b2fbe>] ? pre_handler_kretprobe+0x28/0xf4
 [<c06b3080>] ? pre_handler_kretprobe+0xea/0xf4
 [<c044cf1b>] lock_release_non_nested+0xa4/0x1a5
 [<c06b3080>] ? pre_handler_kretprobe+0xea/0xf4
 [<c044d15d>] lock_release+0x141/0x166
 [<c06b07dd>] _spin_unlock_irqrestore+0x19/0x50
 [<c06b3080>] pre_handler_kretprobe+0xea/0xf4
 [<c06b20b5>] kprobe_exceptions_notify+0x1c9/0x43e
 [<c06b2b02>] notifier_call_chain+0x26/0x48
 [<c06b2b5b>] __atomic_notifier_call_chain+0x37/0x5a
 [<c06b2b24>] ? __atomic_notifier_call_chain+0x0/0x5a
 [<c06b2b8a>] atomic_notifier_call_chain+0xc/0xe
 [<c0442d0d>] notify_die+0x2d/0x2f
 [<c06b0f9c>] do_int3+0x1f/0x71
 [<c06b0e84>] int3+0x2c/0x34
 [<c042d476>] ? do_fork+0x1/0x288
 [<c040221b>] ? kernel_thread+0x71/0x79
 [<c043ed1b>] ? kthread+0x0/0x60
 [<c043ed1b>] ? kthread+0x0/0x60
 [<c04040b8>] ? kernel_thread_helper+0x0/0x10
 [<c043ec7f>] kthreadd+0xac/0x148
 [<c043ebd3>] ? kthreadd+0x0/0x148
 [<c04040bf>] kernel_thread_helper+0x7/0x10

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Tested-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <stable@kernel.org> [2.6.29.x, 2.6.28.x, 2.6.27.x]
LKML-Reference: <20090318113621.GB4129@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 479d4d5672f9..5016bfb682b9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -919,10 +919,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 		ri->rp = rp;
 		ri->task = current;
 
-		if (rp->entry_handler && rp->entry_handler(ri, regs)) {
-			spin_unlock_irqrestore(&rp->lock, flags);
+		if (rp->entry_handler && rp->entry_handler(ri, regs))
 			return 0;
-		}
 
 		arch_prepare_kretprobe(ri, regs);
 
-- 
cgit v1.2.3-59-g8ed1b


From 4acd4d00f716873e27e7b60ae292cbdbfae674dd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 18 Mar 2009 10:40:24 -0400
Subject: tracing: give easy way to clear trace buffer

There is currently no easy way to clear the trace buffer. Currently
the only way is to change the current tracer.

This patch lets the user clear the trace buffer by simply writing
into the trace files.

 echo > /debug/tracing/trace

or to clear a single cpu (i.e. for CPU 1):

 echo > /debug/tracing/per_cpu/cpu1/trace

Requested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a2d13e8c8fd8..8d981ababc45 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1941,9 +1941,14 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
 static int tracing_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *m = (struct seq_file *)file->private_data;
-	struct trace_iterator *iter = m->private;
+	struct trace_iterator *iter;
 	int cpu;
 
+	if (!(file->f_mode & FMODE_READ))
+		return 0;
+
+	iter = m->private;
+
 	mutex_lock(&trace_types_lock);
 	for_each_tracing_cpu(cpu) {
 		if (iter->buffer_iter[cpu])
@@ -1969,12 +1974,24 @@ static int tracing_open(struct inode *inode, struct file *file)
 	struct trace_iterator *iter;
 	int ret = 0;
 
-	iter = __tracing_open(inode, file);
-	if (IS_ERR(iter))
-		ret = PTR_ERR(iter);
-	else if (trace_flags & TRACE_ITER_LATENCY_FMT)
-		iter->iter_flags |= TRACE_FILE_LAT_FMT;
+	/* If this file was open for write, then erase contents */
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !(file->f_flags & O_APPEND)) {
+		long cpu = (long) inode->i_private;
+
+		if (cpu == TRACE_PIPE_ALL_CPU)
+			tracing_reset_online_cpus(&global_trace);
+		else
+			tracing_reset(&global_trace, cpu);
+	}
 
+	if (file->f_mode & FMODE_READ) {
+		iter = __tracing_open(inode, file);
+		if (IS_ERR(iter))
+			ret = PTR_ERR(iter);
+		else if (trace_flags & TRACE_ITER_LATENCY_FMT)
+			iter->iter_flags |= TRACE_FILE_LAT_FMT;
+	}
 	return ret;
 }
 
@@ -2049,9 +2066,17 @@ static int show_traces_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
+static ssize_t
+tracing_write_stub(struct file *filp, const char __user *ubuf,
+		   size_t count, loff_t *ppos)
+{
+	return count;
+}
+
 static const struct file_operations tracing_fops = {
 	.open		= tracing_open,
 	.read		= seq_read,
+	.write		= tracing_write_stub,
 	.llseek		= seq_lseek,
 	.release	= tracing_release,
 };
@@ -3576,7 +3601,7 @@ static void tracing_init_debugfs_percpu(long cpu)
 		pr_warning("Could not create debugfs 'trace_pipe' entry\n");
 
 	/* per cpu trace */
-	entry = debugfs_create_file("trace", 0444, d_cpu,
+	entry = debugfs_create_file("trace", 0644, d_cpu,
 				(void *) cpu, &tracing_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace' entry\n");
@@ -3890,7 +3915,7 @@ static __init int tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
 
-	entry = debugfs_create_file("trace", 0444, d_tracer,
+	entry = debugfs_create_file("trace", 0644, d_tracer,
 				 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace' entry\n");
-- 
cgit v1.2.3-59-g8ed1b


From 09933a108e6730a464a1ab676c9decc11aee0edc Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Wed, 18 Mar 2009 22:18:56 +0530
Subject: tracing: fix oops in tracepoint_update_probe_range()

Change this crash:

 BUG: unable to handle kernel NULL pointer dereference at (null)
 IP: [<ffffffff8107d4de>] tracepoint_update_probe_range+0x1f/0x9b
 PGD 13d5fb067 PUD 13d688067 PMD 0
 Oops: 0000 [#1] SMP

To a more debuggable WARN_ONCE().

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237394936.3132.1.camel@localhost.localdomain>
[ moved the check outside the lock and added a WARN_ON(). ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/tracepoint.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 79602740bbb5..adf28734fe2d 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -272,12 +272,17 @@ static void disable_tracepoint(struct tracepoint *elem)
  *
  * Updates the probe callback corresponding to a range of tracepoints.
  */
-void tracepoint_update_probe_range(struct tracepoint *begin,
-	struct tracepoint *end)
+void
+tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
 {
 	struct tracepoint *iter;
 	struct tracepoint_entry *mark_entry;
 
+	if (!begin) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
 	mutex_lock(&tracepoints_mutex);
 	for (iter = begin; iter < end; iter++) {
 		mark_entry = get_tracepoint(iter->name);
-- 
cgit v1.2.3-59-g8ed1b


From ec625cb29e66824f7ce41082617aeb93fa4e42e2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Mar 2009 19:54:04 +0100
Subject: tracepoints: dont update zero-sized tracepoint sections

Zero-sized tracepoint sections can occur if tracing is enabled but
no tracepoint is defined. Do not emit a warning in that case.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
LKML-Reference: <1237394936.3132.1.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/tracepoint.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index adf28734fe2d..1ef5d3a601c7 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -278,10 +278,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
 	struct tracepoint *iter;
 	struct tracepoint_entry *mark_entry;
 
-	if (!begin) {
-		WARN_ON_ONCE(1);
+	if (!begin)
 		return;
-	}
 
 	mutex_lock(&tracepoints_mutex);
 	for (iter = begin; iter < end; iter++) {
-- 
cgit v1.2.3-59-g8ed1b


From df7c8e845e8e2030e8ae947e0ace56d184d0e9a0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 19 Mar 2009 15:22:20 +1030
Subject: cpumask: remove cpumask allocation from idle_balance

Impact: fix circular locking

Steven reports a circular locking from alloc_cpumask_var doing
a wakeup. We get rid of this using the tried-and-true technique
of using a per-cpu cpumask_var_t rather than doing an alloc
every time.

Simpler and more robust than a rare, implicit allocation within
an atomic codepath.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <alpine.DEB.2.00.0903181729360.31583@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5dabd80c3c15..48862d418bed 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3448,19 +3448,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
  */
 #define MAX_PINNED_INTERVAL	512
 
+/* Working cpumask for load_balance and load_balance_newidle. */
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *balance, struct cpumask *cpus)
+			int *balance)
 {
 	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
+	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 
 	cpumask_setall(cpus);
 
@@ -3615,8 +3619,7 @@ out:
  * this_rq is locked.
  */
 static int
-load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-			struct cpumask *cpus)
+load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 {
 	struct sched_group *group;
 	struct rq *busiest = NULL;
@@ -3624,6 +3627,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
 	int ld_moved = 0;
 	int sd_idle = 0;
 	int all_pinned = 0;
+	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 
 	cpumask_setall(cpus);
 
@@ -3764,10 +3768,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 	struct sched_domain *sd;
 	int pulled_task = 0;
 	unsigned long next_balance = jiffies + HZ;
-	cpumask_var_t tmpmask;
-
-	if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
-		return;
 
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
@@ -3778,7 +3778,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 		if (sd->flags & SD_BALANCE_NEWIDLE)
 			/* If we've pulled tasks over stop searching: */
 			pulled_task = load_balance_newidle(this_cpu, this_rq,
-							   sd, tmpmask);
+							   sd);
 
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
@@ -3793,7 +3793,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 		 */
 		this_rq->next_balance = next_balance;
 	}
-	free_cpumask_var(tmpmask);
 }
 
 /*
@@ -3943,11 +3942,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
 	int need_serialize;
-	cpumask_var_t tmp;
-
-	/* Fails alloc?  Rebalancing probably not a priority right now. */
-	if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
-		return;
 
 	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3972,7 +3966,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 		}
 
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
-			if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
+			if (load_balance(cpu, rq, sd, idle, &balance)) {
 				/*
 				 * We've pulled tasks over so either we're no
 				 * longer idle, or one of our SMT siblings is
@@ -4006,8 +4000,6 @@ out:
 	 */
 	if (likely(update_next_balance))
 		rq->next_balance = next_balance;
-
-	free_cpumask_var(tmp);
 }
 
 /*
@@ -8303,6 +8295,9 @@ void __init sched_init(void)
 #endif
 #ifdef CONFIG_USER_SCHED
 	alloc_size *= 2;
+#endif
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	alloc_size *= num_possible_cpus() * cpumask_size();
 #endif
 	/*
 	 * As sched_init() is called before page_alloc is setup,
@@ -8341,6 +8336,12 @@ void __init sched_init(void)
 		ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_CPUMASK_OFFSTACK
+		for_each_possible_cpu(i) {
+			per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+			ptr += cpumask_size();
+		}
+#endif /* CONFIG_CPUMASK_OFFSTACK */
 	}
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3-59-g8ed1b


From 4a44bac1f98223ed77e47bf3b42fcfd10cddd85f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 19 Mar 2009 13:21:44 +0100
Subject: symbols, stacktrace: look up init symbols after module symbols

Impact: fix incomplete stacktraces

I noticed such weird stacktrace entries in lockdep dumps:

[    0.285956] {HARDIRQ-ON-W} state was registered at:
[    0.285956]   [<ffffffff802bce90>] mark_irqflags+0xbe/0x125
[    0.285956]   [<ffffffff802bf2fd>] __lock_acquire+0x674/0x82d
[    0.285956]   [<ffffffff802bf5b2>] lock_acquire+0xfc/0x128
[    0.285956]   [<ffffffff8135b636>] rt_spin_lock+0xc8/0xd0
[    0.285956]   [<ffffffffffffffff>] 0xffffffffffffffff

The stacktrace entry is cut off after rt_spin_lock.

After much debugging i found out that stacktrace entries that
belong to init symbols dont get printed out, due to commit:

  a2da405: module: Don't report discarded init pages as kernel text.

The reason is this check added to core_kernel_text():

-       if (addr >= (unsigned long)_sinittext &&
+       if (system_state == SYSTEM_BOOTING &&
+           addr >= (unsigned long)_sinittext &&
            addr <= (unsigned long)_einittext)
                return 1;

This will discard inittext symbols even though their symbol table
is still present and even though stacktraces done while the system
was booting up might still be relevant.

To not reintroduce the (not well-specified) bug addressed in that
commit, first do a module symbols lookup, then a final init-symbols
lookup.

This will work fine on architectures that have separate address
spaces for modules (such as x86) - and should not crash any other
architectures either.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <new-discussion>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/extable.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8d82ba..c46da6a47036 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,6 +41,14 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	return e;
 }
 
+static inline int init_kernel_text(unsigned long addr)
+{
+	if (addr >= (unsigned long)_sinittext &&
+	    addr <= (unsigned long)_einittext)
+		return 1;
+	return 0;
+}
+
 __notrace_funcgraph int core_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext &&
@@ -48,8 +56,7 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr)
 		return 1;
 
 	if (system_state == SYSTEM_BOOTING &&
-	    addr >= (unsigned long)_sinittext &&
-	    addr <= (unsigned long)_einittext)
+	    init_kernel_text(addr))
 		return 1;
 	return 0;
 }
@@ -58,7 +65,19 @@ __notrace_funcgraph int __kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
-	return __module_text_address(addr) != NULL;
+	if (__module_text_address(addr))
+		return 1;
+	/*
+	 * There might be init symbols in saved stacktraces.
+	 * Give those symbols a chance to be printed in
+	 * backtraces (such as lockdep traces).
+	 *
+	 * Since we are after the module-symbols check, there's
+	 * no danger of address overlap:
+	 */
+	if (init_kernel_text(addr))
+		return 1;
+	return 0;
 }
 
 int kernel_text_address(unsigned long addr)
-- 
cgit v1.2.3-59-g8ed1b


From 8c083f081d0014057901c68a0a3e0f8ca7ac8d23 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 19 Mar 2009 15:22:20 +1030
Subject: cpumask: remove cpumask allocation from idle_balance, fix

Impact: fix boot crash

Fix typo in the size calculation.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <alpine.DEB.2.00.0903181729360.31583@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 48862d418bed..11dd52780adb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8297,7 +8297,7 @@ void __init sched_init(void)
 	alloc_size *= 2;
 #endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
-	alloc_size *= num_possible_cpus() * cpumask_size();
+	alloc_size += num_possible_cpus() * cpumask_size();
 #endif
 	/*
 	 * As sched_init() is called before page_alloc is setup,
-- 
cgit v1.2.3-59-g8ed1b


From ac5f6c96859e9a664ac05b04bc96ed1caad5fe29 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 11:29:23 -0400
Subject: function-graph: consolidate prologues for output

Impact: clean up

The prologue of the function graph entry, return and comments all
start out pretty much the same. Each of these duplicate code and
do so slightly differently.

This patch consolidates the printing of the pid, absolute time,
cpu and proc (and for entry, the interrupt).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions_graph.c | 98 +++++++++++-------------------------
 1 file changed, 29 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6004ccac2dd7..2d4d185cd0d8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -554,24 +554,24 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 }
 
 static enum print_line_t
-print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
-			struct trace_iterator *iter)
+print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
+		     int type, unsigned long addr)
 {
-	int ret;
-	int cpu = iter->cpu;
-	pid_t *last_entry = iter->private;
 	struct trace_entry *ent = iter->ent;
-	struct ftrace_graph_ent *call = &field->graph_ent;
-	struct ftrace_graph_ret_entry *leaf_ret;
+	pid_t *last_pid = iter->private;
+	int cpu = iter->cpu;
+	int ret;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	/* Interrupt */
-	ret = print_graph_irq(iter, call->func, TRACE_GRAPH_ENT, cpu, ent->pid);
-	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (type) {
+		/* Interrupt */
+		ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Absolute time */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
@@ -598,6 +598,20 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
+	return 0;
+}
+
+static enum print_line_t
+print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
+			struct trace_iterator *iter)
+{
+	int cpu = iter->cpu;
+	struct ftrace_graph_ent *call = &field->graph_ent;
+	struct ftrace_graph_ret_entry *leaf_ret;
+
+	if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
+		return TRACE_TYPE_PARTIAL_LINE;
+
 	leaf_ret = get_return_for_leaf(iter, field);
 	if (leaf_ret)
 		return print_graph_entry_leaf(iter, field, leaf_ret, s);
@@ -613,38 +627,12 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	int i;
 	int ret;
 	int cpu = iter->cpu;
-	pid_t *last_pid = iter->private, pid = ent->pid;
+	pid_t pid = ent->pid;
 	unsigned long long duration = trace->rettime - trace->calltime;
 
-	/* Pid */
-	if (verif_pid(s, pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+	if (print_graph_prologue(iter, s, 0, 0))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	/* Absolute time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
-		ret = print_graph_abs_time(iter->ts, s);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
-	/* Cpu */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
-		ret = print_graph_cpu(s, cpu);
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
-	/* Proc */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-		ret = print_graph_proc(s, ent->pid);
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = trace_seq_printf(s, " | ");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
 	/* Overhead */
 	ret = print_graph_overhead(duration, s);
 	if (!ret)
@@ -689,38 +677,10 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 {
 	int i;
 	int ret;
-	int cpu = iter->cpu;
-	pid_t *last_pid = iter->private;
 
-	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+	if (print_graph_prologue(iter, s, 0, 0))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	/* Absolute time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
-		ret = print_graph_abs_time(iter->ts, s);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
-	/* Cpu */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
-		ret = print_graph_cpu(s, cpu);
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
-	/* Proc */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-		ret = print_graph_proc(s, ent->pid);
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = trace_seq_printf(s, " | ");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
 	/* No overhead */
 	ret = print_graph_overhead(-1, s);
 	if (!ret)
-- 
cgit v1.2.3-59-g8ed1b


From 3bf832ce1fe6988148d392599f34ca0c6a34427d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 19 Mar 2009 14:47:33 +0100
Subject: tracing/ring-buffer: fix non cpu hotplug case

Impact: fix warning with irqsoff tracer

The ring buffer allocates its buffers on pre-smp time (early_initcall).
It means that, at first, only the boot cpu buffer is allocated and
the ring-buffer cpumask only has the boot cpu set (cpu_online_mask).

Later, the secondary cpu will show up and the ring-buffer will be notified
about this event: the appropriate buffer will be allocated and the cpumask
will be updated.

Unfortunately, if !CONFIG_CPU_HOTPLUG, the ring-buffer will not be
notified about the secondary cpus, meaning that the cpumask will have
only the cpu boot set, and only one cpu buffer allocated.

We fix that by using cpu_possible_mask if !CONFIG_CPU_HOTPLUG.

This patch fixes the following warning with irqsoff tracer running:

[  169.317794] WARNING: at kernel/trace/trace.c:466 update_max_tr_single+0xcc/0xf3()
[  169.318002] Hardware name: AMILO Li 2727
[  169.318002] Modules linked in:
[  169.318002] Pid: 5624, comm: bash Not tainted 2.6.29-rc8-tip-02636-g6aafa6c #11
[  169.318002] Call Trace:
[  169.318002]  [<ffffffff81036182>] warn_slowpath+0xea/0x13d
[  169.318002]  [<ffffffff8100b9d6>] ? ftrace_call+0x5/0x2b
[  169.318002]  [<ffffffff8100b9d6>] ? ftrace_call+0x5/0x2b
[  169.318002]  [<ffffffff8100b9d1>] ? ftrace_call+0x0/0x2b
[  169.318002]  [<ffffffff8101ef10>] ? ftrace_modify_code+0xa9/0x108
[  169.318002]  [<ffffffff8106e27f>] ? trace_hardirqs_off+0x25/0x27
[  169.318002]  [<ffffffff8149afe7>] ? _spin_unlock_irqrestore+0x1f/0x2d
[  169.318002]  [<ffffffff81064f52>] ? ring_buffer_reset_cpu+0xf6/0xfb
[  169.318002]  [<ffffffff8106637c>] ? ring_buffer_reset+0x36/0x48
[  169.318002]  [<ffffffff8106aeda>] update_max_tr_single+0xcc/0xf3
[  169.318002]  [<ffffffff8100bc17>] ? sysret_check+0x22/0x5d
[  169.318002]  [<ffffffff8106e3ea>] stop_critical_timing+0x142/0x204
[  169.318002]  [<ffffffff8106e4cf>] trace_hardirqs_on_caller+0x23/0x25
[  169.318002]  [<ffffffff8149ac28>] trace_hardirqs_on_thunk+0x3a/0x3c
[  169.318002]  [<ffffffff8100bc17>] ? sysret_check+0x22/0x5d
[  169.318002] ---[ end trace db76cbf775a750cf ]---

Because this tracer may try to swap two cpu ring buffers for an
unregistered cpu on the ring buffer.

This patch might also fix a fair loss of traces due to unallocated buffers
for secondary cpus.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-b: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237470453-5427-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bbf51922a8ca..384ca5d9d729 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -577,8 +577,17 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	if (buffer->pages == 1)
 		buffer->pages++;
 
+	/*
+	 * In case of non-hotplug cpu, if the ring-buffer is allocated
+	 * in early initcall, it will not be notified of secondary cpus.
+	 * In that off case, we need to allocate for all possible cpus.
+	 */
+#ifdef CONFIG_HOTPLUG_CPU
 	get_online_cpus();
 	cpumask_copy(buffer->cpumask, cpu_online_mask);
+#else
+	cpumask_copy(buffer->cpumask, cpu_possible_mask);
+#endif
 	buffer->cpus = nr_cpu_ids;
 
 	bsize = sizeof(void *) * nr_cpu_ids;
-- 
cgit v1.2.3-59-g8ed1b


From 5ef841f6f32dce0b752a4fa0622781ee67a0e874 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 12:20:38 -0400
Subject: tracing: make print_(b)printk_msg_only global

This patch makes print_printk_msg_only and print_bprintk_msg_only
global for other functions to use. It also renames them by adding
a "trace_" to the beginning to avoid namespace collisions.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c        | 36 ++----------------------------------
 kernel/trace/trace_output.c | 32 ++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.h |  5 +++++
 3 files changed, 39 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8d981ababc45..c637cb687cf2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1694,38 +1694,6 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t print_bprintk_msg_only(struct trace_iterator *iter)
-{
-	struct trace_seq *s = &iter->seq;
-	struct trace_entry *entry = iter->ent;
-	struct bprint_entry *field;
-	int ret;
-
-	trace_assign_type(field, entry);
-
-	ret = trace_seq_bprintf(s, field->fmt, field->buf);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
-{
-	struct trace_seq *s = &iter->seq;
-	struct trace_entry *entry = iter->ent;
-	struct print_entry *field;
-	int ret;
-
-	trace_assign_type(field, entry);
-
-	ret = trace_seq_printf(s, "%s", field->buf);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
 static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1787,12 +1755,12 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 	if (iter->ent->type == TRACE_BPRINT &&
 			trace_flags & TRACE_ITER_PRINTK &&
 			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
-		return print_bprintk_msg_only(iter);
+		return trace_print_bprintk_msg_only(iter);
 
 	if (iter->ent->type == TRACE_PRINT &&
 			trace_flags & TRACE_ITER_PRINTK &&
 			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
-		return print_printk_msg_only(iter);
+		return trace_print_printk_msg_only(iter);
 
 	if (trace_flags & TRACE_ITER_BIN)
 		return print_bin_fmt(iter);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 6a4c9dea191e..b45141748af5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -19,6 +19,38 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
 
+enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct bprint_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_bprintf(s, field->fmt, field->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct print_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_printf(s, "%s", field->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 /**
  * trace_seq_printf - sequence printing of trace information
  * @s: trace sequence descriptor
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 3b90e6ade1aa..35c422fb51a9 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -15,6 +15,11 @@ struct trace_event {
 	trace_print_func	binary;
 };
 
+extern enum print_line_t
+trace_print_bprintk_msg_only(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_printk_msg_only(struct trace_iterator *iter);
+
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 extern int
-- 
cgit v1.2.3-59-g8ed1b


From 2fbcdb35aca614f9529a0e7d340146cf0b71684f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 13:24:42 -0400
Subject: function-graph: calculate function depth within function graph tracer

Currently, the function graph tracer depends on the trace_printk
to record the depth. All the information is already there in the trace
to calculate function depth, with the exception of having the printk
be the first item. But as soon as a entry or exit is reached, then
we know the depth.

This patch changes the iter->private data from recording a per cpu
last_pid, to a structure that holds both the last_pid and the current
depth. This data is used to determine the function depth for the
printks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions_graph.c | 91 +++++++++++++++++++++++++++---------
 1 file changed, 69 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 2d4d185cd0d8..66ea23b64fe6 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,6 +14,11 @@
 #include "trace.h"
 #include "trace_output.h"
 
+struct fgraph_data {
+	pid_t		last_pid;
+	int		depth;
+};
+
 #define TRACE_GRAPH_INDENT	2
 
 /* Flag options */
@@ -231,16 +236,16 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
 
 /* If the pid changed since the last trace, output this event */
 static enum print_line_t
-verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
+verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
 {
 	pid_t prev_pid;
 	pid_t *last_pid;
 	int ret;
 
-	if (!last_pids_cpu)
+	if (!data)
 		return TRACE_TYPE_HANDLED;
 
-	last_pid = per_cpu_ptr(last_pids_cpu, cpu);
+	last_pid = &(per_cpu_ptr(data, cpu)->last_pid);
 
 	if (*last_pid == pid)
 		return TRACE_TYPE_HANDLED;
@@ -471,6 +476,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 		struct ftrace_graph_ent_entry *entry,
 		struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
 {
+	struct fgraph_data *data = iter->private;
 	struct ftrace_graph_ret *graph_ret;
 	struct ftrace_graph_ent *call;
 	unsigned long long duration;
@@ -481,6 +487,18 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	call = &entry->graph_ent;
 	duration = graph_ret->rettime - graph_ret->calltime;
 
+	if (data) {
+		int cpu = iter->cpu;
+		int *depth = &(per_cpu_ptr(data, cpu)->depth);
+
+		/*
+		 * Comments display at + 1 to depth. Since
+		 * this is a leaf function, keep the comments
+		 * equal to this depth.
+		 */
+		*depth = call->depth - 1;
+	}
+
 	/* Overhead */
 	ret = print_graph_overhead(duration, s);
 	if (!ret)
@@ -512,12 +530,21 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 }
 
 static enum print_line_t
-print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
-			struct trace_seq *s, pid_t pid, int cpu)
+print_graph_entry_nested(struct trace_iterator *iter,
+			 struct ftrace_graph_ent_entry *entry,
+			 struct trace_seq *s, int cpu)
 {
-	int i;
-	int ret;
 	struct ftrace_graph_ent *call = &entry->graph_ent;
+	struct fgraph_data *data = iter->private;
+	int ret;
+	int i;
+
+	if (data) {
+		int cpu = iter->cpu;
+		int *depth = &(per_cpu_ptr(data, cpu)->depth);
+
+		*depth = call->depth;
+	}
 
 	/* No overhead */
 	ret = print_graph_overhead(-1, s);
@@ -557,13 +584,13 @@ static enum print_line_t
 print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 		     int type, unsigned long addr)
 {
+	struct fgraph_data *data = iter->private;
 	struct trace_entry *ent = iter->ent;
-	pid_t *last_pid = iter->private;
 	int cpu = iter->cpu;
 	int ret;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	if (type) {
@@ -616,7 +643,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	if (leaf_ret)
 		return print_graph_entry_leaf(iter, field, leaf_ret, s);
 	else
-		return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
+		return print_graph_entry_nested(iter, field, s, cpu);
 
 }
 
@@ -624,11 +651,24 @@ static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 		   struct trace_entry *ent, struct trace_iterator *iter)
 {
-	int i;
-	int ret;
-	int cpu = iter->cpu;
-	pid_t pid = ent->pid;
 	unsigned long long duration = trace->rettime - trace->calltime;
+	struct fgraph_data *data = iter->private;
+	pid_t pid = ent->pid;
+	int cpu = iter->cpu;
+	int ret;
+	int i;
+
+	if (data) {
+		int cpu = iter->cpu;
+		int *depth = &(per_cpu_ptr(data, cpu)->depth);
+
+		/*
+		 * Comments display at + 1 to depth. This is the
+		 * return from a function, we now want the comments
+		 * to display at the same level of the bracket.
+		 */
+		*depth = trace->depth - 1;
+	}
 
 	if (print_graph_prologue(iter, s, 0, 0))
 		return TRACE_TYPE_PARTIAL_LINE;
@@ -675,8 +715,13 @@ static enum print_line_t
 print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 		   struct trace_entry *ent, struct trace_iterator *iter)
 {
-	int i;
+	struct fgraph_data *data = iter->private;
+	int depth = 0;
 	int ret;
+	int i;
+
+	if (data)
+		depth = per_cpu_ptr(data, iter->cpu)->depth;
 
 	if (print_graph_prologue(iter, s, 0, 0))
 		return TRACE_TYPE_PARTIAL_LINE;
@@ -694,8 +739,8 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 	}
 
 	/* Indentation */
-	if (trace->depth > 0)
-		for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
+	if (depth > 0)
+		for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
 			ret = trace_seq_printf(s, " ");
 			if (!ret)
 				return TRACE_TYPE_PARTIAL_LINE;
@@ -780,19 +825,21 @@ static void print_graph_headers(struct seq_file *s)
 
 static void graph_trace_open(struct trace_iterator *iter)
 {
-	/* pid on the last trace processed */
-	pid_t *last_pid = alloc_percpu(pid_t);
+	/* pid and depth on the last trace processed */
+	struct fgraph_data *data = alloc_percpu(struct fgraph_data);
 	int cpu;
 
-	if (!last_pid)
+	if (!data)
 		pr_warning("function graph tracer: not enough memory\n");
 	else
 		for_each_possible_cpu(cpu) {
-			pid_t *pid = per_cpu_ptr(last_pid, cpu);
+			pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid);
+			int *depth = &(per_cpu_ptr(data, cpu)->depth);
 			*pid = -1;
+			*depth = 0;
 		}
 
-	iter->private = last_pid;
+	iter->private = data;
 }
 
 static void graph_trace_close(struct trace_iterator *iter)
-- 
cgit v1.2.3-59-g8ed1b


From 40ce74f19c28077550646c76d96a075bf312e461 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 14:03:53 -0400
Subject: tracing: remove recording function depth from trace_printk

The function depth in trace_printk was to facilitate the function
graph output. Now that the function graph calculates the depth within
the trace output, we no longer need to record the depth when the
trace_printk is called.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c             | 8 +++-----
 kernel/trace/trace.h             | 6 ++----
 kernel/trace/trace_event_types.h | 2 --
 kernel/trace/trace_mmiotrace.c   | 2 +-
 kernel/trace/trace_printk.c      | 8 ++++----
 5 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c637cb687cf2..f7f359d45823 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1194,7 +1194,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
  * trace_vbprintk - write binary msg to tracing buffer
  *
  */
-int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 {
 	static raw_spinlock_t trace_buf_lock =
 		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
@@ -1236,7 +1236,6 @@ int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
 	entry->ip			= ip;
-	entry->depth			= depth;
 	entry->fmt			= fmt;
 
 	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
@@ -1254,7 +1253,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(trace_vbprintk);
 
-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 {
 	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
 	static char trace_buf[TRACE_BUF_SIZE];
@@ -1291,7 +1290,6 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
 	entry->ip			= ip;
-	entry->depth			= depth;
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
@@ -3140,7 +3138,7 @@ static int mark_printk(const char *fmt, ...)
 	int ret;
 	va_list args;
 	va_start(args, fmt);
-	ret = trace_vprintk(0, -1, fmt, args);
+	ret = trace_vprintk(0, fmt, args);
 	va_end(args);
 	return ret;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 38276d1638e3..7c9a0cbf5dca 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -123,7 +123,6 @@ struct userstack_entry {
 struct bprint_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
-	int			depth;
 	const char		*fmt;
 	u32			buf[];
 };
@@ -131,7 +130,6 @@ struct bprint_entry {
 struct print_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
-	int			depth;
 	char			buf[];
 };
 
@@ -598,9 +596,9 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 extern void *head_page(struct trace_array_cpu *data);
 extern long ns2usecs(cycle_t nsec);
 extern int
-trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args);
+trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
 extern int
-trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
+trace_vprintk(unsigned long ip, const char *fmt, va_list args);
 
 extern unsigned long trace_flags;
 
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 019915063fe6..fd78bee71dd7 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -105,7 +105,6 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
-		TRACE_FIELD(unsigned int, depth, depth)
 		TRACE_FIELD(char *, fmt, fmt)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
@@ -115,7 +114,6 @@ TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
 TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
-		TRACE_FIELD(unsigned int, depth, depth)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
 	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index f095916e477f..8e37fcddd8b4 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -359,5 +359,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
 
 int mmio_trace_printk(const char *fmt, va_list args)
 {
-	return trace_vprintk(0, -1, fmt, args);
+	return trace_vprintk(0, fmt, args);
 }
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 486785214e3e..eb81556107fe 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -112,7 +112,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-	ret = trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	ret = trace_vbprintk(ip, fmt, ap);
 	va_end(ap);
 	return ret;
 }
@@ -126,7 +126,7 @@ int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
 	if (!(trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
-	return trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	return trace_vbprintk(ip, fmt, ap);
 }
 EXPORT_SYMBOL_GPL(__ftrace_vbprintk);
 
@@ -139,7 +139,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	ret = trace_vprintk(ip, fmt, ap);
 	va_end(ap);
 	return ret;
 }
@@ -150,7 +150,7 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 	if (!(trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
-	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	return trace_vprintk(ip, fmt, ap);
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
-- 
cgit v1.2.3-59-g8ed1b


From 5087f8d2a2f2daff5a913d72d8ea3ad601948e10 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 15:14:46 -0400
Subject: function-graph: show binary events as comments

With the added TRACE_EVENT macro, the events no longer appear in
the function graph tracer. This was because the function graph
did not know how to display the entries. The graph tracer was
only aware of its own entries and the printk entries.

By using the event call back feature, the graph tracer can now display
the events.

 # echo irq > /debug/tracing/set_event

Which can show:

 0)               |          handle_IRQ_event() {
 0)               |            /* irq_handler_entry: irq=48 handler=eth0 */
 0)               |            e1000_intr() {
 0)   0.926 us    |              __napi_schedule();
 0)   3.888 us    |            }
 0)               |            /* irq_handler_exit: irq=48 return=handled */
 0)   0.655 us    |            runqueue_is_locked();
 0)               |            __wake_up() {
 0)   0.831 us    |              _spin_lock_irqsave();

The irq entry and exit events show up as comments.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions_graph.c | 40 +++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 66ea23b64fe6..e876816fa8e7 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -712,10 +712,12 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 }
 
 static enum print_line_t
-print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
-		   struct trace_entry *ent, struct trace_iterator *iter)
+print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
+		    struct trace_iterator *iter)
 {
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct fgraph_data *data = iter->private;
+	struct trace_event *event;
 	int depth = 0;
 	int ret;
 	int i;
@@ -751,9 +753,26 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_bprintf(s, trace->fmt, trace->buf);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	switch (iter->ent->type) {
+	case TRACE_BPRINT:
+		ret = trace_print_bprintk_msg_only(iter);
+		if (ret != TRACE_TYPE_HANDLED)
+			return ret;
+		break;
+	case TRACE_PRINT:
+		ret = trace_print_printk_msg_only(iter);
+		if (ret != TRACE_TYPE_HANDLED)
+			return ret;
+		break;
+	default:
+		event = ftrace_find_event(ent->type);
+		if (!event)
+			return TRACE_TYPE_UNHANDLED;
+
+		ret = event->trace(iter, sym_flags);
+		if (ret != TRACE_TYPE_HANDLED)
+			return ret;
+	}
 
 	/* Strip ending newline */
 	if (s->buffer[s->len - 1] == '\n') {
@@ -772,8 +791,8 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
-	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
 
 	switch (entry->type) {
 	case TRACE_GRAPH_ENT: {
@@ -786,14 +805,11 @@ print_graph_function(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 		return print_graph_return(&field->ret, s, entry, iter);
 	}
-	case TRACE_BPRINT: {
-		struct bprint_entry *field;
-		trace_assign_type(field, entry);
-		return print_graph_comment(field, s, entry, iter);
-	}
 	default:
-		return TRACE_TYPE_UNHANDLED;
+		return print_graph_comment(s, entry, iter);
 	}
+
+	return TRACE_TYPE_HANDLED;
 }
 
 static void print_graph_headers(struct seq_file *s)
-- 
cgit v1.2.3-59-g8ed1b


From 23725aeeab10ba02bcf10ec49ad73146b54cb52f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:13 +0100
Subject: ftrace: provide an id file for each event

Since not every event has a format file to read the id from,
expose it explicitly in a separate file.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090319194233.372534033@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c88227b3b9db..7763db8fd0b3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -412,6 +412,29 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 	return r;
 }
 
+static ssize_t
+event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+	trace_seq_printf(s, "%d\n", call->id);
+
+	r = simple_read_from_buffer(ubuf, cnt, ppos,
+				    s->buffer, s->len);
+	kfree(s);
+	return r;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -452,6 +475,11 @@ static const struct file_operations ftrace_event_format_fops = {
 	.read = event_format_read,
 };
 
+static const struct file_operations ftrace_event_id_fops = {
+	.open = tracing_open_generic,
+	.read = event_id_read,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -550,6 +578,14 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   "'%s/enable' entry\n", call->name);
 	}
 
+	if (call->id) {
+		entry = debugfs_create_file("id", 0444, call->dir, call,
+				&ftrace_event_id_fops);
+		if (!entry)
+			pr_warning("Could not create debugfs '%s/id' entry\n",
+					call->name);
+	}
+
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 28bea271e58e429eccfad3d7ee2ad12d6ee015bf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:14 +0100
Subject: ftrace: ensure every event gets an id

Impact: widen user-space visibe event IDs to all events

Previously only TRACE_EVENT events got ids, because only they
generated raw output which needs to be demuxed from the trace.

In order to provide a unique ID for each event, register everybody,
regardless.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090319194233.464914218@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_stage_3.h | 15 ++++++++++++++-
 kernel/trace/trace_output.c         |  5 +++++
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index ae2e323df0c7..4c26d97b4508 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -130,7 +130,19 @@ static void ftrace_unreg_event_##call(void)				\
 {									\
 	unregister_trace_##call(ftrace_event_##call);			\
 }									\
-
+									\
+static struct ftrace_event_call event_##call;				\
+									\
+static int ftrace_init_event_##call(void)				\
+{									\
+	int id;								\
+									\
+	id = register_ftrace_event(NULL);				\
+	if (!id)							\
+		return -ENODEV;						\
+	event_##call.id = id;						\
+	return 0;							\
+}
 
 #undef TRACE_FORMAT
 #define TRACE_FORMAT(call, proto, args, fmt)				\
@@ -140,6 +152,7 @@ __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
+	.raw_init		= ftrace_init_event_##call,		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b45141748af5..19261fdd2455 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -481,6 +481,11 @@ int register_ftrace_event(struct trace_event *event)
 
 	mutex_lock(&trace_event_mutex);
 
+	if (!event) {
+		ret = next_event_type++;
+		goto out;
+	}
+
 	if (!event->type)
 		event->type = next_event_type++;
 	else if (event->type > __TRACE_LAST_TYPE) {
-- 
cgit v1.2.3-59-g8ed1b


From ac199db0189c091f2863312061c0575937f68810 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:15 +0100
Subject: ftrace: event profile hooks

Impact: new tracing infrastructure feature

Provide infrastructure to generate software perf counter events
from tracepoints.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090319194233.557364871@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile               |  1 +
 kernel/trace/events.c               |  1 -
 kernel/trace/trace.h                | 11 ++++++++++
 kernel/trace/trace_event_profile.c  | 31 ++++++++++++++++++++++++++
 kernel/trace/trace_events.c         |  9 ++------
 kernel/trace/trace_events_stage_3.h | 44 +++++++++++++++++++++++++++++++++++++
 6 files changed, 89 insertions(+), 8 deletions(-)
 create mode 100644 kernel/trace/trace_event_profile.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c3feea01c3e0..0e45c206c2f9 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -44,5 +44,6 @@ obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACER) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
+obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 9fc918da404f..246f2aa6dc46 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -12,4 +12,3 @@
 #include "trace_events_stage_2.h"
 #include "trace_events_stage_3.h"
 
-#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7c9a0cbf5dca..7cfb741be200 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -785,12 +785,23 @@ struct ftrace_event_call {
 	int		id;
 	int		(*raw_init)(void);
 	int		(*show_format)(struct trace_seq *s);
+
+#ifdef CONFIG_EVENT_PROFILE
+	atomic_t	profile_count;
+	int		(*profile_enable)(struct ftrace_event_call *);
+	void		(*profile_disable)(struct ftrace_event_call *);
+#endif
 };
 
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
+#define for_each_event(event)						\
+	for (event = __start_ftrace_events;				\
+	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
+	     event++)
+
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
new file mode 100644
index 000000000000..22cba9970776
--- /dev/null
+++ b/kernel/trace/trace_event_profile.c
@@ -0,0 +1,31 @@
+/*
+ * trace event based perf counter profiling
+ *
+ * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ */
+
+#include "trace.h"
+
+int ftrace_profile_enable(int event_id)
+{
+	struct ftrace_event_call *event;
+
+	for_each_event(event) {
+		if (event->id == event_id)
+			return event->profile_enable(event);
+	}
+
+	return -EINVAL;
+}
+
+void ftrace_profile_disable(int event_id)
+{
+	struct ftrace_event_call *event;
+
+	for_each_event(event) {
+		if (event->id == event_id)
+			return event->profile_disable(event);
+	}
+}
+
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7763db8fd0b3..3047b56f6637 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -19,11 +19,6 @@
 
 static DEFINE_MUTEX(event_mutex);
 
-#define events_for_each(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
-
 static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call = (void *)__start_ftrace_events;
@@ -90,7 +85,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 	}
 
 	mutex_lock(&event_mutex);
-	events_for_each(call) {
+	for_each_event(call) {
 
 		if (!call->name || !call->regfunc)
 			continue;
@@ -628,7 +623,7 @@ static __init int event_trace_init(void)
 	if (!d_events)
 		return 0;
 
-	events_for_each(call) {
+	for_each_event(call) {
 		/* The linker may leave blanks */
 		if (!call->name)
 			continue;
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 4c26d97b4508..6b3261ca988c 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -109,6 +109,40 @@
 #undef TP_FMT
 #define TP_FMT(fmt, args...)	fmt "\n", ##args
 
+#ifdef CONFIG_EVENT_PROFILE
+#define _TRACE_PROFILE(call, proto, args)				\
+static void ftrace_profile_##call(proto)				\
+{									\
+	extern void perf_tpcounter_event(int);				\
+	perf_tpcounter_event(event_##call.id);				\
+}									\
+									\
+static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
+{									\
+	int ret = 0;							\
+									\
+	if (!atomic_inc_return(&call->profile_count))			\
+		ret = register_trace_##call(ftrace_profile_##call);	\
+									\
+	return ret;							\
+}									\
+									\
+static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
+{									\
+	if (atomic_add_negative(-1, &call->profile_count))		\
+		unregister_trace_##call(ftrace_profile_##call);		\
+}
+
+#define _TRACE_PROFILE_INIT(call)					\
+	.profile_count = ATOMIC_INIT(-1),				\
+	.profile_enable = ftrace_profile_enable_##call,			\
+	.profile_disable = ftrace_profile_disable_##call,
+
+#else
+#define _TRACE_PROFILE(call, proto, args)
+#define _TRACE_PROFILE_INIT(call)
+#endif
+
 #define _TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
 {									\
@@ -147,6 +181,7 @@ static int ftrace_init_event_##call(void)				\
 #undef TRACE_FORMAT
 #define TRACE_FORMAT(call, proto, args, fmt)				\
 _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
@@ -155,6 +190,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.raw_init		= ftrace_init_event_##call,		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
+	_TRACE_PROFILE_INIT(call)					\
 }
 
 #undef __entry
@@ -162,6 +198,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
 									\
 static struct ftrace_event_call event_##call;				\
 									\
@@ -227,4 +264,11 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
 	.unregfunc		= ftrace_raw_unreg_event_##call,	\
 	.show_format		= ftrace_format_##call,			\
+	_TRACE_PROFILE_INIT(call)					\
 }
+
+#include <trace/trace_event_types.h>
+
+#undef _TRACE_PROFILE
+#undef _TRACE_PROFILE_INIT
+
-- 
cgit v1.2.3-59-g8ed1b


From 505f2b970b2269ce4cb669b3ff4f6479d379cec2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 20 Mar 2009 11:05:04 +0100
Subject: tracing, Text Edit Lock - kprobes architecture independent support,
 nommu fix

Impact: build fix on SH !CONFIG_MMU

Stephen Rothwell reported this linux-next build failure on the SH
architecture:

  kernel/built-in.o: In function `disable_all_kprobes':
  kernel/kprobes.c:1382: undefined reference to `text_mutex'
  [...]

And observed:

| Introduced by commit 4460fdad85becd569f11501ad5b91814814335ff ("tracing,
| Text Edit Lock - kprobes architecture independent support") from the
| tracing tree.  text_mutex is defined in mm/memory.c which is only built
| if CONFIG_MMU is defined, which is not true for sh allmodconfig.

Move this lock to kernel/extable.c (which is already home to various
kernel text related routines), which file is always built-in.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
LKML-Reference: <20090320110602.86351a91.sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/extable.c | 14 ++++++++++++--
 mm/memory.c      |  8 --------
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index 0df6253730be..25d39b0c3a1b 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -15,11 +15,21 @@
     along with this program; if not, write to the Free Software
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+#include <linux/ftrace.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/init.h>
-#include <linux/ftrace.h>
-#include <asm/uaccess.h>
+
 #include <asm/sections.h>
+#include <asm/uaccess.h>
+
+/*
+ * mutex protecting text section modification (dynamic code patching).
+ * some users need to sleep (allocating memory...) while they hold this lock.
+ *
+ * NOT exported to modules - patching kernel text is a really delicate matter.
+ */
+DEFINE_MUTEX(text_mutex);
 
 extern struct exception_table_entry __start___ex_table[];
 extern struct exception_table_entry __stop___ex_table[];
diff --git a/mm/memory.c b/mm/memory.c
index 05fab3bc5b4b..dfc9e4ea4e8b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -101,14 +101,6 @@ int randomize_va_space __read_mostly =
 					2;
 #endif
 
-/*
- * mutex protecting text section modification (dynamic code patching).
- * some users need to sleep (allocating memory...) while they hold this lock.
- *
- * NOT exported to modules - patching kernel text is a really delicate matter.
- */
-DEFINE_MUTEX(text_mutex);
-
 static int __init disable_randmaps(char *s)
 {
 	randomize_va_space = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 09c9e84d474d917d9de5b9011ed2064b03a19677 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 21 Mar 2009 04:33:36 +0100
Subject: tracing/ring-buffer: don't annotate rb_cpu_notify with __cpuinit

Impact: remove a section warning

CONFIG_DEBUG_SECTION_MISMATCH raises the following warning on -tip:

  WARNING: kernel/trace/built-in.o(.text+0x5bc5): Section mismatch in
  reference from the function ring_buffer_alloc() to the function
  .cpuinit.text:rb_cpu_notify()
  The function ring_buffer_alloc() references
  the function __cpuinit rb_cpu_notify().

This is actually harmless. The code in the ring buffer don't build
rb_cpu_notify and other cpu hotplug stuffs when !CONFIG_HOTPLUG_CPU
so we have no risk to reference freed memory here (it would even
be harmless if we unconditionally build it because register_cpu_notifier
would do nothing when !CONFIG_HOTPLUG_CPU.

But since ring_buffer_alloc() can be called everytime, we don't want it
to be annotated with __cpuinit so we drop the __cpuinit from
rb_cpu_notify.

This is not a waste of memory because it is only defined and used on
CONFIG_HOTPLUG_CPU.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237606416-22268-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 384ca5d9d729..808b14bbf076 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -535,8 +535,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 extern int ring_buffer_page_too_big(void);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static int __cpuinit rb_cpu_notify(struct notifier_block *self,
-				   unsigned long action, void *hcpu);
+static int rb_cpu_notify(struct notifier_block *self,
+			 unsigned long action, void *hcpu);
 #endif
 
 /**
@@ -2784,8 +2784,8 @@ static __init int rb_init_debugfs(void)
 fs_initcall(rb_init_debugfs);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static int __cpuinit rb_cpu_notify(struct notifier_block *self,
-				   unsigned long action, void *hcpu)
+static int rb_cpu_notify(struct notifier_block *self,
+			 unsigned long action, void *hcpu)
 {
 	struct ring_buffer *buffer =
 		container_of(self, struct ring_buffer, cpu_notify);
-- 
cgit v1.2.3-59-g8ed1b


From 1a17662ea033674a58bad3603531b0b5d42572f6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:47:30 +0800
Subject: blktrace: fix possible memory leak

When we failed to create "block" debugfs dir, we should do some
cleanups.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C2F5B2.8000800@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b171778e3863..fb3bc53835dd 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -432,7 +432,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!blk_tree_root) {
 		blk_tree_root = debugfs_create_dir("block", NULL);
 		if (!blk_tree_root)
-			return -ENOMEM;
+			goto err;
 	}
 
 	dir = debugfs_create_dir(buts->name, blk_tree_root);
-- 
cgit v1.2.3-59-g8ed1b


From 5006ea73f38caef6065d1136808413813271633f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:48:03 +0800
Subject: blktrace: make blk_tracer_enabled a bool flag

It doesn't have to be a counter, and it can be a bool flag instead.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C2F5D3.8090104@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fb3bc53835dd..73845b7968bb 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -30,7 +30,7 @@
 static unsigned int blktrace_seq __read_mostly = 1;
 
 static struct trace_array *blk_tr;
-static int __read_mostly  blk_tracer_enabled;
+static bool blk_tracer_enabled __read_mostly;
 
 /* Select an alternative, minimalistic output than the original one */
 #define TRACE_BLK_OPT_CLASSIC	0x1
@@ -1111,9 +1111,7 @@ static int blk_tracer_init(struct trace_array *tr)
 {
 	blk_tr = tr;
 	blk_tracer_start(tr);
-	mutex_lock(&blk_probe_mutex);
-	blk_tracer_enabled++;
-	mutex_unlock(&blk_probe_mutex);
+	blk_tracer_enabled = true;
 	return 0;
 }
 
@@ -1131,11 +1129,7 @@ static void blk_tracer_reset(struct trace_array *tr)
 	if (!atomic_read(&blk_probes_ref))
 		return;
 
-	mutex_lock(&blk_probe_mutex);
-	blk_tracer_enabled--;
-	WARN_ON(blk_tracer_enabled < 0);
-	mutex_unlock(&blk_probe_mutex);
-
+	blk_tracer_enabled = false;
 	blk_tracer_stop(tr);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 3c289ba7c320560ee74979a8895141c829046a2d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:48:26 +0800
Subject: blktrace: remove blk_probe_mutex

blk_register_tracepoints() always returns 0, so make it return void,
thus we don't need to use blk_probe_mutex to protect blk_probes_ref.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C2F5EA.8060606@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 27 +++++----------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 73845b7968bb..223b92e77b3f 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -47,10 +47,9 @@ static struct tracer_flags blk_tracer_flags = {
 };
 
 /* Global reference count of probes */
-static DEFINE_MUTEX(blk_probe_mutex);
 static atomic_t blk_probes_ref = ATOMIC_INIT(0);
 
-static int blk_register_tracepoints(void);
+static void blk_register_tracepoints(void);
 static void blk_unregister_tracepoints(void);
 
 /*
@@ -256,10 +255,8 @@ static void blk_trace_cleanup(struct blk_trace *bt)
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
 	kfree(bt);
-	mutex_lock(&blk_probe_mutex);
 	if (atomic_dec_and_test(&blk_probes_ref))
 		blk_unregister_tracepoints();
-	mutex_unlock(&blk_probe_mutex);
 }
 
 int blk_trace_remove(struct request_queue *q)
@@ -471,13 +468,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 
-	mutex_lock(&blk_probe_mutex);
-	if (atomic_add_return(1, &blk_probes_ref) == 1) {
-		ret = blk_register_tracepoints();
-		if (ret)
-			goto probe_err;
-	}
-	mutex_unlock(&blk_probe_mutex);
+	if (atomic_add_return(1, &blk_probes_ref) == 1)
+		blk_register_tracepoints();
 
 	ret = -EBUSY;
 	old_bt = xchg(&q->blk_trace, bt);
@@ -487,9 +479,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	}
 
 	return 0;
-probe_err:
-	atomic_dec(&blk_probes_ref);
-	mutex_unlock(&blk_probe_mutex);
 err:
 	if (bt) {
 		if (bt->msg_file)
@@ -863,7 +852,7 @@ void blk_add_driver_data(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
-static int blk_register_tracepoints(void)
+static void blk_register_tracepoints(void)
 {
 	int ret;
 
@@ -901,7 +890,6 @@ static int blk_register_tracepoints(void)
 	WARN_ON(ret);
 	ret = register_trace_block_remap(blk_add_trace_remap);
 	WARN_ON(ret);
-	return 0;
 }
 
 static void blk_unregister_tracepoints(void)
@@ -1099,11 +1087,8 @@ static void blk_tracer_print_header(struct seq_file *m)
 
 static void blk_tracer_start(struct trace_array *tr)
 {
-	mutex_lock(&blk_probe_mutex);
 	if (atomic_add_return(1, &blk_probes_ref) == 1)
-		if (blk_register_tracepoints())
-			atomic_dec(&blk_probes_ref);
-	mutex_unlock(&blk_probe_mutex);
+		blk_register_tracepoints();
 	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 
@@ -1118,10 +1103,8 @@ static int blk_tracer_init(struct trace_array *tr)
 static void blk_tracer_stop(struct trace_array *tr)
 {
 	trace_flags |= TRACE_ITER_CONTEXT_INFO;
-	mutex_lock(&blk_probe_mutex);
 	if (atomic_dec_and_test(&blk_probes_ref))
 		blk_unregister_tracepoints();
-	mutex_unlock(&blk_probe_mutex);
 }
 
 static void blk_tracer_reset(struct trace_array *tr)
-- 
cgit v1.2.3-59-g8ed1b


From cbe28296eb1ac441b35cf45804d0ae808add7dd1 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:48:47 +0800
Subject: blktrace: don't increase blk_probes_ref if failed to setup blk trace

do_blk_trace_setup() may return EBUSY, but the current code
doesn't decrease blk_probes_ref in this case.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C2F5FF.80002@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 223b92e77b3f..11e7c8d9d222 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -468,9 +468,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 
-	if (atomic_add_return(1, &blk_probes_ref) == 1)
-		blk_register_tracepoints();
-
 	ret = -EBUSY;
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt) {
@@ -478,6 +475,9 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		goto err;
 	}
 
+	if (atomic_add_return(1, &blk_probes_ref) == 1)
+		blk_register_tracepoints();
+
 	return 0;
 err:
 	if (bt) {
-- 
cgit v1.2.3-59-g8ed1b


From 15152e448b693fa41de40f1e40ffbe717a3aab88 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:49:08 +0800
Subject: blktrace: report EBUSY correctly

blk_trace_remove_queue() returns EINVAL if q->blk_trace == NULL,
but blk_trace_setup_queue() doesn't return EBUSY if
q->blk_trace != NULL.

 # echo 0 > sdaX/trace/enable
 # echo 0 > sdaX/trace/enable
 bash: echo: write error: Invalid argument
 # echo 1 > sdaX/trace/enable
 # echo 1 > sdaX/trace/enable
 (should return EBUSY)

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C2F614.2010101@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 11e7c8d9d222..14986afdbc1c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1260,12 +1260,10 @@ static int blk_trace_remove_queue(struct request_queue *q)
 static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 {
 	struct blk_trace *old_bt, *bt = NULL;
-	int ret;
 
-	ret = -ENOMEM;
 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 	if (!bt)
-		goto err;
+		return -ENOMEM;
 
 	bt->dev = dev;
 	bt->act_mask = (u16)-1;
@@ -1276,11 +1274,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 	if (old_bt != NULL) {
 		(void)xchg(&q->blk_trace, old_bt);
 		kfree(bt);
-		ret = -EBUSY;
+		return -EBUSY;
 	}
+
 	return 0;
-err:
-	return ret;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From cd649b8bb830d65c57c3c8b98d57b5402256d8bd Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 11:33:55 +0800
Subject: blktrace: remove sysfs_blk_trace_enable_show/store()

sysfs_blk_trace_enable_show()/store() share most of code with
sysfs_blk_trace_attr_show()/store().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C30EA3.1060004@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 92 ++++++++++++-------------------------------------
 1 file changed, 22 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 14986afdbc1c..dfee6f915179 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1284,72 +1284,6 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
  * sysfs interface to enable and configure tracing
  */
 
-static ssize_t sysfs_blk_trace_enable_show(struct device *dev,
-					   struct device_attribute *attr,
-					   char *buf)
-{
-	struct hd_struct *p = dev_to_part(dev);
-	struct block_device *bdev;
-	ssize_t ret = -ENXIO;
-
-	lock_kernel();
-	bdev = bdget(part_devt(p));
-	if (bdev != NULL) {
-		struct request_queue *q = bdev_get_queue(bdev);
-
-		if (q != NULL) {
-			mutex_lock(&bdev->bd_mutex);
-			ret = sprintf(buf, "%u\n", !!q->blk_trace);
-			mutex_unlock(&bdev->bd_mutex);
-		}
-
-		bdput(bdev);
-	}
-
-	unlock_kernel();
-	return ret;
-}
-
-static ssize_t sysfs_blk_trace_enable_store(struct device *dev,
-					    struct device_attribute *attr,
-					    const char *buf, size_t count)
-{
-	struct block_device *bdev;
-	struct request_queue *q;
-	struct hd_struct *p;
-	int value;
-	ssize_t ret = -ENXIO;
-
-	if (count == 0 || sscanf(buf, "%d", &value) != 1)
-		goto out;
-
-	lock_kernel();
-	p = dev_to_part(dev);
-	bdev = bdget(part_devt(p));
-	if (bdev == NULL)
-		goto out_unlock_kernel;
-
-	q = bdev_get_queue(bdev);
-	if (q == NULL)
-		goto out_bdput;
-
-	mutex_lock(&bdev->bd_mutex);
-	if (value)
-		ret = blk_trace_setup_queue(q, bdev->bd_dev);
-	else
-		ret = blk_trace_remove_queue(q);
-	mutex_unlock(&bdev->bd_mutex);
-
-	if (ret == 0)
-		ret = count;
-out_bdput:
-	bdput(bdev);
-out_unlock_kernel:
-	unlock_kernel();
-out:
-	return ret;
-}
-
 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf);
@@ -1361,8 +1295,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 		    sysfs_blk_trace_attr_show, \
 		    sysfs_blk_trace_attr_store)
 
-static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,
-		   sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store);
+static BLK_TRACE_DEVICE_ATTR(enable);
 static BLK_TRACE_DEVICE_ATTR(act_mask);
 static BLK_TRACE_DEVICE_ATTR(pid);
 static BLK_TRACE_DEVICE_ATTR(start_lba);
@@ -1447,6 +1380,12 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	if (q == NULL)
 		goto out_bdput;
 	mutex_lock(&bdev->bd_mutex);
+
+	if (attr == &dev_attr_enable) {
+		ret = sprintf(buf, "%u\n", !!q->blk_trace);
+		goto out_unlock_bdev;
+	}
+
 	if (q->blk_trace == NULL)
 		ret = sprintf(buf, "disabled\n");
 	else if (attr == &dev_attr_act_mask)
@@ -1457,6 +1396,8 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 		ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
 	else if (attr == &dev_attr_end_lba)
 		ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+
+out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 out_bdput:
 	bdput(bdev);
@@ -1499,6 +1440,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 		goto out_bdput;
 
 	mutex_lock(&bdev->bd_mutex);
+
+	if (attr == &dev_attr_enable) {
+		if (value)
+			ret = blk_trace_setup_queue(q, bdev->bd_dev);
+		else
+			ret = blk_trace_remove_queue(q);
+		goto out_unlock_bdev;
+	}
+
 	ret = 0;
 	if (q->blk_trace == NULL)
 		ret = blk_trace_setup_queue(q, bdev->bd_dev);
@@ -1512,13 +1462,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 			q->blk_trace->start_lba = value;
 		else if (attr == &dev_attr_end_lba)
 			q->blk_trace->end_lba = value;
-		ret = count;
 	}
+
+out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 out_bdput:
 	bdput(bdev);
 out_unlock_kernel:
 	unlock_kernel();
 out:
-	return ret;
+	return ret ? ret : count;
 }
+
-- 
cgit v1.2.3-59-g8ed1b


From b125130b22d67f249beba10b71a254558b5279d0 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 10:34:00 +0800
Subject: blktrace: avoid accessing NULL bdev->bd_disk

bdev->bd_disk can be NULL, if the block device is not opened.

Try this against an unmounted partition, and you'll see NULL dereference:

  # echo 1 > /sys/block/sda/sda5/enable

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C30098.6080107@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index dfee6f915179..108f4f7715a5 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1362,6 +1362,14 @@ static int blk_str2act_mask(const char *str)
 	return mask;
 }
 
+static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
+{
+	if (bdev->bd_disk == NULL)
+		return NULL;
+
+	return bdev_get_queue(bdev);
+}
+
 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
@@ -1376,9 +1384,10 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	if (bdev == NULL)
 		goto out_unlock_kernel;
 
-	q = bdev_get_queue(bdev);
+	q = blk_trace_get_queue(bdev);
 	if (q == NULL)
 		goto out_bdput;
+
 	mutex_lock(&bdev->bd_mutex);
 
 	if (attr == &dev_attr_enable) {
@@ -1435,7 +1444,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	if (bdev == NULL)
 		goto out_unlock_kernel;
 
-	q = bdev_get_queue(bdev);
+	q = blk_trace_get_queue(bdev);
 	if (q == NULL)
 		goto out_bdput;
 
-- 
cgit v1.2.3-59-g8ed1b


From cf586b61f80229491127d3c57c06ed93c9f530d3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 05:04:35 +0100
Subject: tracing/function-graph-tracer: prevent hangs during self-tests

Impact: detect tracing related hangs

Sometimes, with some configs, the function graph tracer can make
the timer interrupt too much slow, hanging the kernel in an endless
loop of timer interrupts servicing.

As suggested by Ingo, this patch brings a watchdog which stops the
selftest after a defined number of functions traced, definitely
disabling this tracer.

For those who want to debug the cause of the function graph trace
hang, you can pass the ftrace_dump_on_oops kernel parameter to dump
the traces after this hang detection.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237694675-23509-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c          | 26 +++++++++++++++++++++++---
 kernel/trace/trace_selftest.c | 38 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e3dfefe69348..e6fac0ffe6f0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4018,11 +4018,12 @@ trace_printk_seq(struct trace_seq *s)
 	trace_seq_init(s);
 }
 
-void ftrace_dump(void)
+static void __ftrace_dump(bool disable_tracing)
 {
 	static DEFINE_SPINLOCK(ftrace_dump_lock);
 	/* use static because iter can be a bit big for the stack */
 	static struct trace_iterator iter;
+	unsigned int old_userobj;
 	static int dump_ran;
 	unsigned long flags;
 	int cnt = 0, cpu;
@@ -4034,14 +4035,17 @@ void ftrace_dump(void)
 
 	dump_ran = 1;
 
-	/* No turning back! */
 	tracing_off();
-	ftrace_kill();
+
+	if (disable_tracing)
+		ftrace_kill();
 
 	for_each_tracing_cpu(cpu) {
 		atomic_inc(&global_trace.data[cpu]->disabled);
 	}
 
+	old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
+
 	/* don't look at user memory in panic mode */
 	trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
 
@@ -4086,10 +4090,26 @@ void ftrace_dump(void)
 	else
 		printk(KERN_TRACE "---------------------------------\n");
 
+	/* Re-enable tracing if requested */
+	if (!disable_tracing) {
+		trace_flags |= old_userobj;
+
+		for_each_tracing_cpu(cpu) {
+			atomic_dec(&global_trace.data[cpu]->disabled);
+		}
+		tracing_on();
+	}
+
  out:
 	spin_unlock_irqrestore(&ftrace_dump_lock, flags);
 }
 
+/* By default: disable tracing after the dump */
+void ftrace_dump(void)
+{
+	__ftrace_dump(true);
+}
+
 __init static int tracer_alloc_buffers(void)
 {
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 38856ba78a92..b56dcf7d3566 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -248,6 +248,28 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+/* Maximum number of functions to trace before diagnosing a hang */
+#define GRAPH_MAX_FUNC_TEST	100000000
+
+static void __ftrace_dump(bool disable_tracing);
+static unsigned int graph_hang_thresh;
+
+/* Wrap the real function entry probe to avoid possible hanging */
+static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
+{
+	/* This is harmlessly racy, we want to approximately detect a hang */
+	if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
+		ftrace_graph_stop();
+		printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
+		if (ftrace_dump_on_oops)
+			__ftrace_dump(false);
+		return 0;
+	}
+
+	return trace_graph_entry(trace);
+}
+
 /*
  * Pretty much the same than for the function tracer from which the selftest
  * has been borrowed.
@@ -259,15 +281,29 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 	int ret;
 	unsigned long count;
 
-	ret = tracer_init(trace, tr);
+	/*
+	 * Simulate the init() callback but we attach a watchdog callback
+	 * to detect and recover from possible hangs
+	 */
+	tracing_reset_online_cpus(tr);
+	ret = register_ftrace_graph(&trace_graph_return,
+				    &trace_graph_entry_watchdog);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		goto out;
 	}
+	tracing_start_cmdline_record();
 
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 
+	/* Have we just recovered from a hang? */
+	if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) {
+		trace->reset(tr);
+		ret = -1;
+		goto out;
+	}
+
 	tracing_stop();
 
 	/* check the trace buffer */
-- 
cgit v1.2.3-59-g8ed1b


From 0cf53ff62b3e9e491ff5e5f05b193fb6ce643047 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 15:13:07 +0100
Subject: tracing: keep the tracing buffer after self-test failure

Instead of using ftrace_dump_on_oops, it's far more convenient
to have the trace leading up to a self-test failure available
in /debug/tracing/trace.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237694675-23509-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b56dcf7d3566..08f4eb2763d1 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -299,7 +299,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 
 	/* Have we just recovered from a hang? */
 	if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) {
-		trace->reset(tr);
+		tracing_selftest_disabled = true;
 		ret = -1;
 		goto out;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From cf027f645e6aee4f0ca6197a6b6a57f327fdb13f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 22 Mar 2009 03:30:39 -0500
Subject: tracing: add run-time field descriptions for event filtering

This patch makes the field descriptions defined for event tracing
available at run-time, for the event-filtering mechanism introduced
in a subsequent patch.

The common event fields are prepended with 'common_' in the format
display, allowing them to be distinguished from the other fields
that might internally have same name and can therefore be
unambiguously used in filters.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237710639.7703.46.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h                | 30 +++++++++++++++++--------
 kernel/trace/trace_events.c         | 40 ++++++++++++++++++++++++++++++++-
 kernel/trace/trace_events_stage_2.h | 45 +++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_stage_3.h |  2 ++
 4 files changed, 107 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7cfb741be200..9288dc7ad14f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -775,16 +775,26 @@ enum {
 	TRACE_EVENT_TYPE_RAW		= 2,
 };
 
+struct ftrace_event_field {
+	struct list_head	link;
+	char			*name;
+	char			*type;
+	int			offset;
+	int			size;
+};
+
 struct ftrace_event_call {
-	char		*name;
-	char		*system;
-	struct dentry	*dir;
-	int		enabled;
-	int		(*regfunc)(void);
-	void		(*unregfunc)(void);
-	int		id;
-	int		(*raw_init)(void);
-	int		(*show_format)(struct trace_seq *s);
+	char			*name;
+	char			*system;
+	struct dentry		*dir;
+	int			enabled;
+	int			(*regfunc)(void);
+	void			(*unregfunc)(void);
+	int			id;
+	int			(*raw_init)(void);
+	int			(*show_format)(struct trace_seq *s);
+	int			(*define_fields)(void);
+	struct list_head	fields;
 
 #ifdef CONFIG_EVENT_PROFILE
 	atomic_t	profile_count;
@@ -793,6 +803,8 @@ struct ftrace_event_call {
 #endif
 };
 
+int trace_define_field(struct ftrace_event_call *call, char *type,
+		       char *name, int offset, int size);
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3047b56f6637..961b057da28b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -19,6 +19,34 @@
 
 static DEFINE_MUTEX(event_mutex);
 
+int trace_define_field(struct ftrace_event_call *call, char *type,
+		       char *name, int offset, int size)
+{
+	struct ftrace_event_field *field;
+
+	field = kmalloc(sizeof(*field), GFP_KERNEL);
+	if (!field)
+		goto err;
+	field->name = kstrdup(name, GFP_KERNEL);
+	if (!field->name)
+		goto err;
+	field->type = kstrdup(type, GFP_KERNEL);
+	if (!field->type)
+		goto err;
+	field->offset = offset;
+	field->size = size;
+	list_add(&field->link, &call->fields);
+
+	return 0;
+err:
+	if (field) {
+		kfree(field->name);
+		kfree(field->type);
+	}
+	kfree(field);
+	return -ENOMEM;
+}
+
 static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call = (void *)__start_ftrace_events;
@@ -343,7 +371,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 #undef FIELD
 #define FIELD(type, name)						\
-	#type, #name, offsetof(typeof(field), name), sizeof(field.name)
+	#type, "common_" #name, offsetof(typeof(field), name),		\
+		sizeof(field.name)
 
 static int trace_write_header(struct trace_seq *s)
 {
@@ -581,6 +610,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 					call->name);
 	}
 
+	if (call->define_fields) {
+		ret = call->define_fields();
+		if (ret < 0) {
+			pr_warning("Could not initialize trace point"
+				   " events/%s\n", call->name);
+			return ret;
+		}
+	}
+
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 5117c43f5c67..30743f7d4110 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -129,3 +129,48 @@ ftrace_format_##call(struct trace_seq *s)				\
 }
 
 #include <trace/trace_event_types.h>
+
+#undef __field
+#define __field(type, item)						\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#undef __array
+#define __array(type, item, len)					\
+	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#define __common_field(type, item)					\
+	ret = trace_define_field(event_call, #type, "common_" #item,	\
+				 offsetof(typeof(field.ent), item),	\
+				 sizeof(field.ent.item));		\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
+int									\
+ftrace_define_fields_##call(void)					\
+{									\
+	struct ftrace_raw_##call field;					\
+	struct ftrace_event_call *event_call = &event_##call;		\
+	int ret;							\
+									\
+	__common_field(unsigned char, type);				\
+	__common_field(unsigned char, flags);				\
+	__common_field(unsigned char, preempt_count);			\
+	__common_field(int, pid);					\
+	__common_field(int, tgid);					\
+									\
+	tstruct;							\
+									\
+	return ret;							\
+}
+
+#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 6b3261ca988c..468938f70141 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -252,6 +252,7 @@ static int ftrace_raw_init_event_##call(void)				\
 	if (!id)							\
 		return -ENODEV;						\
 	event_##call.id = id;						\
+	INIT_LIST_HEAD(&event_##call.fields);				\
 	return 0;							\
 }									\
 									\
@@ -264,6 +265,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
 	.unregfunc		= ftrace_raw_unreg_event_##call,	\
 	.show_format		= ftrace_format_##call,			\
+	.define_fields		= ftrace_define_fields_##call,		\
 	_TRACE_PROFILE_INIT(call)					\
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From f80d2d7725b04f8225b11b55e43bb2c77c819926 Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Sun, 22 Mar 2009 19:11:10 +0200
Subject: tracing, Text Edit Lock: Fix one sparse warning in kernel/extable.c

Impact: cleanup.

The global mutex text_mutex if declared in linux/memory.h, so
this file needs to be included into kernel/extable.c, where the
same mutex is defined. This fixes the following sparse warning:

 kernel/extable.c:32:1: warning: symbol 'text_mutex' was not declared.
 Should it be static?

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
LKML-Reference: <1237741871-5827-3-git-send-email-dmitri.vorobiev@movial.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/extable.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index 25d39b0c3a1b..b54a6017b6b5 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -16,6 +16,7 @@
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include <linux/ftrace.h>
+#include <linux/memory.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
-- 
cgit v1.2.3-59-g8ed1b


From b8b94265337f83b7db9c5f429b1769d463d7da8c Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Sun, 22 Mar 2009 19:11:11 +0200
Subject: tracing: fix four sparse warnings

Impact: cleanup.

This patch fixes the following sparse warnings:

 kernel/trace/trace.c:385:9: warning: symbol 'trace_seq_to_buffer' was
 not declared. Should it be static?

 kernel/trace/trace_clock.c:29:13: warning: symbol 'trace_clock_local'
 was not declared. Should it be static?

 kernel/trace/trace_clock.c:54:13: warning: symbol 'trace_clock' was not
 declared. Should it be static?

 kernel/trace/trace_clock.c:74:13: warning: symbol 'trace_clock_global'
 was not declared. Should it be static?

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
LKML-Reference: <1237741871-5827-4-git-send-email-dmitri.vorobiev@movial.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c       | 2 +-
 kernel/trace/trace_clock.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e6fac0ffe6f0..ace685c70186 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -382,7 +382,7 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 	return cnt;
 }
 
-ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
+static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 {
 	int len;
 	void *ret;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 05b176abfd30..b588fd81f7f9 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -18,6 +18,7 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/ktime.h>
+#include <linux/trace_clock.h>
 
 /*
  * trace_clock_local(): the simplest and least coherent tracing clock.
-- 
cgit v1.2.3-59-g8ed1b


From 2d622719f1572ef31e0616444a515eba3094d050 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 22 Mar 2009 03:30:49 -0500
Subject: tracing: add ring_buffer_event_discard() to ring buffer

This patch overloads RINGBUF_TYPE_PADDING to provide a way to discard
events from the ring buffer, for the event-filtering mechanism
introduced in a subsequent patch.

I did the initial version but thanks to Steven Rostedt for adding
the parts that actually made it work. ;-)

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h |  11 +++--
 kernel/trace/ring_buffer.c  | 117 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 105 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 9e6052bd1a1c..e1b7b2173885 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -18,10 +18,13 @@ struct ring_buffer_event {
 /**
  * enum ring_buffer_type - internal ring buffer types
  *
- * @RINGBUF_TYPE_PADDING:	Left over page padding
- *				 array is ignored
- *				 size is variable depending on how much
+ * @RINGBUF_TYPE_PADDING:	Left over page padding or discarded event
+ *				 If time_delta is 0:
+ *				  array is ignored
+ *				  size is variable depending on how much
  *				  padding is needed
+ *				 If time_delta is non zero:
+ *				  everything else same as RINGBUF_TYPE_DATA
  *
  * @RINGBUF_TYPE_TIME_EXTEND:	Extend the time delta
  *				 array[0] = time delta (28 .. 59)
@@ -65,6 +68,8 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
 	return event->time_delta;
 }
 
+void ring_buffer_event_discard(struct ring_buffer_event *event);
+
 /*
  * size is in bytes for each per CPU buffer.
  */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 384ca5d9d729..a09027ec1714 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -189,16 +189,65 @@ enum {
 	RB_LEN_TIME_STAMP = 16,
 };
 
-/* inline for ring buffer fast paths */
+static inline int rb_null_event(struct ring_buffer_event *event)
+{
+	return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
+}
+
+static inline int rb_discarded_event(struct ring_buffer_event *event)
+{
+	return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
+}
+
+static void rb_event_set_padding(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	event->time_delta = 0;
+}
+
+/**
+ * ring_buffer_event_discard - discard an event in the ring buffer
+ * @buffer: the ring buffer
+ * @event: the event to discard
+ *
+ * Sometimes a event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * Note, it is up to the user to be careful with this, and protect
+ * against races. If the user discards an event that has been consumed
+ * it is possible that it could corrupt the ring buffer.
+ */
+void ring_buffer_event_discard(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	if (!event->time_delta)
+		event->time_delta = 1;
+}
+
 static unsigned
-rb_event_length(struct ring_buffer_event *event)
+rb_event_data_length(struct ring_buffer_event *event)
 {
 	unsigned length;
 
+	if (event->len)
+		length = event->len * RB_ALIGNMENT;
+	else
+		length = event->array[0];
+	return length + RB_EVNT_HDR_SIZE;
+}
+
+/* inline for ring buffer fast paths */
+static unsigned
+rb_event_length(struct ring_buffer_event *event)
+{
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		/* undefined */
-		return -1;
+		if (rb_null_event(event))
+			/* undefined */
+			return -1;
+		return rb_event_data_length(event);
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		return RB_LEN_TIME_EXTEND;
@@ -207,11 +256,7 @@ rb_event_length(struct ring_buffer_event *event)
 		return RB_LEN_TIME_STAMP;
 
 	case RINGBUF_TYPE_DATA:
-		if (event->len)
-			length = event->len * RB_ALIGNMENT;
-		else
-			length = event->array[0];
-		return length + RB_EVNT_HDR_SIZE;
+		return rb_event_data_length(event);
 	default:
 		BUG();
 	}
@@ -845,11 +890,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
 
-static inline int rb_null_event(struct ring_buffer_event *event)
-{
-	return event->type == RINGBUF_TYPE_PADDING;
-}
-
 static inline void *
 __rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
 {
@@ -1219,7 +1259,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		if (tail < BUF_PAGE_SIZE) {
 			/* Mark the rest of the page with padding */
 			event = __rb_page_index(tail_page, tail);
-			event->type = RINGBUF_TYPE_PADDING;
+			rb_event_set_padding(event);
 		}
 
 		if (tail <= BUF_PAGE_SIZE)
@@ -1969,7 +2009,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 	event = rb_reader_event(cpu_buffer);
 
-	if (event->type == RINGBUF_TYPE_DATA)
+	if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
 		cpu_buffer->entries--;
 
 	rb_update_read_stamp(cpu_buffer, event);
@@ -2052,9 +2092,18 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		RB_WARN_ON(cpu_buffer, 1);
+		if (rb_null_event(event))
+			RB_WARN_ON(cpu_buffer, 1);
+		/*
+		 * Because the writer could be discarding every
+		 * event it creates (which would probably be bad)
+		 * if we were to go back to "again" then we may never
+		 * catch up, and will trigger the warn on, or lock
+		 * the box. Return the padding, and we will release
+		 * the current locks, and try again.
+		 */
 		rb_advance_reader(cpu_buffer);
-		return NULL;
+		return event;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		/* Internal data, OK to advance */
@@ -2115,8 +2164,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		rb_inc_iter(iter);
-		goto again;
+		if (rb_null_event(event)) {
+			rb_inc_iter(iter);
+			goto again;
+		}
+		rb_advance_iter(iter);
+		return event;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		/* Internal data, OK to advance */
@@ -2163,10 +2216,16 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 
@@ -2185,10 +2244,16 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_event *event;
 	unsigned long flags;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_iter_peek(iter, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 
@@ -2207,6 +2272,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 	struct ring_buffer_event *event = NULL;
 	unsigned long flags;
 
+ again:
 	/* might be called in atomic */
 	preempt_disable();
 
@@ -2228,6 +2294,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
  out:
 	preempt_enable();
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
@@ -2306,6 +2377,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
 	unsigned long flags;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_iter_peek(iter, ts);
 	if (!event)
@@ -2315,6 +2387,11 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read);
-- 
cgit v1.2.3-59-g8ed1b


From 7ce7e4249921d5073e764f7ff7ad83cfa9894bd7 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 22 Mar 2009 03:31:04 -0500
Subject: tracing: add per-event filtering

This patch adds per-event filtering to the event tracing subsystem.

It adds a 'filter' debugfs file to each event directory.  This file can
be written to to set filters; reading from it will display the current
set of filters set for that event.

Basically, any field listed in the 'format' file for an event can be
filtered on (including strings, but not yet other array types) using
either matching ('==') or non-matching ('!=') 'predicates'.  A
'predicate' can be either a single expression:

 # echo pid != 0 > filter

 # cat filter
 pid != 0

or a compound expression of up to 8 sub-expressions combined using '&&'
or '||':

 # echo comm == Xorg > filter
 # echo "&& sig != 29" > filter

 # cat filter
 comm == Xorg
 && sig != 29

Only events having field values matching an expression will be available
in the trace output; non-matching events are discarded.

Note that a compound expression is built up by echoing each
sub-expression separately - it's not the most efficient way to do
things, but it keeps the parser simple and assumes that compound
expressions will be relatively uncommon.  In any case, a subsequent
patch introducing a way to set filters for entire subsystems should
mitigate any need to do this for lots of events.

Setting a filter without an '&&' or '||' clears the previous filter
completely and sets the filter to the new expression:

 # cat filter
 comm == Xorg
 && sig != 29

 # echo comm != Xorg

 # cat filter
 comm != Xorg

To clear a filter, echo 0 to the filter file:

 # echo 0 > filter
 # cat filter
 none

The limit of 8 predicates for a compound expression is arbitrary - for
efficiency, it's implemented as an array of pointers to predicates, and
8 seemed more than enough for any filter...

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237710665.7703.48.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile               |   1 +
 kernel/trace/trace.h                |  28 ++++
 kernel/trace/trace_events.c         |  77 +++++++++
 kernel/trace/trace_events_filter.c  | 326 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_stage_3.h |   4 +
 5 files changed, 436 insertions(+)
 create mode 100644 kernel/trace/trace_events_filter.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 0e45c206c2f9..2630f5121ec1 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -45,5 +45,6 @@ obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACER) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
+obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9288dc7ad14f..d9eb39e4bb38 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -795,6 +795,7 @@ struct ftrace_event_call {
 	int			(*show_format)(struct trace_seq *s);
 	int			(*define_fields)(void);
 	struct list_head	fields;
+	struct filter_pred	**preds;
 
 #ifdef CONFIG_EVENT_PROFILE
 	atomic_t	profile_count;
@@ -803,8 +804,35 @@ struct ftrace_event_call {
 #endif
 };
 
+#define MAX_FILTER_PRED 8
+
+struct filter_pred;
+
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+
+struct filter_pred {
+	filter_pred_fn_t fn;
+	u64 val;
+	char *str_val;
+	int str_len;
+	char *field_name;
+	int offset;
+	int not;
+	int or;
+	int compound;
+	int clear;
+};
+
 int trace_define_field(struct ftrace_event_call *call, char *type,
 		       char *name, int offset, int size);
+extern void filter_free_pred(struct filter_pred *pred);
+extern int filter_print_preds(struct filter_pred **preds, char *buf);
+extern int filter_parse(char **pbuf, struct filter_pred *pred);
+extern int filter_add_pred(struct ftrace_event_call *call,
+			   struct filter_pred *pred);
+extern void filter_free_preds(struct ftrace_event_call *call);
+extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 961b057da28b..97470c48956e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -459,6 +459,71 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 	return r;
 }
 
+static ssize_t
+event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
+		  loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	r = filter_print_preds(call->preds, s->buffer);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r);
+
+	kfree(s);
+
+	return r;
+}
+
+static ssize_t
+event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		   loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[64], *pbuf = buf;
+	struct filter_pred *pred;
+	int err;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return -ENOMEM;
+
+	err = filter_parse(&pbuf, pred);
+	if (err < 0) {
+		filter_free_pred(pred);
+		return err;
+	}
+
+	if (pred->clear) {
+		filter_free_preds(call);
+		return cnt;
+	}
+
+	if (filter_add_pred(call, pred)) {
+		filter_free_pred(pred);
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -504,6 +569,12 @@ static const struct file_operations ftrace_event_id_fops = {
 	.read = event_id_read,
 };
 
+static const struct file_operations ftrace_event_filter_fops = {
+	.open = tracing_open_generic,
+	.read = event_filter_read,
+	.write = event_filter_write,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -619,6 +690,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		}
 	}
 
+	entry = debugfs_create_file("filter", 0444, call->dir, call,
+				    &ftrace_event_filter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/filter' entry\n", call->name);
+
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
new file mode 100644
index 000000000000..8e8c5fa25be9
--- /dev/null
+++ b/kernel/trace/trace_events_filter.c
@@ -0,0 +1,326 @@
+/*
+ * trace_events_filter - generic event filtering
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+
+#include "trace.h"
+
+static int filter_pred_64(struct filter_pred *pred, void *event)
+{
+	u64 *addr = (u64 *)(event + pred->offset);
+	u64 val = (u64)pred->val;
+	int match;
+
+	match = (val == *addr) ^ pred->not;
+
+	return match;
+}
+
+static int filter_pred_32(struct filter_pred *pred, void *event)
+{
+	u32 *addr = (u32 *)(event + pred->offset);
+	u32 val = (u32)pred->val;
+	int match;
+
+	match = (val == *addr) ^ pred->not;
+
+	return match;
+}
+
+static int filter_pred_16(struct filter_pred *pred, void *event)
+{
+	u16 *addr = (u16 *)(event + pred->offset);
+	u16 val = (u16)pred->val;
+	int match;
+
+	match = (val == *addr) ^ pred->not;
+
+	return match;
+}
+
+static int filter_pred_8(struct filter_pred *pred, void *event)
+{
+	u8 *addr = (u8 *)(event + pred->offset);
+	u8 val = (u8)pred->val;
+	int match;
+
+	match = (val == *addr) ^ pred->not;
+
+	return match;
+}
+
+static int filter_pred_string(struct filter_pred *pred, void *event)
+{
+	char *addr = (char *)(event + pred->offset);
+	int cmp, match;
+
+	cmp = strncmp(addr, pred->str_val, pred->str_len);
+
+	match = (!cmp) ^ pred->not;
+
+	return match;
+}
+
+/* return 1 if event matches, 0 otherwise (discard) */
+int filter_match_preds(struct ftrace_event_call *call, void *rec)
+{
+	int i, matched, and_failed = 0;
+	struct filter_pred *pred;
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (call->preds[i]) {
+			pred = call->preds[i];
+			if (and_failed && !pred->or)
+				continue;
+			matched = pred->fn(pred, rec);
+			if (!matched && !pred->or) {
+				and_failed = 1;
+				continue;
+			} else if (matched && pred->or)
+				return 1;
+		} else
+			break;
+	}
+
+	if (and_failed)
+		return 0;
+
+	return 1;
+}
+
+int filter_print_preds(struct filter_pred **preds, char *buf)
+{
+	ssize_t this_len = 0;
+	char *field_name;
+	struct filter_pred *pred;
+	int i;
+
+	if (!preds) {
+		this_len += sprintf(buf + this_len, "none\n");
+		return this_len;
+	}
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (preds[i]) {
+			pred = preds[i];
+			field_name = pred->field_name;
+			if (i)
+				this_len += sprintf(buf + this_len,
+					    pred->or ? "|| " : "&& ");
+			this_len += sprintf(buf + this_len,
+					    "%s ", field_name);
+			this_len += sprintf(buf + this_len,
+					    pred->not ? "!= " : "== ");
+			if (pred->str_val)
+				this_len += sprintf(buf + this_len,
+						    "%s\n", pred->str_val);
+			else
+				this_len += sprintf(buf + this_len,
+						    "%llu\n", pred->val);
+		} else
+			break;
+	}
+
+	return this_len;
+}
+
+static struct ftrace_event_field *
+find_event_field(struct ftrace_event_call *call, char *name)
+{
+	struct ftrace_event_field *field;
+	struct list_head *entry, *tmp;
+
+	list_for_each_safe(entry, tmp, &call->fields) {
+		field = list_entry(entry, struct ftrace_event_field, link);
+		if (!strcmp(field->name, name))
+			return field;
+	}
+
+	return NULL;
+}
+
+void filter_free_pred(struct filter_pred *pred)
+{
+	if (!pred)
+		return;
+
+	kfree(pred->field_name);
+	kfree(pred->str_val);
+	kfree(pred);
+}
+
+void filter_free_preds(struct ftrace_event_call *call)
+{
+	int i;
+
+	if (call->preds) {
+		for (i = 0; i < MAX_FILTER_PRED; i++)
+			filter_free_pred(call->preds[i]);
+		kfree(call->preds);
+		call->preds = NULL;
+	}
+}
+
+static int __filter_add_pred(struct ftrace_event_call *call,
+			     struct filter_pred *pred)
+{
+	int i;
+
+	if (call->preds && !pred->compound)
+		filter_free_preds(call);
+
+	if (!call->preds) {
+		call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+				      GFP_KERNEL);
+		if (!call->preds)
+			return -ENOMEM;
+	}
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (!call->preds[i]) {
+			call->preds[i] = pred;
+			return 0;
+		}
+	}
+
+	return -ENOMEM;
+}
+
+static int is_string_field(const char *type)
+{
+	if (strchr(type, '[') && strstr(type, "char"))
+		return 1;
+
+	return 0;
+}
+
+int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
+{
+	struct ftrace_event_field *field;
+
+	field = find_event_field(call, pred->field_name);
+	if (!field)
+		return -EINVAL;
+
+	pred->offset = field->offset;
+
+	if (is_string_field(field->type)) {
+		pred->fn = filter_pred_string;
+		pred->str_len = field->size;
+		return __filter_add_pred(call, pred);
+	}
+
+	switch (field->size) {
+	case 8:
+		pred->fn = filter_pred_64;
+		break;
+	case 4:
+		pred->fn = filter_pred_32;
+		break;
+	case 2:
+		pred->fn = filter_pred_16;
+		break;
+	case 1:
+		pred->fn = filter_pred_8;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return __filter_add_pred(call, pred);
+}
+
+int filter_parse(char **pbuf, struct filter_pred *pred)
+{
+	char *tmp, *tok, *val_str = NULL;
+	int tok_n = 0;
+
+	/* field ==/!= number, or/and field ==/!= number, number */
+	while ((tok = strsep(pbuf, " \n"))) {
+		if (tok_n == 0) {
+			if (!strcmp(tok, "0")) {
+				pred->clear = 1;
+				return 0;
+			} else if (!strcmp(tok, "&&")) {
+				pred->or = 0;
+				pred->compound = 1;
+			} else if (!strcmp(tok, "||")) {
+				pred->or = 1;
+				pred->compound = 1;
+			} else
+				pred->field_name = tok;
+			tok_n = 1;
+			continue;
+		}
+		if (tok_n == 1) {
+			if (!pred->field_name)
+				pred->field_name = tok;
+			else if (!strcmp(tok, "!="))
+				pred->not = 1;
+			else if (!strcmp(tok, "=="))
+				pred->not = 0;
+			else {
+				pred->field_name = NULL;
+				return -EINVAL;
+			}
+			tok_n = 2;
+			continue;
+		}
+		if (tok_n == 2) {
+			if (pred->compound) {
+				if (!strcmp(tok, "!="))
+					pred->not = 1;
+				else if (!strcmp(tok, "=="))
+					pred->not = 0;
+				else {
+					pred->field_name = NULL;
+					return -EINVAL;
+				}
+			} else {
+				val_str = tok;
+				break; /* done */
+			}
+			tok_n = 3;
+			continue;
+		}
+		if (tok_n == 3) {
+			val_str = tok;
+			break; /* done */
+		}
+	}
+
+	pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
+	if (!pred->field_name)
+		return -ENOMEM;
+
+	pred->val = simple_strtoull(val_str, &tmp, 10);
+	if (tmp == val_str) {
+		pred->str_val = kstrdup(val_str, GFP_KERNEL);
+		if (!pred->str_val)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 468938f70141..ebf215e87d5e 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -204,6 +204,7 @@ static struct ftrace_event_call event_##call;				\
 									\
 static void ftrace_raw_event_##call(proto)				\
 {									\
+	struct ftrace_event_call *call = &event_##call;			\
 	struct ring_buffer_event *event;				\
 	struct ftrace_raw_##call *entry;				\
 	unsigned long irq_flags;					\
@@ -222,6 +223,9 @@ static void ftrace_raw_event_##call(proto)				\
 	assign;								\
 									\
 	trace_current_buffer_unlock_commit(event, irq_flags, pc);	\
+									\
+	if (call->preds && !filter_match_preds(call, entry))		\
+		ring_buffer_event_discard(event);			\
 }									\
 									\
 static int ftrace_raw_reg_event_##call(void)				\
-- 
cgit v1.2.3-59-g8ed1b


From cfb180f3e71b2a280a254c8646a9ab1beab63f84 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 22 Mar 2009 03:31:17 -0500
Subject: tracing: add per-subsystem filtering

This patch adds per-subsystem filtering to the event tracing subsystem.

It adds a 'filter' debugfs file to each subsystem directory.  This file
can be written to to set filters; reading from it will display the
current set of filters set for that subsystem.

Basically what it does is propagate the filter down to each event
contained in the subsystem.  If a particular event doesn't have a field
with the name specified in the filter, it simply doesn't get set for
that event.  You can verify whether or not the filter was set for a
particular event by looking at the filter file for that event.

As with per-event filters, compound expressions are supported, echoing
'0' to the subsystem's filter file clears all filters in the subsystem,
etc.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237710677.7703.49.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h               | 15 +++++++
 kernel/trace/trace_events.c        | 86 +++++++++++++++++++++++++++++++++++---
 kernel/trace/trace_events_filter.c | 80 +++++++++++++++++++++++++++++++++++
 3 files changed, 175 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d9eb39e4bb38..f267723c3c52 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -804,6 +804,18 @@ struct ftrace_event_call {
 #endif
 };
 
+struct event_subsystem {
+	struct list_head	list;
+	const char		*name;
+	struct dentry		*entry;
+	struct filter_pred	**preds;
+};
+
+#define events_for_each(event)						\
+	for (event = __start_ftrace_events;				\
+	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
+	     event++)
+
 #define MAX_FILTER_PRED 8
 
 struct filter_pred;
@@ -832,6 +844,9 @@ extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
 extern void filter_free_preds(struct ftrace_event_call *call);
 extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+extern void filter_free_subsystem_preds(struct event_subsystem *system);
+extern int filter_add_subsystem_pred(struct event_subsystem *system,
+				     struct filter_pred *pred);
 
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 97470c48956e..97d4daaddd9a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -524,6 +524,71 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static ssize_t
+subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
+		      loff_t *ppos)
+{
+	struct event_subsystem *system = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	r = filter_print_preds(system->preds, s->buffer);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r);
+
+	kfree(s);
+
+	return r;
+}
+
+static ssize_t
+subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		       loff_t *ppos)
+{
+	struct event_subsystem *system = filp->private_data;
+	char buf[64], *pbuf = buf;
+	struct filter_pred *pred;
+	int err;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return -ENOMEM;
+
+	err = filter_parse(&pbuf, pred);
+	if (err < 0) {
+		filter_free_pred(pred);
+		return err;
+	}
+
+	if (pred->clear) {
+		filter_free_subsystem_preds(system);
+		return cnt;
+	}
+
+	if (filter_add_subsystem_pred(system, pred)) {
+		filter_free_pred(pred);
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -575,6 +640,12 @@ static const struct file_operations ftrace_event_filter_fops = {
 	.write = event_filter_write,
 };
 
+static const struct file_operations ftrace_subsystem_filter_fops = {
+	.open = tracing_open_generic,
+	.read = subsystem_filter_read,
+	.write = subsystem_filter_write,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -595,18 +666,13 @@ static struct dentry *event_trace_events_dir(void)
 	return d_events;
 }
 
-struct event_subsystem {
-	struct list_head	list;
-	const char		*name;
-	struct dentry		*entry;
-};
-
 static LIST_HEAD(event_subsystems);
 
 static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
 	struct event_subsystem *system;
+	struct dentry *entry;
 
 	/* First see if we did not already create this dir */
 	list_for_each_entry(system, &event_subsystems, list) {
@@ -633,6 +699,14 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 	system->name = name;
 	list_add(&system->list, &event_subsystems);
 
+	system->preds = NULL;
+
+	entry = debugfs_create_file("filter", 0444, system->entry, system,
+				    &ftrace_subsystem_filter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/filter' entry\n", name);
+
 	return system->entry;
 }
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8e8c5fa25be9..1ab20cee0e4c 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -181,6 +181,27 @@ void filter_free_preds(struct ftrace_event_call *call)
 	}
 }
 
+void filter_free_subsystem_preds(struct event_subsystem *system)
+{
+	struct ftrace_event_call *call = __start_ftrace_events;
+	int i;
+
+	if (system->preds) {
+		for (i = 0; i < MAX_FILTER_PRED; i++)
+			filter_free_pred(system->preds[i]);
+		kfree(system->preds);
+		system->preds = NULL;
+	}
+
+	events_for_each(call) {
+		if (!call->name || !call->regfunc)
+			continue;
+
+		if (!strcmp(call->system, system->name))
+			filter_free_preds(call);
+	}
+}
+
 static int __filter_add_pred(struct ftrace_event_call *call,
 			     struct filter_pred *pred)
 {
@@ -250,6 +271,65 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 	return __filter_add_pred(call, pred);
 }
 
+static struct filter_pred *copy_pred(struct filter_pred *pred)
+{
+	struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
+	if (!new_pred)
+		return NULL;
+
+	memcpy(new_pred, pred, sizeof(*pred));
+	if (pred->str_val) {
+		new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
+		new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
+		if (!new_pred->str_val) {
+			kfree(new_pred);
+			return NULL;
+		}
+	}
+
+	return new_pred;
+}
+
+int filter_add_subsystem_pred(struct event_subsystem *system,
+			      struct filter_pred *pred)
+{
+	struct ftrace_event_call *call = __start_ftrace_events;
+	struct filter_pred *event_pred;
+	int i;
+
+	if (system->preds && !pred->compound)
+		filter_free_subsystem_preds(system);
+
+	if (!system->preds) {
+		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+					GFP_KERNEL);
+		if (!system->preds)
+			return -ENOMEM;
+	}
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (!system->preds[i]) {
+			system->preds[i] = pred;
+			break;
+		}
+		if (i == MAX_FILTER_PRED - 1)
+			return -EINVAL;
+	}
+
+	events_for_each(call) {
+		if (!call->name || !call->regfunc)
+			continue;
+
+		if (!strcmp(call->system, system->name)) {
+			event_pred = copy_pred(pred);
+			if (event_pred)
+				filter_add_pred(call, event_pred);
+		}
+	}
+
+	return 0;
+}
+
 int filter_parse(char **pbuf, struct filter_pred *pred)
 {
 	char *tmp, *tok, *val_str = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From fe9f57f250ab4d781b99504caeb218ca2db14c1a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Mar 2009 18:41:59 +0100
Subject: tracing: add run-time field descriptions for event filtering, kfree
 fix

Impact: fix potential kfree of random data in (rare) failure path

Zero-initialize the field structure.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <1237710639.7703.46.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 97d4daaddd9a..594d78aaa185 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -24,26 +24,31 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
 {
 	struct ftrace_event_field *field;
 
-	field = kmalloc(sizeof(*field), GFP_KERNEL);
+	field = kzalloc(sizeof(*field), GFP_KERNEL);
 	if (!field)
 		goto err;
+
 	field->name = kstrdup(name, GFP_KERNEL);
 	if (!field->name)
 		goto err;
+
 	field->type = kstrdup(type, GFP_KERNEL);
 	if (!field->type)
 		goto err;
+
 	field->offset = offset;
 	field->size = size;
 	list_add(&field->link, &call->fields);
 
 	return 0;
+
 err:
 	if (field) {
 		kfree(field->name);
 		kfree(field->type);
 	}
 	kfree(field);
+
 	return -ENOMEM;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 9bd7d099ab3f10dd666da399c064999bae427cd9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 23:10:43 +0100
Subject: tracing/events: make the filter files writable

We need the filter files to be writable, the current
filter file permissions are only set readable.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <1237759847-21025-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 594d78aaa185..19f61dd23219 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -706,7 +706,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	system->preds = NULL;
 
-	entry = debugfs_create_file("filter", 0444, system->entry, system,
+	entry = debugfs_create_file("filter", 0644, system->entry, system,
 				    &ftrace_subsystem_filter_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
@@ -769,7 +769,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		}
 	}
 
-	entry = debugfs_create_file("filter", 0444, call->dir, call,
+	entry = debugfs_create_file("filter", 0644, call->dir, call,
 				    &ftrace_event_filter_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
-- 
cgit v1.2.3-59-g8ed1b


From 07edf7121374609709ef1b0889f6e7b8d6a62ec1 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 23:10:46 +0100
Subject: tracing/events: don't use wake up for events

Impact: fix hard-lockup with sched switch events

Some ftrace events, such as sched wakeup, can be traced
while the runqueue lock is hold. Since they are using
trace_current_buffer_unlock_commit(), they call wake_up()
which can try to grab the runqueue lock too, resulting in
a deadlock.

Now for all event, we call a new helper:
trace_nowake_buffer_unlock_commit() which do pretty the same than
trace_current_buffer_unlock_commit() except than it doesn't call
trace_wake_up().

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237759847-21025-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                | 26 +++++++++++++++++++++-----
 kernel/trace/trace.h                |  2 ++
 kernel/trace/trace_events_stage_3.h |  2 +-
 3 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e6fac0ffe6f0..6bad12819eb6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -860,15 +860,25 @@ static void ftrace_trace_stack(struct trace_array *tr,
 static void ftrace_trace_userstack(struct trace_array *tr,
 				   unsigned long flags, int pc);
 
-void trace_buffer_unlock_commit(struct trace_array *tr,
-				struct ring_buffer_event *event,
-				unsigned long flags, int pc)
+static inline void __trace_buffer_unlock_commit(struct trace_array *tr,
+					struct ring_buffer_event *event,
+					unsigned long flags, int pc,
+					int wake)
 {
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
-	trace_wake_up();
+
+	if (wake)
+		trace_wake_up();
+}
+
+void trace_buffer_unlock_commit(struct trace_array *tr,
+					struct ring_buffer_event *event,
+					unsigned long flags, int pc)
+{
+	__trace_buffer_unlock_commit(tr, event, flags, pc, 1);
 }
 
 struct ring_buffer_event *
@@ -882,7 +892,13 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
-	return trace_buffer_unlock_commit(&global_trace, event, flags, pc);
+	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
+}
+
+void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc)
+{
+	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
 }
 
 void
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f267723c3c52..54fd9bcd0a65 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -483,6 +483,8 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
 				  unsigned long flags, int pc);
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc);
+void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc);
 
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index ebf215e87d5e..9a3bd49b52e5 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -222,7 +222,7 @@ static void ftrace_raw_event_##call(proto)				\
 									\
 	assign;								\
 									\
-	trace_current_buffer_unlock_commit(event, irq_flags, pc);	\
+	trace_nowake_buffer_unlock_commit(event, irq_flags, pc);	\
 									\
 	if (call->preds && !filter_match_preds(call, entry))		\
 		ring_buffer_event_discard(event);			\
-- 
cgit v1.2.3-59-g8ed1b


From 7e6ea92df3fd7cbe74e7985c6f3e40255c44b201 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 23:10:47 +0100
Subject: tracing/ftrace: make nop-tracer use polling wait for events on pipe

Impact: display events when they arrive

Now that the events don't use wake_up() anymore, we need the nop
tracer to poll waiting for events on the pipe. Especially because
nop is useful to look at orphan traces types (traces types that
don't rely on specific tracers) because it doesn't produce traces
itself.

And unlike other tracers that trigger specific traces periodically,
nop triggers no traces by itself that can wake him.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237759847-21025-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_nop.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 9aa84bde23cd..394f94417e2f 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -91,6 +91,7 @@ struct tracer nop_trace __read_mostly =
 	.name		= "nop",
 	.init		= nop_trace_init,
 	.reset		= nop_trace_reset,
+	.wait_pipe	= poll_wait_pipe,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest	= trace_selftest_startup_nop,
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From b118415bfad6d75792a85ac999e25149db8e6919 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 23 Mar 2009 00:18:39 +0100
Subject: tracing/events: don't discard an event after commit

When we want to filter an event, the filter test is done after
the event is commited to the ring-buffer to be discarded later if
needed.

But a reader could be reading this event while we are trying to discard
it. Other kind of racy events can even happen because the event is
commited and can be read and/or consumed.

What we want is to discard the event before committing it.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <1237763919-21505-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_stage_3.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 9a3bd49b52e5..9d2fa78cecca 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -222,10 +222,11 @@ static void ftrace_raw_event_##call(proto)				\
 									\
 	assign;								\
 									\
-	trace_nowake_buffer_unlock_commit(event, irq_flags, pc);	\
-									\
 	if (call->preds && !filter_match_preds(call, entry))		\
 		ring_buffer_event_discard(event);			\
+									\
+	trace_nowake_buffer_unlock_commit(event, irq_flags, pc);	\
+									\
 }									\
 									\
 static int ftrace_raw_reg_event_##call(void)				\
-- 
cgit v1.2.3-59-g8ed1b


From 75c8b417526529d0a7072e4d93ec99dbd483a6f4 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Mon, 23 Mar 2009 03:26:28 -0500
Subject: tracing/filters: use list_for_each_entry_safe

Impact: cleanup

Use list_for_each_entry_safe instead of list_for_each_entry in
find_event_field().

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237796788.7527.35.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 1ab20cee0e4c..c4a413b70f78 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -147,11 +147,9 @@ int filter_print_preds(struct filter_pred **preds, char *buf)
 static struct ftrace_event_field *
 find_event_field(struct ftrace_event_call *call, char *name)
 {
-	struct ftrace_event_field *field;
-	struct list_head *entry, *tmp;
+	struct ftrace_event_field *field, *next;
 
-	list_for_each_safe(entry, tmp, &call->fields) {
-		field = list_entry(entry, struct ftrace_event_field, link);
+	list_for_each_entry_safe(field, next, &call->fields, link) {
 		if (!strcmp(field->name, name))
 			return field;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From ee6cdabc820a29bd607f38d9cb335c3ceddc673b Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Mon, 23 Mar 2009 03:26:42 -0500
Subject: tracing/filters: fix bug in copy_pred()

Impact: fix potential crash on subsystem filter expression freeing

When making a copy of the predicate, pred->field_name needs to be
duplicated in the copy as well, otherwise bad things can happen due to
later multiple frees of the same string.

This affects only per-subsystem event filtering.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237796802.7527.39.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index c4a413b70f78..fd01d8022ad1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -276,11 +276,19 @@ static struct filter_pred *copy_pred(struct filter_pred *pred)
 		return NULL;
 
 	memcpy(new_pred, pred, sizeof(*pred));
+
+	if (pred->field_name) {
+		new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
+		if (!new_pred->field_name) {
+			kfree(new_pred);
+			return NULL;
+		}
+	}
+
 	if (pred->str_val) {
 		new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
-		new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
 		if (!new_pred->str_val) {
-			kfree(new_pred);
+			filter_free_pred(new_pred);
 			return NULL;
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b


From c4cff064be678f1e8344d907499f2a81282edc19 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Mon, 23 Mar 2009 03:26:48 -0500
Subject: tracing/filters: clean up filter_add_subsystem_pred()

Impact: cleanup, memory leak fix

This patch cleans up filter_add_subsystem_pred():

- searches for the field before creating a copy of the pred

- fixes memory leak in the case a predicate isn't applied

- if -ENOMEM, makes sure there's no longer a reference to the
  pred so the caller can free the half-finished filter

- changes the confusing i == MAX_FILTER_PRED - 1 comparison
  previously remarked upon

This affects only per-subsystem event filtering.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237796808.7527.40.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c        |  1 +
 kernel/trace/trace_events_filter.c | 31 ++++++++++++++++++++++++-------
 2 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 19f61dd23219..fdab30d6c835 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -585,6 +585,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	}
 
 	if (filter_add_subsystem_pred(system, pred)) {
+		filter_free_subsystem_preds(system);
 		filter_free_pred(pred);
 		return -EINVAL;
 	}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index fd01d8022ad1..4117c2ebc245 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -318,22 +318,39 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 			system->preds[i] = pred;
 			break;
 		}
-		if (i == MAX_FILTER_PRED - 1)
-			return -EINVAL;
 	}
 
+	if (i == MAX_FILTER_PRED)
+		return -EINVAL;
+
 	events_for_each(call) {
+		int err;
+
 		if (!call->name || !call->regfunc)
 			continue;
 
-		if (!strcmp(call->system, system->name)) {
-			event_pred = copy_pred(pred);
-			if (event_pred)
-				filter_add_pred(call, event_pred);
-		}
+		if (strcmp(call->system, system->name))
+			continue;
+
+		if (!find_event_field(call, pred->field_name))
+			continue;
+
+		event_pred = copy_pred(pred);
+		if (!event_pred)
+			goto oom;
+
+		err = filter_add_pred(call, event_pred);
+		if (err)
+			filter_free_pred(event_pred);
+		if (err == -ENOMEM)
+			goto oom;
 	}
 
 	return 0;
+
+oom:
+	system->preds[i] = NULL;
+	return -ENOMEM;
 }
 
 int filter_parse(char **pbuf, struct filter_pred *pred)
-- 
cgit v1.2.3-59-g8ed1b


From 3e1f60b80cafcb5d7e8d3665b35962fbb8fb9efa Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 23:10:45 +0100
Subject: tracing/ftrace: check if debugfs is registered before creating files

Impact: fix a crash with ftrace={nop,boot} parameter

If the nop or initcall tracers are launched as boot tracers,
they will attempt to create their option directory and files.
But these tracers are registered very early and then assigned
as "boot tracers" very early if asked to.

Since they do this before debugfs has been registered (core initcall),
a crash is triggered.

Another early tracers could also come later. So we fix it by
checking if debugfs is initialized before creating the root
tracing directory.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237759847-21025-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ace685c70186..f0e1337b1ebd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3513,6 +3513,9 @@ struct dentry *tracing_init_dentry(void)
 	if (d_tracer)
 		return d_tracer;
 
+	if (!debugfs_initialized())
+		return NULL;
+
 	d_tracer = debugfs_create_dir("tracing", NULL);
 
 	if (!d_tracer && !once) {
-- 
cgit v1.2.3-59-g8ed1b


From 45b9560895b07a4a09d55d49235c984db512c5aa Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 24 Mar 2009 01:07:24 +0300
Subject: tracing: Fix TRACING_SUPPORT dependency for PPC32

commit 40ada30f9621fbd831ac2437b9a2a399aa ("tracing: clean up menu"),
despite the "clean up" in its purpose, introduced a behavioural
change for Kconfig symbols: we no longer able to select tracing
support on PPC32 (because IRQFLAGS_SUPPORT isn't yet implemented).

The IRQFLAGS_SUPPORT is not mandatory for most tracers, tracing core
has a special case for platforms w/o irqflags (which, by the way, has
become useless as of the commit above).

Though according to Ingo Molnar, there was periodic build failures on
weird, unmaintained architectures that had no irqflags-tracing support
and hence didn't know the raw_irqs_save/restore primitives. Thus we'd
better not enable irqflags-less tracing for all architectures.

This patch restores the old behaviour for PPC32, and thus brings the
tracing back. Other architectures can either add themselves to the
exception list or (better) implement TRACE_IRQFLAGS_SUPPORT.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Acked-b: Steven Rostedt <rostedt@goodmis.org>
Cc: linuxppc-dev@ozlabs.org
LKML-Reference: <20090323220724.GA9851@oksana.dev.rtsoft.ru>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b0a46f889659..8a4d72931042 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -63,7 +63,11 @@ config TRACING
 #
 config TRACING_SUPPORT
 	bool
-	depends on TRACE_IRQFLAGS_SUPPORT
+	# PPC32 has no irqflags tracing support, but it can use most of the
+	# tracers anyway, they were tested to build and work. Note that new
+	# exceptions to this list aren't welcomed, better implement the
+	# irqflags tracing for your architecture.
+	depends on TRACE_IRQFLAGS_SUPPORT || PPC32
 	depends on STACKTRACE_SUPPORT
 	default y
 
-- 
cgit v1.2.3-59-g8ed1b


From 1618536961d31f9b3f55767b22d4a897f4204c26 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 23 Mar 2009 22:17:01 +0100
Subject: tracing/function-graph-tracer: fix functions call traces imbalance

Impact: fix traces output

Sometimes one can observe an imbalance in the traces between function
calls and function return traces:

func1() {
    }
}

The curly brace inside func1() is the return of another function nested
inside func1. The return trace have been inserted in the buffer but not
the entry.
We are storing a return address on the function traces stack while we
haven't inserted its entry on the buffer, hence the imbalance on the
traces.

This is because the tracers doesn't check all failures that can happen
on buffer insertion.

This patch reports the tracing recursion failures and the ring buffer
failures. In such cases, we now restore the original return address for
the function, giving up its return trace.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237843021-11695-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6bad12819eb6..89f0c2544ad0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -924,7 +924,7 @@ trace_function(struct trace_array *tr,
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void __trace_graph_entry(struct trace_array *tr,
+static int __trace_graph_entry(struct trace_array *tr,
 				struct ftrace_graph_ent *trace,
 				unsigned long flags,
 				int pc)
@@ -933,15 +933,17 @@ static void __trace_graph_entry(struct trace_array *tr,
 	struct ftrace_graph_ent_entry *entry;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
-		return;
+		return 0;
 
 	event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
 					  sizeof(*entry), flags, pc);
 	if (!event)
-		return;
+		return 0;
 	entry	= ring_buffer_event_data(event);
 	entry->graph_ent			= *trace;
 	ring_buffer_unlock_commit(global_trace.buffer, event);
+
+	return 1;
 }
 
 static void __trace_graph_return(struct trace_array *tr,
@@ -1162,6 +1164,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
+	int ret;
 	int cpu;
 	int pc;
 
@@ -1177,15 +1180,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_graph_entry(tr, trace, flags, pc);
+		ret = __trace_graph_entry(tr, trace, flags, pc);
+	} else {
+		ret = 0;
 	}
 	/* Only do the atomic if it is not already set */
 	if (!test_tsk_trace_graph(current))
 		set_tsk_trace_graph(current);
+
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 
-	return 1;
+	return ret;
 }
 
 void trace_graph_return(struct ftrace_graph_ret *trace)
-- 
cgit v1.2.3-59-g8ed1b


From 1fc2d5c11918082536acf261ce6abb1f5511053f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 24 Mar 2009 02:14:01 -0500
Subject: tracing/filters: use list_for_each_entry

Impact: cleanup

No need to use the safe version here, so use list_for_each_entry instead
of list_for_each_entry_safe in find_event_field().

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878841.8339.57.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 4117c2ebc245..3f0b79f8a4bc 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -147,9 +147,9 @@ int filter_print_preds(struct filter_pred **preds, char *buf)
 static struct ftrace_event_field *
 find_event_field(struct ftrace_event_call *call, char *name)
 {
-	struct ftrace_event_field *field, *next;
+	struct ftrace_event_field *field;
 
-	list_for_each_entry_safe(field, next, &call->fields, link) {
+	list_for_each_entry(field, &call->fields, link) {
 		if (!strcmp(field->name, name))
 			return field;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 09f1f245c79585383de63e3ca54d0f91824bff3a Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 24 Mar 2009 02:14:11 -0500
Subject: tracing/filters: free pred when clearing filters

Impact: fix (small) per trace filter modification memory leak

Free the current pred when clearing the filters via the filter files.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878851.8339.58.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fdab30d6c835..a9381384aa9e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -516,6 +516,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	if (pred->clear) {
 		filter_free_preds(call);
+		filter_free_pred(pred);
 		return cnt;
 	}
 
@@ -581,6 +582,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	if (pred->clear) {
 		filter_free_subsystem_preds(system);
+		filter_free_pred(pred);
 		return cnt;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 4bda2d517bfa3ce3d7044e06988cdddae7adffe2 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 24 Mar 2009 02:14:31 -0500
Subject: tracing/filters: use trace_seq_printf() to print filters

Impact: cleanup

Instead of just using the trace_seq buffer to print the filters, use
trace_seq_printf() as it was intended to be used.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878871.8339.59.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h               |  3 ++-
 kernel/trace/trace_events.c        |  8 ++++----
 kernel/trace/trace_events_filter.c | 25 +++++++++----------------
 3 files changed, 15 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 54fd9bcd0a65..90a848debcba 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -840,7 +840,8 @@ struct filter_pred {
 int trace_define_field(struct ftrace_event_call *call, char *type,
 		       char *name, int offset, int size);
 extern void filter_free_pred(struct filter_pred *pred);
-extern int filter_print_preds(struct filter_pred **preds, char *buf);
+extern void filter_print_preds(struct filter_pred **preds,
+			       struct trace_seq *s);
 extern int filter_parse(char **pbuf, struct filter_pred *pred);
 extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a9381384aa9e..d132997ab756 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -481,8 +481,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	r = filter_print_preds(call->preds, s->buffer);
-	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r);
+	filter_print_preds(call->preds, s);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
 
@@ -547,8 +547,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	r = filter_print_preds(system->preds, s->buffer);
-	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r);
+	filter_print_preds(system->preds, s);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 3f0b79f8a4bc..9fca8bb1c06b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -24,6 +24,7 @@
 #include <linux/ctype.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 static int filter_pred_64(struct filter_pred *pred, void *event)
 {
@@ -108,16 +109,15 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 	return 1;
 }
 
-int filter_print_preds(struct filter_pred **preds, char *buf)
+void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
 {
-	ssize_t this_len = 0;
 	char *field_name;
 	struct filter_pred *pred;
 	int i;
 
 	if (!preds) {
-		this_len += sprintf(buf + this_len, "none\n");
-		return this_len;
+		trace_seq_printf(s, "none\n");
+		return;
 	}
 
 	for (i = 0; i < MAX_FILTER_PRED; i++) {
@@ -125,23 +125,16 @@ int filter_print_preds(struct filter_pred **preds, char *buf)
 			pred = preds[i];
 			field_name = pred->field_name;
 			if (i)
-				this_len += sprintf(buf + this_len,
-					    pred->or ? "|| " : "&& ");
-			this_len += sprintf(buf + this_len,
-					    "%s ", field_name);
-			this_len += sprintf(buf + this_len,
-					    pred->not ? "!= " : "== ");
+				trace_seq_printf(s, pred->or ? "|| " : "&& ");
+			trace_seq_printf(s, "%s ", field_name);
+			trace_seq_printf(s, pred->not ? "!= " : "== ");
 			if (pred->str_val)
-				this_len += sprintf(buf + this_len,
-						    "%s\n", pred->str_val);
+				trace_seq_printf(s, "%s\n", pred->str_val);
 			else
-				this_len += sprintf(buf + this_len,
-						    "%llu\n", pred->val);
+				trace_seq_printf(s, "%llu\n", pred->val);
 		} else
 			break;
 	}
-
-	return this_len;
 }
 
 static struct ftrace_event_field *
-- 
cgit v1.2.3-59-g8ed1b


From 9f58a159d022c8f2533a27708aa267adf4f0e3ce Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 24 Mar 2009 02:14:42 -0500
Subject: tracing/filters: disallow integer values for string filters and vice
 versa

Impact: fix filter use boundary condition / crash

Make sure filters for string fields don't use integer values and vice
versa.  Getting it wrong can crash the system or produce bogus
results.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878882.8339.61.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 9fca8bb1c06b..026be412f356 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -237,9 +237,14 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 	pred->offset = field->offset;
 
 	if (is_string_field(field->type)) {
+		if (!pred->str_val)
+			return -EINVAL;
 		pred->fn = filter_pred_string;
 		pred->str_len = field->size;
 		return __filter_add_pred(call, pred);
+	} else {
+		if (pred->str_val)
+			return -EINVAL;
 	}
 
 	switch (field->size) {
-- 
cgit v1.2.3-59-g8ed1b


From 3aa551c9b4c40018f0e261a178e3d25478dc04a9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 23 Mar 2009 18:28:15 +0100
Subject: genirq: add threaded interrupt handler support

Add support for threaded interrupt handlers:

A device driver can request that its main interrupt handler runs in a
thread. To achive this the device driver requests the interrupt with
request_threaded_irq() and provides additionally to the handler a
thread function. The handler function is called in hard interrupt
context and needs to check whether the interrupt originated from the
device. If the interrupt originated from the device then the handler
can either return IRQ_HANDLED or IRQ_WAKE_THREAD. IRQ_HANDLED is
returned when no further action is required. IRQ_WAKE_THREAD causes
the genirq code to invoke the threaded (main) handler. When
IRQ_WAKE_THREAD is returned handler must have disabled the interrupt
on the device level. This is mandatory for shared interrupt handlers,
but we need to do it as well for obscure x86 hardware where disabling
an interrupt on the IO_APIC level redirects the interrupt to the
legacy PIC interrupt lines.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hardirq.h   |   2 +-
 include/linux/interrupt.h |  37 ++++++++-
 include/linux/irq.h       |   5 ++
 include/linux/irqreturn.h |   2 +
 include/linux/sched.h     |   5 ++
 kernel/exit.c             |   2 +
 kernel/irq/handle.c       |  31 +++++++-
 kernel/irq/manage.c       | 192 ++++++++++++++++++++++++++++++++++++++++++----
 8 files changed, 259 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f83288347dda..2dfaadbdb2ac 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -105,7 +105,7 @@
 # define IRQ_EXIT_OFFSET HARDIRQ_OFFSET
 #endif
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_GENERIC_HARDIRQS)
 extern void synchronize_irq(unsigned int irq);
 #else
 # define synchronize_irq(irq)	barrier()
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0c9cb63e6895..6fc2b720c231 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -59,6 +59,16 @@
 #define IRQF_NOBALANCING	0x00000800
 #define IRQF_IRQPOLL		0x00001000
 
+/*
+ * Bits used by threaded handlers:
+ * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
+ * IRQTF_DIED      - handler thread died
+ */
+enum {
+	IRQTF_RUNTHREAD,
+	IRQTF_DIED,
+};
+
 typedef irqreturn_t (*irq_handler_t)(int, void *);
 
 /**
@@ -71,6 +81,9 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
  * @next:	pointer to the next irqaction for shared interrupts
  * @irq:	interrupt number
  * @dir:	pointer to the proc/irq/NN/name entry
+ * @thread_fn:	interupt handler function for threaded interrupts
+ * @thread:	thread pointer for threaded interrupts
+ * @thread_flags:	flags related to @thread
  */
 struct irqaction {
 	irq_handler_t handler;
@@ -81,11 +94,31 @@ struct irqaction {
 	struct irqaction *next;
 	int irq;
 	struct proc_dir_entry *dir;
+	irq_handler_t thread_fn;
+	struct task_struct *thread;
+	unsigned long thread_flags;
 };
 
 extern irqreturn_t no_action(int cpl, void *dev_id);
-extern int __must_check request_irq(unsigned int, irq_handler_t handler,
-		       unsigned long, const char *, void *);
+
+extern int __must_check
+request_threaded_irq(unsigned int irq, irq_handler_t handler,
+		     irq_handler_t thread_fn,
+		     unsigned long flags, const char *name, void *dev);
+
+static inline int __must_check
+request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
+	    const char *name, void *dev)
+{
+	return request_threaded_irq(irq, handler, NULL, flags, name, dev);
+}
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+extern void exit_irq_thread(void);
+#else
+static inline void exit_irq_thread(void) { }
+#endif
+
 extern void free_irq(unsigned int, void *);
 
 struct device;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 873e4ac11b81..8b1cf0630210 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -20,6 +20,7 @@
 #include <linux/irqreturn.h>
 #include <linux/irqnr.h>
 #include <linux/errno.h>
+#include <linux/wait.h>
 
 #include <asm/irq.h>
 #include <asm/ptrace.h>
@@ -155,6 +156,8 @@ struct irq_2_iommu;
  * @affinity:		IRQ affinity on SMP
  * @cpu:		cpu index useful for balancing
  * @pending_mask:	pending rebalanced interrupts
+ * @threads_active:	number of irqaction threads currently running
+ * @wait_for_threads:	wait queue for sync_irq to wait for threaded handlers
  * @dir:		/proc/irq/ procfs entry
  * @name:		flow handler name for /proc/interrupts output
  */
@@ -186,6 +189,8 @@ struct irq_desc {
 	cpumask_var_t		pending_mask;
 #endif
 #endif
+	atomic_t		threads_active;
+	wait_queue_head_t       wait_for_threads;
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry	*dir;
 #endif
diff --git a/include/linux/irqreturn.h b/include/linux/irqreturn.h
index c5584ca5b8c9..819acaaac3f5 100644
--- a/include/linux/irqreturn.h
+++ b/include/linux/irqreturn.h
@@ -5,10 +5,12 @@
  * enum irqreturn
  * @IRQ_NONE		interrupt was not from this device
  * @IRQ_HANDLED		interrupt was handled by this device
+ * @IRQ_WAKE_THREAD	handler requests to wake the handler thread
  */
 enum irqreturn {
 	IRQ_NONE,
 	IRQ_HANDLED,
+	IRQ_WAKE_THREAD,
 };
 
 typedef enum irqreturn irqreturn_t;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 46d680643f89..38b77b0f56e5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1292,6 +1292,11 @@ struct task_struct {
 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
 	spinlock_t alloc_lock;
 
+#ifdef CONFIG_GENERIC_HARDIRQS
+	/* IRQ handler threads */
+	struct irqaction *irqaction;
+#endif
+
 	/* Protection of the PI data structures: */
 	spinlock_t pi_lock;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3ad7c6..ca0b3488c4a9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1037,6 +1037,8 @@ NORET_TYPE void do_exit(long code)
 		schedule();
 	}
 
+	exit_irq_thread();
+
 	exit_signals(tsk);  /* sets PF_EXITING */
 	/*
 	 * tsk->flags are checked in the futex code to protect against
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 9ebf77968871..fe8f45374e86 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -357,8 +357,37 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 	do {
 		ret = action->handler(irq, action->dev_id);
-		if (ret == IRQ_HANDLED)
+
+		switch (ret) {
+		case IRQ_WAKE_THREAD:
+			/*
+			 * Wake up the handler thread for this
+			 * action. In case the thread crashed and was
+			 * killed we just pretend that we handled the
+			 * interrupt. The hardirq handler above has
+			 * disabled the device interrupt, so no irq
+			 * storm is lurking.
+			 */
+			if (likely(!test_bit(IRQTF_DIED,
+					     &action->thread_flags))) {
+				set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+				wake_up_process(action->thread);
+			}
+
+			/*
+			 * Set it to handled so the spurious check
+			 * does not trigger.
+			 */
+			ret = IRQ_HANDLED;
+			/* Fall through to add to randomness */
+		case IRQ_HANDLED:
 			status |= action->flags;
+			break;
+
+		default:
+			break;
+		}
+
 		retval |= ret;
 		action = action->next;
 	} while (action);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6458e99984c0..a4c1ab86cd25 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -8,16 +8,15 @@
  */
 
 #include <linux/irq.h>
+#include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 
 #include "internals.h"
 
-#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
-cpumask_var_t irq_default_affinity;
-
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *	@irq: interrupt number to wait for
@@ -53,9 +52,18 @@ void synchronize_irq(unsigned int irq)
 
 		/* Oops, that failed? */
 	} while (status & IRQ_INPROGRESS);
+
+	/*
+	 * We made sure that no hardirq handler is running. Now verify
+	 * that no threaded handlers are active.
+	 */
+	wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
 }
 EXPORT_SYMBOL(synchronize_irq);
 
+#ifdef CONFIG_SMP
+cpumask_var_t irq_default_affinity;
+
 /**
  *	irq_can_set_affinity - Check if the affinity of a given irq can be set
  *	@irq:		Interrupt to check
@@ -72,6 +80,18 @@ int irq_can_set_affinity(unsigned int irq)
 	return 1;
 }
 
+static void
+irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
+{
+	struct irqaction *action = desc->action;
+
+	while (action) {
+		if (action->thread)
+			set_cpus_allowed_ptr(action->thread, cpumask);
+		action = action->next;
+	}
+}
+
 /**
  *	irq_set_affinity - Set the irq affinity of a given irq
  *	@irq:		Interrupt to set affinity
@@ -100,6 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	cpumask_copy(desc->affinity, cpumask);
 	desc->chip->set_affinity(irq, cpumask);
 #endif
+	irq_set_thread_affinity(desc, cpumask);
 	desc->status |= IRQ_AFFINITY_SET;
 	spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
@@ -150,6 +171,8 @@ int irq_select_affinity_usr(unsigned int irq)
 
 	spin_lock_irqsave(&desc->lock, flags);
 	ret = setup_affinity(irq, desc);
+	if (!ret)
+		irq_set_thread_affinity(desc, desc->affinity);
 	spin_unlock_irqrestore(&desc->lock, flags);
 
 	return ret;
@@ -384,6 +407,93 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 	return ret;
 }
 
+static inline int irq_thread_should_run(struct irqaction *action)
+{
+	return test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+}
+
+static int irq_wait_for_interrupt(struct irqaction *action)
+{
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (irq_thread_should_run(action)) {
+			__set_current_state(TASK_RUNNING);
+			return 0;
+		} else
+			schedule();
+	}
+	return -1;
+}
+
+/*
+ * Interrupt handler thread
+ */
+static int irq_thread(void *data)
+{
+	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+	struct irqaction *action = data;
+	struct irq_desc *desc = irq_to_desc(action->irq);
+	int wake;
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+	current->irqaction = action;
+
+	while (!irq_wait_for_interrupt(action)) {
+
+		atomic_inc(&desc->threads_active);
+
+		spin_lock_irq(&desc->lock);
+		if (unlikely(desc->status & IRQ_DISABLED)) {
+			/*
+			 * CHECKME: We might need a dedicated
+			 * IRQ_THREAD_PENDING flag here, which
+			 * retriggers the thread in check_irq_resend()
+			 * but AFAICT IRQ_PENDING should be fine as it
+			 * retriggers the interrupt itself --- tglx
+			 */
+			desc->status |= IRQ_PENDING;
+			spin_unlock_irq(&desc->lock);
+		} else {
+			spin_unlock_irq(&desc->lock);
+
+			action->thread_fn(action->irq, action->dev_id);
+		}
+
+		wake = atomic_dec_and_test(&desc->threads_active);
+
+		if (wake && waitqueue_active(&desc->wait_for_threads))
+			wake_up(&desc->wait_for_threads);
+	}
+
+	/*
+	 * Clear irqaction. Otherwise exit_irq_thread() would make
+	 * fuzz about an active irq thread going into nirvana.
+	 */
+	current->irqaction = NULL;
+	return 0;
+}
+
+/*
+ * Called from do_exit()
+ */
+void exit_irq_thread(void)
+{
+	struct task_struct *tsk = current;
+
+	if (!tsk->irqaction)
+		return;
+
+	printk(KERN_ERR
+	       "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+	       tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
+
+	/*
+	 * Set the THREAD DIED flag to prevent further wakeups of the
+	 * soon to be gone threaded handler.
+	 */
+	set_bit(IRQTF_DIED, &tsk->irqaction->flags);
+}
+
 /*
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
@@ -419,6 +529,26 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		rand_initialize_irq(irq);
 	}
 
+	/*
+	 * Threaded handler ?
+	 */
+	if (new->thread_fn) {
+		struct task_struct *t;
+
+		t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
+				   new->name);
+		if (IS_ERR(t))
+			return PTR_ERR(t);
+		/*
+		 * We keep the reference to the task struct even if
+		 * the thread dies to avoid that the interrupt code
+		 * references an already freed task_struct.
+		 */
+		get_task_struct(t);
+		new->thread = t;
+		wake_up_process(t);
+	}
+
 	/*
 	 * The following block of code has to be executed atomically
 	 */
@@ -456,15 +586,15 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	if (!shared) {
 		irq_chip_set_defaults(desc->chip);
 
+		init_waitqueue_head(&desc->wait_for_threads);
+
 		/* Setup the type (level, edge polarity) if configured: */
 		if (new->flags & IRQF_TRIGGER_MASK) {
 			ret = __irq_set_trigger(desc, irq,
 					new->flags & IRQF_TRIGGER_MASK);
 
-			if (ret) {
-				spin_unlock_irqrestore(&desc->lock, flags);
-				return ret;
-			}
+			if (ret)
+				goto out_thread;
 		} else
 			compat_irq_chip_set_default_handler(desc);
 #if defined(CONFIG_IRQ_PER_CPU)
@@ -532,8 +662,19 @@ mismatch:
 		dump_stack();
 	}
 #endif
+	ret = -EBUSY;
+
+out_thread:
 	spin_unlock_irqrestore(&desc->lock, flags);
-	return -EBUSY;
+	if (new->thread) {
+		struct task_struct *t = new->thread;
+
+		new->thread = NULL;
+		if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
+			kthread_stop(t);
+		put_task_struct(t);
+	}
+	return ret;
 }
 
 /**
@@ -559,6 +700,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action, **action_ptr;
+	struct task_struct *irqthread;
 	unsigned long flags;
 
 	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -605,6 +747,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 		else
 			desc->chip->disable(irq);
 	}
+
+	irqthread = action->thread;
+	action->thread = NULL;
+
 	spin_unlock_irqrestore(&desc->lock, flags);
 
 	unregister_handler_proc(irq, action);
@@ -612,6 +758,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 	/* Make sure it's not being used on another CPU: */
 	synchronize_irq(irq);
 
+	if (irqthread) {
+		if (!test_bit(IRQTF_DIED, &action->thread_flags))
+			kthread_stop(irqthread);
+		put_task_struct(irqthread);
+	}
+
 #ifdef CONFIG_DEBUG_SHIRQ
 	/*
 	 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -664,9 +816,12 @@ void free_irq(unsigned int irq, void *dev_id)
 EXPORT_SYMBOL(free_irq);
 
 /**
- *	request_irq - allocate an interrupt line
+ *	request_threaded_irq - allocate an interrupt line
  *	@irq: Interrupt line to allocate
- *	@handler: Function to be called when the IRQ occurs
+ *	@handler: Function to be called when the IRQ occurs.
+ *		  Primary handler for threaded interrupts
+ *      @thread_fn: Function called from the irq handler thread
+ *                  If NULL, no irq thread is created
  *	@irqflags: Interrupt type flags
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A cookie passed back to the handler function
@@ -678,6 +833,15 @@ EXPORT_SYMBOL(free_irq);
  *	raises, you must take care both to initialise your hardware
  *	and to set up the interrupt handler in the right order.
  *
+ *	If you want to set up a threaded irq handler for your device
+ *	then you need to supply @handler and @thread_fn. @handler ist
+ *	still called in hard interrupt context and has to check
+ *	whether the interrupt originates from the device. If yes it
+ *	needs to disable the interrupt on the device and return
+ *	IRQ_THREAD_WAKE which will wake up the handler thread and run
+ *	@thread_fn. This split handler design is necessary to support
+ *	shared interrupts.
+ *
  *	Dev_id must be globally unique. Normally the address of the
  *	device data structure is used as the cookie. Since the handler
  *	receives this value it makes sense to use it.
@@ -693,8 +857,9 @@ EXPORT_SYMBOL(free_irq);
  *	IRQF_TRIGGER_*		Specify active edge(s) or level
  *
  */
-int request_irq(unsigned int irq, irq_handler_t handler,
-		unsigned long irqflags, const char *devname, void *dev_id)
+int request_threaded_irq(unsigned int irq, irq_handler_t handler,
+			 irq_handler_t thread_fn, unsigned long irqflags,
+			 const char *devname, void *dev_id)
 {
 	struct irqaction *action;
 	struct irq_desc *desc;
@@ -742,6 +907,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 		return -ENOMEM;
 
 	action->handler = handler;
+	action->thread_fn = thread_fn;
 	action->flags = irqflags;
 	action->name = devname;
 	action->dev_id = dev_id;
@@ -771,4 +937,4 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 #endif
 	return retval;
 }
-EXPORT_SYMBOL(request_irq);
+EXPORT_SYMBOL(request_threaded_irq);
-- 
cgit v1.2.3-59-g8ed1b


From 935bd5b971f0df7c06d214d022cf8392e2f37952 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 23 Mar 2009 18:28:16 +0100
Subject: genirq: add support for threaded interrupts to devres

Some devices use devres_request_irq() for to install their interrupt
handler. Add support for threaded interrupts to devres as well.

[tglx - simplified and adapted to latest threadirq version]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h | 17 ++++++++++++++---
 kernel/irq/devres.c       | 16 ++++++++++------
 2 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 6fc2b720c231..dbf6a6fd116c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -123,9 +123,20 @@ extern void free_irq(unsigned int, void *);
 
 struct device;
 
-extern int __must_check devm_request_irq(struct device *dev, unsigned int irq,
-			    irq_handler_t handler, unsigned long irqflags,
-			    const char *devname, void *dev_id);
+extern int __must_check
+devm_request_threaded_irq(struct device *dev, unsigned int irq,
+			  irq_handler_t handler, irq_handler_t thread_fn,
+			  unsigned long irqflags, const char *devname,
+			  void *dev_id);
+
+static inline int __must_check
+devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler,
+		 unsigned long irqflags, const char *devname, void *dev_id)
+{
+	return devm_request_threaded_irq(dev, irq, handler, NULL, irqflags,
+					 devname, dev_id);
+}
+
 extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
 
 /*
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 38a25b8d8bff..d06df9c41cba 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -26,10 +26,12 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
 }
 
 /**
- *	devm_request_irq - allocate an interrupt line for a managed device
+ *	devm_request_threaded_irq - allocate an interrupt line for a managed device
  *	@dev: device to request interrupt for
  *	@irq: Interrupt line to allocate
  *	@handler: Function to be called when the IRQ occurs
+ *	@thread_fn: function to be called in a threaded interrupt context. NULL
+ *		    for devices which handle everything in @handler
  *	@irqflags: Interrupt type flags
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A cookie passed back to the handler function
@@ -42,9 +44,10 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
  *	If an IRQ allocated with this function needs to be freed
  *	separately, dev_free_irq() must be used.
  */
-int devm_request_irq(struct device *dev, unsigned int irq,
-		     irq_handler_t handler, unsigned long irqflags,
-		     const char *devname, void *dev_id)
+int devm_request_threaded_irq(struct device *dev, unsigned int irq,
+			      irq_handler_t handler, irq_handler_t thread_fn,
+			      unsigned long irqflags, const char *devname,
+			      void *dev_id)
 {
 	struct irq_devres *dr;
 	int rc;
@@ -54,7 +57,8 @@ int devm_request_irq(struct device *dev, unsigned int irq,
 	if (!dr)
 		return -ENOMEM;
 
-	rc = request_irq(irq, handler, irqflags, devname, dev_id);
+	rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
+				  dev_id);
 	if (rc) {
 		devres_free(dr);
 		return rc;
@@ -66,7 +70,7 @@ int devm_request_irq(struct device *dev, unsigned int irq,
 
 	return 0;
 }
-EXPORT_SYMBOL(devm_request_irq);
+EXPORT_SYMBOL(devm_request_threaded_irq);
 
 /**
  *	devm_free_irq - free an interrupt
-- 
cgit v1.2.3-59-g8ed1b


From f48fe81e5b032914183e9a17052313720c2cac56 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 24 Mar 2009 11:46:22 +0100
Subject: genirq: threaded irq handlers review fixups

Delta patch to address the review comments.

      - Implement warning when IRQ_WAKE_THREAD is requested and no
        thread handler installed
      - coding style fixes

Pointed-out-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  2 ++
 kernel/irq/handle.c       | 29 ++++++++++++++++++++++++-----
 kernel/irq/manage.c       | 17 +++++++----------
 3 files changed, 33 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index dbf6a6fd116c..266a5f5f57cc 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -63,10 +63,12 @@
  * Bits used by threaded handlers:
  * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
  * IRQTF_DIED      - handler thread died
+ * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
  */
 enum {
 	IRQTF_RUNTHREAD,
 	IRQTF_DIED,
+	IRQTF_WARNED,
 };
 
 typedef irqreturn_t (*irq_handler_t)(int, void *);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index fe8f45374e86..38b49a9e508a 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -338,6 +338,15 @@ irqreturn_t no_action(int cpl, void *dev_id)
 	return IRQ_NONE;
 }
 
+static void warn_no_thread(unsigned int irq, struct irqaction *action)
+{
+	if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags))
+		return;
+
+	printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD "
+	       "but no thread function available.", irq, action->name);
+}
+
 /**
  * handle_IRQ_event - irq action chain handler
  * @irq:	the interrupt number
@@ -360,6 +369,21 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 		switch (ret) {
 		case IRQ_WAKE_THREAD:
+			/*
+			 * Set result to handled so the spurious check
+			 * does not trigger.
+			 */
+			ret = IRQ_HANDLED;
+
+			/*
+			 * Catch drivers which return WAKE_THREAD but
+			 * did not set up a thread function
+			 */
+			if (unlikely(!action->thread_fn)) {
+				warn_no_thread(irq, action);
+				break;
+			}
+
 			/*
 			 * Wake up the handler thread for this
 			 * action. In case the thread crashed and was
@@ -374,11 +398,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 				wake_up_process(action->thread);
 			}
 
-			/*
-			 * Set it to handled so the spurious check
-			 * does not trigger.
-			 */
-			ret = IRQ_HANDLED;
 			/* Fall through to add to randomness */
 		case IRQ_HANDLED:
 			status |= action->flags;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a4c1ab86cd25..a3eb7baf1e46 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -407,20 +407,17 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 	return ret;
 }
 
-static inline int irq_thread_should_run(struct irqaction *action)
-{
-	return test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags);
-}
-
 static int irq_wait_for_interrupt(struct irqaction *action)
 {
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (irq_thread_should_run(action)) {
+
+		if (test_and_clear_bit(IRQTF_RUNTHREAD,
+				       &action->thread_flags)) {
 			__set_current_state(TASK_RUNNING);
 			return 0;
-		} else
-			schedule();
+		}
+		schedule();
 	}
 	return -1;
 }
@@ -820,8 +817,8 @@ EXPORT_SYMBOL(free_irq);
  *	@irq: Interrupt line to allocate
  *	@handler: Function to be called when the IRQ occurs.
  *		  Primary handler for threaded interrupts
- *      @thread_fn: Function called from the irq handler thread
- *                  If NULL, no irq thread is created
+ *	@thread_fn: Function called from the irq handler thread
+ *		    If NULL, no irq thread is created
  *	@irqflags: Interrupt type flags
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A cookie passed back to the handler function
-- 
cgit v1.2.3-59-g8ed1b


From e4955c9986a27bb47ddeb6cd55803053f547e2e9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 16:04:37 +0800
Subject: blktrace: mark ddir_act[] const

Impact: cleanup

ddir_act and what2act always stay immutable.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49C89415.5080503@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 108f4f7715a5..1ffcbd4ac45b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,8 +147,8 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 /*
  * Data direction bit lookup
  */
-static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ),
-					 BLK_TC_ACT(BLK_TC_WRITE) };
+static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
+				 BLK_TC_ACT(BLK_TC_WRITE) };
 
 /* The ilog2() calls fall out because they're constant */
 #define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
@@ -1116,10 +1116,10 @@ static void blk_tracer_reset(struct trace_array *tr)
 	blk_tracer_stop(tr);
 }
 
-static struct {
+static const struct {
 	const char *act[2];
 	int	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
-} what2act[] __read_mostly = {
+} what2act[] = {
 	[__BLK_TA_QUEUE]	= {{  "Q", "queue" },	   blk_log_generic },
 	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
 	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
-- 
cgit v1.2.3-59-g8ed1b


From 65796348e09880e12b97267d39b8857c758440a6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 16:05:06 +0800
Subject: blktrace: fix wrong calculation of RWBS

Impact: fix the output of IO type category characters

Trace categories are the upper 16 bits, not the lower 16 bits.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49C89432.8010805@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 1ffcbd4ac45b..9af41430ee54 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -922,23 +922,24 @@ static void blk_unregister_tracepoints(void)
 static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 {
 	int i = 0;
+	int tc = t->action >> BLK_TC_SHIFT;
 
-	if (t->action & BLK_TC_DISCARD)
+	if (tc & BLK_TC_DISCARD)
 		rwbs[i++] = 'D';
-	else if (t->action & BLK_TC_WRITE)
+	else if (tc & BLK_TC_WRITE)
 		rwbs[i++] = 'W';
 	else if (t->bytes)
 		rwbs[i++] = 'R';
 	else
 		rwbs[i++] = 'N';
 
-	if (t->action & BLK_TC_AHEAD)
+	if (tc & BLK_TC_AHEAD)
 		rwbs[i++] = 'A';
-	if (t->action & BLK_TC_BARRIER)
+	if (tc & BLK_TC_BARRIER)
 		rwbs[i++] = 'B';
-	if (t->action & BLK_TC_SYNC)
+	if (tc & BLK_TC_SYNC)
 		rwbs[i++] = 'S';
-	if (t->action & BLK_TC_META)
+	if (tc & BLK_TC_META)
 		rwbs[i++] = 'M';
 
 	rwbs[i] = '\0';
-- 
cgit v1.2.3-59-g8ed1b


From e0dc81bec0927fa0c8aabc521793161909eef7a5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 16:05:51 +0800
Subject: blktrace: fix t_error()

Impact: fix error flag output

t_error() should return t->error but not t->sector.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49C8945F.5020802@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9af41430ee54..f69f8bd8bef9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -968,7 +968,7 @@ static inline unsigned long long t_sector(const struct trace_entry *ent)
 
 static inline __u16 t_error(const struct trace_entry *ent)
 {
-	return te_blk_io_trace(ent)->sector;
+	return te_blk_io_trace(ent)->error;
 }
 
 static __u64 get_pdu_int(const struct trace_entry *ent)
-- 
cgit v1.2.3-59-g8ed1b


From 093419971e03362a00f499960557c119982ea09f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 17:43:30 +0800
Subject: blktrace: print human-readable act_mask

Impact: new feature, allow symbolic values in /debug/tracing/act_mask

Print stringified act_mask instead of hex value:

 # cat act_mask
 read,write,barrier,sync,queue,requeue,issue,complete,fs,pc,ahead,meta,
 discard,drv_data
 # echo "meta,write" > act_mask
 # cat act_mask
 write,meta

Also:
 - make act_mask accept "ahead", "meta", "discard" and "drv_data"
 - use strsep() instead of strchr() to parse user input
 - return -EINVAL if a token is not found in the mask map
 - fix a bug that 'value' is unsigned, so it can < 0
 - propagate error value of blk_trace_mask2str() to userspace, but not
   always return -ENXIO.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49C8AB42.1000802@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 103 ++++++++++++++++++++++++++++++------------------
 1 file changed, 65 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f69f8bd8bef9..6fb274f5f34e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1316,53 +1316,77 @@ struct attribute_group blk_trace_attr_group = {
 	.attrs = blk_trace_attrs,
 };
 
-static int blk_str2act_mask(const char *str)
+static const struct {
+	int mask;
+	const char *str;
+} mask_maps[] = {
+	{ BLK_TC_READ,		"read"		},
+	{ BLK_TC_WRITE,		"write"		},
+	{ BLK_TC_BARRIER,	"barrier"	},
+	{ BLK_TC_SYNC,		"sync"		},
+	{ BLK_TC_QUEUE,		"queue"		},
+	{ BLK_TC_REQUEUE,	"requeue"	},
+	{ BLK_TC_ISSUE,		"issue"		},
+	{ BLK_TC_COMPLETE,	"complete"	},
+	{ BLK_TC_FS,		"fs"		},
+	{ BLK_TC_PC,		"pc"		},
+	{ BLK_TC_AHEAD,		"ahead"		},
+	{ BLK_TC_META,		"meta"		},
+	{ BLK_TC_DISCARD,	"discard"	},
+	{ BLK_TC_DRV_DATA,	"drv_data"	},
+};
+
+static int blk_trace_str2mask(const char *str)
 {
+	int i;
 	int mask = 0;
-	char *copy = kstrdup(str, GFP_KERNEL), *s;
+	char *s, *token;
 
-	if (copy == NULL)
+	s = kstrdup(str, GFP_KERNEL);
+	if (s == NULL)
 		return -ENOMEM;
-
-	s = strstrip(copy);
+	s = strstrip(s);
 
 	while (1) {
-		char *sep = strchr(s, ',');
-
-		if (sep != NULL)
-			*sep = '\0';
-
-		if (strcasecmp(s, "barrier") == 0)
-			mask |= BLK_TC_BARRIER;
-		else if (strcasecmp(s, "complete") == 0)
-			mask |= BLK_TC_COMPLETE;
-		else if (strcasecmp(s, "fs") == 0)
-			mask |= BLK_TC_FS;
-		else if (strcasecmp(s, "issue") == 0)
-			mask |= BLK_TC_ISSUE;
-		else if (strcasecmp(s, "pc") == 0)
-			mask |= BLK_TC_PC;
-		else if (strcasecmp(s, "queue") == 0)
-			mask |= BLK_TC_QUEUE;
-		else if (strcasecmp(s, "read") == 0)
-			mask |= BLK_TC_READ;
-		else if (strcasecmp(s, "requeue") == 0)
-			mask |= BLK_TC_REQUEUE;
-		else if (strcasecmp(s, "sync") == 0)
-			mask |= BLK_TC_SYNC;
-		else if (strcasecmp(s, "write") == 0)
-			mask |= BLK_TC_WRITE;
-
-		if (sep == NULL)
+		token = strsep(&s, ",");
+		if (token == NULL)
 			break;
 
-		s = sep + 1;
+		if (*token == '\0')
+			continue;
+
+		for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
+			if (strcasecmp(token, mask_maps[i].str) == 0) {
+				mask |= mask_maps[i].mask;
+				break;
+			}
+		}
+		if (i == ARRAY_SIZE(mask_maps)) {
+			mask = -EINVAL;
+			break;
+		}
 	}
-	kfree(copy);
+	kfree(s);
 
 	return mask;
 }
 
+static ssize_t blk_trace_mask2str(char *buf, int mask)
+{
+	int i;
+	char *p = buf;
+
+	for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
+		if (mask & mask_maps[i].mask) {
+			p += sprintf(p, "%s%s",
+				    (p == buf) ? "" : ",", mask_maps[i].str);
+		}
+	}
+	*p++ = '\n';
+
+	return p - buf;
+}
+
 static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
 {
 	if (bdev->bd_disk == NULL)
@@ -1399,7 +1423,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	if (q->blk_trace == NULL)
 		ret = sprintf(buf, "disabled\n");
 	else if (attr == &dev_attr_act_mask)
-		ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask);
+		ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
 	else if (attr == &dev_attr_pid)
 		ret = sprintf(buf, "%u\n", q->blk_trace->pid);
 	else if (attr == &dev_attr_start_lba)
@@ -1424,7 +1448,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	struct request_queue *q;
 	struct hd_struct *p;
 	u64 value;
-	ssize_t ret = -ENXIO;
+	ssize_t ret = -EINVAL;
 
 	if (count == 0)
 		goto out;
@@ -1432,13 +1456,16 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	if (attr == &dev_attr_act_mask) {
 		if (sscanf(buf, "%llx", &value) != 1) {
 			/* Assume it is a list of trace category names */
-			value = blk_str2act_mask(buf);
-			if (value < 0)
+			ret = blk_trace_str2mask(buf);
+			if (ret < 0)
 				goto out;
+			value = ret;
 		}
 	} else if (sscanf(buf, "%llu", &value) != 1)
 		goto out;
 
+	ret = -ENXIO;
+
 	lock_kernel();
 	p = dev_to_part(dev);
 	bdev = bdget(part_devt(p));
-- 
cgit v1.2.3-59-g8ed1b


From 098335215a4921a8a54193829eaed602dca24df5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 21 Mar 2009 02:44:50 -0400
Subject: tracing: fix memory leak in trace_stat

If the function profiler does not have any items recorded and one were
to cat the function stat file, the kernel would take a BUG with a NULL
pointer dereference.

Looking further into this, I found that returning NULL from stat_start
did not stop the stat logic, and would later call stat_next. This breaks
from the way seq_file works, so I looked into fixing the stat code.

This is where I noticed that the last next_entry is never freed.
It is allocated, and if the stat_next returns NULL, the code breaks out
of the loop, unlocks the mutex and exits. We never link the next_entry
nor do we free it. Thus it is a real memory leak.

This patch rearranges the code a bit to not only fix the memory leak,
but also to act more like seq_file where nothing is printed if there
is nothing to print. That is, stat_start returns NULL.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_stat.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 39310e3434ee..f71b85b22cfe 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -75,7 +75,7 @@ static int stat_seq_init(struct tracer_stat_session *session)
 {
 	struct trace_stat_list *iter_entry, *new_entry;
 	struct tracer_stat *ts = session->ts;
-	void *prev_stat;
+	void *stat;
 	int ret = 0;
 	int i;
 
@@ -85,6 +85,10 @@ static int stat_seq_init(struct tracer_stat_session *session)
 	if (!ts->stat_cmp)
 		ts->stat_cmp = dummy_cmp;
 
+	stat = ts->stat_start();
+	if (!stat)
+		goto exit;
+
 	/*
 	 * The first entry. Actually this is the second, but the first
 	 * one (the stat_list head) is pointless.
@@ -99,14 +103,19 @@ static int stat_seq_init(struct tracer_stat_session *session)
 
 	list_add(&new_entry->list, &session->stat_list);
 
-	new_entry->stat = ts->stat_start();
-	prev_stat = new_entry->stat;
+	new_entry->stat = stat;
 
 	/*
 	 * Iterate over the tracer stat entries and store them in a sorted
 	 * list.
 	 */
 	for (i = 1; ; i++) {
+		stat = ts->stat_next(stat, i);
+
+		/* End of insertion */
+		if (!stat)
+			break;
+
 		new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
 		if (!new_entry) {
 			ret = -ENOMEM;
@@ -114,11 +123,7 @@ static int stat_seq_init(struct tracer_stat_session *session)
 		}
 
 		INIT_LIST_HEAD(&new_entry->list);
-		new_entry->stat = ts->stat_next(prev_stat, i);
-
-		/* End of insertion */
-		if (!new_entry->stat)
-			break;
+		new_entry->stat = stat;
 
 		list_for_each_entry(iter_entry, &session->stat_list, list) {
 
@@ -137,8 +142,6 @@ static int stat_seq_init(struct tracer_stat_session *session)
 				break;
 			}
 		}
-
-		prev_stat = new_entry->stat;
 	}
 exit:
 	mutex_unlock(&session->stat_mutex);
-- 
cgit v1.2.3-59-g8ed1b


From 5d1a03dc541dc6672e60e57249ed22f40654ca47 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 Mar 2009 23:38:49 -0400
Subject: function-graph: moved the timestamp from arch to generic code

This patch move the timestamp from happening in the arch specific
code into the general code. This allows for better control by the tracer
to time manipulation.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/kernel/ftrace.c             | 6 +-----
 include/linux/ftrace.h               | 3 +--
 kernel/trace/trace_functions_graph.c | 8 +++++---
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 57b33edb7ce3..61df77532120 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -410,7 +410,6 @@ int ftrace_disable_ftrace_graph_caller(void)
 void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 {
 	unsigned long old;
-	unsigned long long calltime;
 	int faulted;
 	struct ftrace_graph_ent trace;
 	unsigned long return_hooker = (unsigned long)
@@ -453,10 +452,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	calltime = trace_clock_local();
-
-	if (ftrace_push_return_trace(old, calltime,
-				self_addr, &trace.depth) == -EBUSY) {
+	if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) {
 		*parent = old;
 		return;
 	}
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index db3fed630db3..1141248c84ee 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -369,8 +369,7 @@ struct ftrace_ret_stack {
 extern void return_to_handler(void);
 
 extern int
-ftrace_push_return_trace(unsigned long ret, unsigned long long time,
-			 unsigned long func, int *depth);
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth);
 extern void
 ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret);
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e876816fa8e7..d28687e7b3a7 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -57,9 +57,9 @@ static struct tracer_flags tracer_flags = {
 
 /* Add a function return address to the trace stack on thread info.*/
 int
-ftrace_push_return_trace(unsigned long ret, unsigned long long time,
-			 unsigned long func, int *depth)
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 {
+	unsigned long long calltime;
 	int index;
 
 	if (!current->ret_stack)
@@ -71,11 +71,13 @@ ftrace_push_return_trace(unsigned long ret, unsigned long long time,
 		return -EBUSY;
 	}
 
+	calltime = trace_clock_local();
+
 	index = ++current->curr_ret_stack;
 	barrier();
 	current->ret_stack[index].ret = ret;
 	current->ret_stack[index].func = func;
-	current->ret_stack[index].calltime = time;
+	current->ret_stack[index].calltime = calltime;
 	*depth = index;
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 05ce5818adee8f8efd0a5ca0d900a6789012516b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 00:18:31 -0400
Subject: function-graph: prevent more than one tracer registering

Impact: prevent crash due to multiple function graph tracers

The function graph tracer can currently only handle a single tracer
being registered. If another tracer registers with the function
graph tracer it can crash the system.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7847806eefef..c81a759fbf76 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2643,6 +2643,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 
 	mutex_lock(&ftrace_lock);
 
+	/* we currently allow only one tracer registered at a time */
+	if (atomic_read(&ftrace_graph_active)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
 	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
 	register_pm_notifier(&ftrace_suspend_notifier);
 
-- 
cgit v1.2.3-59-g8ed1b


From 8aef2d2856158a36c295a8d1288281e4839bff13 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 01:10:15 -0400
Subject: function-graph: ignore times across schedule

Impact: more accurate timings

The current method of function graph tracing does not take into
account the time spent when a task is not running. This shows functions
that call schedule have increased costs:

 3) + 18.664 us   |      }
 ------------------------------------------
 3)    <idle>-0    =>  kblockd-123
 ------------------------------------------

 3)               |      finish_task_switch() {
 3)   1.441 us    |        _spin_unlock_irq();
 3)   3.966 us    |      }
 3) ! 2959.433 us |    }
 3) ! 2961.465 us |  }

This patch uses the tracepoint in the scheduling context switch to
account for time that has elapsed while a task is scheduled out.
Now we see:

 ------------------------------------------
 3)    <idle>-0    =>  edac-po-1067
 ------------------------------------------

 3)               |      finish_task_switch() {
 3)   0.685 us    |        _spin_unlock_irq();
 3)   2.331 us    |      }
 3) + 41.439 us   |    }
 3) + 42.663 us   |  }

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/sched.h |  2 ++
 kernel/trace/ftrace.c | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 89cd308cc7a5..471e36d30123 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1409,6 +1409,8 @@ struct task_struct {
 	int curr_ret_stack;
 	/* Stack of return addresses for return function tracing */
 	struct ftrace_ret_stack	*ret_stack;
+	/* time stamp for last schedule */
+	unsigned long long ftrace_timestamp;
 	/*
 	 * Number of functions that haven't been traced
 	 * because of depth overrun.
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c81a759fbf76..0b90364d1a2c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,6 +29,8 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 
+#include <trace/sched.h>
+
 #include <asm/ftrace.h>
 
 #include "trace.h"
@@ -2590,6 +2592,31 @@ free:
 	return ret;
 }
 
+static void
+ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
+				struct task_struct *next)
+{
+	unsigned long long timestamp;
+	int index;
+
+	timestamp = trace_clock_local();
+
+	prev->ftrace_timestamp = timestamp;
+
+	/* only process tasks that we timestamped */
+	if (!next->ftrace_timestamp)
+		return;
+
+	/*
+	 * Update all the counters in next to make up for the
+	 * time next was sleeping.
+	 */
+	timestamp -= next->ftrace_timestamp;
+
+	for (index = next->curr_ret_stack; index >= 0; index--)
+		next->ret_stack[index].calltime += timestamp;
+}
+
 /* Allocate a return stack for each task */
 static int start_graph_tracing(void)
 {
@@ -2611,6 +2638,13 @@ static int start_graph_tracing(void)
 		ret = alloc_retstack_tasklist(ret_stack_list);
 	} while (ret == -EAGAIN);
 
+	if (!ret) {
+		ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+		if (ret)
+			pr_info("ftrace_graph: Couldn't activate tracepoint"
+				" probe to kernel_sched_switch\n");
+	}
+
 	kfree(ret_stack_list);
 	return ret;
 }
@@ -2674,6 +2708,7 @@ void unregister_ftrace_graph(void)
 	mutex_lock(&ftrace_lock);
 
 	atomic_dec(&ftrace_graph_active);
+	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
@@ -2694,6 +2729,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 		t->curr_ret_stack = -1;
 		atomic_set(&t->tracing_graph_pause, 0);
 		atomic_set(&t->trace_overrun, 0);
+		t->ftrace_timestamp = 0;
 	} else
 		t->ret_stack = NULL;
 }
-- 
cgit v1.2.3-59-g8ed1b


From be6f164a02f394675e2ac2077dd354cebef5b4c0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 11:06:24 -0400
Subject: function-graph: add option for include sleep times

Impact: give user a choice to show times spent while sleeping

The user may want to see the time a function spent sleeping.
This patch adds the trace option "sleep-time" to allow that.
The "sleep-time" option is default on.

 echo sleep-time > /debug/tracing/trace_options

produces:

 ------------------------------------------
 2)  avahi-d-3428  =>    <idle>-0
 ------------------------------------------

 2)               |      finish_task_switch() {
 2)   0.621 us    |        _spin_unlock_irq();
 2)   2.202 us    |      }
 2) ! 1002.197 us |    }
 2) ! 1003.521 us |  }

where as,

 echo nosleep-time > /debug/tracing/trace_options

produces:

 0)    <idle>-0    =>  yum-upd-3416
 ------------------------------------------

 0)               |              finish_task_switch() {
 0)   0.643 us    |                _spin_unlock_irq();
 0)   2.342 us    |              }
 0) + 41.302 us   |            }
 0) + 42.453 us   |          }

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 7 +++++++
 kernel/trace/trace.c  | 3 ++-
 kernel/trace/trace.h  | 1 +
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0b90364d1a2c..02d2de9d08ba 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2599,6 +2599,13 @@ ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 	unsigned long long timestamp;
 	int index;
 
+	/*
+	 * Does the user want to count the time a function was asleep.
+	 * If so, do not update the time stamps.
+	 */
+	if (trace_flags & TRACE_ITER_SLEEP_TIME)
+		return;
+
 	timestamp = trace_clock_local();
 
 	prev->ftrace_timestamp = timestamp;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f0e1337b1ebd..67c6a21dd427 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -255,7 +255,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
-	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO;
+	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
@@ -316,6 +316,7 @@ static const char *trace_options[] = {
 	"context-info",
 	"latency-format",
 	"global-clock",
+	"sleep-time",
 	NULL
 };
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7cfb741be200..d7410bbb9a80 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -683,6 +683,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_CONTEXT_INFO		= 0x20000, /* Print pid/cpu/time */
 	TRACE_ITER_LATENCY_FMT		= 0x40000,
 	TRACE_ITER_GLOBAL_CLK		= 0x80000,
+	TRACE_ITER_SLEEP_TIME		= 0x100000,
 };
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From cc59c9e8d0165c632fd056c4a23e36f917507fb4 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 11:03:01 +0800
Subject: ftrace: show virtual PID

Impact: fix PID output under namespaces

When current namespace is not the global namespace,
pid read from set_ftrace_pid is no correct.

 # ~/newpid_namespace_run bash
 # echo $$
 1
 # echo 1 > set_ftrace_pid
 # cat set_ftrace_pid
 3756

Since we write virtual PID to set_ftrace_pid, we need get
virtual PID when we read it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C84D65.9050606@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 02d2de9d08ba..bb377112b1bb 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2264,7 +2264,7 @@ ftrace_pid_read(struct file *file, char __user *ubuf,
 	if (ftrace_pid_trace == ftrace_swapper_pid)
 		r = sprintf(buf, "swapper tasks\n");
 	else if (ftrace_pid_trace)
-		r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace));
+		r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
 	else
 		r = sprintf(buf, "no pid\n");
 
-- 
cgit v1.2.3-59-g8ed1b


From ee000b7f9fe429d2470c674ccec8d344f6789e0d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 13:38:06 +0800
Subject: tracing: use union for multi-usages field

Impact: cleanup

struct dyn_ftrace::ip has different usages in his lifecycle,
we use union for it. And also for struct dyn_ftrace::flags.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C871BE.3080405@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 12 +++++++++---
 kernel/trace/ftrace.c  |  8 ++++----
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1141248c84ee..015a3d22cf74 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -145,9 +145,15 @@ enum {
 };
 
 struct dyn_ftrace {
-	unsigned long		ip; /* address of mcount call-site */
-	unsigned long		flags;
-	struct dyn_arch_ftrace	arch;
+	union {
+		unsigned long		ip; /* address of mcount call-site */
+		struct dyn_ftrace	*freelist;
+	};
+	union {
+		unsigned long		flags;
+		struct dyn_ftrace	*newlist;
+	};
+	struct dyn_arch_ftrace		arch;
 };
 
 int ftrace_force_update(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bb377112b1bb..7b8722baf153 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -341,7 +341,7 @@ static inline int record_frozen(struct dyn_ftrace *rec)
 
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
-	rec->ip = (unsigned long)ftrace_free_records;
+	rec->freelist = ftrace_free_records;
 	ftrace_free_records = rec;
 	rec->flags |= FTRACE_FL_FREE;
 }
@@ -379,7 +379,7 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 			return NULL;
 		}
 
-		ftrace_free_records = (void *)rec->ip;
+		ftrace_free_records = rec->freelist;
 		memset(rec, 0, sizeof(*rec));
 		return rec;
 	}
@@ -411,7 +411,7 @@ ftrace_record_ip(unsigned long ip)
 		return NULL;
 
 	rec->ip = ip;
-	rec->flags = (unsigned long)ftrace_new_addrs;
+	rec->newlist = ftrace_new_addrs;
 	ftrace_new_addrs = rec;
 
 	return rec;
@@ -731,7 +731,7 @@ static int ftrace_update_code(struct module *mod)
 			return -1;
 
 		p = ftrace_new_addrs;
-		ftrace_new_addrs = (struct dyn_ftrace *)p->flags;
+		ftrace_new_addrs = p->newlist;
 		p->flags = 0L;
 
 		/* convert record (i.e, patch mcount-call with NOP) */
-- 
cgit v1.2.3-59-g8ed1b


From e6f489013b985b58d096a3091ece0ed579367232 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 16:27:17 +0800
Subject: trace_stat: don't call seq_printf() in seq_operation->start()

Impact: Fix incorrect way using seq_file's API

Use SEQ_START_TOKEN instead of calling ->stat_headers()
int seq_operation->start().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
LKML-Reference: <49C9EAE5.5070202@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index f71b85b22cfe..8c129dd480ad 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -163,7 +163,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 
 	/* If we are in the beginning of the file, print the headers */
 	if (!*pos && session->ts->stat_headers)
-		session->ts->stat_headers(s);
+		return SEQ_START_TOKEN;
 
 	return seq_list_start(&session->stat_list, *pos);
 }
@@ -172,6 +172,9 @@ static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
 	struct tracer_stat_session *session = s->private;
 
+	if (p == SEQ_START_TOKEN)
+		return seq_list_start(&session->stat_list, *pos);
+
 	return seq_list_next(p, &session->stat_list, pos);
 }
 
@@ -186,6 +189,9 @@ static int stat_seq_show(struct seq_file *s, void *v)
 	struct tracer_stat_session *session = s->private;
 	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
 
+	if (v == SEQ_START_TOKEN)
+		return session->ts->stat_headers(s);
+
 	return session->ts->stat_show(s, l->stat);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 220ba351dfa57eca4bec5ce0098a276446a47958 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 16:58:39 +0800
Subject: trace_stat: keep original order

Impact: make trace_stat files show items with the original order

trace_stat tracer reverse the items, it makes the output
looks a little ugly.

Example, when we read trace_stat/workqueues, we get cpu#7's stat.
at first, and then cpu#6... cpu#0.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C9F23F.5040307@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 8c129dd480ad..acdebd771a93 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -125,23 +125,21 @@ static int stat_seq_init(struct tracer_stat_session *session)
 		INIT_LIST_HEAD(&new_entry->list);
 		new_entry->stat = stat;
 
-		list_for_each_entry(iter_entry, &session->stat_list, list) {
+		list_for_each_entry_reverse(iter_entry, &session->stat_list,
+				list) {
 
 			/* Insertion with a descendent sorting */
-			if (ts->stat_cmp(new_entry->stat,
-						iter_entry->stat) > 0) {
+			if (ts->stat_cmp(iter_entry->stat,
+					new_entry->stat) >= 0) {
 
-				list_add_tail(&new_entry->list,
-						&iter_entry->list);
-				break;
-
-			/* The current smaller value */
-			} else if (list_is_last(&iter_entry->list,
-						&session->stat_list)) {
 				list_add(&new_entry->list, &iter_entry->list);
 				break;
 			}
 		}
+
+		/* The current larger value */
+		if (list_empty(&new_entry->list))
+			list_add(&new_entry->list, &session->stat_list);
 	}
 exit:
 	mutex_unlock(&session->stat_mutex);
-- 
cgit v1.2.3-59-g8ed1b


From 2f63b840bc8a816ac879ee773b035cf3e433fae4 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 16:59:18 +0800
Subject: trace_workqueues: fix empty line's output

Empty lines separate cpus stat. After previous
fix(trace_stat: keep original order) applied, the empty lines
are displayed at incorrect position.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C9F266.2060706@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_workqueue.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 9ab035b58cf1..797201e4a137 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -196,6 +196,11 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 	struct pid *pid;
 	struct task_struct *tsk;
 
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
+		seq_printf(s, "\n");
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+
 	pid = find_get_pid(cws->pid);
 	if (pid) {
 		tsk = get_pid_task(pid, PIDTYPE_PID);
@@ -208,18 +213,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 		put_pid(pid);
 	}
 
-	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-		seq_printf(s, "\n");
-	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
 	return 0;
 }
 
 static int workqueue_stat_headers(struct seq_file *s)
 {
 	seq_printf(s, "# CPU  INSERTED  EXECUTED   NAME\n");
-	seq_printf(s, "# |      |         |          |\n\n");
+	seq_printf(s, "# |      |         |          |\n");
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 2a4efa42450762cbfa5c5712aa4cc9f06924c9fd Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 12:06:05 +0800
Subject: ftrace: Using FTRACE_WARN_ON() to check "freed record" in
 ftrace_release()

 "Because when we call ftrace_free_rec we change the rec->ip to point to the
  next record in the chain. Something is very wrong if rec->ip >= s &&
  rec->ip < e and the record is already free."

 "Note, use FTRACE_WARN_ON() macro. This way it shuts down ftrace if it is
  hit and helps to avoid further damage later."
                   -- Steven Rostedt <rostedt@goodmis.org>

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7b8722baf153..1752a63f37c0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -358,9 +358,14 @@ void ftrace_release(void *start, unsigned long size)
 
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
-		if ((rec->ip >= s) && (rec->ip < e) &&
-		    !(rec->flags & FTRACE_FL_FREE))
+		if ((rec->ip >= s) && (rec->ip < e)) {
+			/*
+			 * rec->ip is changed in ftrace_free_rec()
+			 * It should not between s and e if record was freed.
+			 */
+			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
 			ftrace_free_rec(rec);
+		}
 	} while_for_each_ftrace_rec();
 	mutex_unlock(&ftrace_lock);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 9a8118baaeb0eaa148913bed77bf9c6335f6ca63 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Thu, 26 Mar 2009 01:24:34 -0500
Subject: tracing: filter fix for TRACE_EVENT_FORMAT events

Impact: fix crash (hang) when using TRACE_EVENT_FORMAT filter files

filters are only hooked up to the tracepoint events defined using
TRACE_EVENT but not the tracers that use TRACE_EVENT_FORMAT, such
as ftrace.

Do not display the filter files at all for TRACE_EVENT_FORMAT events
for the time being.

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878882.8339.61.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d132997ab756..64ec4d278ffb 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -680,7 +680,6 @@ static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
 	struct event_subsystem *system;
-	struct dentry *entry;
 
 	/* First see if we did not already create this dir */
 	list_for_each_entry(system, &event_subsystems, list) {
@@ -709,12 +708,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	system->preds = NULL;
 
-	entry = debugfs_create_file("filter", 0644, system->entry, system,
-				    &ftrace_subsystem_filter_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/filter' entry\n", name);
-
 	return system->entry;
 }
 
@@ -770,14 +763,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   " events/%s\n", call->name);
 			return ret;
 		}
+		entry = debugfs_create_file("filter", 0644, call->dir, call,
+					    &ftrace_event_filter_fops);
+		if (!entry)
+			pr_warning("Could not create debugfs "
+				   "'%s/filter' entry\n", call->name);
 	}
 
-	entry = debugfs_create_file("filter", 0644, call->dir, call,
-				    &ftrace_event_filter_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/filter' entry\n", call->name);
-
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 2a93a1f21480f3aa134fe0c48954b64b8a97ef25 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-Koenig <ukleinek@strlen.de>
Date: Mon, 12 Jan 2009 23:35:50 +0100
Subject: trivial: fix typo "resgister" -> "register"

Signed-off-by: Uwe Kleine-Koenig <ukleinek@strlen.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fdf913dfc7e8..53e8c8bc0c98 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1908,7 +1908,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
 }
 
 /**
- * unregister_ftrace_function - unresgister a function for profiling.
+ * unregister_ftrace_function - unregister a function for profiling.
  * @ops - ops structure that holds the function to unregister
  *
  * Unregister a function that was added to be called by ftrace profiling.
-- 
cgit v1.2.3-59-g8ed1b


From 877d03105d04b2c13e241130277fa69c8d2564f0 Mon Sep 17 00:00:00 2001
From: Nick Andrew <nick@nick-andrew.net>
Date: Mon, 26 Jan 2009 11:06:57 +0100
Subject: trivial: Fix misspelling of firmware

Fix misspelling of firmware.

Signed-off-by: Nick Andrew <nick@nick-andrew.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/ia64/kvm.txt                                    | 2 +-
 Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt | 2 +-
 arch/mips/sgi-ip27/ip27-smp.c                                 | 2 +-
 arch/sparc/kernel/head_64.S                                   | 2 +-
 drivers/net/sb1250-mac.c                                      | 2 +-
 drivers/net/tg3.c                                             | 2 +-
 drivers/net/wireless/ipw2x00/ipw2100.c                        | 2 +-
 drivers/net/wireless/ipw2x00/ipw2200.c                        | 2 +-
 drivers/net/wireless/iwlwifi/iwl-agn.c                        | 2 +-
 drivers/net/wireless/iwlwifi/iwl3945-base.c                   | 2 +-
 drivers/net/wireless/libertas/cmd.c                           | 2 +-
 drivers/pci/pci.c                                             | 2 +-
 drivers/platform/x86/thinkpad_acpi.c                          | 2 +-
 drivers/staging/otus/hal/hpmain.c                             | 2 +-
 drivers/usb/atm/ueagle-atm.c                                  | 2 +-
 drivers/usb/serial/ChangeLog.history                          | 2 +-
 include/linux/libata.h                                        | 2 +-
 kernel/power/disk.c                                           | 4 ++--
 sound/oss/pss.c                                               | 2 +-
 sound/sh/aica.c                                               | 2 +-
 20 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ia64/kvm.txt b/Documentation/ia64/kvm.txt
index 84f7cb3d5bec..ffb5c80bec3e 100644
--- a/Documentation/ia64/kvm.txt
+++ b/Documentation/ia64/kvm.txt
@@ -42,7 +42,7 @@ Note: For step 2, please make sure that host page size == TARGET_PAGE_SIZE of qe
 		hg clone http://xenbits.xensource.com/ext/efi-vfirmware.hg
 	    you can get the firmware's binary in the directory of efi-vfirmware.hg/binaries.
 
-	(3) Rename the firware you owned to Flash.fd, and copy it to /usr/local/share/qemu
+	(3) Rename the firmware you owned to Flash.fd, and copy it to /usr/local/share/qemu
 
 4. Boot up Linux or Windows guests:
 	4.1 Create or install a image for guest boot. If you have xen experience, it should be easy.
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
index 6c238f59b2a9..249db3a15d15 100644
--- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
@@ -1,6 +1,6 @@
 * Uploaded QE firmware
 
-      If a new firwmare has been uploaded to the QE (usually by the
+      If a new firmware has been uploaded to the QE (usually by the
       boot loader), then a 'firmware' child node should be added to the QE
       node.  This node provides information on the uploaded firmware that
       device drivers may need.
diff --git a/arch/mips/sgi-ip27/ip27-smp.c b/arch/mips/sgi-ip27/ip27-smp.c
index 5b47d6b65275..cbcd7eb83bd1 100644
--- a/arch/mips/sgi-ip27/ip27-smp.c
+++ b/arch/mips/sgi-ip27/ip27-smp.c
@@ -221,7 +221,7 @@ static void __init ip27_smp_setup(void)
 	 * Assumption to be fixed: we're always booted on logical / physical
 	 * processor 0.  While we're always running on logical processor 0
 	 * this still means this is physical processor zero; it might for
-	 * example be disabled in the firwware.
+	 * example be disabled in the firmware.
 	 */
 	alloc_cpupda(0, 0);
 }
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index a46c3a21e26d..3a1b7bf03cff 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -686,7 +686,7 @@ tlb_fixup_done:
 	 * point.
 	 *
 	 * There used to be enormous complexity wrt. transferring
-	 * over from the firwmare's trap table to the Linux kernel's.
+	 * over from the firmware's trap table to the Linux kernel's.
 	 * For example, there was a chicken & egg problem wrt. building
 	 * the OBP page tables, yet needing to be on the Linux kernel
 	 * trap table (to translate PAGE_OFFSET addresses) in order to
diff --git a/drivers/net/sb1250-mac.c b/drivers/net/sb1250-mac.c
index 88dd2e09832f..ce7551e17ba7 100644
--- a/drivers/net/sb1250-mac.c
+++ b/drivers/net/sb1250-mac.c
@@ -2299,7 +2299,7 @@ static int sbmac_init(struct platform_device *pldev, long long base)
 	eaddr = sc->sbm_hwaddr;
 
 	/*
-	 * Read the ethernet address.  The firwmare left this programmed
+	 * Read the ethernet address.  The firmware left this programmed
 	 * for us in the ethernet address register for each mac.
 	 */
 
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index f7efcecc4108..ed60b18addac 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -11225,7 +11225,7 @@ static int __devinit tg3_phy_probe(struct tg3 *tp)
 		return tg3_phy_init(tp);
 
 	/* Reading the PHY ID register can conflict with ASF
-	 * firwmare access to the PHY hardware.
+	 * firmware access to the PHY hardware.
 	 */
 	err = 0;
 	if ((tp->tg3_flags & TG3_FLAG_ENABLE_ASF) ||
diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c
index 115b70487502..f4e963ba768b 100644
--- a/drivers/net/wireless/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/ipw2x00/ipw2100.c
@@ -2362,7 +2362,7 @@ static void ipw2100_corruption_detected(struct ipw2100_priv *priv, int i)
 		       i * sizeof(struct ipw2100_status));
 
 #ifdef IPW2100_DEBUG_C3
-	/* Halt the fimrware so we can get a good image */
+	/* Halt the firmware so we can get a good image */
 	write_register(priv->net_dev, IPW_REG_RESET_REG,
 		       IPW_AUX_HOST_RESET_REG_STOP_MASTER);
 	j = 5;
diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c
index b3449948a25a..f6174fdc12bf 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/ipw2x00/ipw2200.c
@@ -8844,7 +8844,7 @@ static int ipw_wx_set_mode(struct net_device *dev,
 #endif				/* CONFIG_IPW2200_MONITOR */
 
 	/* Free the existing firmware and reset the fw_loaded
-	 * flag so ipw_load() will bring in the new firmawre */
+	 * flag so ipw_load() will bring in the new firmware */
 	free_firmware();
 
 	priv->ieee->iw_mode = wrqu->mode;
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c
index 663dc83be501..3889158b359c 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
@@ -1337,7 +1337,7 @@ static int iwl_read_ucode(struct iwl_priv *priv)
 
 	/* api_ver should match the api version forming part of the
 	 * firmware filename ... but we don't check for that and only rely
-	 * on the API version read from firware header from here on forward */
+	 * on the API version read from firmware header from here on forward */
 
 	if (api_ver < api_min || api_ver > api_max) {
 		IWL_ERR(priv, "Driver unable to support your firmware API. "
diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index a71b08ca7c71..9d5f97dd7c73 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -2562,7 +2562,7 @@ static int iwl3945_read_ucode(struct iwl_priv *priv)
 
 	/* api_ver should match the api version forming part of the
 	 * firmware filename ... but we don't check for that and only rely
-	 * on the API version read from firware header from here on forward */
+	 * on the API version read from firmware header from here on forward */
 
 	if (api_ver < api_min || api_ver > api_max) {
 		IWL_ERR(priv, "Driver unable to support your firmware API. "
diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c
index 639dd02d3d31..8c3605cdc64c 100644
--- a/drivers/net/wireless/libertas/cmd.c
+++ b/drivers/net/wireless/libertas/cmd.c
@@ -1649,7 +1649,7 @@ static struct cmd_ctrl_node *lbs_get_cmd_ctrl_node(struct lbs_private *priv)
 
 /**
  *  @brief This function executes next command in command
- *  pending queue. It will put fimware back to PS mode
+ *  pending queue. It will put firmware back to PS mode
  *  if applicable.
  *
  *  @param priv     A pointer to struct lbs_private structure
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 6d6120007af4..dab33a21d49a 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -550,7 +550,7 @@ void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
  * @dev: PCI device to handle.
  * @state: PCI power state (D0, D1, D2, D3hot) to put the device into.
  *
- * Transition a device to a new power state, using the platform formware and/or
+ * Transition a device to a new power state, using the platform firmware and/or
  * the device's PCI PM registers.
  *
  * RETURN VALUE:
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index d2433204a40c..814cb6520673 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -5811,7 +5811,7 @@ static struct ibm_struct volume_driver_data = {
  *	ThinkPads from this same time period (and earlier) probably lack the
  *	tachometer as well.
  *
- *	Unfortunately a lot of ThinkPads with new-style ECs but whose firwmare
+ *	Unfortunately a lot of ThinkPads with new-style ECs but whose firmware
  *	was never fixed by IBM to report the EC firmware version string
  *	probably support the tachometer (like the early X models), so
  *	detecting it is quite hard.  We need more data to know for sure.
diff --git a/drivers/staging/otus/hal/hpmain.c b/drivers/staging/otus/hal/hpmain.c
index 2e65c466aae8..dab278326931 100644
--- a/drivers/staging/otus/hal/hpmain.c
+++ b/drivers/staging/otus/hal/hpmain.c
@@ -152,7 +152,7 @@ u16_t zfHpInit(zdev_t* dev, u32_t frequency)
     else
     {
     #ifndef ZM_OTUS_LINUX_PHASE_2
-        /* donwload the normal frimware */
+        /* download the normal firmware */
         if ((ret = zfFirmwareDownload(dev, (u32_t*)zcFwImage,
                 (u32_t)zcFwImageSize, ZM_FIRMWARE_WLAN_ADDR)) != ZM_SUCCESS)
         {
diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c
index b6483dd98acc..9cf9ff69e3e3 100644
--- a/drivers/usb/atm/ueagle-atm.c
+++ b/drivers/usb/atm/ueagle-atm.c
@@ -626,7 +626,7 @@ static void uea_upload_pre_firmware(const struct firmware *fw_entry, void *conte
 		goto err_fw_corrupted;
 
 	/*
-	 * Start to upload formware : send reset
+	 * Start to upload firmware : send reset
 	 */
 	value = 1;
 	ret = uea_send_modem_cmd(usb, F8051_USBCS, sizeof(value), &value);
diff --git a/drivers/usb/serial/ChangeLog.history b/drivers/usb/serial/ChangeLog.history
index c1b279939bbf..f13fd488ebec 100644
--- a/drivers/usb/serial/ChangeLog.history
+++ b/drivers/usb/serial/ChangeLog.history
@@ -715,7 +715,7 @@ io_edgeport.c Change Log comments:
 
  0.2 (01/30/2000) greg kroah-hartman
 	Milestone 1 release.
-	Device is found by USB subsystem, enumerated, fimware is downloaded
+	Device is found by USB subsystem, enumerated, firmware is downloaded
 	and the descriptors are printed to the debug log, config is set, and
 	green light starts to blink. Open port works, and data can be sent
 	and received at the default settings of the UART. Loopback connector
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 76262d83656b..b450a2628855 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -379,7 +379,7 @@ enum {
 	ATA_HORKAGE_BRIDGE_OK	= (1 << 10),	/* no bridge limits */
 	ATA_HORKAGE_ATAPI_MOD16_DMA = (1 << 11), /* use ATAPI DMA for commands
 						    not multiple of 16 bytes */
-	ATA_HORKAGE_FIRMWARE_WARN = (1 << 12),	/* firwmare update warning */
+	ATA_HORKAGE_FIRMWARE_WARN = (1 << 12),	/* firmware update warning */
 	ATA_HORKAGE_1_5_GBPS	= (1 << 13),	/* force 1.5 Gbps */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4a4a206b1979..9d1c1a0de350 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -265,7 +265,7 @@ static int create_image(int platform_mode)
  *	hibernation_snapshot - quiesce devices and create the hibernation
  *	snapshot image.
  *	@platform_mode - if set, use the platform driver, if available, to
- *			 prepare the platform frimware for the power transition.
+ *			 prepare the platform firmware for the power transition.
  *
  *	Must be called with pm_mutex held
  */
@@ -378,7 +378,7 @@ static int resume_target_kernel(void)
  *	hibernation_restore - quiesce devices and restore the hibernation
  *	snapshot image.  If successful, control returns in hibernation_snaphot()
  *	@platform_mode - if set, use the platform driver, if available, to
- *			 prepare the platform frimware for the transition.
+ *			 prepare the platform firmware for the transition.
  *
  *	Must be called with pm_mutex held
  */
diff --git a/sound/oss/pss.c b/sound/oss/pss.c
index 16517a5a1301..83f5ee236b12 100644
--- a/sound/oss/pss.c
+++ b/sound/oss/pss.c
@@ -46,7 +46,7 @@
  *          load the driver as it did in previous versions.
  * 04-07-1999: Anthony Barbachan <barbcode@xmen.cis.fordham.edu>
  *          Added module parameter pss_firmware to allow the user to tell 
- *          the driver where the fireware file is located.  The default 
+ *          the driver where the firmware file is located.  The default 
  *          setting is the previous hardcoded setting "/etc/sound/pss_synth".
  * 00-03-03: Christoph Hellwig <chhellwig@infradead.org>
  *	    Adapted to module_init/module_exit
diff --git a/sound/sh/aica.c b/sound/sh/aica.c
index f551233c5a08..583a3693df75 100644
--- a/sound/sh/aica.c
+++ b/sound/sh/aica.c
@@ -565,7 +565,7 @@ static int load_aica_firmware(void)
 	err = request_firmware(&fw_entry, "aica_firmware.bin", &pd->dev);
 	if (unlikely(err))
 		return err;
-	/* write firware into memory */
+	/* write firmware into memory */
 	spu_disable();
 	spu_memload(0, fw_entry->data, fw_entry->size);
 	spu_enable();
-- 
cgit v1.2.3-59-g8ed1b


From 692105b8ac5bcd75dc65f6a8f10bdbd0f0f34dcf Mon Sep 17 00:00:00 2001
From: Matt LaPlante <kernel1@cyberdogtech.com>
Date: Mon, 26 Jan 2009 11:12:25 +0100
Subject: trivial: fix typos/grammar errors in Kconfig texts

Signed-off-by: Matt LaPlante <kernel1@cyberdogtech.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/arm/mach-omap1/Kconfig          |  2 +-
 arch/avr32/Kconfig                   |  6 +++---
 arch/blackfin/Kconfig                |  6 +++---
 arch/cris/Kconfig                    |  6 +++---
 arch/cris/arch-v32/Kconfig           |  2 +-
 arch/cris/arch-v32/drivers/Kconfig   |  2 +-
 arch/cris/arch-v32/mach-fs/Kconfig   |  2 +-
 arch/mips/Kconfig                    |  2 +-
 arch/powerpc/Kconfig                 |  2 +-
 arch/powerpc/platforms/Kconfig       |  2 +-
 arch/powerpc/sysdev/bestcomm/Kconfig |  4 ++--
 arch/sh/Kconfig                      |  4 ++--
 arch/x86/Kconfig                     |  4 ++--
 drivers/ata/Kconfig                  |  2 +-
 drivers/gpio/Kconfig                 |  6 +++---
 drivers/hid/Kconfig                  |  2 +-
 drivers/input/Kconfig                |  2 +-
 drivers/isdn/mISDN/Kconfig           | 10 ++++++----
 drivers/leds/Kconfig                 |  6 +++---
 drivers/media/common/tuners/Kconfig  |  2 +-
 drivers/media/dvb/frontends/Kconfig  |  2 +-
 drivers/mfd/Kconfig                  |  2 +-
 drivers/misc/Kconfig                 |  6 +++---
 drivers/mmc/host/Kconfig             |  2 +-
 drivers/scsi/Kconfig                 |  4 ++--
 drivers/serial/Kconfig               |  2 +-
 drivers/staging/Kconfig              |  4 ++--
 drivers/staging/comedi/Kconfig       |  4 ++--
 drivers/staging/go7007/Kconfig       |  4 ++--
 drivers/staging/panel/Kconfig        |  2 +-
 drivers/usb/gadget/Kconfig           |  2 +-
 drivers/usb/serial/Kconfig           |  4 ++--
 drivers/uwb/Kconfig                  |  4 ++--
 drivers/xen/Kconfig                  |  2 +-
 fs/ext4/Kconfig                      |  2 +-
 fs/ubifs/Kconfig                     |  4 ++--
 init/Kconfig                         |  6 +++---
 kernel/trace/Kconfig                 |  9 ++++-----
 net/Kconfig                          |  2 +-
 net/ipv6/Kconfig                     | 18 +++++++++---------
 net/mac80211/Kconfig                 |  2 +-
 net/netfilter/Kconfig                |  2 +-
 net/phonet/Kconfig                   |  2 +-
 net/sunrpc/Kconfig                   |  2 +-
 net/wimax/Kconfig                    |  2 +-
 sound/soc/blackfin/Kconfig           |  2 +-
 46 files changed, 86 insertions(+), 85 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-omap1/Kconfig b/arch/arm/mach-omap1/Kconfig
index 3f325d3718a9..cd8de89c5fad 100644
--- a/arch/arm/mach-omap1/Kconfig
+++ b/arch/arm/mach-omap1/Kconfig
@@ -109,7 +109,7 @@ config MACH_OMAP_PALMZ71
 	help
 	 Support for the Palm Zire71 PDA. To boot the kernel,
 	 you'll need a PalmOS compatible bootloader; check out
-	 http://hackndev.com/palm/z71 for more informations.
+	 http://hackndev.com/palm/z71 for more information.
 	 Say Y here if you have such a PDA, say N otherwise.
 
 config MACH_OMAP_PALMTT
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 05fe3053dcae..414a8ad97f52 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -127,13 +127,13 @@ config BOARD_HAMMERHEAD
 	select CPU_AT32AP7000
 	select USB_ARCH_HAS_HCD
 	help
-	  The Hammerhead platform is built around a AVR32 32-bit microcontroller from Atmel.
+	  The Hammerhead platform is built around an AVR32 32-bit microcontroller from Atmel.
 	  It offers versatile peripherals, such as ethernet, usb device, usb host etc.
 
-	  The board also incooperates a power supply and is a Power over Ethernet (PoE) Powered
+	  The board also incorporates a power supply and is a Power over Ethernet (PoE) Powered
 	  Device (PD).
 
-	  Additonally, a Cyclone III FPGA from Altera is integrated on the board. The FPGA is
+	  Additionally, a Cyclone III FPGA from Altera is integrated on the board. The FPGA is
 	  mapped into the 32-bit AVR memory bus. The FPGA offers two DDR2 SDRAM interfaces, which
 	  will cover even the most exceptional need of memory bandwidth. Together with the onboard
 	  video decoder the board is ready for video processing.
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index 0c1f86e3e44a..3640cdc38aac 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -777,7 +777,7 @@ config CACHELINE_ALIGNED_L1
 	default n if BF54x
 	depends on !BF531
 	help
-	  If enabled, cacheline_anligned data is linked
+	  If enabled, cacheline_aligned data is linked
 	  into L1 data memory. (less latency)
 
 config SYSCALL_TAB_L1
@@ -957,7 +957,7 @@ config MPU
 	  memory they do not own.  This comes at a performance penalty
 	  and is recommended only for debugging.
 
-comment "Asynchonous Memory Configuration"
+comment "Asynchronous Memory Configuration"
 
 menu "EBIU_AMGCTL Global Control"
 config C_AMCKEN
@@ -989,7 +989,7 @@ config C_B3PEN
 	default n
 
 choice
-	prompt"Enable Asynchonous Memory Banks"
+	prompt "Enable Asynchronous Memory Banks"
 	default C_AMBEN_ALL
 
 config C_AMBEN
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 3462245fe9fb..7adac388a771 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -438,7 +438,7 @@ config ETRAX_SERIAL_PORT0_DMA1_IN
 	help
 	  Enables the DMA1 input channel for ser0 (ttyS0).
 	  If you do not enable DMA, an interrupt for each character will be
-	  used when receiveing data.
+	  used when receiving data.
 	  Normally you want to use DMA, unless you use the DMA channel for
 	  something else.
 
@@ -565,7 +565,7 @@ config ETRAX_SERIAL_PORT2_DMA7_IN
 	help
 	  Enables the DMA7 input channel for ser2 (ttyS2).
 	  If you do not enable DMA, an interrupt for each character will be
-	  used when receiveing data.
+	  used when receiving data.
 	  Normally you want to use DMA, unless you use the DMA channel for
 	  something else.
 
@@ -604,7 +604,7 @@ config ETRAX_SERIAL_PORT3_DMA3_IN
 	help
 	  Enables the DMA3 input channel for ser3 (ttyS3).
 	  If you do not enable DMA, an interrupt for each character will be
-	  used when receiveing data.
+	  used when receiving data.
 	  Normally you want to use DMA, unless you use the DMA channel for
 	  something else.
 
diff --git a/arch/cris/arch-v32/Kconfig b/arch/cris/arch-v32/Kconfig
index 005ed2b3f7f4..21bbd93be34f 100644
--- a/arch/cris/arch-v32/Kconfig
+++ b/arch/cris/arch-v32/Kconfig
@@ -28,7 +28,7 @@ config	ETRAX_NBR_LED_GRP_ONE
 	help
 	  Select this if you want one Ethernet LED group. This LED group
 	  can be used for one or more Ethernet interfaces. However, it is
-	  recomended that each Ethernet interface use a dedicated LED group.
+	  recommended that each Ethernet interface use a dedicated LED group.
 
 config	ETRAX_NBR_LED_GRP_TWO
 	bool "Use two LED groups"
diff --git a/arch/cris/arch-v32/drivers/Kconfig b/arch/cris/arch-v32/drivers/Kconfig
index 7a64fcef9d07..b9e328e688be 100644
--- a/arch/cris/arch-v32/drivers/Kconfig
+++ b/arch/cris/arch-v32/drivers/Kconfig
@@ -342,7 +342,7 @@ config ETRAX_SERIAL_PORT4_DMA9_IN
 	help
 	  Enables the DMA9 input channel for ser4 (ttyS4).
 	  If you do not enable DMA, an interrupt for each character will be
-	  used when receiveing data.
+	  used when receiving data.
 	  Normally you want to use DMA, unless you use the DMA channel for
 	  something else.
 
diff --git a/arch/cris/arch-v32/mach-fs/Kconfig b/arch/cris/arch-v32/mach-fs/Kconfig
index f6d74475f1c6..774de82abef6 100644
--- a/arch/cris/arch-v32/mach-fs/Kconfig
+++ b/arch/cris/arch-v32/mach-fs/Kconfig
@@ -59,7 +59,7 @@ config ETRAX_SDRAM_GRP1_CONFIG
 	depends on ETRAX_ARCH_V32
 	default "0"
 	help
-	  SDRAM configuration for group 1. The defult value is 0
+	  SDRAM configuration for group 1. The default value is 0
 	  because group 1 is not used in the default configuration,
 	  described in the help for SDRAM_GRP0_CONFIG.
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 206cb7953b0c..92508c521673 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -136,7 +136,7 @@ config MACH_JAZZ
 	help
 	 This a family of machines based on the MIPS R4030 chipset which was
 	 used by several vendors to build RISC/os and Windows NT workstations.
-	 Members include the Acer PICA, MIPS Magnum 4000, MIPS Millenium and
+	 Members include the Acer PICA, MIPS Magnum 4000, MIPS Millennium and
 	 Olivetti M700-10 workstations.
 
 config LASAT
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 74cc312c347c..2e2160c3de7c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -342,7 +342,7 @@ config PHYP_DUMP
 	help
 	  Hypervisor-assisted dump is meant to be a kdump replacement
 	  offering robustness and speed not possible without system
-	  hypervisor assistence.
+	  hypervisor assistance.
 
 	  If unsure, say "N"
 
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 200b9cb900ea..9a8bb53d315d 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -298,7 +298,7 @@ config CPM
 config OF_RTC
 	bool
 	help
-	  Uses information from the OF or flattened device tree to instatiate
+	  Uses information from the OF or flattened device tree to instantiate
 	  platform devices for direct mapped RTC chips like the DS1742 or DS1743.
 
 source "arch/powerpc/sysdev/bestcomm/Kconfig"
diff --git a/arch/powerpc/sysdev/bestcomm/Kconfig b/arch/powerpc/sysdev/bestcomm/Kconfig
index 0b192a1c429d..29e427085efb 100644
--- a/arch/powerpc/sysdev/bestcomm/Kconfig
+++ b/arch/powerpc/sysdev/bestcomm/Kconfig
@@ -9,8 +9,8 @@ config PPC_BESTCOMM
 	select PPC_LIB_RHEAP
 	help
 	  BestComm is the name of the communication coprocessor found
-	  on the Freescale MPC5200 family of processor. It's usage is
-	  optionnal for some drivers (like ATA), but required for
+	  on the Freescale MPC5200 family of processor.  Its usage is
+	  optional for some drivers (like ATA), but required for
 	  others (like FEC).
 
 	  If you want to use drivers that require DMA operations,
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 8d50d527c595..2d52b515c241 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -640,10 +640,10 @@ config GUSA_RB
 	depends on GUSA && CPU_SH3 || (CPU_SH4 && !CPU_SH4A)
 	help
 	  Enabling this option will allow the kernel to implement some
-	  atomic operations using a software implemention of load-locked/
+	  atomic operations using a software implementation of load-locked/
 	  store-conditional (LLSC). On machines which do not have hardware
 	  LLSC, this should be more efficient than the other alternative of
-	  disabling insterrupts around the atomic sequence.
+	  disabling interrupts around the atomic sequence.
 
 endmenu
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 06c02c00d7d9..d7b5621382f6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1129,7 +1129,7 @@ config NODES_SHIFT
 	depends on NEED_MULTIPLE_NODES
 	---help---
 	  Specify the maximum number of NUMA Nodes available on the target
-	  system.  Increases memory reserved to accomodate various tables.
+	  system.  Increases memory reserved to accommodate various tables.
 
 config HAVE_ARCH_BOOTMEM
 	def_bool y
@@ -1307,7 +1307,7 @@ config MTRR_SANITIZER
 	  add writeback entries.
 
 	  Can be disabled with disable_mtrr_cleanup on the kernel command line.
-	  The largest mtrr entry size for a continous block can be set with
+	  The largest mtrr entry size for a continuous block can be set with
 	  mtrr_chunk_size.
 
 	  If unsure, say Y.
diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig
index 0bcf26464670..9120717c0701 100644
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@ -86,7 +86,7 @@ config ATA_SFF
 
 	  For users with exclusively modern controllers like AHCI,
 	  Silicon Image 3124, or Marvell 6440, you may choose to
-	  disable this uneeded SFF support.
+	  disable this unneeded SFF support.
 
 	  If unsure, say Y.
 
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 3d2565441b36..edb02530e461 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -42,9 +42,9 @@ config DEBUG_GPIO
 	depends on DEBUG_KERNEL
 	help
 	  Say Y here to add some extra checks and diagnostics to GPIO calls.
-	  The checks help ensure that GPIOs have been properly initialized
-	  before they are used and that sleeping calls aren not made from
-	  nonsleeping contexts.  They can make bitbanged serial protocols
+	  These checks help ensure that GPIOs have been properly initialized
+	  before they are used, and that sleeping calls are not made from
+	  non-sleeping contexts.  They can make bitbanged serial protocols
 	  slower.  The diagnostics help catch the type of setup errors
 	  that are most common when setting up new platforms or boards.
 
diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig
index e85c8fe9ffcf..504cfaa6160f 100644
--- a/drivers/hid/Kconfig
+++ b/drivers/hid/Kconfig
@@ -243,7 +243,7 @@ config GREENASIA_FF
 	select INPUT_FF_MEMLESS
 	---help---
 	Say Y here if you have a GreenAsia (Product ID 0x12) based game controller
-	(like MANTA Warior MM816 and SpeedLink Strike2 SL-6635) or adapter
+	(like MANTA Warrior MM816 and SpeedLink Strike2 SL-6635) or adapter
 	and want to enable force feedback support for it.
 
 config HID_TOPSEED
diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig
index 5f9d860925a1..cd50c00ab20f 100644
--- a/drivers/input/Kconfig
+++ b/drivers/input/Kconfig
@@ -143,7 +143,7 @@ config INPUT_APMPOWER
 	---help---
 	  Say Y here if you want suspend key events to trigger a user
 	  requested suspend through APM. This is useful on embedded
-	  systems where such behviour is desired without userspace
+	  systems where such behaviour is desired without userspace
 	  interaction. If unsure, say N.
 
 	  To compile this driver as a module, choose M here: the
diff --git a/drivers/isdn/mISDN/Kconfig b/drivers/isdn/mISDN/Kconfig
index 4938355c4072..1747a02a019a 100644
--- a/drivers/isdn/mISDN/Kconfig
+++ b/drivers/isdn/mISDN/Kconfig
@@ -14,13 +14,15 @@ config MISDN_DSP
 	depends on MISDN
 	help
 	  Enable support for digital audio processing capability.
+
 	  This module may be used for special applications that require
-	  cross connecting of bchannels, conferencing, dtmf decoding
+	  cross connecting of bchannels, conferencing, dtmf decoding,
 	  echo cancelation, tone generation, and Blowfish encryption and
-	  decryption.
-	  It may use hardware features if available.
+	  decryption. It may use hardware features if available.
+
 	  E.g. it is required for PBX4Linux. Go to http://isdn.eversberg.eu
-	  and get more informations about this module and it's usage.
+	  and get more information about this module and its usage.
+
 	  If unsure, say 'N'.
 
 config MISDN_L1OIP
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 556aeca0d860..d9db17624f12 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -100,7 +100,7 @@ config LEDS_HP6XX
 	tristate "LED Support for the HP Jornada 6xx"
 	depends on LEDS_CLASS && SH_HP6XX
 	help
-	  This option enables led support for the handheld
+	  This option enables LED support for the handheld
 	  HP Jornada 620/660/680/690.
 
 config LEDS_PCA9532
@@ -108,7 +108,7 @@ config LEDS_PCA9532
 	depends on LEDS_CLASS && I2C && INPUT && EXPERIMENTAL
 	help
 	  This option enables support for NXP pca9532
-	  led controller. It is generally only usefull
+	  LED controller. It is generally only useful
 	  as a platform driver
 
 config LEDS_GPIO
@@ -144,7 +144,7 @@ config LEDS_CLEVO_MAIL
 	  	Positivo Mobile (Clevo M5x0V)
 
 	  If your model is not listed here you can try the "nodetect"
-	  module paramter.
+	  module parameter.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called leds-clevo-mail.
diff --git a/drivers/media/common/tuners/Kconfig b/drivers/media/common/tuners/Kconfig
index 6f92beaa5ac8..da058c174049 100644
--- a/drivers/media/common/tuners/Kconfig
+++ b/drivers/media/common/tuners/Kconfig
@@ -147,7 +147,7 @@ config MEDIA_TUNER_XC5000
 	default m if DVB_FE_CUSTOMISE
 	help
 	  A driver for the silicon tuner XC5000 from Xceive.
-	  This device is only used inside a SiP called togther with a
+	  This device is only used inside a SiP called together with a
 	  demodulator for now.
 
 config MEDIA_TUNER_MXL5005S
diff --git a/drivers/media/dvb/frontends/Kconfig b/drivers/media/dvb/frontends/Kconfig
index 00269560793a..baf808e9d6e1 100644
--- a/drivers/media/dvb/frontends/Kconfig
+++ b/drivers/media/dvb/frontends/Kconfig
@@ -439,7 +439,7 @@ config DVB_TUNER_DIB0070
 	default m if DVB_FE_CUSTOMISE
 	help
 	  A driver for the silicon baseband tuner DiB0070 from DiBcom.
-	  This device is only used inside a SiP called togther with a
+	  This device is only used inside a SiP called together with a
 	  demodulator for now.
 
 comment "SEC control devices for DVB-S"
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 06a2b0f7737c..75f35dbb11dc 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -88,7 +88,7 @@ config MENELAUS
 	help
 	  If you say yes here you get support for the Texas Instruments
 	  TWL92330/Menelaus Power Management chip. This include voltage
-	  regulators, Dual slot memory card tranceivers, real-time clock
+	  regulators, Dual slot memory card transceivers, real-time clock
 	  and other features that are often used in portable devices like
 	  cell phones and PDAs.
 
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 1c484084ed4f..7a1e948840e6 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -18,8 +18,8 @@ config ATMEL_PWM
 	depends on AVR32 || ARCH_AT91SAM9263 || ARCH_AT91SAM9RL || ARCH_AT91CAP9
 	help
 	  This option enables device driver support for the PWM channels
-	  on certain Atmel prcoessors.  Pulse Width Modulation is used for
-	  purposes including software controlled power-efficent backlights
+	  on certain Atmel processors.  Pulse Width Modulation is used for
+	  purposes including software controlled power-efficient backlights
 	  on LCD displays, motor control, and waveform generation.
 
 config ATMEL_TCLIB
@@ -142,7 +142,7 @@ config ATMEL_SSC
 	tristate "Device driver for Atmel SSC peripheral"
 	depends on AVR32 || ARCH_AT91
 	---help---
-	  This option enables device driver support for Atmel Syncronized
+	  This option enables device driver support for Atmel Synchronized
 	  Serial Communication peripheral (SSC).
 
 	  The SSC peripheral supports a wide variety of serial frame based
diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index 99d4b28d52ed..6fbb246c40bb 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -177,7 +177,7 @@ config MMC_SPI
 	select CRC7
 	select CRC_ITU_T
 	help
-	  Some systems accss MMC/SD/SDIO cards using a SPI controller
+	  Some systems access MMC/SD/SDIO cards using a SPI controller
 	  instead of using a "native" MMC/SD/SDIO controller.  This has a
 	  disadvantage of being relatively high overhead, but a compensating
 	  advantage of working on many systems without dedicated MMC/SD/SDIO
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index e2f44e6c0bcb..20297c521e50 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1380,7 +1380,7 @@ config SCSI_LPFC_DEBUG_FS
 	bool "Emulex LightPulse Fibre Channel debugfs Support"
 	depends on SCSI_LPFC && DEBUG_FS
 	help
-	  This makes debugging infomation from the lpfc driver
+	  This makes debugging information from the lpfc driver
 	  available via the debugfs filesystem.
 
 config SCSI_SIM710
@@ -1388,7 +1388,7 @@ config SCSI_SIM710
 	depends on (EISA || MCA) && SCSI
 	select SCSI_SPI_ATTRS
 	---help---
-	  This driver for NCR53c710 based SCSI host adapters.
+	  This driver is for NCR53c710 based SCSI host adapters.
 
 	  It currently supports Compaq EISA cards and NCR MCA cards
 
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 9be11b0963f2..aa9d3a4c2d50 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -1374,7 +1374,7 @@ config SERIAL_BFIN_SPORT
 	depends on BLACKFIN && EXPERIMENTAL
 	select SERIAL_CORE
 	help
-	  Enble support SPORT emulate UART on Blackfin series.
+	  Enable SPORT emulate UART on Blackfin series.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called bfin_sport_uart.
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 211af86a6c55..92981c2383ee 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -4,7 +4,7 @@ menuconfig STAGING
 	---help---
 	  This option allows you to select a number of drivers that are
 	  not of the "normal" Linux kernel quality level.  These drivers
-	  are placed here in order to get a wider audience for use of
+	  are placed here in order to get a wider audience to make use of
 	  them.  Please note that these drivers are under heavy
 	  development, may or may not work, and may contain userspace
 	  interfaces that most likely will be changed in the near
@@ -12,7 +12,7 @@ menuconfig STAGING
 
 	  Using any of these drivers will taint your kernel which might
 	  affect support options from both the community, and various
-	  commercial support orginizations.
+	  commercial support organizations.
 
 	  If you wish to work on these drivers, to help improve them, or
 	  to report problems you have with them, please see the
diff --git a/drivers/staging/comedi/Kconfig b/drivers/staging/comedi/Kconfig
index b47ca1e7e383..83a93a5c6392 100644
--- a/drivers/staging/comedi/Kconfig
+++ b/drivers/staging/comedi/Kconfig
@@ -1,9 +1,9 @@
 config COMEDI
-	tristate "Data Acquision support (comedi)"
+	tristate "Data acquisition support (comedi)"
 	default N
 	depends on m
 	---help---
-	  Enable support a wide range of data acquision devices
+	  Enable support a wide range of data acquisition devices
 	  for Linux.
 
 config COMEDI_RT
diff --git a/drivers/staging/go7007/Kconfig b/drivers/staging/go7007/Kconfig
index f2cf7f66ae05..ca6ade6c4b47 100644
--- a/drivers/staging/go7007/Kconfig
+++ b/drivers/staging/go7007/Kconfig
@@ -10,7 +10,7 @@ config VIDEO_GO7007
 	select CRC32
 	default N
 	---help---
-	  This is a video4linux driver for some wierd device...
+	  This is a video4linux driver for some weird device...
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called go7007
@@ -20,7 +20,7 @@ config VIDEO_GO7007_USB
 	depends on VIDEO_GO7007 && USB
 	default N
 	---help---
-	  This is a video4linux driver for some wierd device...
+	  This is a video4linux driver for some weird device...
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called go7007-usb
diff --git a/drivers/staging/panel/Kconfig b/drivers/staging/panel/Kconfig
index c4b30f2a549b..3abe7c9d558d 100644
--- a/drivers/staging/panel/Kconfig
+++ b/drivers/staging/panel/Kconfig
@@ -110,7 +110,7 @@ config PANEL_LCD_BWIDTH
 	---help---
 	  Most LCDs use a standard controller which supports hardware lines of 40
 	  characters, although sometimes only 16, 20 or 24 of them are really wired
-	  to the terminal. This results in some non-visible but adressable characters,
+	  to the terminal. This results in some non-visible but addressable characters,
 	  and is the case for most parallel LCDs. Other LCDs, and some serial ones,
 	  however, use the same line width internally as what is visible. The KS0074
 	  for example, uses 16 characters per line for 16 visible characters per line.
diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig
index 770b3eaa9184..080bb1e4b847 100644
--- a/drivers/usb/gadget/Kconfig
+++ b/drivers/usb/gadget/Kconfig
@@ -392,7 +392,7 @@ config USB_GADGET_FSL_QE
 	   controllers having QE or CPM2, given minor tweaks.
 
 	   Set CONFIG_USB_GADGET to "m" to build this driver as a
-	   dynmically linked module called "fsl_qe_udc".
+	   dynamically linked module called "fsl_qe_udc".
 
 config USB_FSL_QE
 	tristate
diff --git a/drivers/usb/serial/Kconfig b/drivers/usb/serial/Kconfig
index a65f9196b0a0..c480ea4c19f2 100644
--- a/drivers/usb/serial/Kconfig
+++ b/drivers/usb/serial/Kconfig
@@ -518,8 +518,8 @@ config USB_SERIAL_SIERRAWIRELESS
 	help
 	  Say M here if you want to use Sierra Wireless devices.
 
-	  Many deviecs have a feature known as TRU-Install, for those devices
-	  to work properly the USB Storage Sierra feature must be enabled.
+	  Many devices have a feature known as TRU-Install. For those devices
+	  to work properly, the USB Storage Sierra feature must be enabled.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called sierra.
diff --git a/drivers/uwb/Kconfig b/drivers/uwb/Kconfig
index ca783127af36..bac8e7a6f17b 100644
--- a/drivers/uwb/Kconfig
+++ b/drivers/uwb/Kconfig
@@ -48,10 +48,10 @@ config UWB_WHCI
         help
           This driver enables the radio controller for WHCI cards.
 
-          WHCI is an specification developed by Intel
+          WHCI is a specification developed by Intel
           (http://www.intel.com/technology/comms/wusb/whci.htm) much
           in the spirit of USB's EHCI, but for UWB and Wireless USB
-          radio/host controllers connected via memmory mapping (eg:
+          radio/host controllers connected via memory mapping (eg:
           PCI). Most of these cards come also with a Wireless USB host
           controller.
 
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 526187c8a12d..8ac9cddac575 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -37,7 +37,7 @@ config XEN_COMPAT_XENFS
          The old xenstore userspace tools expect to find "xenbus"
          under /proc/xen, but "xenbus" is now found at the root of the
          xenfs filesystem.  Selecting this causes the kernel to create
-         the compatibilty mount point /proc/xen if it is running on
+         the compatibility mount point /proc/xen if it is running on
          a xen platform.
          If in doubt, say yes.
 
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 7505482a08fa..418b6f3b0ae8 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -18,7 +18,7 @@ config EXT4_FS
 	  filesystem; while there will be some performance gains from
 	  the delayed allocation and inode table readahead, the best
 	  performance gains will require enabling ext4 features in the
-	  filesystem, or formating a new filesystem as an ext4
+	  filesystem, or formatting a new filesystem as an ext4
 	  filesystem initially.
 
 	  To compile this file system support as a module, choose M here. The
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index e35b54d5059d..830e3f76f442 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -22,7 +22,7 @@ config UBIFS_FS_ADVANCED_COMPR
 	depends on UBIFS_FS
 	help
 	  This option allows to explicitly choose which compressions, if any,
-	  are enabled in UBIFS. Removing compressors means inbility to read
+	  are enabled in UBIFS. Removing compressors means inability to read
 	  existing file systems.
 
 	  If unsure, say 'N'.
@@ -32,7 +32,7 @@ config UBIFS_FS_LZO
 	depends on UBIFS_FS
 	default y
 	help
-	   LZO compressor is generally faster then zlib but compresses worse.
+	   LZO compressor is generally faster than zlib but compresses worse.
 	   Say 'Y' if unsure.
 
 config UBIFS_FS_ZLIB
diff --git a/init/Kconfig b/init/Kconfig
index 14c483d2b7c9..bcffc0e47647 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -687,7 +687,7 @@ config PID_NS
 	depends on NAMESPACES && EXPERIMENTAL
 	help
 	  Support process id namespaces.  This allows having multiple
-	  process with the same pid as long as they are in different
+	  processes with the same pid as long as they are in different
 	  pid namespaces.  This is a building block of containers.
 
 	  Unless you want to work with an experimental feature
@@ -952,7 +952,7 @@ config COMPAT_BRK
 	  Randomizing heap placement makes heap exploits harder, but it
 	  also breaks ancient binaries (including anything libc5 based).
 	  This option changes the bootup default to heap randomization
-	  disabled, and can be overriden runtime by setting
+	  disabled, and can be overridden at runtime by setting
 	  /proc/sys/kernel/randomize_va_space to 2.
 
 	  On non-ancient distros (post-2000 ones) N is usually a safe choice.
@@ -1110,7 +1110,7 @@ config INIT_ALL_POSSIBLE
 	  cpu_possible_map, some of them chose to initialize cpu_possible_map
 	  with all 1s, and others with all 0s.  When they were centralised,
 	  it was better to provide this option than to break all the archs
-	  and have several arch maintainers persuing me down dark alleys.
+	  and have several arch maintainers pursuing me down dark alleys.
 
 config STOP_MACHINE
 	bool
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 34e707e5ab87..504086ab4443 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -72,11 +72,10 @@ config FUNCTION_GRAPH_TRACER
 	help
 	  Enable the kernel to trace a function at both its return
 	  and its entry.
-	  It's first purpose is to trace the duration of functions and
-	  draw a call graph for each thread with some informations like
-	  the return value.
-	  This is done by setting the current return address on the current
-	  task structure into a stack of calls.
+	  Its first purpose is to trace the duration of functions and
+	  draw a call graph for each thread with some information like
+	  the return value. This is done by setting the current return 
+	  address on the current task structure into a stack of calls.
 
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
diff --git a/net/Kconfig b/net/Kconfig
index ec93e7e38b38..ce77db4fcec8 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -140,7 +140,7 @@ config NETFILTER_ADVANCED
 	default y
 	help
 	  If you say Y here you can select between all the netfilter modules.
-	  If you say N the more ununsual ones will not be shown and the
+	  If you say N the more unusual ones will not be shown and the
 	  basic ones needed by most people will default to 'M'.
 
 	  If unsure, say Y.
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index ec992159b5f8..ca8cb326d1d2 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -22,17 +22,17 @@ menuconfig IPV6
 if IPV6
 
 config IPV6_PRIVACY
-	bool "IPv6: Privacy Extensions support"
+	bool "IPv6: Privacy Extensions (RFC 3041) support"
 	---help---
 	  Privacy Extensions for Stateless Address Autoconfiguration in IPv6
-	  support.  With this option, additional periodically-alter 
-	  pseudo-random global-scope unicast address(es) will assigned to
+	  support.  With this option, additional periodically-altered
+	  pseudo-random global-scope unicast address(es) will be assigned to
 	  your interface(s).
 	
-	  We use our standard pseudo random algorithm to generate randomized
-	  interface identifier, instead of one described in RFC 3041.
+	  We use our standard pseudo-random algorithm to generate the
+          randomized interface identifier, instead of one described in RFC 3041.
 
-	  By default, kernel do not generate temporary addresses.
+	  By default the kernel does not generate temporary addresses.
 	  To use temporary addresses, do
 	
 	        echo 2 >/proc/sys/net/ipv6/conf/all/use_tempaddr 
@@ -43,9 +43,9 @@ config IPV6_ROUTER_PREF
 	bool "IPv6: Router Preference (RFC 4191) support"
 	---help---
 	  Router Preference is an optional extension to the Router
-	  Advertisement message to improve the ability of hosts
-	  to pick more appropriate router, especially when the hosts
-	  is placed in a multi-homed network.
+	  Advertisement message which improves the ability of hosts
+	  to pick an appropriate router, especially when the hosts
+	  are placed in a multi-homed network.
 
 	  If unsure, say N.
 
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 60c16162474c..f3d9ae350fb6 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -33,7 +33,7 @@ choice
 	---help---
 	  This option selects the default rate control algorithm
 	  mac80211 will use. Note that this default can still be
-	  overriden through the ieee80211_default_rc_algo module
+	  overridden through the ieee80211_default_rc_algo module
 	  parameter if different algorithms are available.
 
 config MAC80211_RC_DEFAULT_PID
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 2c967e4f706c..bb279bf59a1b 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -52,7 +52,7 @@ config NF_CT_ACCT
 
 	  Please note that currently this option only sets a default state.
 	  You may change it at boot time with nf_conntrack.acct=0/1 kernel
-	  paramater or by loading the nf_conntrack module with acct=0/1.
+	  parameter or by loading the nf_conntrack module with acct=0/1.
 
 	  You may also disable/enable it on a running system with:
 	   sysctl net.netfilter.nf_conntrack_acct=0/1
diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig
index 51a5669573f2..6ec7d55b1769 100644
--- a/net/phonet/Kconfig
+++ b/net/phonet/Kconfig
@@ -6,7 +6,7 @@ config PHONET
 	tristate "Phonet protocols family"
 	help
 	  The Phone Network protocol (PhoNet) is a packet-oriented
-	  communication protocol developped by Nokia for use with its modems.
+	  communication protocol developed by Nokia for use with its modems.
 
 	  This is required for Maemo to use cellular data connectivity (if
 	  supported). It can also be used to control Nokia phones
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 5592883e1e4a..3f7faa9688ae 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -69,7 +69,7 @@ config RPCSEC_GSS_SPKM3
 	select CRYPTO_CBC
 	help
 	  Choose Y here to enable Secure RPC using the SPKM3 public key
-	  GSS-API mechansim (RFC 2025).
+	  GSS-API mechanism (RFC 2025).
 
 	  Secure RPC calls with SPKM3 require an auxiliary userspace
 	  daemon which may be found in the Linux nfs-utils package
diff --git a/net/wimax/Kconfig b/net/wimax/Kconfig
index 18495cdcd10d..1b46747a5f5a 100644
--- a/net/wimax/Kconfig
+++ b/net/wimax/Kconfig
@@ -8,7 +8,7 @@
 #
 # As well, enablement of the RFKILL code means we need the INPUT layer
 # support to inject events coming from hw rfkill switches. That
-# dependency could be killed if input.h provided appropiate means to
+# dependency could be killed if input.h provided appropriate means to
 # work when input is disabled.
 
 comment "WiMAX Wireless Broadband support requires CONFIG_INPUT enabled"
diff --git a/sound/soc/blackfin/Kconfig b/sound/soc/blackfin/Kconfig
index 0a2f8f9eff53..811596f4c092 100644
--- a/sound/soc/blackfin/Kconfig
+++ b/sound/soc/blackfin/Kconfig
@@ -42,7 +42,7 @@ config SND_BF5XX_AC97
 	  You will also need to select the audio interfaces to support below.
 
 	  Note:
-	  AC97 codecs which do not implment the slot-16 mode will not function
+	  AC97 codecs which do not implement the slot-16 mode will not function
 	  properly with this driver. This driver is known to work with the
 	  Analog Devices line of AC97 codecs.
 
-- 
cgit v1.2.3-59-g8ed1b


From f69b17d7e745d8edd7c0d90390cbaa77e63c5ea3 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 17:40:06 +0800
Subject: rcu: rcu_barrier VS cpu_hotplug: Ensure callbacks in dead cpu are
 migrated to online cpu

cpu hotplug may happen asynchronously, some rcu callbacks are maybe
still on dead cpu, rcu_barrier() also needs to wait for these rcu
callbacks to complete, so we must ensure callbacks in dead cpu are
migrated to online cpu.

Paul E. McKenney's review:

  Good stuff, Lai!!!  Simpler than any of the approaches that I was
  considering, and, better yet, independent of the underlying RCU
  implementation!!!

  I was initially worried that wake_up() might wake only one of two
  possible wait_event()s, namely rcu_barrier() and the CPU_POST_DEAD code,
  but the fact that wait_event() clears WQ_FLAG_EXCLUSIVE avoids that issue.
  I was also worried about the fact that different RCU implementations have
  different mappings of call_rcu(), call_rcu_bh(), and call_rcu_sched(), but
  this is OK as well because we just get an extra (harmless) callback in the
  case that they map together (for example, Classic RCU has call_rcu_sched()
  mapping to call_rcu()).

  Overlap of CPU-hotplug operations is prevented by cpu_add_remove_lock,
  and any stray callbacks that arrive (for example, from irq handlers
  running on the dying CPU) either are ahead of the CPU_DYING callbacks on
  the one hand (and thus accounted for), or happened after the rcu_barrier()
  started on the other (and thus don't need to be accounted for).

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <49C36476.1010400@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupdate.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index cae8a059cf47..2c7b8457d0d2 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -122,6 +122,8 @@ static void rcu_barrier_func(void *type)
 	}
 }
 
+static inline void wait_migrated_callbacks(void);
+
 /*
  * Orchestrate the specified type of RCU barrier, waiting for all
  * RCU callbacks of the specified type to complete.
@@ -147,6 +149,7 @@ static void _rcu_barrier(enum rcu_barrier type)
 		complete(&rcu_barrier_completion);
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
+	wait_migrated_callbacks();
 }
 
 /**
@@ -176,9 +179,50 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
+static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
+static struct rcu_head rcu_migrate_head[3];
+static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
+
+static void rcu_migrate_callback(struct rcu_head *notused)
+{
+	if (atomic_dec_and_test(&rcu_migrate_type_count))
+		wake_up(&rcu_migrate_wq);
+}
+
+static inline void wait_migrated_callbacks(void)
+{
+	wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
+}
+
+static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
+		unsigned long action, void *hcpu)
+{
+	if (action == CPU_DYING) {
+		/*
+		 * preempt_disable() in on_each_cpu() prevents stop_machine(),
+		 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
+		 * returns, all online cpus have queued rcu_barrier_func(),
+		 * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
+		 *
+		 * These callbacks ensure _rcu_barrier() waits for all
+		 * RCU callbacks of the specified type to complete.
+		 */
+		atomic_set(&rcu_migrate_type_count, 3);
+		call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
+		call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
+		call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
+	} else if (action == CPU_POST_DEAD) {
+		/* rcu_migrate_head is protected by cpu_add_remove_lock */
+		wait_migrated_callbacks();
+	}
+
+	return NOTIFY_OK;
+}
+
 void __init rcu_init(void)
 {
 	__rcu_init();
+	hotcpu_notifier(rcu_barrier_cpu_hotplug, 0);
 }
 
 void rcu_scheduler_starting(void)
-- 
cgit v1.2.3-59-g8ed1b


From e180a6b7759a99a28cbcce3547c4c80822cb6c2a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:29 -0600
Subject: param: fix charp parameters set via sysfs

Impact: fix crash on reading from /sys/module/.../ieee80211_default_rc_algo

The module_param type "charp" simply sets a char * pointer in the
module to the parameter in the commandline string: this is why we keep
the (mangled) module command line around.  But when set via sysfs (as
about 11 charp parameters can be) this memory is freed on the way
out of the write().  Future reads hit random mem.

So we kstrdup instead: we have to check we're not in early commandline
parsing, and we have to note when we've used it so we can reliably
kfree the parameter when it's next overwritten, and also on module
unload.

(Thanks to Randy Dunlap for CONFIG_SYSFS=n fixes)

Reported-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Diagnosed-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Christof Schmitt <christof.schmitt@de.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h      |  4 ++++
 include/linux/moduleparam.h | 10 ++++++++++
 kernel/module.c             | 14 ++++++++------
 kernel/params.c             | 26 +++++++++++++++++++++++++-
 4 files changed, 47 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 145a75528cc1..08e5e75d6122 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -248,6 +248,10 @@ struct module
 	const unsigned long *crcs;
 	unsigned int num_syms;
 
+	/* Kernel parameters. */
+	struct kernel_param *kp;
+	unsigned int num_kp;
+
 	/* GPL-only exported symbols. */
 	unsigned int num_gpl_syms;
 	const struct kernel_symbol *gpl_syms;
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index e4af3399ef48..a4f0b931846c 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -138,6 +138,16 @@ extern int parse_args(const char *name,
 		      unsigned num,
 		      int (*unknown)(char *param, char *val));
 
+/* Called by module remove. */
+#ifdef CONFIG_SYSFS
+extern void destroy_params(const struct kernel_param *params, unsigned num);
+#else
+static inline void destroy_params(const struct kernel_param *params,
+				  unsigned num)
+{
+}
+#endif /* !CONFIG_SYSFS */
+
 /* All the helper functions */
 /* The macros to do compile-time type checking stolen from Jakub
    Jelinek, who IIRC came up with this idea for the 2.4 module init code. */
diff --git a/kernel/module.c b/kernel/module.c
index f77ac320d0b5..b862fdb6a372 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1491,6 +1491,9 @@ static void free_module(struct module *mod)
 	/* Module unload stuff */
 	module_unload_free(mod);
 
+	/* Free any allocated parameters. */
+	destroy_params(mod->kp, mod->num_kp);
+
 	/* release any pointers to mcount in this module */
 	ftrace_release(mod->module_core, mod->core_size);
 
@@ -1898,8 +1901,7 @@ static noinline struct module *load_module(void __user *umod,
 	unsigned int symindex = 0;
 	unsigned int strindex = 0;
 	unsigned int modindex, versindex, infoindex, pcpuindex;
-	unsigned int num_kp, num_mcount;
-	struct kernel_param *kp;
+	unsigned int num_mcount;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -2144,8 +2146,8 @@ static noinline struct module *load_module(void __user *umod,
 
 	/* Now we've got everything in the final locations, we can
 	 * find optional sections. */
-	kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
-			  &num_kp);
+	mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
+			       sizeof(*mod->kp), &mod->num_kp);
 	mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
 				 sizeof(*mod->syms), &mod->num_syms);
 	mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
@@ -2291,11 +2293,11 @@ static noinline struct module *load_module(void __user *umod,
 	 */
 	list_add_rcu(&mod->list, &modules);
 
-	err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
+	err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
 	if (err < 0)
 		goto unlink;
 
-	err = mod_sysfs_setup(mod, kp, num_kp);
+	err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
 	if (err < 0)
 		goto unlink;
 	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
diff --git a/kernel/params.c b/kernel/params.c
index a1e3025b19a9..de273ec85bd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,6 +24,9 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 
+/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
+#define KPARAM_KMALLOCED	0x80000000
+
 #if 0
 #define DEBUGP printk
 #else
@@ -217,7 +220,19 @@ int param_set_charp(const char *val, struct kernel_param *kp)
 		return -ENOSPC;
 	}
 
-	*(char **)kp->arg = (char *)val;
+	if (kp->perm & KPARAM_KMALLOCED)
+		kfree(*(char **)kp->arg);
+
+	/* This is a hack.  We can't need to strdup in early boot, and we
+	 * don't need to; this mangled commandline is preserved. */
+	if (slab_is_available()) {
+		kp->perm |= KPARAM_KMALLOCED;
+		*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
+		if (!kp->arg)
+			return -ENOMEM;
+	} else
+		*(const char **)kp->arg = val;
+
 	return 0;
 }
 
@@ -571,6 +586,15 @@ void module_param_sysfs_remove(struct module *mod)
 }
 #endif
 
+void destroy_params(const struct kernel_param *params, unsigned num)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++)
+		if (params[i].perm & KPARAM_KMALLOCED)
+			kfree(*(char **)params[i].arg);
+}
+
 static void __init kernel_add_sysfs_param(const char *name,
 					  struct kernel_param *kparam,
 					  unsigned int name_skip)
-- 
cgit v1.2.3-59-g8ed1b


From b10153fe31dde3805f8320b61ef147cebe379aee Mon Sep 17 00:00:00 2001
From: Américo Wang <xiyou.wangcong@gmail.com>
Date: Wed, 25 Mar 2009 00:07:19 +0800
Subject: kernel/module.c: fix an unused goto label

Impact: cleanup

Label 'free_init' is only used when defined(CONFIG_MODULE_UNLOAD) &&
defined(CONFIG_SMP), so move it inside to shut up gcc.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index b862fdb6a372..7af72bbe4cc0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2319,8 +2319,8 @@ static noinline struct module *load_module(void __user *umod,
 	ftrace_release(mod->module_core, mod->core_size);
  free_unload:
 	module_unload_free(mod);
- free_init:
 #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ free_init:
 	percpu_modfree(mod->refptr);
 #endif
 	module_free(mod, mod->module_init);
-- 
cgit v1.2.3-59-g8ed1b


From 414fd31b2553aaf160ca9b9afe45aa0372b01b92 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@mit.edu>
Date: Fri, 5 Dec 2008 19:03:56 -0500
Subject: module: Make find_symbol return a struct kernel_symbol

Impact: Cleanup, internal API change

Ksplice needs access to the kernel_symbol structure in order to support
modifications to the exported symbol table.

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Signed-off-by: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (bugfix and style)
---
 kernel/module.c | 75 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 37 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 7af72bbe4cc0..2f0fddf3c114 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -283,7 +283,7 @@ struct find_symbol_arg {
 	/* Output */
 	struct module *owner;
 	const unsigned long *crc;
-	unsigned long value;
+	const struct kernel_symbol *sym;
 };
 
 static bool find_symbol_in_section(const struct symsearch *syms,
@@ -324,17 +324,17 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 
 	fsa->owner = owner;
 	fsa->crc = symversion(syms->crcs, symnum);
-	fsa->value = syms->start[symnum].value;
+	fsa->sym = &syms->start[symnum];
 	return true;
 }
 
-/* Find a symbol, return value, (optional) crc and (optional) module
- * which owns it */
-static unsigned long find_symbol(const char *name,
-				 struct module **owner,
-				 const unsigned long **crc,
-				 bool gplok,
-				 bool warn)
+/* Find a symbol and return it, along with, (optional) crc and
+ * (optional) module which owns it */
+static const struct kernel_symbol *find_symbol(const char *name,
+					       struct module **owner,
+					       const unsigned long **crc,
+					       bool gplok,
+					       bool warn)
 {
 	struct find_symbol_arg fsa;
 
@@ -347,11 +347,11 @@ static unsigned long find_symbol(const char *name,
 			*owner = fsa.owner;
 		if (crc)
 			*crc = fsa.crc;
-		return fsa.value;
+		return fsa.sym;
 	}
 
 	DEBUGP("Failed to find symbol %s\n", name);
-	return -ENOENT;
+	return NULL;
 }
 
 /* Search for module by name: must hold module_mutex. */
@@ -894,7 +894,7 @@ void __symbol_put(const char *symbol)
 	struct module *owner;
 
 	preempt_disable();
-	if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false)))
+	if (!find_symbol(symbol, &owner, NULL, true, false))
 		BUG();
 	module_put(owner);
 	preempt_enable();
@@ -1057,7 +1057,7 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 {
 	const unsigned long *crc;
 
-	if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false)))
+	if (!find_symbol("struct_module", NULL, &crc, true, false))
 		BUG();
 	return check_version(sechdrs, versindex, "struct_module", mod, crc);
 }
@@ -1098,25 +1098,25 @@ static inline int same_magic(const char *amagic, const char *bmagic,
 
 /* Resolve a symbol for this module.  I.e. if we find one, record usage.
    Must be holding module_mutex. */
-static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
-				    unsigned int versindex,
-				    const char *name,
-				    struct module *mod)
+static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
+						  unsigned int versindex,
+						  const char *name,
+						  struct module *mod)
 {
 	struct module *owner;
-	unsigned long ret;
+	const struct kernel_symbol *sym;
 	const unsigned long *crc;
 
-	ret = find_symbol(name, &owner, &crc,
+	sym = find_symbol(name, &owner, &crc,
 			  !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
-	if (!IS_ERR_VALUE(ret)) {
-		/* use_module can fail due to OOM,
-		   or module initialization or unloading */
+	/* use_module can fail due to OOM,
+	   or module initialization or unloading */
+	if (sym) {
 		if (!check_version(sechdrs, versindex, name, mod, crc) ||
 		    !use_module(mod, owner))
-			ret = -EINVAL;
+			sym = NULL;
 	}
-	return ret;
+	return sym;
 }
 
 /*
@@ -1516,17 +1516,15 @@ static void free_module(struct module *mod)
 void *__symbol_get(const char *symbol)
 {
 	struct module *owner;
-	unsigned long value;
+	const struct kernel_symbol *sym;
 
 	preempt_disable();
-	value = find_symbol(symbol, &owner, NULL, true, true);
-	if (IS_ERR_VALUE(value))
-		value = 0;
-	else if (strong_try_module_get(owner))
-		value = 0;
+	sym = find_symbol(symbol, &owner, NULL, true, true);
+	if (sym && strong_try_module_get(owner))
+		sym = NULL;
 	preempt_enable();
 
-	return (void *)value;
+	return sym ? (void *)sym->value : NULL;
 }
 EXPORT_SYMBOL_GPL(__symbol_get);
 
@@ -1554,8 +1552,7 @@ static int verify_export_symbols(struct module *mod)
 
 	for (i = 0; i < ARRAY_SIZE(arr); i++) {
 		for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
-			if (!IS_ERR_VALUE(find_symbol(s->name, &owner,
-						      NULL, true, false))) {
+			if (find_symbol(s->name, &owner, NULL, true, false)) {
 				printk(KERN_ERR
 				       "%s: exports duplicate symbol %s"
 				       " (owned by %s)\n",
@@ -1579,6 +1576,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 	unsigned long secbase;
 	unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
 	int ret = 0;
+	const struct kernel_symbol *ksym;
 
 	for (i = 1; i < n; i++) {
 		switch (sym[i].st_shndx) {
@@ -1598,13 +1596,14 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 			break;
 
 		case SHN_UNDEF:
-			sym[i].st_value
-			  = resolve_symbol(sechdrs, versindex,
-					   strtab + sym[i].st_name, mod);
-
+			ksym = resolve_symbol(sechdrs, versindex,
+					      strtab + sym[i].st_name, mod);
 			/* Ok if resolved.  */
-			if (!IS_ERR_VALUE(sym[i].st_value))
+			if (ksym) {
+				sym[i].st_value = ksym->value;
 				break;
+			}
+
 			/* Ok if weak.  */
 			if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
 				break;
-- 
cgit v1.2.3-59-g8ed1b


From e610499e2656e61975affd0af56b26eb73964c84 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:31 -0600
Subject: module: __module_address

Impact: New API, cleanup

ksplice wants to know the bounds of a module, not just the module text.

It makes sense to have __module_address.  We then implement
is_module_address and __module_text_address in terms of this (and
change is_module_text_address() to bool while we're at it).

Also, add proper kerneldoc for them all.

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Cc: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 20 +++++++++----
 kernel/module.c        | 76 ++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 73 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 08e5e75d6122..fd1241e1416f 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -365,7 +365,9 @@ static inline int module_is_live(struct module *mod)
 /* Is this address in a module? (second is with no locks, for oops) */
 struct module *module_text_address(unsigned long addr);
 struct module *__module_text_address(unsigned long addr);
-int is_module_address(unsigned long addr);
+struct module *__module_address(unsigned long addr);
+bool is_module_address(unsigned long addr);
+bool is_module_text_address(unsigned long addr);
 
 static inline int within_module_core(unsigned long addr, struct module *mod)
 {
@@ -494,21 +496,29 @@ search_module_extables(unsigned long addr)
 	return NULL;
 }
 
-/* Is this address in a module? */
 static inline struct module *module_text_address(unsigned long addr)
 {
 	return NULL;
 }
 
-/* Is this address in a module? (don't take a lock, we're oopsing) */
+static inline struct module *__module_address(unsigned long addr)
+{
+	return NULL;
+}
+
 static inline struct module *__module_text_address(unsigned long addr)
 {
 	return NULL;
 }
 
-static inline int is_module_address(unsigned long addr)
+static inline bool is_module_address(unsigned long addr)
 {
-	return 0;
+	return false;
+}
+
+static inline bool is_module_text_address(unsigned long addr)
+{
+	return false;
 }
 
 /* Get/put a kernel symbol (calls should be symmetric) */
diff --git a/kernel/module.c b/kernel/module.c
index 2f0fddf3c114..bd15a94f91c1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -76,7 +76,7 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
-/* Bounds of module allocation, for speeding __module_text_address */
+/* Bounds of module allocation, for speeding __module_address */
 static unsigned long module_addr_min = -1UL, module_addr_max = 0;
 
 int register_module_notifier(struct notifier_block * nb)
@@ -2745,29 +2745,31 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
 }
 
 /*
- * Is this a valid module address?
+ * is_module_address - is this address inside a module?
+ * @addr: the address to check.
+ *
+ * See is_module_text_address() if you simply want to see if the address
+ * is code (not data).
  */
-int is_module_address(unsigned long addr)
+bool is_module_address(unsigned long addr)
 {
-	struct module *mod;
+	bool ret;
 
 	preempt_disable();
-
-	list_for_each_entry_rcu(mod, &modules, list) {
-		if (within_module_core(addr, mod)) {
-			preempt_enable();
-			return 1;
-		}
-	}
-
+	ret = __module_address(addr) != NULL;
 	preempt_enable();
 
-	return 0;
+	return ret;
 }
 
-
-/* Is this a valid kernel address? */
-__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
+/*
+ * __module_address - get the module which contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+__notrace_funcgraph struct module *__module_address(unsigned long addr)
 {
 	struct module *mod;
 
@@ -2775,12 +2777,50 @@ __notrace_funcgraph struct module *__module_text_address(unsigned long addr)
 		return NULL;
 
 	list_for_each_entry_rcu(mod, &modules, list)
-		if (within(addr, mod->module_init, mod->init_text_size)
-		    || within(addr, mod->module_core, mod->core_text_size))
+		if (within_module_core(addr, mod)
+		    || within_module_init(addr, mod))
 			return mod;
 	return NULL;
 }
 
+/*
+ * is_module_text_address - is this address inside module code?
+ * @addr: the address to check.
+ *
+ * See is_module_address() if you simply want to see if the address is
+ * anywhere in a module.  See kernel_text_address() for testing if an
+ * address corresponds to kernel or module code.
+ */
+bool is_module_text_address(unsigned long addr)
+{
+	bool ret;
+
+	preempt_disable();
+	ret = __module_text_address(addr) != NULL;
+	preempt_enable();
+
+	return ret;
+}
+
+/*
+ * __module_text_address - get the module whose code contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_text_address(unsigned long addr)
+{
+	struct module *mod = __module_address(addr);
+	if (mod) {
+		/* Make sure it's within the text section. */
+		if (!within(addr, mod->module_init, mod->init_text_size)
+		    && !within(addr, mod->module_core, mod->core_text_size))
+			mod = NULL;
+	}
+	return mod;
+}
+
 struct module *module_text_address(unsigned long addr)
 {
 	struct module *mod;
-- 
cgit v1.2.3-59-g8ed1b


From a6e6abd575fcbe6572ebc7a70ad616406d206fa8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:31 -0600
Subject: module: remove module_text_address()

Impact: Replace and remove risky (non-EXPORTed) API

module_text_address() returns a pointer to the module, which given locking
improvements in module.c, is useless except to test for NULL:

1) If the module can't go away, use __module_text_address.
2) Otherwise, just use is_module_text_address().

Cc: linux-mtd@lists.infradead.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/mtd/nand/nand_base.c |  4 ++--
 include/linux/module.h       |  7 -------
 kernel/extable.c             |  6 +++---
 kernel/module.c              | 17 ++++-------------
 4 files changed, 9 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0c3afccde8a2..5f71371eb1b0 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2720,14 +2720,14 @@ int nand_scan_tail(struct mtd_info *mtd)
 	return chip->scan_bbt(mtd);
 }
 
-/* module_text_address() isn't exported, and it's mostly a pointless
+/* is_module_text_address() isn't exported, and it's mostly a pointless
    test if this is a module _anyway_ -- they'd have to try _really_ hard
    to call us from in-kernel code if the core NAND support is modular. */
 #ifdef MODULE
 #define caller_is_module() (1)
 #else
 #define caller_is_module() \
-	module_text_address((unsigned long)__builtin_return_address(0))
+	is_module_text_address((unsigned long)__builtin_return_address(0))
 #endif
 
 /**
diff --git a/include/linux/module.h b/include/linux/module.h
index fd1241e1416f..69761ce0dbf0 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -362,8 +362,6 @@ static inline int module_is_live(struct module *mod)
 	return mod->state != MODULE_STATE_GOING;
 }
 
-/* Is this address in a module? (second is with no locks, for oops) */
-struct module *module_text_address(unsigned long addr);
 struct module *__module_text_address(unsigned long addr);
 struct module *__module_address(unsigned long addr);
 bool is_module_address(unsigned long addr);
@@ -496,11 +494,6 @@ search_module_extables(unsigned long addr)
 	return NULL;
 }
 
-static inline struct module *module_text_address(unsigned long addr)
-{
-	return NULL;
-}
-
 static inline struct module *__module_address(unsigned long addr)
 {
 	return NULL;
diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8d82ba..384f0da8a03e 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -58,14 +58,14 @@ __notrace_funcgraph int __kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
-	return __module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
 
 int kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
-	return module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
 
 /*
@@ -81,5 +81,5 @@ int func_ptr_is_kernel_text(void *ptr)
 	addr = (unsigned long) dereference_function_descriptor(ptr);
 	if (core_kernel_text(addr))
 		return 1;
-	return module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
diff --git a/kernel/module.c b/kernel/module.c
index bd15a94f91c1..8ddca629e079 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -908,8 +908,10 @@ void symbol_put_addr(void *addr)
 	if (core_kernel_text((unsigned long)addr))
 		return;
 
-	if (!(modaddr = module_text_address((unsigned long)addr)))
-		BUG();
+	/* module_text_address is safe here: we're supposed to have reference
+	 * to module from symbol_get, so it can't go away. */
+	modaddr = __module_text_address((unsigned long)addr);
+	BUG_ON(!modaddr);
 	module_put(modaddr);
 }
 EXPORT_SYMBOL_GPL(symbol_put_addr);
@@ -2821,17 +2823,6 @@ struct module *__module_text_address(unsigned long addr)
 	return mod;
 }
 
-struct module *module_text_address(unsigned long addr)
-{
-	struct module *mod;
-
-	preempt_disable();
-	mod = __module_text_address(addr);
-	preempt_enable();
-
-	return mod;
-}
-
 /* Don't grab lock, we're oopsing. */
 void print_modules(void)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 75a66614db21007bcc8c37f9c5d5b922981387b9 Mon Sep 17 00:00:00 2001
From: Anders Kaseorg <andersk@mit.edu>
Date: Fri, 5 Dec 2008 19:03:58 -0500
Subject: Ksplice: Add functions for walking kallsyms symbols

Impact: New API

kallsyms_lookup_name only returns the first match that it finds.  Ksplice
needs information about all symbols with a given name in order to correctly
resolve local symbols.

kallsyms_on_each_symbol provides a generic mechanism for iterating over the
kallsyms table.

Cc: Jeff Arnold <jbarnold@mit.edu>
Cc: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Anders Kaseorg <andersk@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/kallsyms.h | 15 +++++++++++++++
 include/linux/module.h   | 12 ++++++++++++
 kernel/kallsyms.c        | 19 +++++++++++++++++++
 kernel/module.c          | 19 +++++++++++++++++++
 4 files changed, 65 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index f3fe34391d8e..792274269f2b 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -13,10 +13,17 @@
 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \
 			 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1)
 
+struct module;
+
 #ifdef CONFIG_KALLSYMS
 /* Lookup the address for a symbol. Returns 0 if not found. */
 unsigned long kallsyms_lookup_name(const char *name);
 
+/* Call a function on each kallsyms symbol in the core kernel */
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
+				      unsigned long),
+			    void *data);
+
 extern int kallsyms_lookup_size_offset(unsigned long addr,
 				  unsigned long *symbolsize,
 				  unsigned long *offset);
@@ -43,6 +50,14 @@ static inline unsigned long kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
+static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+						    struct module *,
+						    unsigned long),
+					  void *data)
+{
+	return 0;
+}
+
 static inline int kallsyms_lookup_size_offset(unsigned long addr,
 					      unsigned long *symbolsize,
 					      unsigned long *offset)
diff --git a/include/linux/module.h b/include/linux/module.h
index 69761ce0dbf0..c3d3fc4ffb18 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -387,6 +387,10 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 /* Look for this name: can be of form module:name. */
 unsigned long module_kallsyms_lookup_name(const char *name);
 
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+					     struct module *, unsigned long),
+				   void *data);
+
 extern void __module_put_and_exit(struct module *mod, long code)
 	__attribute__((noreturn));
 #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code);
@@ -566,6 +570,14 @@ static inline unsigned long module_kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
+static inline int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+							   struct module *,
+							   unsigned long),
+						 void *data)
+{
+	return 0;
+}
+
 static inline int register_module_notifier(struct notifier_block * nb)
 {
 	/* no events will happen anyway, so this can always succeed */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7b8b0f21a5b1..374faf9bfdc7 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -161,6 +161,25 @@ unsigned long kallsyms_lookup_name(const char *name)
 	return module_kallsyms_lookup_name(name);
 }
 
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
+				      unsigned long),
+			    void *data)
+{
+	char namebuf[KSYM_NAME_LEN];
+	unsigned long i;
+	unsigned int off;
+	int ret;
+
+	for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
+		off = kallsyms_expand_symbol(off, namebuf);
+		ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
+		if (ret != 0)
+			return ret;
+	}
+	return module_kallsyms_on_each_symbol(fn, data);
+}
+EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
+
 static unsigned long get_symbol_pos(unsigned long addr,
 				    unsigned long *symbolsize,
 				    unsigned long *offset)
diff --git a/kernel/module.c b/kernel/module.c
index 8ddca629e079..dd4389be9152 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2612,6 +2612,25 @@ unsigned long module_kallsyms_lookup_name(const char *name)
 	preempt_enable();
 	return ret;
 }
+
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+					     struct module *, unsigned long),
+				   void *data)
+{
+	struct module *mod;
+	unsigned int i;
+	int ret;
+
+	list_for_each_entry(mod, &modules, list) {
+		for (i = 0; i < mod->num_symtab; i++) {
+			ret = fn(data, mod->strtab + mod->symtab[i].st_name,
+				 mod, mod->symtab[i].st_value);
+			if (ret != 0)
+				return ret;
+		}
+	}
+	return 0;
+}
 #endif /* CONFIG_KALLSYMS */
 
 static char *module_flags(struct module *mod, char *buf)
-- 
cgit v1.2.3-59-g8ed1b


From c6b37801911d7f4663c99cad8aa230bc934cea82 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@mit.edu>
Date: Fri, 5 Dec 2008 19:03:59 -0500
Subject: module: Export symbols needed for Ksplice

Impact: Expose some module.c symbols

Ksplice uses several functions from module.c in order to resolve
symbols and implement dependency handling.  Calling these functions
requires holding module_mutex, so it is exported.

(This is just the module part of a bigger add-exports patch from Tim).

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Signed-off-by: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 28 ++++++++++++++++++++++++++++
 kernel/module.c        | 43 +++++++++++++++++++------------------------
 2 files changed, 47 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index c3d3fc4ffb18..d246da0b0f8c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -354,6 +354,8 @@ struct module
 #define MODULE_ARCH_INIT {}
 #endif
 
+extern struct mutex module_mutex;
+
 /* FIXME: It'd be nice to isolate modules during init, too, so they
    aren't used before they (may) fail.  But presently too much code
    (IDE & SCSI) require entry into the module during init.*/
@@ -379,6 +381,31 @@ static inline int within_module_init(unsigned long addr, struct module *mod)
 	       addr < (unsigned long)mod->module_init + mod->init_size;
 }
 
+/* Search for module by name: must hold module_mutex. */
+struct module *find_module(const char *name);
+
+struct symsearch {
+	const struct kernel_symbol *start, *stop;
+	const unsigned long *crcs;
+	enum {
+		NOT_GPL_ONLY,
+		GPL_ONLY,
+		WILL_BE_GPL_ONLY,
+	} licence;
+	bool unused;
+};
+
+/* Search for an exported symbol by name. */
+const struct kernel_symbol *find_symbol(const char *name,
+					struct module **owner,
+					const unsigned long **crc,
+					bool gplok,
+					bool warn);
+
+/* Walk the exported symbol table */
+bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
+			    unsigned int symnum, void *data), void *data);
+
 /* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
    symnum out of range. */
 int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
@@ -452,6 +479,7 @@ static inline void __module_get(struct module *module)
 #define symbol_put_addr(p) do { } while(0)
 
 #endif /* CONFIG_MODULE_UNLOAD */
+int use_module(struct module *a, struct module *b);
 
 /* This is a #define so the string doesn't get put in every .o file */
 #define module_name(mod)			\
diff --git a/kernel/module.c b/kernel/module.c
index dd4389be9152..5fd00766a4dc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -68,7 +68,8 @@
 
 /* List of modules, protected by module_mutex or preempt_disable
  * (delete uses stop_machine/add uses RCU list operations). */
-static DEFINE_MUTEX(module_mutex);
+DEFINE_MUTEX(module_mutex);
+EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
 
 /* Waiting for a module to finish initializing? */
@@ -186,17 +187,6 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
 #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
 
-struct symsearch {
-	const struct kernel_symbol *start, *stop;
-	const unsigned long *crcs;
-	enum {
-		NOT_GPL_ONLY,
-		GPL_ONLY,
-		WILL_BE_GPL_ONLY,
-	} licence;
-	bool unused;
-};
-
 static bool each_symbol_in_section(const struct symsearch *arr,
 				   unsigned int arrsize,
 				   struct module *owner,
@@ -217,10 +207,8 @@ static bool each_symbol_in_section(const struct symsearch *arr,
 }
 
 /* Returns true as soon as fn returns true, otherwise false. */
-static bool each_symbol(bool (*fn)(const struct symsearch *arr,
-				   struct module *owner,
-				   unsigned int symnum, void *data),
-			void *data)
+bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
+			    unsigned int symnum, void *data), void *data)
 {
 	struct module *mod;
 	const struct symsearch arr[] = {
@@ -273,6 +261,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
 	}
 	return false;
 }
+EXPORT_SYMBOL_GPL(each_symbol);
 
 struct find_symbol_arg {
 	/* Input */
@@ -330,11 +319,11 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 
 /* Find a symbol and return it, along with, (optional) crc and
  * (optional) module which owns it */
-static const struct kernel_symbol *find_symbol(const char *name,
-					       struct module **owner,
-					       const unsigned long **crc,
-					       bool gplok,
-					       bool warn)
+const struct kernel_symbol *find_symbol(const char *name,
+					struct module **owner,
+					const unsigned long **crc,
+					bool gplok,
+					bool warn)
 {
 	struct find_symbol_arg fsa;
 
@@ -353,9 +342,10 @@ static const struct kernel_symbol *find_symbol(const char *name,
 	DEBUGP("Failed to find symbol %s\n", name);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(find_symbol);
 
 /* Search for module by name: must hold module_mutex. */
-static struct module *find_module(const char *name)
+struct module *find_module(const char *name)
 {
 	struct module *mod;
 
@@ -365,6 +355,7 @@ static struct module *find_module(const char *name)
 	}
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(find_module);
 
 #ifdef CONFIG_SMP
 
@@ -641,7 +632,7 @@ static int already_uses(struct module *a, struct module *b)
 }
 
 /* Module a uses b */
-static int use_module(struct module *a, struct module *b)
+int use_module(struct module *a, struct module *b)
 {
 	struct module_use *use;
 	int no_warn, err;
@@ -674,6 +665,7 @@ static int use_module(struct module *a, struct module *b)
 	no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
 	return 1;
 }
+EXPORT_SYMBOL_GPL(use_module);
 
 /* Clear the unload stuff of the module. */
 static void module_unload_free(struct module *mod)
@@ -951,10 +943,11 @@ static inline void module_unload_free(struct module *mod)
 {
 }
 
-static inline int use_module(struct module *a, struct module *b)
+int use_module(struct module *a, struct module *b)
 {
 	return strong_try_module_get(b) == 0;
 }
+EXPORT_SYMBOL_GPL(use_module);
 
 static inline void module_unload_init(struct module *mod)
 {
@@ -2803,6 +2796,7 @@ __notrace_funcgraph struct module *__module_address(unsigned long addr)
 			return mod;
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(__module_address);
 
 /*
  * is_module_text_address - is this address inside module code?
@@ -2841,6 +2835,7 @@ struct module *__module_text_address(unsigned long addr)
 	}
 	return mod;
 }
+EXPORT_SYMBOL_GPL(__module_text_address);
 
 /* Don't grab lock, we're oopsing. */
 void print_modules(void)
-- 
cgit v1.2.3-59-g8ed1b


From c6e665c8f0c18ab3686117905765b5139efd6ebd Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:33 -0600
Subject: module: clarify the force-loading taint message.

Impact: Message cleanup

Two of three callers of try_to_force_load() are not because of a
missing version, so change the messages:

Old:
	<modname>: no version for "magic" found: kernel tainted.
New:
	<modname>: bad vermagic: kernel tainted.

Old:
	<modname>: no version for "nocrc" found: kernel tainted.
New:
	<modname>: no versions for exported symbols: kernel tainted.

Old:
	<modname>: no version for "<symname>" found: kernel tainted.
New:
	<modname>: <symname>: kernel tainted.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5fd00766a4dc..599fc85bd2cc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -990,12 +990,12 @@ static struct module_attribute *modinfo_attrs[] = {
 
 static const char vermagic[] = VERMAGIC_STRING;
 
-static int try_to_force_load(struct module *mod, const char *symname)
+static int try_to_force_load(struct module *mod, const char *reason)
 {
 #ifdef CONFIG_MODULE_FORCE_LOAD
 	if (!test_taint(TAINT_FORCED_MODULE))
-		printk("%s: no version for \"%s\" found: kernel tainted.\n",
-		       mod->name, symname);
+		printk(KERN_WARNING "%s: %s: kernel tainted.\n",
+		       mod->name, reason);
 	add_taint_module(mod, TAINT_FORCED_MODULE);
 	return 0;
 #else
@@ -2002,7 +2002,7 @@ static noinline struct module *load_module(void __user *umod,
 	modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
 	/* This is allowed: modprobe --force will invalidate it. */
 	if (!modmagic) {
-		err = try_to_force_load(mod, "magic");
+		err = try_to_force_load(mod, "bad vermagic");
 		if (err)
 			goto free_hdr;
 	} else if (!same_magic(modmagic, vermagic, versindex)) {
@@ -2191,8 +2191,8 @@ static noinline struct module *load_module(void __user *umod,
 	    || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
 #endif
 		) {
-		printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
-		err = try_to_force_load(mod, "nocrc");
+		err = try_to_force_load(mod,
+					"no versions for exported symbols");
 		if (err)
 			goto cleanup;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 9cb610d8e35fe3ec95a2fe2030b02f85aeea83c1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:33 -0600
Subject: module: remove the SHF_ALLOC flag on the __versions section.

Impact: reduce kernel memory usage

This patch just takes off the SHF_ALLOC flag on __versions so we don't
keep them around after module load.

This saves about 7% of module memory if CONFIG_MODVERSIONS=y.

Cc: Shawn Bohrer <shawn.bohrer@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 599fc85bd2cc..784bf6d8f8c4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1961,6 +1961,9 @@ static noinline struct module *load_module(void __user *umod,
 		if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
 			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 #endif
+		/* Don't keep __versions around; it's just for loading. */
+		if (strcmp(secstrings + sechdrs[i].sh_name, "__versions") == 0)
+			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 	}
 
 	modindex = find_sec(hdr, sechdrs, secstrings,
-- 
cgit v1.2.3-59-g8ed1b


From 8c8ef42aee8fcfb4128bb94c50d55c9f80ade525 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:34 -0600
Subject: module: include other structures in module version check

With CONFIG_MODVERSIONS, we version 'struct module' using a dummy
export, but other things matter too:

1) 'struct modversion_info' determines the layout of the __versions section,
2) 'struct kernel_param' determines the layout of the __params section,
3) 'struct kernel_symbol' determines __ksymtab*.
4) 'struct marker' determines __markers.
5) 'struct tracepoint' determines __tracepoints.

So we rename 'struct_module' to 'module_layout' and include these in
the signature.  Now it's general we can add others later on without
confusion.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c       | 18 +++++++++++++-----
 scripts/mod/modpost.c |  4 ++--
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 784bf6d8f8c4..e8cf636e614b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1052,9 +1052,9 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 {
 	const unsigned long *crc;
 
-	if (!find_symbol("struct_module", NULL, &crc, true, false))
+	if (!find_symbol("module_layout", NULL, &crc, true, false))
 		BUG();
-	return check_version(sechdrs, versindex, "struct_module", mod, crc);
+	return check_version(sechdrs, versindex, "module_layout", mod, crc);
 }
 
 /* First part is kernel version, which we ignore if module has crcs. */
@@ -2858,9 +2858,17 @@ void print_modules(void)
 }
 
 #ifdef CONFIG_MODVERSIONS
-/* Generate the signature for struct module here, too, for modversions. */
-void struct_module(struct module *mod) { return; }
-EXPORT_SYMBOL(struct_module);
+/* Generate the signature for all relevant module structures here.
+ * If these change, we don't want to try to parse the module. */
+void module_layout(struct module *mod,
+		   struct modversion_info *ver,
+		   struct kernel_param *kp,
+		   struct kernel_symbol *ks,
+		   struct marker *marker,
+		   struct tracepoint *tp)
+{
+}
+EXPORT_SYMBOL(module_layout);
 #endif
 
 #ifdef CONFIG_MARKERS
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 7e62303133dc..8cc70612984c 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -1607,12 +1607,12 @@ static void read_symbols(char *modname)
 
 	parse_elf_finish(&info);
 
-	/* Our trick to get versioning for struct_module - it's
+	/* Our trick to get versioning for module struct etc. - it's
 	 * never passed as an argument to an exported function, so
 	 * the automatic versioning doesn't pick it up, but it's really
 	 * important anyhow */
 	if (modversions)
-		mod->unres = alloc_symbol("struct_module", 0, mod->unres);
+		mod->unres = alloc_symbol("module_layout", 0, mod->unres);
 }
 
 #define SZ 500
-- 
cgit v1.2.3-59-g8ed1b


From acae05156551fd7528fbb616271e672789388e3c Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 8 Feb 2009 10:42:01 -0800
Subject: module: create a request_module_nowait()

There seems to be a common pattern in the kernel where drivers want to
call request_module() from inside a module_init() function. Currently
this would deadlock.

As a result, several drivers go through hoops like scheduling things via
kevent, or creating custom work queues (because kevent can deadlock on them).

This patch changes this to use a request_module_nowait() function macro instead,
which just fires the modprobe off but doesn't wait for it, and thus avoids the
original deadlock entirely.

On my laptop this already results in one less kernel thread running..

(Includes Jiri's patch to use enum umh_wait)

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (bool-ified)
Cc: Jiri Slaby <jirislaby@gmail.com>
---
 include/linux/kmod.h | 11 ++++++++---
 kernel/kmod.c        | 10 ++++++----
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 92213a9194e1..d5fa565086d1 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -29,10 +29,15 @@
 #ifdef CONFIG_MODULES
 /* modprobe exit status on success, -ve on error.  Return value
  * usually useless though. */
-extern int request_module(const char * name, ...) __attribute__ ((format (printf, 1, 2)));
-#define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x)))
+extern int __request_module(bool wait, const char *name, ...) \
+	__attribute__((format(printf, 2, 3)));
+#define request_module(mod...) __request_module(true, mod)
+#define request_module_nowait(mod...) __request_module(false, mod)
+#define try_then_request_module(x, mod...) \
+	((x) ?: (__request_module(false, mod), (x)))
 #else
-static inline int request_module(const char * name, ...) { return -ENOSYS; }
+static inline int request_module(const char *name, ...) { return -ENOSYS; }
+static inline int request_module_nowait(const char *name, ...) { return -ENOSYS; }
 #define try_then_request_module(x, mod...) (x)
 #endif
 
diff --git a/kernel/kmod.c b/kernel/kmod.c
index f0c8f545180d..b750675251e5 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -50,7 +50,8 @@ static struct workqueue_struct *khelper_wq;
 char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
 
 /**
- * request_module - try to load a kernel module
+ * __request_module - try to load a kernel module
+ * @wait: wait (or not) for the operation to complete
  * @fmt: printf style format string for the name of the module
  * @...: arguments as specified in the format string
  *
@@ -63,7 +64,7 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
  * If module auto-loading support is disabled then this function
  * becomes a no-operation.
  */
-int request_module(const char *fmt, ...)
+int __request_module(bool wait, const char *fmt, ...)
 {
 	va_list args;
 	char module_name[MODULE_NAME_LEN];
@@ -108,11 +109,12 @@ int request_module(const char *fmt, ...)
 		return -ENOMEM;
 	}
 
-	ret = call_usermodehelper(modprobe_path, argv, envp, 1);
+	ret = call_usermodehelper(modprobe_path, argv, envp,
+			wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
 	atomic_dec(&kmod_concurrent);
 	return ret;
 }
-EXPORT_SYMBOL(request_module);
+EXPORT_SYMBOL(__request_module);
 #endif /* CONFIG_MODULES */
 
 struct subprocess_info {
-- 
cgit v1.2.3-59-g8ed1b


From e91defa26c527ceeaff6266c55cdc7e17c9081a2 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:35 -0600
Subject: module: don't use stop_machine on module load

Kay Sievers <kay.sievers@vrfy.org> discovered that boot times are slowed
by about half a second because all the stop_machine_create() calls,
and he only probes about 40 modules (I have 125 loaded on this laptop).

We only do stop_machine_create() so we can unlink the module if
something goes wrong, but it's overkill (and buggy anyway: if
stop_machine_create() fails we still call stop_machine_destroy()).

Since we are only protecting against kallsyms (esp. oops) walking the
list, synchronize_sched() is sufficient (synchronize_rcu() is probably
sufficient, but we're not in a hurry).

Kay says of this patch:
	... no module takes more than 40 millisecs to link now, most of
	them are between 3 and 8 millisecs.

	That looks very different to the numbers without this patch
	and the otherwise same setup, where we get heavy noise in the
	traces and many delays of up to 200 millisecs until linking,
	most of them taking 30+ millisecs.

Tested-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index e8cf636e614b..1a9a3986b136 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1912,12 +1912,6 @@ static noinline struct module *load_module(void __user *umod,
 	if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	/* Create stop_machine threads since the error path relies on
-	 * a non-failing stop_machine call. */
-	err = stop_machine_create();
-	if (err)
-		goto free_hdr;
-
 	if (copy_from_user(hdr, umod, len) != 0) {
 		err = -EFAULT;
 		goto free_hdr;
@@ -2303,12 +2297,13 @@ static noinline struct module *load_module(void __user *umod,
 	/* Get rid of temporary copy */
 	vfree(hdr);
 
-	stop_machine_destroy();
 	/* Done! */
 	return mod;
 
  unlink:
-	stop_machine(__unlink_module, mod, NULL);
+	/* Unlink carefully: kallsyms could be walking list. */
+	list_del_rcu(&mod->list);
+	synchronize_sched();
 	module_arch_cleanup(mod);
  cleanup:
 	kobject_del(&mod->mkobj.kobj);
@@ -2331,7 +2326,6 @@ static noinline struct module *load_module(void __user *umod,
 	kfree(args);
  free_hdr:
 	vfree(hdr);
-	stop_machine_destroy();
 	return ERR_PTR(err);
 
  truncated:
-- 
cgit v1.2.3-59-g8ed1b


From 49502677e11079c2e3e01867c922a894ce06a8be Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:36 -0600
Subject: module: use strstarts()

Impact: minor cleanup.

I'm not going to neaten anyone else's code, but I'm happy to clean up
my own.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 1a9a3986b136..f6e08b7cff7c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1673,8 +1673,7 @@ static void layout_sections(struct module *mod,
 			if ((s->sh_flags & masks[m][0]) != masks[m][0]
 			    || (s->sh_flags & masks[m][1])
 			    || s->sh_entsize != ~0UL
-			    || strncmp(secstrings + s->sh_name,
-				       ".init", 5) == 0)
+			    || strstarts(secstrings + s->sh_name, ".init"))
 				continue;
 			s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
 			DEBUGP("\t%s\n", secstrings + s->sh_name);
@@ -1691,8 +1690,7 @@ static void layout_sections(struct module *mod,
 			if ((s->sh_flags & masks[m][0]) != masks[m][0]
 			    || (s->sh_flags & masks[m][1])
 			    || s->sh_entsize != ~0UL
-			    || strncmp(secstrings + s->sh_name,
-				       ".init", 5) != 0)
+			    || !strstarts(secstrings + s->sh_name, ".init"))
 				continue;
 			s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
 					 | INIT_OFFSET_MASK);
@@ -1825,8 +1823,7 @@ static char elf_type(const Elf_Sym *sym,
 		else
 			return 'b';
 	}
-	if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name,
-		    ".debug", strlen(".debug")) == 0)
+	if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug"))
 		return 'n';
 	return '?';
 }
@@ -1952,7 +1949,7 @@ static noinline struct module *load_module(void __user *umod,
 		}
 #ifndef CONFIG_MODULE_UNLOAD
 		/* Don't load .exit sections */
-		if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
+		if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
 			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 #endif
 		/* Don't keep __versions around; it's just for loading. */
-- 
cgit v1.2.3-59-g8ed1b


From 7f1e2ca9f04b02794597f60e7b1d43f0a1317939 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:27 +0100
Subject: hrtimer: fix rq->lock inversion (again)

It appears I inadvertly introduced rq->lock recursion to the
hrtimer_start() path when I delegated running already expired
timers to softirq context.

This patch fixes it by introducing a __hrtimer_start_range_ns()
method that will not use raise_softirq_irqoff() but
__raise_softirq_irqoff() which avoids the wakeup.

It then also changes schedule() to check for pending softirqs and
do the wakeup then, I'm not quite sure I like this last bit, nor
am I convinced its really needed.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus@samba.org
LKML-Reference: <20090313112301.096138802@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hrtimer.h   |  5 +++++
 include/linux/interrupt.h |  1 +
 kernel/hrtimer.c          | 55 +++++++++++++++++++++++++++++------------------
 kernel/sched.c            | 14 +++++++++---
 kernel/softirq.c          |  2 +-
 5 files changed, 52 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index bd37078c2d7d..0d2f7c8a33d6 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -336,6 +336,11 @@ extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
 			 const enum hrtimer_mode mode);
 extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			unsigned long range_ns, const enum hrtimer_mode mode);
+extern int
+__hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+			 unsigned long delta_ns,
+			 const enum hrtimer_mode mode, int wakeup);
+
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c68bffd182bb..4528bf70866a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -294,6 +294,7 @@ extern void softirq_init(void);
 #define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
+extern void wakeup_softirqd(void);
 
 /* This is the worklist that queues up per-cpu softirq work.
  *
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f394d2a42ca3..cb8a15c19583 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -651,14 +651,20 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
  * and expiry check is done in the hrtimer_interrupt or in the softirq.
  */
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-					    struct hrtimer_clock_base *base)
+					    struct hrtimer_clock_base *base,
+					    int wakeup)
 {
 	if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
-		spin_unlock(&base->cpu_base->lock);
-		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-		spin_lock(&base->cpu_base->lock);
+		if (wakeup) {
+			spin_unlock(&base->cpu_base->lock);
+			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+			spin_lock(&base->cpu_base->lock);
+		} else
+			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+
 		return 1;
 	}
+
 	return 0;
 }
 
@@ -703,7 +709,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
 static inline int hrtimer_switch_to_hres(void) { return 0; }
 static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-					    struct hrtimer_clock_base *base)
+					    struct hrtimer_clock_base *base,
+					    int wakeup)
 {
 	return 0;
 }
@@ -886,20 +893,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 	return 0;
 }
 
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer:	the timer to be added
- * @tim:	expiry time
- * @delta_ns:	"slack" range for the timer
- * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
- */
-int
-hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
-			const enum hrtimer_mode mode)
+int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+		unsigned long delta_ns, const enum hrtimer_mode mode,
+		int wakeup)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
@@ -940,12 +936,29 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
 	 * XXX send_remote_softirq() ?
 	 */
 	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
-		hrtimer_enqueue_reprogram(timer, new_base);
+		hrtimer_enqueue_reprogram(timer, new_base, wakeup);
 
 	unlock_hrtimer_base(timer, &flags);
 
 	return ret;
 }
+
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @delta_ns:	"slack" range for the timer
+ * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ *  0 on success
+ *  1 when the timer was active
+ */
+int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+		unsigned long delta_ns, const enum hrtimer_mode mode)
+{
+	return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
+}
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 
 /**
@@ -961,7 +974,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 int
 hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 {
-	return hrtimer_start_range_ns(timer, tim, 0, mode);
+	return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
 }
 EXPORT_SYMBOL_GPL(hrtimer_start);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 196d48babbef..63256e3ede2a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 
 	spin_lock(&rt_b->rt_runtime_lock);
 	for (;;) {
+		unsigned long delta;
+		ktime_t soft, hard;
+
 		if (hrtimer_active(&rt_b->rt_period_timer))
 			break;
 
 		now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
 		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-		hrtimer_start_expires(&rt_b->rt_period_timer,
-				HRTIMER_MODE_ABS);
+
+		soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
+		hard = hrtimer_get_expires(&rt_b->rt_period_timer);
+		delta = ktime_to_ns(ktime_sub(hard, soft));
+		__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
+				HRTIMER_MODE_ABS, 0);
 	}
 	spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -1146,7 +1153,8 @@ static __init void init_hrtick(void)
  */
 static void hrtick_start(struct rq *rq, u64 delay)
 {
-	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
+			HRTIMER_MODE_REL, 0);
 }
 
 static inline void init_hrtick(void)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 487751604300..accc85197c49 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
  * to the pending events, so lets the scheduler to balance
  * the softirq load for us.
  */
-static inline void wakeup_softirqd(void)
+void wakeup_softirqd(void)
 {
 	/* Interrupts are disabled: no need to stop preemption */
 	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
-- 
cgit v1.2.3-59-g8ed1b


From eedeeabdeeadb016b8c783e3620d06b98d0cb4e1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 18 Mar 2009 12:38:47 +0100
Subject: lockdep: add stack dumps to asserts

Have a better idea about exactly which loc causes a lockdep
limit overflow. Often it's a bug or inefficiency in that
subsystem.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1237376327.5069.253.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 981cd4854281..a288ae107b50 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -792,6 +792,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 
 		printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return NULL;
 	}
 	class = lock_classes + nr_lock_classes++;
@@ -855,6 +856,7 @@ static struct lock_list *alloc_list_entry(void)
 
 		printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return NULL;
 	}
 	return list_entries + nr_list_entries++;
@@ -1681,6 +1683,7 @@ cache_hit:
 
 		printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return 0;
 	}
 	chain = lock_chains + nr_lock_chains++;
@@ -2540,6 +2543,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		debug_locks_off();
 		printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return 0;
 	}
 
@@ -2636,6 +2640,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		debug_locks_off();
 		printk("BUG: MAX_LOCK_DEPTH too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return 0;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 6c051ce0307526adec32a847f0daa1af2124f0a9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 17:18:56 +0800
Subject: blktrace: fix timestamp in binary output

I found the timestamp is wrong:

 # echo bin > trace_option
 # echo blk > current_tracer
 # cat trace_pipe | blkparse -i -
 8,0    0        0     0.000000000   504  A   W ...
 ...
 8,7    1        0     0.008534097     0  C   R ...
            (should be 8.534097xxx)

user-space blkparse expects the timestamp to be nanosecond.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 6fb274f5f34e..ee7a8bb8b1e8 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1168,7 +1168,7 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 	const int offset = offsetof(struct blk_io_trace, sector);
 	struct blk_io_trace old = {
 		.magic	  = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
-		.time     = ns2usecs(iter->ts),
+		.time     = iter->ts,
 	};
 
 	if (!trace_seq_putmem(s, &old, offset))
-- 
cgit v1.2.3-59-g8ed1b


From b5230b56ee6caeb27cedb7753c0c319646383bb4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 17:19:33 +0800
Subject: blktrace: fix a race when creating blk_tree_root in debugfs

t1                                t2
------                            ------
do_blk_trace_setup()              do_blk_trace_setup()
  if (!blk_tree_root) {
                                    if (!blk_tree_root)
    blk_tree_root = create_dir()
                                      blk_tree_root = create_dir();
                                      (now blk_tree_root == NULL)
  ...
  dir = create_dir(name, blk_tree_root);

Due to this race, t1 will create 'dir' in /debugfs but not /debugfs/block.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ee7a8bb8b1e8..95f89faca73e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -426,11 +426,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 
 	ret = -ENOENT;
 
+	mutex_lock(&blk_tree_mutex);
 	if (!blk_tree_root) {
 		blk_tree_root = debugfs_create_dir("block", NULL);
-		if (!blk_tree_root)
+		if (!blk_tree_root) {
+			mutex_unlock(&blk_tree_mutex);
 			goto err;
+		}
 	}
+	mutex_unlock(&blk_tree_mutex);
 
 	dir = debugfs_create_dir(buts->name, blk_tree_root);
 
-- 
cgit v1.2.3-59-g8ed1b


From 5554720482a631702146a959db22fe417740e0a6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 17:21:26 +0800
Subject: blktrace: fix the original blktrace

Currently the original blktrace, which is using relay and is used via
ioctl, is broken. You can use ftrace to see the output of blktrace,
but user-space blktrace is unusable.

It's broken by "blktrace: add ftrace plugin"
(c71a896154119f4ca9e89d6078f5f63ad60ef199)

 -	if (unlikely(bt->trace_state != Blktrace_running))
 +	if (unlikely(bt->trace_state != Blktrace_running || !blk_tracer_enabled))
		return;

With this patch, both ioctl and ftrace can be used, but of course you
can't use both of them at the same time.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 95f89faca73e..a7f7ff5db38b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -110,7 +110,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 	unsigned long flags;
 	char *buf;
 
-	if (blk_tr) {
+	if (blk_tracer_enabled) {
 		va_start(args, fmt);
 		ftrace_vprintk(fmt, args);
 		va_end(args);
@@ -169,7 +169,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	pid_t pid;
 	int cpu, pc = 0;
 
-	if (unlikely(bt->trace_state != Blktrace_running ||
+	if (unlikely(bt->trace_state != Blktrace_running &&
 		     !blk_tracer_enabled))
 		return;
 
@@ -185,7 +185,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		return;
 	cpu = raw_smp_processor_id();
 
-	if (blk_tr) {
+	if (blk_tracer_enabled) {
 		tracing_record_cmdline(current);
 
 		pc = preempt_count();
@@ -235,7 +235,7 @@ record_it:
 		if (pdu_len)
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 
-		if (blk_tr) {
+		if (blk_tracer_enabled) {
 			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
 			return;
 		}
@@ -267,8 +267,7 @@ int blk_trace_remove(struct request_queue *q)
 	if (!bt)
 		return -EINVAL;
 
-	if (bt->trace_state == Blktrace_setup ||
-	    bt->trace_state == Blktrace_stopped)
+	if (bt->trace_state != Blktrace_running)
 		blk_trace_cleanup(bt);
 
 	return 0;
@@ -1273,7 +1272,6 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 	bt->dev = dev;
 	bt->act_mask = (u16)-1;
 	bt->end_lba = -1ULL;
-	bt->trace_state = Blktrace_running;
 
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt != NULL) {
-- 
cgit v1.2.3-59-g8ed1b


From eb08f8eb0673d9c1e62b69ad1b41593e73c40467 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 16:05:27 +0800
Subject: blktrace: fix off-by-one bug

'what' is used as the index of array what2act, so it can't >= the array size.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index a7f7ff5db38b..d43cdac7c754 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1152,7 +1152,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
 	if (!trace_print_context(iter))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
 	else {
 		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
@@ -1199,7 +1199,7 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
 	t = (const struct blk_io_trace *)iter->ent;
 	what = t->action & ((1 << BLK_TC_SHIFT) - 1);
 
-	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
 	else {
 		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
-- 
cgit v1.2.3-59-g8ed1b


From 35ac51bfe4c293b67ce9f85082ba0b9bc6123c40 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:19:46 +0800
Subject: blktrace: make classic output more classic

Impact: fix ftrace plugin timestamp output

In the classic user-space blktrace, the output timestamp is sec.nsec
not sec.usec.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d43cdac7c754..5b28f0f119c5 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -994,8 +994,8 @@ static void get_pdu_remap(const struct trace_entry *ent,
 static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
 {
 	char rwbs[6];
-	unsigned long long ts  = ns2usecs(iter->ts);
-	unsigned long usec_rem = do_div(ts, USEC_PER_SEC);
+	unsigned long long ts  = iter->ts;
+	unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
 	unsigned secs	       = (unsigned long)ts;
 	const struct trace_entry *ent = iter->ent;
 	const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
@@ -1003,9 +1003,9 @@ static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
 	fill_rwbs(rwbs, t);
 
 	return trace_seq_printf(&iter->seq,
-				"%3d,%-3d %2d %5d.%06lu %5u %2s %3s ",
+				"%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
 				MAJOR(t->device), MINOR(t->device), iter->cpu,
-				secs, usec_rem, ent->pid, act, rwbs);
+				secs, nsec_rem, ent->pid, act, rwbs);
 }
 
 static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
-- 
cgit v1.2.3-59-g8ed1b


From 17ba97e347bec9bbc47a0877c7a098708982129d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:20:09 +0800
Subject: blktrace: fix blk_probes_ref chaos

Impact: fix mixed ioctl and ftrace-plugin blktrace use refcount bugs

ioctl-based blktrace allocates bt and registers tracepoints when
ioctl(BLKTRACESETUP), and do all cleanups when ioctl(BLKTRACETEARDOWN).

while ftrace-based blktrace allocates/frees bt when:
  # echo 1/0 > /sys/block/sda/sda1/trace/enable

and registers/unregisters tracepoints when:
  # echo blk/nop > /debugfs/tracing/current_tracer
or
  # echo 1/0 > /debugfs/tracing/tracing_enable

The separatation of allocation and registeration causes 2 problems:

  1. current user-space blktrace still calls ioctl(TEARDOWN) when
     ioctl(SETUP) failed:
       # echo 1 > /sys/block/sda/sda1/trace/enable
       # blktrace /dev/sda
         BLKTRACESETUP: Device or resource busy
         ^C
     and now blk_probes_ref == -1

  2. Another way to make blk_probes_ref == -1:
     # plugin sdb && mount sdb1
     # echo 1 > /sys/block/sdb/sdb1/trace/enable
     # remove sdb

This patch does the allocation and registeration when writing
sdaX/trace/enable.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 5b28f0f119c5..8d6bd12aab10 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -478,7 +478,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		goto err;
 	}
 
-	if (atomic_add_return(1, &blk_probes_ref) == 1)
+	if (atomic_inc_return(&blk_probes_ref) == 1)
 		blk_register_tracepoints();
 
 	return 0;
@@ -1091,8 +1091,6 @@ static void blk_tracer_print_header(struct seq_file *m)
 
 static void blk_tracer_start(struct trace_array *tr)
 {
-	if (atomic_add_return(1, &blk_probes_ref) == 1)
-		blk_register_tracepoints();
 	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 
@@ -1107,15 +1105,10 @@ static int blk_tracer_init(struct trace_array *tr)
 static void blk_tracer_stop(struct trace_array *tr)
 {
 	trace_flags |= TRACE_ITER_CONTEXT_INFO;
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
 }
 
 static void blk_tracer_reset(struct trace_array *tr)
 {
-	if (!atomic_read(&blk_probes_ref))
-		return;
-
 	blk_tracer_enabled = false;
 	blk_tracer_stop(tr);
 }
@@ -1254,6 +1247,9 @@ static int blk_trace_remove_queue(struct request_queue *q)
 	if (bt == NULL)
 		return -EINVAL;
 
+	if (atomic_dec_and_test(&blk_probes_ref))
+		blk_unregister_tracepoints();
+
 	kfree(bt);
 	return 0;
 }
@@ -1280,6 +1276,9 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 		return -EBUSY;
 	}
 
+	if (atomic_inc_return(&blk_probes_ref) == 1)
+		blk_register_tracepoints();
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From ad5dd5493a55e462796e42e50a49e76df76fdb05 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:20:24 +0800
Subject: blktrace: fix memory leak when freeing struct blk_io_trace

Impact: fix mixed ioctl and ftrace-plugin blktrace use memory leak

When mixing the use of ioctl-based blktrace and ftrace-based blktrace,
we can leak memory in this way:

  # btrace /dev/sda > /dev/null &
  # echo 0 > /sys/block/sda/sda1/trace/enable

now we leak bt->dropped_file, bt->msg_file, bt->rchan...

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8d6bd12aab10..2f21d7761600 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -247,7 +247,7 @@ record_it:
 static struct dentry *blk_tree_root;
 static DEFINE_MUTEX(blk_tree_mutex);
 
-static void blk_trace_cleanup(struct blk_trace *bt)
+static void blk_trace_free(struct blk_trace *bt)
 {
 	debugfs_remove(bt->msg_file);
 	debugfs_remove(bt->dropped_file);
@@ -255,6 +255,11 @@ static void blk_trace_cleanup(struct blk_trace *bt)
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
 	kfree(bt);
+}
+
+static void blk_trace_cleanup(struct blk_trace *bt)
+{
+	blk_trace_free(bt);
 	if (atomic_dec_and_test(&blk_probes_ref))
 		blk_unregister_tracepoints();
 }
@@ -410,11 +415,11 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		if (buts->name[i] == '/')
 			buts->name[i] = '_';
 
-	ret = -ENOMEM;
 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 	if (!bt)
-		goto err;
+		return -ENOMEM;
 
+	ret = -ENOMEM;
 	bt->sequence = alloc_percpu(unsigned long);
 	if (!bt->sequence)
 		goto err;
@@ -483,17 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 
 	return 0;
 err:
-	if (bt) {
-		if (bt->msg_file)
-			debugfs_remove(bt->msg_file);
-		if (bt->dropped_file)
-			debugfs_remove(bt->dropped_file);
-		free_percpu(bt->sequence);
-		free_percpu(bt->msg_data);
-		if (bt->rchan)
-			relay_close(bt->rchan);
-		kfree(bt);
-	}
+	blk_trace_free(bt);
 	return ret;
 }
 
@@ -1091,6 +1086,7 @@ static void blk_tracer_print_header(struct seq_file *m)
 
 static void blk_tracer_start(struct trace_array *tr)
 {
+	blk_tracer_enabled = true;
 	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 
@@ -1098,18 +1094,17 @@ static int blk_tracer_init(struct trace_array *tr)
 {
 	blk_tr = tr;
 	blk_tracer_start(tr);
-	blk_tracer_enabled = true;
 	return 0;
 }
 
 static void blk_tracer_stop(struct trace_array *tr)
 {
+	blk_tracer_enabled = false;
 	trace_flags |= TRACE_ITER_CONTEXT_INFO;
 }
 
 static void blk_tracer_reset(struct trace_array *tr)
 {
-	blk_tracer_enabled = false;
 	blk_tracer_stop(tr);
 }
 
@@ -1250,7 +1245,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
 	if (atomic_dec_and_test(&blk_probes_ref))
 		blk_unregister_tracepoints();
 
-	kfree(bt);
+	blk_trace_free(bt);
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From b6a4b0c3ad4c09c1d37b1040ac8e3ebd1016e10b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:21:23 +0800
Subject: blktrace: extract duplidate code

Impact: cleanup

blk_trace_event_print() and blk_tracer_print_line() share most of the code.

   text    data     bss     dec     hex filename
   8605     393      12    9010    2332 kernel/trace/blktrace.o.orig
   text    data     bss     dec     hex filename
   8555     393      12    8960    2300 kernel/trace/blktrace.o

This patch also prepares for the next patch, that prints out BLK_TN_MESSAGE.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 62 ++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2f21d7761600..c103b0c20022 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -986,29 +986,31 @@ static void get_pdu_remap(const struct trace_entry *ent,
 	r->sector = be64_to_cpu(sector);
 }
 
-static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
+typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+
+static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
 {
 	char rwbs[6];
 	unsigned long long ts  = iter->ts;
 	unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
 	unsigned secs	       = (unsigned long)ts;
-	const struct trace_entry *ent = iter->ent;
-	const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
+	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
 
 	fill_rwbs(rwbs, t);
 
 	return trace_seq_printf(&iter->seq,
 				"%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
 				MAJOR(t->device), MINOR(t->device), iter->cpu,
-				secs, nsec_rem, ent->pid, act, rwbs);
+				secs, nsec_rem, iter->ent->pid, act, rwbs);
 }
 
-static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
-			      const char *act)
+static int blk_log_action(struct trace_iterator *iter, const char *act)
 {
 	char rwbs[6];
+	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
+
 	fill_rwbs(rwbs, t);
-	return trace_seq_printf(s, "%3d,%-3d %2s %3s ",
+	return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
 				MAJOR(t->device), MINOR(t->device), act, rwbs);
 }
 
@@ -1129,22 +1131,25 @@ static const struct {
 	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
 };
 
-static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
-					       int flags)
+static enum print_line_t print_one_line(struct trace_iterator *iter,
+					bool classic)
 {
 	struct trace_seq *s = &iter->seq;
-	const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
-	const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+	const struct blk_io_trace *t;
+	u16 what;
 	int ret;
+	bool long_act;
+	blk_log_action_t *log_action;
 
-	if (!trace_print_context(iter))
-		return TRACE_TYPE_PARTIAL_LINE;
+	t	   = te_blk_io_trace(iter->ent);
+	what	   = t->action & ((1 << BLK_TC_SHIFT) - 1);
+	long_act   = !!(trace_flags & TRACE_ITER_VERBOSE);
+	log_action = classic ? &blk_log_action_classic : &blk_log_action;
 
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
 	else {
-		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
-		ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
+		ret = log_action(iter, what2act[what].act[long_act]);
 		if (ret)
 			ret = what2act[what].print(s, iter->ent);
 	}
@@ -1152,6 +1157,15 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
 	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
 }
 
+static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
+					       int flags)
+{
+	if (!trace_print_context(iter))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return print_one_line(iter, false);
+}
+
 static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1177,26 +1191,10 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
 
 static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
 {
-	const struct blk_io_trace *t;
-	u16 what;
-	int ret;
-
 	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
 		return TRACE_TYPE_UNHANDLED;
 
-	t = (const struct blk_io_trace *)iter->ent;
-	what = t->action & ((1 << BLK_TC_SHIFT) - 1);
-
-	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
-		ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
-	else {
-		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
-		ret = blk_log_action_iter(iter, what2act[what].act[long_act]);
-		if (ret)
-			ret = what2act[what].print(&iter->seq, iter->ent);
-	}
-
-	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+	return print_one_line(iter, true);
 }
 
 static struct tracer blk_tracer __read_mostly = {
-- 
cgit v1.2.3-59-g8ed1b


From 18cea4591a98817697017bcb056a848bae1205df Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:21:54 +0800
Subject: blktrace: print out BLK_TN_MESSAGE properly

Impact: improve ftrace plugin output

Before this patch:

 # cat trace
         make-5383  [001]   741.240059:   8,7    P   N [make]
 __trace_note_message: cfq1074

 # echo 1 > options/blk_classic
 # cat trace
   8,7    1     0.692221252     0  C   W 130411392 + 1024 [0]
 Bad pc action 6361
 Bad pc action 283d

 # echo 0 > options/blk_classic
 # echo bin > trace_options
 # cat trace_pipe | blkparse -i -
 (can't parse messages generated by blk_add_trace_msg())

After this patch:
 # cat trace
      <idle>-0     [001]   187.600933:   8,7    C   W 145220224 + 8 [0]
      <idle>-0     [001]   187.600946:   8,7    m   N cfq1076 complete

 # echo 1 > options/blk_classic
 # cat trace
   8,7    1     0.256378996   238  I   W 113190728 + 8 [pdflush]
   8,7    1     0.256378998   238  m   N cfq1076 insert_request

 # echo 0 > options/blk_classic
 # echo bin > trace_options
 # cat trace_pipe | blkparse -i -
  8,7    1        0    22.973250293     0  C   W 102770576 + 8 [0]
  8,7    1        0    22.973259213     0  m   N cfq1076 complete

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 80 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 61 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c103b0c20022..947c5b3f90c4 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -59,22 +59,39 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 		       const void *data, size_t len)
 {
 	struct blk_io_trace *t;
+	struct ring_buffer_event *event = NULL;
+	int pc = 0;
+	int cpu = smp_processor_id();
+	bool blk_tracer = blk_tracer_enabled;
+
+	if (blk_tracer) {
+		pc = preempt_count();
+		event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
+						  sizeof(*t) + len,
+						  0, pc);
+		if (!event)
+			return;
+		t = ring_buffer_event_data(event);
+		goto record_it;
+	}
 
 	if (!bt->rchan)
 		return;
 
 	t = relay_reserve(bt->rchan, sizeof(*t) + len);
 	if (t) {
-		const int cpu = smp_processor_id();
-
 		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
 		t->time = ktime_to_ns(ktime_get());
+record_it:
 		t->device = bt->dev;
 		t->action = action;
 		t->pid = pid;
 		t->cpu = cpu;
 		t->pdu_len = len;
 		memcpy((void *) t + sizeof(*t), data, len);
+
+		if (blk_tracer)
+			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
 	}
 }
 
@@ -110,14 +127,8 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 	unsigned long flags;
 	char *buf;
 
-	if (blk_tracer_enabled) {
-		va_start(args, fmt);
-		ftrace_vprintk(fmt, args);
-		va_end(args);
-		return;
-	}
-
-	if (!bt->msg_data)
+	if (unlikely(bt->trace_state != Blktrace_running &&
+		     !blk_tracer_enabled))
 		return;
 
 	local_irq_save(flags);
@@ -168,9 +179,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	unsigned long *sequence;
 	pid_t pid;
 	int cpu, pc = 0;
+	bool blk_tracer = blk_tracer_enabled;
 
-	if (unlikely(bt->trace_state != Blktrace_running &&
-		     !blk_tracer_enabled))
+	if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
 		return;
 
 	what |= ddir_act[rw & WRITE];
@@ -185,7 +196,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		return;
 	cpu = raw_smp_processor_id();
 
-	if (blk_tracer_enabled) {
+	if (blk_tracer) {
 		tracing_record_cmdline(current);
 
 		pc = preempt_count();
@@ -235,7 +246,7 @@ record_it:
 		if (pdu_len)
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 
-		if (blk_tracer_enabled) {
+		if (blk_tracer) {
 			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
 			return;
 		}
@@ -922,6 +933,11 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 	int i = 0;
 	int tc = t->action >> BLK_TC_SHIFT;
 
+	if (t->action == BLK_TN_MESSAGE) {
+		rwbs[i++] = 'N';
+		goto out;
+	}
+
 	if (tc & BLK_TC_DISCARD)
 		rwbs[i++] = 'D';
 	else if (tc & BLK_TC_WRITE)
@@ -939,7 +955,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 		rwbs[i++] = 'S';
 	if (tc & BLK_TC_META)
 		rwbs[i++] = 'M';
-
+out:
 	rwbs[i] = '\0';
 }
 
@@ -1074,6 +1090,17 @@ static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
 				get_pdu_int(ent), cmd);
 }
 
+static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+{
+	int ret;
+	const struct blk_io_trace *t = te_blk_io_trace(ent);
+
+	ret = trace_seq_putmem(s, t + 1, t->pdu_len);
+	if (ret)
+		return trace_seq_putc(s, '\n');
+	return ret;
+}
+
 /*
  * struct tracer operations
  */
@@ -1146,6 +1173,13 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 	long_act   = !!(trace_flags & TRACE_ITER_VERBOSE);
 	log_action = classic ? &blk_log_action_classic : &blk_log_action;
 
+	if (t->action == BLK_TN_MESSAGE) {
+		ret = log_action(iter, long_act ? "message" : "m");
+		if (ret)
+			ret = blk_log_msg(s, iter->ent);
+		goto out;
+	}
+
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
 	else {
@@ -1153,7 +1187,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 		if (ret)
 			ret = what2act[what].print(s, iter->ent);
 	}
-
+out:
 	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
 }
 
@@ -1253,11 +1287,16 @@ static int blk_trace_remove_queue(struct request_queue *q)
 static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 {
 	struct blk_trace *old_bt, *bt = NULL;
+	int ret = -ENOMEM;
 
 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 	if (!bt)
 		return -ENOMEM;
 
+	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
+	if (!bt->msg_data)
+		goto free_bt;
+
 	bt->dev = dev;
 	bt->act_mask = (u16)-1;
 	bt->end_lba = -1ULL;
@@ -1265,14 +1304,17 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt != NULL) {
 		(void)xchg(&q->blk_trace, old_bt);
-		kfree(bt);
-		return -EBUSY;
+		ret = -EBUSY;
+		goto free_bt;
 	}
 
 	if (atomic_inc_return(&blk_probes_ref) == 1)
 		blk_register_tracepoints();
-
 	return 0;
+
+free_bt:
+	blk_trace_free(bt);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From bdd6df6af98ce7e70702edfb5fd5dbbd8d1b0453 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:22 +0200
Subject: tracing: provide trace_seq_reserve()

trace_seq_reserve() allows a caller to reserve space in a trace_seq and
write directly into it. This makes it easier to export binary data to
userspace via the tracing interface, by simply filling in a struct.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.c | 13 +++++++++++++
 kernel/trace/trace_output.h |  1 +
 2 files changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 19261fdd2455..6595074cbac5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -167,6 +167,19 @@ int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
 	return trace_seq_putmem(s, hex, j);
 }
 
+void *trace_seq_reserve(struct trace_seq *s, size_t len)
+{
+	void *ret;
+
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return NULL;
+
+	ret = s->buffer + s->len;
+	s->len += len;
+
+	return ret;
+}
+
 int trace_seq_path(struct trace_seq *s, struct path *path)
 {
 	unsigned char *p;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 35c422fb51a9..0ae20b83eec8 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -33,6 +33,7 @@ int trace_seq_puts(struct trace_seq *s, const char *str);
 int trace_seq_putc(struct trace_seq *s, unsigned char c);
 int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
 int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
+void *trace_seq_reserve(struct trace_seq *s, size_t len);
 int trace_seq_path(struct trace_seq *s, struct path *path);
 int seq_print_userip_objs(const struct userstack_entry *entry,
 			  struct trace_seq *s, unsigned long sym_flags);
-- 
cgit v1.2.3-59-g8ed1b


From f285901bb21355bb47106658ef14eeb6b8ed538f Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:23 +0200
Subject: tracing: add missing 'extern' keywords to trace_output.h

Impact: cleanup

Many declarations within trace_output.h are missing the 'extern' keyword
in an inconsistent manner. This adds 'extern' where it should be.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.h | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 0ae20b83eec8..46fb9612d884 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -29,25 +29,26 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
 				 size_t cnt);
-int trace_seq_puts(struct trace_seq *s, const char *str);
-int trace_seq_putc(struct trace_seq *s, unsigned char c);
-int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
-int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
-void *trace_seq_reserve(struct trace_seq *s, size_t len);
-int trace_seq_path(struct trace_seq *s, struct path *path);
-int seq_print_userip_objs(const struct userstack_entry *entry,
-			  struct trace_seq *s, unsigned long sym_flags);
-int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
-		      unsigned long ip, unsigned long sym_flags);
+extern int trace_seq_puts(struct trace_seq *s, const char *str);
+extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
+extern int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
+extern int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
+extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
+extern int trace_seq_path(struct trace_seq *s, struct path *path);
+extern int seq_print_userip_objs(const struct userstack_entry *entry,
+				 struct trace_seq *s, unsigned long sym_flags);
+extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+			     unsigned long ip, unsigned long sym_flags);
 
-int trace_print_context(struct trace_iterator *iter);
-int trace_print_lat_context(struct trace_iterator *iter);
+extern int trace_print_context(struct trace_iterator *iter);
+extern int trace_print_lat_context(struct trace_iterator *iter);
 
-struct trace_event *ftrace_find_event(int type);
-int register_ftrace_event(struct trace_event *event);
-int unregister_ftrace_event(struct trace_event *event);
+extern struct trace_event *ftrace_find_event(int type);
+extern int register_ftrace_event(struct trace_event *event);
+extern int unregister_ftrace_event(struct trace_event *event);
 
-enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags);
+extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
+					 int flags);
 
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
-- 
cgit v1.2.3-59-g8ed1b


From b14b70a6a4e394c9630bcde17e07d3bcdcbca27e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:21:00 +0800
Subject: trace: make argument 'mem' of trace_seq_putmem() const

Impact: fix build warning

I passed a const value to trace_seq_putmem(), and I got compile warning.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.c | 6 +++---
 kernel/trace/trace_output.h | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 6595074cbac5..d72b9a63b247 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -137,7 +137,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
 	return 1;
 }
 
-int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
+int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
 {
 	if (len > ((PAGE_SIZE - 1) - s->len))
 		return 0;
@@ -148,10 +148,10 @@ int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
 	return len;
 }
 
-int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
+int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
 {
 	unsigned char hex[HEX_CHARS];
-	unsigned char *data = mem;
+	const unsigned char *data = mem;
 	int i, j;
 
 #ifdef __BIG_ENDIAN
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 46fb9612d884..e0bde39c2dd9 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -31,8 +31,9 @@ extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
 				 size_t cnt);
 extern int trace_seq_puts(struct trace_seq *s, const char *str);
 extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
-extern int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
-extern int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
+extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
+extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+				size_t len);
 extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
 extern int trace_seq_path(struct trace_seq *s, struct path *path);
 extern int seq_print_userip_objs(const struct userstack_entry *entry,
-- 
cgit v1.2.3-59-g8ed1b


From 11d06b2a1e5658f448a308aa3beb97bacd64a940 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 Mar 2009 05:45:36 -0400
Subject: Kill unsharing fs_struct in __set_personality()

That's a rudiment of altroot support.  I.e. it should've been buried
a long time ago.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exec_domain.c | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 667c841c2952..cb8e9626c215 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -145,28 +145,6 @@ __set_personality(u_long personality)
 		return 0;
 	}
 
-	if (atomic_read(&current->fs->count) != 1) {
-		struct fs_struct *fsp, *ofsp;
-
-		fsp = copy_fs_struct(current->fs);
-		if (fsp == NULL) {
-			module_put(ep->module);
-			return -ENOMEM;
-		}
-
-		task_lock(current);
-		ofsp = current->fs;
-		current->fs = fsp;
-		task_unlock(current);
-
-		put_fs_struct(ofsp);
-	}
-
-	/*
-	 * At that point we are guaranteed to be the sole owner of
-	 * current->fs.
-	 */
-
 	current->personality = personality;
 	oep = current_thread_info()->exec_domain;
 	current_thread_info()->exec_domain = ep;
-- 
cgit v1.2.3-59-g8ed1b


From 3e93cd671813e204c258f1e6c797959920cf7772 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2009 19:00:13 -0400
Subject: Take fs_struct handling to new file (fs/fs_struct.c)

Pure code move; two new helper functions for nfsd and daemonize
(unshare_fs_struct() and daemonize_fs_struct() resp.; for now -
the same code as used to be in callers).  unshare_fs_struct()
exported (for nfsd, as copy_fs_struct()/exit_fs() used to be),
copy_fs_struct() and exit_fs() don't need exports anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Makefile               |   2 +-
 fs/fs_struct.c            | 141 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/internal.h             |   6 ++
 fs/namei.c                |   7 ---
 fs/namespace.c            |  68 ----------------------
 fs/nfsd/nfssvc.c          |   7 +--
 include/linux/fs_struct.h |   2 +
 kernel/exit.c             |  31 +---------
 kernel/fork.c             |  29 +---------
 9 files changed, 155 insertions(+), 138 deletions(-)
 create mode 100644 fs/fs_struct.c

(limited to 'kernel')

diff --git a/fs/Makefile b/fs/Makefile
index 6e82a307bcd4..b5cd8e18dd9f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
-		stack.o
+		stack.o fs_struct.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
new file mode 100644
index 000000000000..36e0a123bbf3
--- /dev/null
+++ b/fs/fs_struct.c
@@ -0,0 +1,141 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+
+/*
+ * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_root(struct fs_struct *fs, struct path *path)
+{
+	struct path old_root;
+
+	write_lock(&fs->lock);
+	old_root = fs->root;
+	fs->root = *path;
+	path_get(path);
+	write_unlock(&fs->lock);
+	if (old_root.dentry)
+		path_put(&old_root);
+}
+
+/*
+ * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_pwd(struct fs_struct *fs, struct path *path)
+{
+	struct path old_pwd;
+
+	write_lock(&fs->lock);
+	old_pwd = fs->pwd;
+	fs->pwd = *path;
+	path_get(path);
+	write_unlock(&fs->lock);
+
+	if (old_pwd.dentry)
+		path_put(&old_pwd);
+}
+
+void chroot_fs_refs(struct path *old_root, struct path *new_root)
+{
+	struct task_struct *g, *p;
+	struct fs_struct *fs;
+	int count = 0;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		task_lock(p);
+		fs = p->fs;
+		if (fs) {
+			write_lock(&fs->lock);
+			if (fs->root.dentry == old_root->dentry
+			    && fs->root.mnt == old_root->mnt) {
+				path_get(new_root);
+				fs->root = *new_root;
+				count++;
+			}
+			if (fs->pwd.dentry == old_root->dentry
+			    && fs->pwd.mnt == old_root->mnt) {
+				path_get(new_root);
+				fs->pwd = *new_root;
+				count++;
+			}
+			write_unlock(&fs->lock);
+		}
+		task_unlock(p);
+	} while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
+	while (count--)
+		path_put(old_root);
+}
+
+void put_fs_struct(struct fs_struct *fs)
+{
+	/* No need to hold fs->lock if we are killing it */
+	if (atomic_dec_and_test(&fs->count)) {
+		path_put(&fs->root);
+		path_put(&fs->pwd);
+		kmem_cache_free(fs_cachep, fs);
+	}
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+	struct fs_struct * fs = tsk->fs;
+
+	if (fs) {
+		task_lock(tsk);
+		tsk->fs = NULL;
+		task_unlock(tsk);
+		put_fs_struct(fs);
+	}
+}
+
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+	/* We don't need to lock fs - think why ;-) */
+	if (fs) {
+		atomic_set(&fs->count, 1);
+		rwlock_init(&fs->lock);
+		fs->umask = old->umask;
+		read_lock(&old->lock);
+		fs->root = old->root;
+		path_get(&old->root);
+		fs->pwd = old->pwd;
+		path_get(&old->pwd);
+		read_unlock(&old->lock);
+	}
+	return fs;
+}
+
+int unshare_fs_struct(void)
+{
+	struct fs_struct *fsp = copy_fs_struct(current->fs);
+	if (!fsp)
+		return -ENOMEM;
+	exit_fs(current);
+	current->fs = fsp;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unshare_fs_struct);
+
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+	.count		= ATOMIC_INIT(1),
+	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
+	.umask		= 0022,
+};
+
+void daemonize_fs_struct(void)
+{
+	struct fs_struct *fs;
+
+	exit_fs(current);	/* current->fs->count--; */
+	fs = &init_fs;
+	current->fs = fs;
+	atomic_inc(&fs->count);
+}
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f1732..477a105f8df3 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
 
 struct super_block;
 struct linux_binprm;
+struct path;
 
 /*
  * block_dev.c
@@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 
 extern void __init mnt_init(void);
+
+/*
+ * fs_struct.c
+ */
+extern void chroot_fs_refs(struct path *, struct path *);
diff --git a/fs/namei.c b/fs/namei.c
index d040ce11785d..4c65a6460138 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2897,10 +2897,3 @@ EXPORT_SYMBOL(vfs_symlink);
 EXPORT_SYMBOL(vfs_unlink);
 EXPORT_SYMBOL(dentry_unhash);
 EXPORT_SYMBOL(generic_readlink);
-
-/* to be mentioned only in INIT_TASK */
-struct fs_struct init_fs = {
-	.count		= ATOMIC_INIT(1),
-	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
-	.umask		= 0022,
-};
diff --git a/fs/namespace.c b/fs/namespace.c
index f7ec283ccfbb..1e56303c718e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2092,74 +2092,6 @@ out1:
 	return retval;
 }
 
-/*
- * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_root(struct fs_struct *fs, struct path *path)
-{
-	struct path old_root;
-
-	write_lock(&fs->lock);
-	old_root = fs->root;
-	fs->root = *path;
-	path_get(path);
-	write_unlock(&fs->lock);
-	if (old_root.dentry)
-		path_put(&old_root);
-}
-
-/*
- * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_pwd(struct fs_struct *fs, struct path *path)
-{
-	struct path old_pwd;
-
-	write_lock(&fs->lock);
-	old_pwd = fs->pwd;
-	fs->pwd = *path;
-	path_get(path);
-	write_unlock(&fs->lock);
-
-	if (old_pwd.dentry)
-		path_put(&old_pwd);
-}
-
-static void chroot_fs_refs(struct path *old_root, struct path *new_root)
-{
-	struct task_struct *g, *p;
-	struct fs_struct *fs;
-	int count = 0;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		task_lock(p);
-		fs = p->fs;
-		if (fs) {
-			write_lock(&fs->lock);
-			if (fs->root.dentry == old_root->dentry
-			    && fs->root.mnt == old_root->mnt) {
-				path_get(new_root);
-				fs->root = *new_root;
-				count++;
-			}
-			if (fs->pwd.dentry == old_root->dentry
-			    && fs->pwd.mnt == old_root->mnt) {
-				path_get(new_root);
-				fs->pwd = *new_root;
-				count++;
-			}
-			write_unlock(&fs->lock);
-		}
-		task_unlock(p);
-	} while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
-	while (count--)
-		path_put(old_root);
-}
-
 /*
  * pivot_root Semantics:
  * Moves the root file system of the current process to the directory put_old,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa8..144d69918614 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -404,7 +404,6 @@ static int
 nfsd(void *vrqstp)
 {
 	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
-	struct fs_struct *fsp;
 	int err, preverr = 0;
 
 	/* Lock module and set up kernel thread */
@@ -413,13 +412,11 @@ nfsd(void *vrqstp)
 	/* At this point, the thread shares current->fs
 	 * with the init process. We need to create files with a
 	 * umask of 0 instead of init's umask. */
-	fsp = copy_fs_struct(current->fs);
-	if (!fsp) {
+	if (unshare_fs_struct() < 0) {
 		printk("Unable to start nfsd thread: out of memory\n");
 		goto out;
 	}
-	exit_fs(current);
-	current->fs = fsp;
+
 	current->fs->umask = 0;
 
 	/*
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 18b467dbe278..298cef1c0793 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -20,5 +20,7 @@ extern void set_fs_root(struct fs_struct *, struct path *);
 extern void set_fs_pwd(struct fs_struct *, struct path *);
 extern struct fs_struct *copy_fs_struct(struct fs_struct *);
 extern void put_fs_struct(struct fs_struct *);
+extern void daemonize_fs_struct(void);
+extern int unshare_fs_struct(void);
 
 #endif /* _LINUX_FS_STRUCT_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3ad7c6..ad8375758a79 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -429,7 +429,6 @@ EXPORT_SYMBOL(disallow_signal);
 void daemonize(const char *name, ...)
 {
 	va_list args;
-	struct fs_struct *fs;
 	sigset_t blocked;
 
 	va_start(args, name);
@@ -462,11 +461,7 @@ void daemonize(const char *name, ...)
 
 	/* Become as one with the init task */
 
-	exit_fs(current);	/* current->fs->count--; */
-	fs = init_task.fs;
-	current->fs = fs;
-	atomic_inc(&fs->count);
-
+	daemonize_fs_struct();
 	exit_files(current);
 	current->files = init_task.files;
 	atomic_inc(&current->files->count);
@@ -565,30 +560,6 @@ void exit_files(struct task_struct *tsk)
 	}
 }
 
-void put_fs_struct(struct fs_struct *fs)
-{
-	/* No need to hold fs->lock if we are killing it */
-	if (atomic_dec_and_test(&fs->count)) {
-		path_put(&fs->root);
-		path_put(&fs->pwd);
-		kmem_cache_free(fs_cachep, fs);
-	}
-}
-
-void exit_fs(struct task_struct *tsk)
-{
-	struct fs_struct * fs = tsk->fs;
-
-	if (fs) {
-		task_lock(tsk);
-		tsk->fs = NULL;
-		task_unlock(tsk);
-		put_fs_struct(fs);
-	}
-}
-
-EXPORT_SYMBOL_GPL(exit_fs);
-
 #ifdef CONFIG_MM_OWNER
 /*
  * Task p is exiting and it owned mm, lets find a new owner for it
diff --git a/kernel/fork.c b/kernel/fork.c
index 47c15840a381..05c02dc586b1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -681,38 +681,13 @@ fail_nomem:
 	return retval;
 }
 
-static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
-{
-	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
-	/* We don't need to lock fs - think why ;-) */
-	if (fs) {
-		atomic_set(&fs->count, 1);
-		rwlock_init(&fs->lock);
-		fs->umask = old->umask;
-		read_lock(&old->lock);
-		fs->root = old->root;
-		path_get(&old->root);
-		fs->pwd = old->pwd;
-		path_get(&old->pwd);
-		read_unlock(&old->lock);
-	}
-	return fs;
-}
-
-struct fs_struct *copy_fs_struct(struct fs_struct *old)
-{
-	return __copy_fs_struct(old);
-}
-
-EXPORT_SYMBOL_GPL(copy_fs_struct);
-
 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 {
 	if (clone_flags & CLONE_FS) {
 		atomic_inc(&current->fs->count);
 		return 0;
 	}
-	tsk->fs = __copy_fs_struct(current->fs);
+	tsk->fs = copy_fs_struct(current->fs);
 	if (!tsk->fs)
 		return -ENOMEM;
 	return 0;
@@ -1545,7 +1520,7 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 
 	if ((unshare_flags & CLONE_FS) &&
 	    (fs && atomic_read(&fs->count) > 1)) {
-		*new_fsp = __copy_fs_struct(current->fs);
+		*new_fsp = copy_fs_struct(current->fs);
 		if (!*new_fsp)
 			return -ENOMEM;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 498052bba55ecaff58db6a1436b0e25bfd75a7ff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 Mar 2009 07:20:30 -0400
Subject: New locking/refcounting for fs_struct

* all changes of current->fs are done under task_lock and write_lock of
  old fs->lock
* refcount is not atomic anymore (same protection)
* its decrements are done when removing reference from current; at the
  same time we decide whether to free it.
* put_fs_struct() is gone
* new field - ->in_exec.  Set by check_unsafe_exec() if we are trying to do
  execve() and only subthreads share fs_struct.  Cleared when finishing exec
  (success and failure alike).  Makes CLONE_FS fail with -EAGAIN if set.
* check_unsafe_exec() may fail with -EAGAIN if another execve() from subthread
  is in progress.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c               | 16 +++++++++--
 fs/exec.c                 | 31 +++++++++++++++++----
 fs/fs_struct.c            | 69 +++++++++++++++++++++++++++++++++--------------
 fs/internal.h             |  2 +-
 fs/proc/task_nommu.c      |  2 +-
 include/linux/fs_struct.h |  8 +++---
 kernel/fork.c             | 37 ++++++++++++++++++-------
 7 files changed, 121 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/fs/compat.c b/fs/compat.c
index 55efdfebdf5a..baabf203b847 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -51,6 +51,7 @@
 #include <linux/poll.h>
 #include <linux/mm.h>
 #include <linux/eventpoll.h>
+#include <linux/fs_struct.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1441,12 +1442,15 @@ int compat_do_execve(char * filename,
 	bprm->cred = prepare_exec_creds();
 	if (!bprm->cred)
 		goto out_unlock;
-	check_unsafe_exec(bprm);
+
+	retval = check_unsafe_exec(bprm);
+	if (retval)
+		goto out_unlock;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_unlock;
+		goto out_unmark;
 
 	sched_exec();
 
@@ -1488,6 +1492,9 @@ int compat_do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
@@ -1506,6 +1513,11 @@ out_file:
 		fput(bprm->file);
 	}
 
+out_unmark:
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
+
 out_unlock:
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/exec.c b/fs/exec.c
index c5128fbc9165..07a059664b73 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1056,16 +1056,18 @@ EXPORT_SYMBOL(install_exec_creds);
  * - the caller must hold current->cred_exec_mutex to protect against
  *   PTRACE_ATTACH
  */
-void check_unsafe_exec(struct linux_binprm *bprm)
+int check_unsafe_exec(struct linux_binprm *bprm)
 {
 	struct task_struct *p = current, *t;
 	unsigned long flags;
 	unsigned n_fs, n_sighand;
+	int res = 0;
 
 	bprm->unsafe = tracehook_unsafe_exec(p);
 
 	n_fs = 1;
 	n_sighand = 1;
+	write_lock(&p->fs->lock);
 	lock_task_sighand(p, &flags);
 	for (t = next_thread(p); t != p; t = next_thread(t)) {
 		if (t->fs == p->fs)
@@ -1073,11 +1075,19 @@ void check_unsafe_exec(struct linux_binprm *bprm)
 		n_sighand++;
 	}
 
-	if (atomic_read(&p->fs->count) > n_fs ||
-	    atomic_read(&p->sighand->count) > n_sighand)
+	if (p->fs->users > n_fs ||
+	    atomic_read(&p->sighand->count) > n_sighand) {
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
+	} else {
+		if (p->fs->in_exec)
+			res = -EAGAIN;
+		p->fs->in_exec = 1;
+	}
 
 	unlock_task_sighand(p, &flags);
+	write_unlock(&p->fs->lock);
+
+	return res;
 }
 
 /* 
@@ -1296,12 +1306,15 @@ int do_execve(char * filename,
 	bprm->cred = prepare_exec_creds();
 	if (!bprm->cred)
 		goto out_unlock;
-	check_unsafe_exec(bprm);
+
+	retval = check_unsafe_exec(bprm);
+	if (retval)
+		goto out_unlock;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_unlock;
+		goto out_unmark;
 
 	sched_exec();
 
@@ -1344,6 +1357,9 @@ int do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
@@ -1362,6 +1378,11 @@ out_file:
 		fput(bprm->file);
 	}
 
+out_unmark:
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
+
 out_unlock:
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 36e0a123bbf3..41cff72b377b 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -72,25 +72,27 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 		path_put(old_root);
 }
 
-void put_fs_struct(struct fs_struct *fs)
+void free_fs_struct(struct fs_struct *fs)
 {
-	/* No need to hold fs->lock if we are killing it */
-	if (atomic_dec_and_test(&fs->count)) {
-		path_put(&fs->root);
-		path_put(&fs->pwd);
-		kmem_cache_free(fs_cachep, fs);
-	}
+	path_put(&fs->root);
+	path_put(&fs->pwd);
+	kmem_cache_free(fs_cachep, fs);
 }
 
 void exit_fs(struct task_struct *tsk)
 {
-	struct fs_struct * fs = tsk->fs;
+	struct fs_struct *fs = tsk->fs;
 
 	if (fs) {
+		int kill;
 		task_lock(tsk);
+		write_lock(&fs->lock);
 		tsk->fs = NULL;
+		kill = !--fs->users;
+		write_unlock(&fs->lock);
 		task_unlock(tsk);
-		put_fs_struct(fs);
+		if (kill)
+			free_fs_struct(fs);
 	}
 }
 
@@ -99,7 +101,8 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
 	/* We don't need to lock fs - think why ;-) */
 	if (fs) {
-		atomic_set(&fs->count, 1);
+		fs->users = 1;
+		fs->in_exec = 0;
 		rwlock_init(&fs->lock);
 		fs->umask = old->umask;
 		read_lock(&old->lock);
@@ -114,28 +117,54 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 
 int unshare_fs_struct(void)
 {
-	struct fs_struct *fsp = copy_fs_struct(current->fs);
-	if (!fsp)
+	struct fs_struct *fs = current->fs;
+	struct fs_struct *new_fs = copy_fs_struct(fs);
+	int kill;
+
+	if (!new_fs)
 		return -ENOMEM;
-	exit_fs(current);
-	current->fs = fsp;
+
+	task_lock(current);
+	write_lock(&fs->lock);
+	kill = !--fs->users;
+	current->fs = new_fs;
+	write_unlock(&fs->lock);
+	task_unlock(current);
+
+	if (kill)
+		free_fs_struct(fs);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(unshare_fs_struct);
 
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
-	.count		= ATOMIC_INIT(1),
+	.users		= 1,
 	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
 	.umask		= 0022,
 };
 
 void daemonize_fs_struct(void)
 {
-	struct fs_struct *fs;
+	struct fs_struct *fs = current->fs;
+
+	if (fs) {
+		int kill;
+
+		task_lock(current);
 
-	exit_fs(current);	/* current->fs->count--; */
-	fs = &init_fs;
-	current->fs = fs;
-	atomic_inc(&fs->count);
+		write_lock(&init_fs.lock);
+		init_fs.users++;
+		write_unlock(&init_fs.lock);
+
+		write_lock(&fs->lock);
+		current->fs = &init_fs;
+		kill = !--fs->users;
+		write_unlock(&fs->lock);
+
+		task_unlock(current);
+		if (kill)
+			free_fs_struct(fs);
+	}
 }
diff --git a/fs/internal.h b/fs/internal.h
index 477a105f8df3..b4dac4fb6b61 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -44,7 +44,7 @@ extern void __init chrdev_init(void);
 /*
  * exec.c
  */
-extern void check_unsafe_exec(struct linux_binprm *);
+extern int check_unsafe_exec(struct linux_binprm *);
 
 /*
  * namespace.c
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..6ca01052c5bc 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -49,7 +49,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	else
 		bytes += kobjsize(mm);
 	
-	if (current->fs && atomic_read(&current->fs->count) > 1)
+	if (current->fs && current->fs->users > 1)
 		sbytes += kobjsize(current->fs);
 	else
 		bytes += kobjsize(current->fs);
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 298cef1c0793..78a05bfcd8eb 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -4,12 +4,10 @@
 #include <linux/path.h>
 
 struct fs_struct {
-	atomic_t count;	/* This usage count is used by check_unsafe_exec() for
-			 * security checking purposes - therefore it may not be
-			 * incremented, except by clone(CLONE_FS).
-			 */
+	int users;
 	rwlock_t lock;
 	int umask;
+	int in_exec;
 	struct path root, pwd;
 };
 
@@ -19,7 +17,7 @@ extern void exit_fs(struct task_struct *);
 extern void set_fs_root(struct fs_struct *, struct path *);
 extern void set_fs_pwd(struct fs_struct *, struct path *);
 extern struct fs_struct *copy_fs_struct(struct fs_struct *);
-extern void put_fs_struct(struct fs_struct *);
+extern void free_fs_struct(struct fs_struct *);
 extern void daemonize_fs_struct(void);
 extern int unshare_fs_struct(void);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 05c02dc586b1..51f138a131de 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -683,11 +683,19 @@ fail_nomem:
 
 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 {
+	struct fs_struct *fs = current->fs;
 	if (clone_flags & CLONE_FS) {
-		atomic_inc(&current->fs->count);
+		/* tsk->fs is already what we want */
+		write_lock(&fs->lock);
+		if (fs->in_exec) {
+			write_unlock(&fs->lock);
+			return -EAGAIN;
+		}
+		fs->users++;
+		write_unlock(&fs->lock);
 		return 0;
 	}
-	tsk->fs = copy_fs_struct(current->fs);
+	tsk->fs = copy_fs_struct(fs);
 	if (!tsk->fs)
 		return -ENOMEM;
 	return 0;
@@ -1518,12 +1526,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 {
 	struct fs_struct *fs = current->fs;
 
-	if ((unshare_flags & CLONE_FS) &&
-	    (fs && atomic_read(&fs->count) > 1)) {
-		*new_fsp = copy_fs_struct(current->fs);
-		if (!*new_fsp)
-			return -ENOMEM;
-	}
+	if (!(unshare_flags & CLONE_FS) || !fs)
+		return 0;
+
+	/* don't need lock here; in the worst case we'll do useless copy */
+	if (fs->users == 1)
+		return 0;
+
+	*new_fsp = copy_fs_struct(fs);
+	if (!*new_fsp)
+		return -ENOMEM;
 
 	return 0;
 }
@@ -1639,8 +1651,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 
 		if (new_fs) {
 			fs = current->fs;
+			write_lock(&fs->lock);
 			current->fs = new_fs;
-			new_fs = fs;
+			if (--fs->users)
+				new_fs = NULL;
+			else
+				new_fs = fs;
+			write_unlock(&fs->lock);
 		}
 
 		if (new_mm) {
@@ -1679,7 +1696,7 @@ bad_unshare_cleanup_sigh:
 
 bad_unshare_cleanup_fs:
 	if (new_fs)
-		put_fs_struct(new_fs);
+		free_fs_struct(new_fs);
 
 bad_unshare_cleanup_thread:
 bad_unshare_out:
-- 
cgit v1.2.3-59-g8ed1b


From 5ad4e53bd5406ee214ddc5a41f03f779b8b2d526 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2009 19:50:06 -0400
Subject: Get rid of indirect include of fs_struct.h

Don't pull it in sched.h; very few files actually need it and those
can include directly.  sched.h itself only needs forward declaration
of struct fs_struct;

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/cris/kernel/process.c    | 1 -
 fs/dcache.c                   | 1 +
 fs/exec.c                     | 1 +
 fs/fs_struct.c                | 1 +
 fs/namei.c                    | 1 +
 fs/namespace.c                | 1 +
 fs/open.c                     | 1 +
 fs/proc/base.c                | 1 +
 fs/proc/task_nommu.c          | 1 +
 include/linux/mnt_namespace.h | 2 ++
 include/linux/nsproxy.h       | 1 +
 include/linux/sched.h         | 3 ++-
 init/do_mounts.c              | 1 +
 kernel/auditsc.c              | 1 +
 kernel/exec_domain.c          | 1 +
 kernel/exit.c                 | 1 +
 kernel/fork.c                 | 1 +
 kernel/sys.c                  | 1 +
 security/tomoyo/realpath.c    | 1 +
 19 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 60816e876455..4df0b320d524 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -19,7 +19,6 @@
 #include <asm/system.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
-#include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
diff --git a/fs/dcache.c b/fs/dcache.c
index 90bbd7e1b116..0dc4de21f088 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
 #include <linux/seqlock.h>
 #include <linux/swap.h>
 #include <linux/bootmem.h>
+#include <linux/fs_struct.h>
 #include "internal.h"
 
 int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/exec.c b/fs/exec.c
index 614991bf0c87..052a961e41aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -53,6 +53,7 @@
 #include <linux/tracehook.h>
 #include <linux/kmod.h>
 #include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 6ac219338670..eee059052db5 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/path.h>
 #include <linux/slab.h>
+#include <linux/fs_struct.h>
 
 /*
  * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
diff --git a/fs/namei.c b/fs/namei.c
index 964c0249444b..b8433ebfae05 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -32,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/fcntl.h>
 #include <linux/device_cgroup.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 
 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
diff --git a/fs/namespace.c b/fs/namespace.c
index 1e56303c718e..c6f54e4c4290 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
 #include <linux/ramfs.h>
 #include <linux/log2.h>
 #include <linux/idr.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
diff --git a/fs/open.c b/fs/open.c
index 75b61677daaf..377eb25b6abf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
 #include <linux/rcupdate.h>
 #include <linux/audit.h>
 #include <linux/falloc.h>
+#include <linux/fs_struct.h>
 
 int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e0afd326b688..f71559784bfb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -80,6 +80,7 @@
 #include <linux/oom.h>
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
+#include <linux/fs_struct.h>
 #include "internal.h"
 
 /* NOTE:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 6ca01052c5bc..253afc04484c 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -2,6 +2,7 @@
 #include <linux/mm.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/fs_struct.h>
 #include <linux/mount.h>
 #include <linux/ptrace.h>
 #include <linux/seq_file.h>
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 830bbcd449d6..3a059298cc19 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -22,6 +22,8 @@ struct proc_mounts {
 	int event;
 };
 
+struct fs_struct;
+
 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
 		struct fs_struct *);
 extern void __put_mnt_ns(struct mnt_namespace *ns);
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index afad7dec1b36..7b370c7cfeff 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -8,6 +8,7 @@ struct mnt_namespace;
 struct uts_namespace;
 struct ipc_namespace;
 struct pid_namespace;
+struct fs_struct;
 
 /*
  * A structure to contain pointers to all per-process
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29df6374d2de..b4e065ea0de1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -68,7 +68,7 @@ struct sched_param {
 #include <linux/smp.h>
 #include <linux/sem.h>
 #include <linux/signal.h>
-#include <linux/fs_struct.h>
+#include <linux/path.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
 #include <linux/pid.h>
@@ -97,6 +97,7 @@ struct futex_pi_state;
 struct robust_list_head;
 struct bio;
 struct bts_tracer;
+struct fs_struct;
 
 /*
  * List of flags we want to share for kernel threads,
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 8d4ff5afc1d8..dd7ee5f203f3 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/initrd.h>
 #include <linux/async.h>
+#include <linux/fs_struct.h>
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_fs_sb.h>
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8cbddff6c283..2bfc64786765 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -66,6 +66,7 @@
 #include <linux/syscalls.h>
 #include <linux/inotify.h>
 #include <linux/capability.h>
+#include <linux/fs_struct.h>
 
 #include "audit.h"
 
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index cb8e9626c215..c35452cadded 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -18,6 +18,7 @@
 #include <linux/syscalls.h>
 #include <linux/sysctl.h>
 #include <linux/types.h>
+#include <linux/fs_struct.h>
 
 
 static void default_handler(int, struct pt_regs *);
diff --git a/kernel/exit.c b/kernel/exit.c
index ad8375758a79..b5d656845c90 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
+#include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <trace/sched.h>
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 51f138a131de..e82a14577a98 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -60,6 +60,7 @@
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
+#include <linux/fs_struct.h>
 #include <trace/sched.h>
 #include <linux/magic.h>
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 37f458e6882a..ce182aaed204 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -34,6 +34,7 @@
 #include <linux/seccomp.h>
 #include <linux/cpu.h>
 #include <linux/ptrace.h>
+#include <linux/fs_struct.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c
index d47f16b844b2..3bbe01a7a4b5 100644
--- a/security/tomoyo/realpath.c
+++ b/security/tomoyo/realpath.c
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 #include <linux/mount.h>
 #include <linux/mnt_namespace.h>
+#include <linux/fs_struct.h>
 #include "common.h"
 #include "realpath.h"
 
-- 
cgit v1.2.3-59-g8ed1b


From 2aad1b76e6b0cc5a2e5d9b95a9f356ddddbfa8a9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 30 Mar 2009 11:11:28 -0400
Subject: function-graph: allow unregistering twice

Impact: fix to permanent disabling of function graph tracer

There should be nothing to prevent a tracer from unregistering a
function graph callback more than once. This can simplify error paths.

But currently, the counter does not account for mulitple unregistering
of the function graph callback. If it happens, the function graph
tracer will be permanently disabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1752a63f37c0..f1ed080406c3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2719,6 +2719,9 @@ void unregister_ftrace_graph(void)
 {
 	mutex_lock(&ftrace_lock);
 
+	if (!unlikely(atomic_read(&ftrace_graph_active)))
+		goto out;
+
 	atomic_dec(&ftrace_graph_active);
 	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
@@ -2726,6 +2729,7 @@ void unregister_ftrace_graph(void)
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 	unregister_pm_notifier(&ftrace_suspend_notifier);
 
+ out:
 	mutex_unlock(&ftrace_lock);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 2e572895bf3203e881356a4039ab0fa428ed2639 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 30 Mar 2009 14:03:19 -0400
Subject: ring-buffer: do not remove reader page from list on ring buffer free

Impact: prevent possible memory leak

The reader page of the ring buffer is special. Although it points
into the ring buffer, it is not part of the actual buffer. It is
a page used by the reader to swap with a page in the ring buffer.
Once the swap is made, the new reader page is again outside the
buffer.

Even though the reader page points into the buffer, it is really
pointing to residual data. Note, this data is used by the reader.

              reader page
                  |
                  v
       (prev)   +---+    (next)
     +----------|   |----------+
     |          +---+          |
     v                         v
   +---+        +---+        +---+
-->|   |------->|   |------->|   |--->
<--|   |<-------|   |<-------|   |<---
   +---+        +---+        +---+

     ^            ^            ^
      \           |            /
       ------- Buffer---------

If we perform a list_del_init() on the reader page we will actually remove
the last page the reader swapped with and not the reader page itself.
This will cause that page to not be freed, and thus is a memory leak.

Luckily, the only user of the ring buffer so far is ftrace. And ftrace
will not free its ring buffer after it allocates it. There is no current
possible memory leak. But once there are other users, or if ftrace
dynamically creates and frees its ring buffer, then this would be a
memory leak.

This patch fixes the leak for future cases.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index edce2ff38944..960cbf44c844 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -563,7 +563,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 	struct list_head *head = &cpu_buffer->pages;
 	struct buffer_page *bpage, *tmp;
 
-	list_del_init(&cpu_buffer->reader_page->list);
 	free_buffer_page(cpu_buffer->reader_page);
 
 	list_for_each_entry_safe(bpage, tmp, head, list) {
-- 
cgit v1.2.3-59-g8ed1b


From 33e5d76979cf01e3834814fe0aea569d1d602c1a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Apr 2009 16:56:32 -0700
Subject: nommu: fix a number of issues with the per-MM VMA patch

Fix a number of issues with the per-MM VMA patch:

 (1) Make mmap_pages_allocated an atomic_long_t, just in case this is used on
     a NOMMU system with more than 2G pages.  Makes no difference on a 32-bit
     system.

 (2) Report vma->vm_pgoff * PAGE_SIZE as a 64-bit value, not a 32-bit value,
     lest it overflow.

 (3) Move the allocation of the vm_area_struct slab back for fork.c.

 (4) Use KMEM_CACHE() for both vm_area_struct and vm_region slabs.

 (5) Use BUG_ON() rather than if () BUG().

 (6) Make the default validate_nommu_regions() a static inline rather than a
     #define.

 (7) Make free_page_series()'s objection to pages with a refcount != 1 more
     informative.

 (8) Adjust the __put_nommu_region() banner comment to indicate that the
     semaphore must be held for writing.

 (9) Limit the number of warnings about munmaps of non-mmapped regions.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c    |  2 +-
 fs/proc/task_nommu.c |  4 ++--
 include/linux/mm.h   |  2 +-
 kernel/fork.c        |  1 +
 mm/mmap.c            |  3 ---
 mm/nommu.c           | 52 +++++++++++++++++++++++++---------------------------
 6 files changed, 30 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 43d23948384a..74ea974f5ca6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freeram-i.freehigh),
 #endif
 #ifndef CONFIG_MMU
-		K((unsigned long) atomic_read(&mmap_pages_allocated)),
+		K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
 #endif
 		K(i.totalswap),
 		K(i.freeswap),
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..370be0a2c909 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -136,14 +136,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 	}
 
 	seq_printf(m,
-		   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
 		   vma->vm_start,
 		   vma->vm_end,
 		   flags & VM_READ ? 'r' : '-',
 		   flags & VM_WRITE ? 'w' : '-',
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-		   vma->vm_pgoff << PAGE_SHIFT,
+		   (unsigned long long) vma->vm_pgoff << PAGE_SHIFT,
 		   MAJOR(dev), MINOR(dev), ino, &len);
 
 	if (file) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index aeabe953ba4f..bff1f0d475c7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1079,7 +1079,7 @@ static inline void setup_per_cpu_pageset(void) {}
 #endif
 
 /* nommu.c */
-extern atomic_t mmap_pages_allocated;
+extern atomic_long_t mmap_pages_allocated;
 
 /* prio_tree.c */
 void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
diff --git a/kernel/fork.c b/kernel/fork.c
index 47c15840a381..51d1aa21483b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1488,6 +1488,7 @@ void __init proc_caches_init(void)
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
 	mmap_init();
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 1abb9185a686..4a3841186c11 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2481,7 +2481,4 @@ void mm_drop_all_locks(struct mm_struct *mm)
  */
 void __init mmap_init(void)
 {
-	vm_area_cachep = kmem_cache_create("vm_area_struct",
-			sizeof(struct vm_area_struct), 0,
-			SLAB_PANIC, NULL);
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 2fcf47d449b4..72eda4aee2cb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,7 +69,7 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
 int heap_stack_gap = 0;
 
-atomic_t mmap_pages_allocated;
+atomic_long_t mmap_pages_allocated;
 
 EXPORT_SYMBOL(mem_map);
 EXPORT_SYMBOL(num_physpages);
@@ -463,12 +463,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
  */
 void __init mmap_init(void)
 {
-	vm_region_jar = kmem_cache_create("vm_region_jar",
-					  sizeof(struct vm_region), 0,
-					  SLAB_PANIC, NULL);
-	vm_area_cachep = kmem_cache_create("vm_area_struct",
-					   sizeof(struct vm_area_struct), 0,
-					   SLAB_PANIC, NULL);
+	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 }
 
 /*
@@ -486,27 +481,24 @@ static noinline void validate_nommu_regions(void)
 		return;
 
 	last = rb_entry(lastp, struct vm_region, vm_rb);
-	if (unlikely(last->vm_end <= last->vm_start))
-		BUG();
-	if (unlikely(last->vm_top < last->vm_end))
-		BUG();
+	BUG_ON(unlikely(last->vm_end <= last->vm_start));
+	BUG_ON(unlikely(last->vm_top < last->vm_end));
 
 	while ((p = rb_next(lastp))) {
 		region = rb_entry(p, struct vm_region, vm_rb);
 		last = rb_entry(lastp, struct vm_region, vm_rb);
 
-		if (unlikely(region->vm_end <= region->vm_start))
-			BUG();
-		if (unlikely(region->vm_top < region->vm_end))
-			BUG();
-		if (unlikely(region->vm_start < last->vm_top))
-			BUG();
+		BUG_ON(unlikely(region->vm_end <= region->vm_start));
+		BUG_ON(unlikely(region->vm_top < region->vm_end));
+		BUG_ON(unlikely(region->vm_start < last->vm_top));
 
 		lastp = p;
 	}
 }
 #else
-#define validate_nommu_regions() do {} while(0)
+static void validate_nommu_regions(void)
+{
+}
 #endif
 
 /*
@@ -563,16 +555,17 @@ static void free_page_series(unsigned long from, unsigned long to)
 		struct page *page = virt_to_page(from);
 
 		kdebug("- free %lx", from);
-		atomic_dec(&mmap_pages_allocated);
+		atomic_long_dec(&mmap_pages_allocated);
 		if (page_count(page) != 1)
-			kdebug("free page %p [%d]", page, page_count(page));
+			kdebug("free page %p: refcount not one: %d",
+			       page, page_count(page));
 		put_page(page);
 	}
 }
 
 /*
  * release a reference to a region
- * - the caller must hold the region semaphore, which this releases
+ * - the caller must hold the region semaphore for writing, which this releases
  * - the region may not have been added to the tree yet, in which case vm_top
  *   will equal vm_start
  */
@@ -1096,7 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
 		goto enomem;
 
 	total = 1 << order;
-	atomic_add(total, &mmap_pages_allocated);
+	atomic_long_add(total, &mmap_pages_allocated);
 
 	point = rlen >> PAGE_SHIFT;
 
@@ -1107,7 +1100,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
 			order = ilog2(total - point);
 			n = 1 << order;
 			kdebug("shave %lu/%lu @%lu", n, total - point, total);
-			atomic_sub(n, &mmap_pages_allocated);
+			atomic_long_sub(n, &mmap_pages_allocated);
 			total -= n;
 			set_page_refcounted(pages + total);
 			__free_pages(pages + total, order);
@@ -1536,10 +1529,15 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 	/* find the first potentially overlapping VMA */
 	vma = find_vma(mm, start);
 	if (!vma) {
-		printk(KERN_WARNING
-		       "munmap of memory not mmapped by process %d (%s):"
-		       " 0x%lx-0x%lx\n",
-		       current->pid, current->comm, start, start + len - 1);
+		static int limit = 0;
+		if (limit < 5) {
+			printk(KERN_WARNING
+			       "munmap of memory not mmapped by process %d"
+			       " (%s): 0x%lx-0x%lx\n",
+			       current->pid, current->comm,
+			       start, start + len - 1);
+			limit++;
+		}
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 6f2c55b843836d26528c56a0968689accaedbc67 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 2 Apr 2009 16:56:59 -0700
Subject: Simplify copy_thread()

First argument unused since 2.3.11.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/process.c         | 2 +-
 arch/arm/kernel/process.c           | 2 +-
 arch/avr32/kernel/process.c         | 2 +-
 arch/blackfin/kernel/process.c      | 2 +-
 arch/cris/arch-v10/kernel/process.c | 2 +-
 arch/cris/arch-v32/kernel/process.c | 2 +-
 arch/frv/kernel/process.c           | 2 +-
 arch/h8300/kernel/process.c         | 2 +-
 arch/ia64/kernel/process.c          | 2 +-
 arch/m32r/kernel/process.c          | 2 +-
 arch/m68k/kernel/process.c          | 2 +-
 arch/m68knommu/kernel/process.c     | 2 +-
 arch/mips/kernel/process.c          | 2 +-
 arch/mn10300/kernel/process.c       | 2 +-
 arch/parisc/kernel/process.c        | 2 +-
 arch/powerpc/kernel/process.c       | 2 +-
 arch/s390/kernel/process.c          | 2 +-
 arch/sh/kernel/process_32.c         | 2 +-
 arch/sh/kernel/process_64.c         | 2 +-
 arch/sparc/kernel/process_32.c      | 2 +-
 arch/sparc/kernel/process_64.c      | 2 +-
 arch/um/kernel/process.c            | 2 +-
 arch/x86/kernel/process_32.c        | 2 +-
 arch/x86/kernel/process_64.c        | 2 +-
 arch/xtensa/kernel/process.c        | 2 +-
 include/linux/sched.h               | 3 ++-
 kernel/fork.c                       | 2 +-
 27 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 8d0097f10208..3a2fb7a02db4 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -272,7 +272,7 @@ alpha_vfork(struct pt_regs *regs)
  */
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+copy_thread(unsigned long clone_flags, unsigned long usp,
 	    unsigned long unused,
 	    struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 2de14e2afdc5..c3265a2e7cd4 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -301,7 +301,7 @@ void release_thread(struct task_struct *dead_task)
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long stack_start,
+copy_thread(unsigned long clone_flags, unsigned long stack_start,
 	    unsigned long stk_sz, struct task_struct *p, struct pt_regs *regs)
 {
 	struct thread_info *thread = task_thread_info(p);
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 43ae555ecb33..1bbe1da54869 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -332,7 +332,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 33e2e8993f7f..f49427293ca1 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -193,7 +193,7 @@ asmlinkage int bfin_clone(struct pt_regs *regs)
 }
 
 int
-copy_thread(int nr, unsigned long clone_flags,
+copy_thread(unsigned long clone_flags,
 	    unsigned long usp, unsigned long topstk,
 	    struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index bd9b3ff63f6c..c4c69cf721e5 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -115,7 +115,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
  */
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index ced5b725d9bd..120e7f796fea 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -131,7 +131,7 @@ kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 extern asmlinkage void ret_from_fork(void);
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+copy_thread(unsigned long clone_flags, unsigned long usp,
 	unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 9583a338e9d6..0de50df74970 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -204,7 +204,7 @@ void prepare_to_copy(struct task_struct *tsk)
 /*
  * set up the kernel stack and exception frames for a new process
  */
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
 		unsigned long usp, unsigned long topstk,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index a8ef654a5a0b..e2f33d0f9969 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -191,7 +191,7 @@ asmlinkage int h8300_clone(struct pt_regs *regs)
 
 }
 
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
                 unsigned long usp, unsigned long topstk,
 		 struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index c57162705147..5d7c0e5b9e76 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -413,7 +413,7 @@ ia64_load_extra (struct task_struct *task)
  * so there is nothing to worry about.
  */
 int
-copy_thread (int nr, unsigned long clone_flags,
+copy_thread(unsigned long clone_flags,
 	     unsigned long user_stack_base, unsigned long user_stack_size,
 	     struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 7103d91e1a2f..3e876f0baebc 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -225,7 +225,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 	return 0; /* Task didn't use the fpu at all. */
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long spu,
+int copy_thread(unsigned long clone_flags, unsigned long spu,
 	unsigned long unused, struct task_struct *tsk, struct pt_regs *regs)
 {
 	struct pt_regs *childregs = task_pt_regs(tsk);
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 632ce016014d..ec37fb56c127 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -233,7 +233,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs)
 		       parent_tidptr, child_tidptr);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		 unsigned long unused,
 		 struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c
index 3f2d7745f31e..1e96c6eb6312 100644
--- a/arch/m68knommu/kernel/process.c
+++ b/arch/m68knommu/kernel/process.c
@@ -199,7 +199,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs)
         return do_fork(clone_flags, newsp, regs, 0, NULL, NULL);
 }
 
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
 		unsigned long usp, unsigned long topstk,
 		struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index ca2e4026ad20..1eaaa450e20c 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -99,7 +99,7 @@ void flush_thread(void)
 {
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 	unsigned long unused, struct task_struct *p, struct pt_regs *regs)
 {
 	struct thread_info *ti = task_thread_info(p);
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index b28c9a60445b..234cf344cdce 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -193,7 +193,7 @@ void prepare_to_copy(struct task_struct *tsk)
  * set up the kernel stack for a new thread and copy arch-specific thread
  * control information
  */
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
 		unsigned long c_usp, unsigned long ustk_size,
 		struct task_struct *p, struct pt_regs *kregs)
 {
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index b80e02a4d81d..8aa591ed9127 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -263,7 +263,7 @@ sys_vfork(struct pt_regs *regs)
 }
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+copy_thread(unsigned long clone_flags, unsigned long usp,
 	    unsigned long unused,	/* in ia64 this is "user_stack_size" */
 	    struct task_struct * p, struct pt_regs * pregs)
 {
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index eac064948780..7b44a33f03c2 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -598,7 +598,7 @@ void prepare_to_copy(struct task_struct *tsk)
 /*
  * Copy a thread..
  */
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused, struct task_struct *p,
 		struct pt_regs *regs)
 {
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index b48e961a38f6..a3acd8e60aff 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -160,7 +160,7 @@ void release_thread(struct task_struct *dead_task)
 {
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
+int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index ddafbbbab2ab..694bc15f84fd 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -170,7 +170,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index c90c7e5e5fee..96be839040f8 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -425,7 +425,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index f4bee35a1b46..2830b415e214 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -455,7 +455,7 @@ asmlinkage int sparc_do_fork(unsigned long clone_flags,
  */
 extern void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index a73954b87f0a..4041f94e7724 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -561,7 +561,7 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags,
  * Parent -->  %o0 == childs  pid, %o1 == 0
  * Child  -->  %o0 == parents pid, %o1 == 1
  */
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index a1c6d07cac3e..4a28a1568d85 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -179,7 +179,7 @@ void fork_handler(void)
 	userspace(&current->thread.regs.regs);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long stack_top, struct task_struct * p,
 		struct pt_regs *regs)
 {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 14014d766cad..76f8f84043a2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -245,7 +245,7 @@ void prepare_to_copy(struct task_struct *tsk)
 	unlazy_fpu(tsk);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 	unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index abb7e6a7f0c6..b751a41392b1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -278,7 +278,7 @@ void prepare_to_copy(struct task_struct *tsk)
 	unlazy_fpu(tsk);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 9185597eb6a0..031f36685710 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -172,7 +172,7 @@ void prepare_to_copy(struct task_struct *tsk)
  *       childregs.
  */
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
                 struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 481fad3a9b42..9186f8c5d5f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1975,7 +1975,8 @@ extern void mm_release(struct task_struct *, struct mm_struct *);
 /* Allocate a new mm structure and copy contents from tsk->mm */
 extern struct mm_struct *dup_mm(struct task_struct *tsk);
 
-extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+extern int copy_thread(unsigned long, unsigned long, unsigned long,
+			struct task_struct *, struct pt_regs *);
 extern void flush_thread(void);
 extern void exit_thread(void);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 51d1aa21483b..d7eb727eb535 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1125,7 +1125,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_io(clone_flags, p)))
 		goto bad_fork_cleanup_namespaces;
-	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+	retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
 		goto bad_fork_cleanup_io;
 
-- 
cgit v1.2.3-59-g8ed1b


From 313e924c0852943e67335fad9d2608701f0dfe8e Mon Sep 17 00:00:00 2001
From: Grzegorz Nosek <root@localdomain.pl>
Date: Thu, 2 Apr 2009 16:57:23 -0700
Subject: cgroups: relax ns_can_attach checks to allow attaching to grandchild
 cgroups

The ns_proxy cgroup allows moving processes to child cgroups only one
level deep at a time.  This commit relaxes this restriction and makes it
possible to attach tasks directly to grandchild cgroups, e.g.:

($pid is in the root cgroup)
echo $pid > /cgroup/CG1/CG2/tasks

Previously this operation would fail with -EPERM and would have to be
performed as two steps:
echo $pid > /cgroup/CG1/tasks
echo $pid > /cgroup/CG1/CG2/tasks

Also, the target cgroup no longer needs to be empty to move a task there.

Signed-off-by: Grzegorz Nosek <root@localdomain.pl>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  4 ++--
 kernel/cgroup.c        | 11 ++++++-----
 kernel/ns_cgroup.c     | 14 ++++----------
 3 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bb8feb9feccd..788c4964c142 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -348,8 +348,8 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
-/* Return true if the cgroup is a descendant of the current cgroup */
-int cgroup_is_descendant(const struct cgroup *cgrp);
+/* Return true if cgrp is a descendant of the task's cgroup */
+int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
 
 /* Control Group subsystem type. See Documentation/cgroups.txt for details */
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c500ca7239b2..27792bcb0758 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3084,18 +3084,19 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
 }
 
 /**
- * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp
+ * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
  * @cgrp: the cgroup in question
+ * @task: the task in question
  *
- * See if @cgrp is a descendant of the current task's cgroup in
- * the appropriate hierarchy.
+ * See if @cgrp is a descendant of @task's cgroup in the appropriate
+ * hierarchy.
  *
  * If we are sending in dummytop, then presumably we are creating
  * the top cgroup in the subsystem.
  *
  * Called only by the ns (nsproxy) cgroup.
  */
-int cgroup_is_descendant(const struct cgroup *cgrp)
+int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
 {
 	int ret;
 	struct cgroup *target;
@@ -3105,7 +3106,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)
 		return 1;
 
 	get_first_subsys(cgrp, NULL, &subsys_id);
-	target = task_cgroup(current, subsys_id);
+	target = task_cgroup(task, subsys_id);
 	while (cgrp != target && cgrp!= cgrp->top_cgroup)
 		cgrp = cgrp->parent;
 	ret = (cgrp == target);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 78bc3fdac0d2..5aa854f9e5ae 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -34,7 +34,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 
 /*
  * Rules:
- *   1. you can only enter a cgroup which is a child of your current
+ *   1. you can only enter a cgroup which is a descendant of your current
  *     cgroup
  *   2. you can only place another process into a cgroup if
  *     a. you have CAP_SYS_ADMIN
@@ -45,21 +45,15 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 static int ns_can_attach(struct cgroup_subsys *ss,
 		struct cgroup *new_cgroup, struct task_struct *task)
 {
-	struct cgroup *orig;
-
 	if (current != task) {
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
-		if (!cgroup_is_descendant(new_cgroup))
+		if (!cgroup_is_descendant(new_cgroup, current))
 			return -EPERM;
 	}
 
-	if (atomic_read(&new_cgroup->count) != 0)
-		return -EPERM;
-
-	orig = task_cgroup(task, ns_subsys_id);
-	if (orig && orig != new_cgroup->parent)
+	if (!cgroup_is_descendant(new_cgroup, task))
 		return -EPERM;
 
 	return 0;
@@ -77,7 +71,7 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
-	if (!cgroup_is_descendant(cgroup))
+	if (!cgroup_is_descendant(cgroup, current))
 		return ERR_PTR(-EPERM);
 
 	ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
-- 
cgit v1.2.3-59-g8ed1b


From 38460b48d06440de46b34cb778bd6c4855030754 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:25 -0700
Subject: cgroup: CSS ID support

Patch for Per-CSS(Cgroup Subsys State) ID and private hierarchy code.

This patch attaches unique ID to each css and provides following.

 - css_lookup(subsys, id)
   returns pointer to struct cgroup_subysys_state of id.
 - css_get_next(subsys, id, rootid, depth, foundid)
   returns the next css under "root" by scanning

When cgroup_subsys->use_id is set, an id for css is maintained.

The cgroup framework only parepares
	- css_id of root css for subsys
	- id is automatically attached at creation of css.
	- id is *not* freed automatically. Because the cgroup framework
	  don't know lifetime of cgroup_subsys_state.
	  free_css_id() function is provided. This must be called by subsys.

There are several reasons to develop this.
	- Saving space .... For example, memcg's swap_cgroup is array of
	  pointers to cgroup. But it is not necessary to be very fast.
	  By replacing pointers(8bytes per ent) to ID (2byes per ent), we can
	  reduce much amount of memory usage.

	- Scanning without lock.
	  CSS_ID provides "scan id under this ROOT" function. By this, scanning
	  css under root can be written without locks.
	  ex)
	  do {
		rcu_read_lock();
		next = cgroup_get_next(subsys, id, root, &found);
		/* check sanity of next here */
		css_tryget();
		rcu_read_unlock();
		id = found + 1
	 } while(...)

Characteristics:
	- Each css has unique ID under subsys.
	- Lifetime of ID is controlled by subsys.
	- css ID contains "ID" and "Depth in hierarchy" and stack of hierarchy
	- Allowed ID is 1-65535, ID 0 is UNUSED ID.

Design Choices:
	- scan-by-ID v.s. scan-by-tree-walk.
	  As /proc's pid scan does, scan-by-ID is robust when scanning is done
	  by following kind of routine.
	  scan -> rest a while(release a lock) -> conitunue from interrupted
	  memcg's hierarchical reclaim does this.

	- When subsys->use_id is set, # of css in the system is limited to
	  65535.

[bharata@linux.vnet.ibm.com: remove rcu_read_lock() from css_get_next()]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  50 +++++++++
 include/linux/idr.h    |   1 +
 kernel/cgroup.c        | 286 ++++++++++++++++++++++++++++++++++++++++++++++++-
 lib/idr.c              |  46 ++++++++
 4 files changed, 382 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 788c4964c142..9a23bb098205 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -15,6 +15,7 @@
 #include <linux/cgroupstats.h>
 #include <linux/prio_heap.h>
 #include <linux/rwsem.h>
+#include <linux/idr.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -22,6 +23,7 @@ struct cgroupfs_root;
 struct cgroup_subsys;
 struct inode;
 struct cgroup;
+struct css_id;
 
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
@@ -63,6 +65,8 @@ struct cgroup_subsys_state {
 	atomic_t refcnt;
 
 	unsigned long flags;
+	/* ID for this css, if possible */
+	struct css_id *id;
 };
 
 /* bits in struct cgroup_subsys_state flags field */
@@ -373,6 +377,11 @@ struct cgroup_subsys {
 	int active;
 	int disabled;
 	int early_init;
+	/*
+	 * True if this subsys uses ID. ID is not available before cgroup_init()
+	 * (not available in early_init time.)
+	 */
+	bool use_id;
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 
@@ -395,6 +404,9 @@ struct cgroup_subsys {
 	 */
 	struct cgroupfs_root *root;
 	struct list_head sibling;
+	/* used when use_id == true */
+	struct idr idr;
+	spinlock_t id_lock;
 };
 
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
@@ -450,6 +462,44 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
 
+/*
+ * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
+ * if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
+ * CSS ID is assigned at cgroup allocation (create) automatically
+ * and removed when subsys calls free_css_id() function. This is because
+ * the lifetime of cgroup_subsys_state is subsys's matter.
+ *
+ * Looking up and scanning function should be called under rcu_read_lock().
+ * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls.
+ * But the css returned by this routine can be "not populated yet" or "being
+ * destroyed". The caller should check css and cgroup's status.
+ */
+
+/*
+ * Typically Called at ->destroy(), or somewhere the subsys frees
+ * cgroup_subsys_state.
+ */
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
+
+/* Find a cgroup_subsys_state which has given ID */
+
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
+
+/*
+ * Get a cgroup whose id is greater than or equal to id under tree of root.
+ * Returning a cgroup_subsys_state or NULL.
+ */
+struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
+		struct cgroup_subsys_state *root, int *foundid);
+
+/* Returns true if root is ancestor of cg */
+bool css_is_ancestor(struct cgroup_subsys_state *cg,
+		     struct cgroup_subsys_state *root);
+
+/* Get id and depth of css */
+unsigned short css_id(struct cgroup_subsys_state *css);
+unsigned short css_depth(struct cgroup_subsys_state *css);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
diff --git a/include/linux/idr.h b/include/linux/idr.h
index dd846df8cd32..e968db71e33a 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -106,6 +106,7 @@ int idr_get_new(struct idr *idp, void *ptr, int *id);
 int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
 int idr_for_each(struct idr *idp,
 		 int (*fn)(int id, void *p, void *data), void *data);
+void *idr_get_next(struct idr *idp, int *nextid);
 void *idr_replace(struct idr *idp, void *ptr, int id);
 void idr_remove(struct idr *idp, int id);
 void idr_remove_all(struct idr *idp);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 27792bcb0758..d3c521137425 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -94,7 +94,6 @@ struct cgroupfs_root {
 	char release_agent_path[PATH_MAX];
 };
 
-
 /*
  * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
  * subsystems that are otherwise unattached - it never has more than a
@@ -102,6 +101,39 @@ struct cgroupfs_root {
  */
 static struct cgroupfs_root rootnode;
 
+/*
+ * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
+ * cgroup_subsys->use_id != 0.
+ */
+#define CSS_ID_MAX	(65535)
+struct css_id {
+	/*
+	 * The css to which this ID points. This pointer is set to valid value
+	 * after cgroup is populated. If cgroup is removed, this will be NULL.
+	 * This pointer is expected to be RCU-safe because destroy()
+	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
+	 * css_tryget() should be used for avoiding race.
+	 */
+	struct cgroup_subsys_state *css;
+	/*
+	 * ID of this css.
+	 */
+	unsigned short id;
+	/*
+	 * Depth in hierarchy which this ID belongs to.
+	 */
+	unsigned short depth;
+	/*
+	 * ID is freed by RCU. (and lookup routine is RCU safe.)
+	 */
+	struct rcu_head rcu_head;
+	/*
+	 * Hierarchy of CSS ID belongs to.
+	 */
+	unsigned short stack[0]; /* Array of Length (depth+1) */
+};
+
+
 /* The list of hierarchy roots */
 
 static LIST_HEAD(roots);
@@ -185,6 +217,8 @@ struct cg_cgroup_link {
 static struct css_set init_css_set;
 static struct cg_cgroup_link init_css_set_link;
 
+static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+
 /* css_set_lock protects the list of css_set objects, and the
  * chain of tasks off each css_set.  Nests outside task->alloc_lock
  * due to cgroup_iter_start() */
@@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
+static int alloc_css_id(struct cgroup_subsys *ss,
+			struct cgroup *parent, struct cgroup *child);
+
 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 {
 	struct inode *inode = new_inode(sb);
@@ -2327,6 +2364,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
 			return err;
 	}
+	/* This cgroup is ready now */
+	for_each_subsys(cgrp->root, ss) {
+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+		/*
+		 * Update id->css pointer and make this css visible from
+		 * CSS ID functions. This pointer will be dereferened
+		 * from RCU-read-side without locks.
+		 */
+		if (css->id)
+			rcu_assign_pointer(css->id->css, css);
+	}
 
 	return 0;
 }
@@ -2338,6 +2386,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 	css->cgroup = cgrp;
 	atomic_set(&css->refcnt, 1);
 	css->flags = 0;
+	css->id = NULL;
 	if (cgrp == dummytop)
 		set_bit(CSS_ROOT, &css->flags);
 	BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -2413,6 +2462,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			goto err_destroy;
 		}
 		init_cgroup_css(css, ss, cgrp);
+		if (ss->use_id)
+			if (alloc_css_id(ss, parent, cgrp))
+				goto err_destroy;
+		/* At error, ->destroy() callback has to free assigned ID. */
 	}
 
 	cgroup_lock_hierarchy(root);
@@ -2708,6 +2761,8 @@ int __init cgroup_init(void)
 		struct cgroup_subsys *ss = subsys[i];
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
+		if (ss->use_id)
+			cgroup_subsys_init_idr(ss);
 	}
 
 	/* Add init_css_set to the hash table */
@@ -3242,3 +3297,232 @@ static int __init cgroup_disable(char *str)
 	return 1;
 }
 __setup("cgroup_disable=", cgroup_disable);
+
+/*
+ * Functons for CSS ID.
+ */
+
+/*
+ *To get ID other than 0, this should be called when !cgroup_is_removed().
+ */
+unsigned short css_id(struct cgroup_subsys_state *css)
+{
+	struct css_id *cssid = rcu_dereference(css->id);
+
+	if (cssid)
+		return cssid->id;
+	return 0;
+}
+
+unsigned short css_depth(struct cgroup_subsys_state *css)
+{
+	struct css_id *cssid = rcu_dereference(css->id);
+
+	if (cssid)
+		return cssid->depth;
+	return 0;
+}
+
+bool css_is_ancestor(struct cgroup_subsys_state *child,
+		    struct cgroup_subsys_state *root)
+{
+	struct css_id *child_id = rcu_dereference(child->id);
+	struct css_id *root_id = rcu_dereference(root->id);
+
+	if (!child_id || !root_id || (child_id->depth < root_id->depth))
+		return false;
+	return child_id->stack[root_id->depth] == root_id->id;
+}
+
+static void __free_css_id_cb(struct rcu_head *head)
+{
+	struct css_id *id;
+
+	id = container_of(head, struct css_id, rcu_head);
+	kfree(id);
+}
+
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
+{
+	struct css_id *id = css->id;
+	/* When this is called before css_id initialization, id can be NULL */
+	if (!id)
+		return;
+
+	BUG_ON(!ss->use_id);
+
+	rcu_assign_pointer(id->css, NULL);
+	rcu_assign_pointer(css->id, NULL);
+	spin_lock(&ss->id_lock);
+	idr_remove(&ss->idr, id->id);
+	spin_unlock(&ss->id_lock);
+	call_rcu(&id->rcu_head, __free_css_id_cb);
+}
+
+/*
+ * This is called by init or create(). Then, calls to this function are
+ * always serialized (By cgroup_mutex() at create()).
+ */
+
+static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
+{
+	struct css_id *newid;
+	int myid, error, size;
+
+	BUG_ON(!ss->use_id);
+
+	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
+	newid = kzalloc(size, GFP_KERNEL);
+	if (!newid)
+		return ERR_PTR(-ENOMEM);
+	/* get id */
+	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
+		error = -ENOMEM;
+		goto err_out;
+	}
+	spin_lock(&ss->id_lock);
+	/* Don't use 0. allocates an ID of 1-65535 */
+	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
+	spin_unlock(&ss->id_lock);
+
+	/* Returns error when there are no free spaces for new ID.*/
+	if (error) {
+		error = -ENOSPC;
+		goto err_out;
+	}
+	if (myid > CSS_ID_MAX)
+		goto remove_idr;
+
+	newid->id = myid;
+	newid->depth = depth;
+	return newid;
+remove_idr:
+	error = -ENOSPC;
+	spin_lock(&ss->id_lock);
+	idr_remove(&ss->idr, myid);
+	spin_unlock(&ss->id_lock);
+err_out:
+	kfree(newid);
+	return ERR_PTR(error);
+
+}
+
+static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
+{
+	struct css_id *newid;
+	struct cgroup_subsys_state *rootcss;
+
+	spin_lock_init(&ss->id_lock);
+	idr_init(&ss->idr);
+
+	rootcss = init_css_set.subsys[ss->subsys_id];
+	newid = get_new_cssid(ss, 0);
+	if (IS_ERR(newid))
+		return PTR_ERR(newid);
+
+	newid->stack[0] = newid->id;
+	newid->css = rootcss;
+	rootcss->id = newid;
+	return 0;
+}
+
+static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
+			struct cgroup *child)
+{
+	int subsys_id, i, depth = 0;
+	struct cgroup_subsys_state *parent_css, *child_css;
+	struct css_id *child_id, *parent_id = NULL;
+
+	subsys_id = ss->subsys_id;
+	parent_css = parent->subsys[subsys_id];
+	child_css = child->subsys[subsys_id];
+	depth = css_depth(parent_css) + 1;
+	parent_id = parent_css->id;
+
+	child_id = get_new_cssid(ss, depth);
+	if (IS_ERR(child_id))
+		return PTR_ERR(child_id);
+
+	for (i = 0; i < depth; i++)
+		child_id->stack[i] = parent_id->stack[i];
+	child_id->stack[depth] = child_id->id;
+	/*
+	 * child_id->css pointer will be set after this cgroup is available
+	 * see cgroup_populate_dir()
+	 */
+	rcu_assign_pointer(child_css->id, child_id);
+
+	return 0;
+}
+
+/**
+ * css_lookup - lookup css by id
+ * @ss: cgroup subsys to be looked into.
+ * @id: the id
+ *
+ * Returns pointer to cgroup_subsys_state if there is valid one with id.
+ * NULL if not. Should be called under rcu_read_lock()
+ */
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
+{
+	struct css_id *cssid = NULL;
+
+	BUG_ON(!ss->use_id);
+	cssid = idr_find(&ss->idr, id);
+
+	if (unlikely(!cssid))
+		return NULL;
+
+	return rcu_dereference(cssid->css);
+}
+
+/**
+ * css_get_next - lookup next cgroup under specified hierarchy.
+ * @ss: pointer to subsystem
+ * @id: current position of iteration.
+ * @root: pointer to css. search tree under this.
+ * @foundid: position of found object.
+ *
+ * Search next css under the specified hierarchy of rootid. Calling under
+ * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
+ */
+struct cgroup_subsys_state *
+css_get_next(struct cgroup_subsys *ss, int id,
+	     struct cgroup_subsys_state *root, int *foundid)
+{
+	struct cgroup_subsys_state *ret = NULL;
+	struct css_id *tmp;
+	int tmpid;
+	int rootid = css_id(root);
+	int depth = css_depth(root);
+
+	if (!rootid)
+		return NULL;
+
+	BUG_ON(!ss->use_id);
+	/* fill start point for scan */
+	tmpid = id;
+	while (1) {
+		/*
+		 * scan next entry from bitmap(tree), tmpid is updated after
+		 * idr_get_next().
+		 */
+		spin_lock(&ss->id_lock);
+		tmp = idr_get_next(&ss->idr, &tmpid);
+		spin_unlock(&ss->id_lock);
+
+		if (!tmp)
+			break;
+		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
+			ret = rcu_dereference(tmp->css);
+			if (ret) {
+				*foundid = tmpid;
+				break;
+			}
+		}
+		/* continue to scan from next id */
+		tmpid = tmpid + 1;
+	}
+	return ret;
+}
+
diff --git a/lib/idr.c b/lib/idr.c
index dab4bca86f5d..80ca9aca038b 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -578,6 +578,52 @@ int idr_for_each(struct idr *idp,
 }
 EXPORT_SYMBOL(idr_for_each);
 
+/**
+ * idr_get_next - lookup next object of id to given id.
+ * @idp: idr handle
+ * @id:  pointer to lookup key
+ *
+ * Returns pointer to registered object with id, which is next number to
+ * given id.
+ */
+
+void *idr_get_next(struct idr *idp, int *nextidp)
+{
+	struct idr_layer *p, *pa[MAX_LEVEL];
+	struct idr_layer **paa = &pa[0];
+	int id = *nextidp;
+	int n, max;
+
+	/* find first ent */
+	n = idp->layers * IDR_BITS;
+	max = 1 << n;
+	p = rcu_dereference(idp->top);
+	if (!p)
+		return NULL;
+
+	while (id < max) {
+		while (n > 0 && p) {
+			n -= IDR_BITS;
+			*paa++ = p;
+			p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]);
+		}
+
+		if (p) {
+			*nextidp = id;
+			return p;
+		}
+
+		id += 1 << n;
+		while (n < fls(id)) {
+			n += IDR_BITS;
+			p = *--paa;
+		}
+	}
+	return NULL;
+}
+
+
+
 /**
  * idr_replace - replace pointer for given id
  * @idp: idr handle
-- 
cgit v1.2.3-59-g8ed1b


From ec64f51545fffbc4cb968f0cea56341a4b07e85a Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:26 -0700
Subject: cgroup: fix frequent -EBUSY at rmdir

In following situation, with memory subsystem,

	/groupA use_hierarchy==1
		/01 some tasks
		/02 some tasks
		/03 some tasks
		/04 empty

When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim
is triggered and the kernel walks tree under groupA. In this case,
rmdir /groupA/04 fails with -EBUSY frequently because of temporal
refcnt from the kernel.

In general. cgroup can be rmdir'd if there are no children groups and
no tasks. Frequent fails of rmdir() is not useful to users.
(And the reason for -EBUSY is unknown to users.....in most cases)

This patch tries to modify above behavior, by
	- retries if css_refcnt is got by someone.
	- add "return value" to pre_destroy() and allows subsystem to
	  say "we're really busy!"

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/cgroups.txt |  6 ++-
 include/linux/cgroup.h            |  6 ++-
 kernel/cgroup.c                   | 81 ++++++++++++++++++++++++++++++++-------
 mm/memcontrol.c                   |  5 ++-
 4 files changed, 79 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 93feb8444489..cdc46a501b85 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -476,11 +476,13 @@ cgroup->parent is still valid. (Note - can also be called for a
 newly-created cgroup if an error occurs after this subsystem's
 create() method has been called for the new cgroup).
 
-void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+int pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
 
 Called before checking the reference count on each subsystem. This may
 be useful for subsystems which have some extra references even if
-there are not tasks in the cgroup.
+there are not tasks in the cgroup. If pre_destroy() returns error code,
+rmdir() will fail with it. From this behavior, pre_destroy() can be
+called multiple times against a cgroup.
 
 int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	       struct task_struct *task)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9a23bb098205..7d824b80b3d7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -135,6 +135,10 @@ enum {
 	CGRP_RELEASABLE,
 	/* Control Group requires release notifications to userspace */
 	CGRP_NOTIFY_ON_RELEASE,
+	/*
+	 * A thread in rmdir() is wating for this cgroup.
+	 */
+	CGRP_WAIT_ON_RMDIR,
 };
 
 struct cgroup {
@@ -360,7 +364,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
 						  struct cgroup *cgrp);
-	void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
+	int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	int (*can_attach)(struct cgroup_subsys *ss,
 			  struct cgroup *cgrp, struct task_struct *tsk);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d3c521137425..fc5e4a48582f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -622,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
  * Call subsys's pre_destroy handler.
  * This is called before css refcnt check.
  */
-static void cgroup_call_pre_destroy(struct cgroup *cgrp)
+static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 {
 	struct cgroup_subsys *ss;
+	int ret = 0;
+
 	for_each_subsys(cgrp->root, ss)
-		if (ss->pre_destroy)
-			ss->pre_destroy(ss, cgrp);
-	return;
+		if (ss->pre_destroy) {
+			ret = ss->pre_destroy(ss, cgrp);
+			if (ret)
+				break;
+		}
+	return ret;
 }
 
 static void free_cgroup_rcu(struct rcu_head *obj)
@@ -722,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 	remove_dir(dentry);
 }
 
+/*
+ * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
+ * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
+ * reference to css->refcnt. In general, this refcnt is expected to goes down
+ * to zero, soon.
+ *
+ * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
+ */
+DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+
+static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
+{
+	if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+		wake_up_all(&cgroup_rmdir_waitq);
+}
+
 static int rebind_subsystems(struct cgroupfs_root *root,
 			      unsigned long final_bits)
 {
@@ -1317,6 +1338,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
 	synchronize_rcu();
 	put_css_set(cg);
+
+	/*
+	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
+	 * is no longer empty.
+	 */
+	cgroup_wakeup_rmdir_waiters(cgrp);
 	return 0;
 }
 
@@ -2608,9 +2635,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	struct cgroup *cgrp = dentry->d_fsdata;
 	struct dentry *d;
 	struct cgroup *parent;
+	DEFINE_WAIT(wait);
+	int ret;
 
 	/* the vfs holds both inode->i_mutex already */
-
+again:
 	mutex_lock(&cgroup_mutex);
 	if (atomic_read(&cgrp->count) != 0) {
 		mutex_unlock(&cgroup_mutex);
@@ -2626,17 +2655,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	 * Call pre_destroy handlers of subsys. Notify subsystems
 	 * that rmdir() request comes.
 	 */
-	cgroup_call_pre_destroy(cgrp);
+	ret = cgroup_call_pre_destroy(cgrp);
+	if (ret)
+		return ret;
 
 	mutex_lock(&cgroup_mutex);
 	parent = cgrp->parent;
-
-	if (atomic_read(&cgrp->count)
-	    || !list_empty(&cgrp->children)
-	    || !cgroup_clear_css_refs(cgrp)) {
+	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
+	/*
+	 * css_put/get is provided for subsys to grab refcnt to css. In typical
+	 * case, subsystem has no reference after pre_destroy(). But, under
+	 * hierarchy management, some *temporal* refcnt can be hold.
+	 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
+	 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
+	 * is called when css_put() is called and refcnt goes down to 0.
+	 */
+	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
+
+	if (!cgroup_clear_css_refs(cgrp)) {
+		mutex_unlock(&cgroup_mutex);
+		schedule();
+		finish_wait(&cgroup_rmdir_waitq, &wait);
+		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+		if (signal_pending(current))
+			return -EINTR;
+		goto again;
+	}
+	/* NO css_tryget() can success after here. */
+	finish_wait(&cgroup_rmdir_waitq, &wait);
+	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 
 	spin_lock(&release_list_lock);
 	set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -3194,10 +3245,12 @@ void __css_put(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
 	rcu_read_lock();
-	if ((atomic_dec_return(&css->refcnt) == 1) &&
-	    notify_on_release(cgrp)) {
-		set_bit(CGRP_RELEASABLE, &cgrp->flags);
-		check_for_release(cgrp);
+	if (atomic_dec_return(&css->refcnt) == 1) {
+		if (notify_on_release(cgrp)) {
+			set_bit(CGRP_RELEASABLE, &cgrp->flags);
+			check_for_release(cgrp);
+		}
+		cgroup_wakeup_rmdir_waiters(cgrp);
 	}
 	rcu_read_unlock();
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8e4be9cb2a6a..8ffec674c5ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2272,11 +2272,12 @@ free_out:
 	return ERR_PTR(-ENOMEM);
 }
 
-static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
+static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
 					struct cgroup *cont)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-	mem_cgroup_force_empty(mem, false);
+
+	return mem_cgroup_force_empty(mem, false);
 }
 
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
-- 
cgit v1.2.3-59-g8ed1b


From 66bdc9cfc77ba89a9ee6c82d28375b646ab4bb1d Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Thu, 2 Apr 2009 16:57:27 -0700
Subject: kernel/cgroup.c: kfree(NULL) is legal

Reduces object file size a bit:

Before:
$ size kernel/cgroup.o
   text    data     bss     dec     hex filename
  21593    7804    4924   34321    8611 kernel/cgroup.o
After:
$ size kernel/cgroup.o
   text    data     bss     dec     hex filename
  21537    7744    4924   34205    859d kernel/cgroup.o

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fc5e4a48582f..9a6c2bfa1d9f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -923,8 +923,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	if (opts.release_agent)
 		strcpy(root->release_agent_path, opts.release_agent);
  out_unlock:
-	if (opts.release_agent)
-		kfree(opts.release_agent);
+	kfree(opts.release_agent);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 	return ret;
@@ -1027,15 +1026,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 	/* First find the desired set of subsystems */
 	ret = parse_cgroupfs_options(data, &opts);
 	if (ret) {
-		if (opts.release_agent)
-			kfree(opts.release_agent);
+		kfree(opts.release_agent);
 		return ret;
 	}
 
 	root = kzalloc(sizeof(*root), GFP_KERNEL);
 	if (!root) {
-		if (opts.release_agent)
-			kfree(opts.release_agent);
+		kfree(opts.release_agent);
 		return -ENOMEM;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 099fca3225b39f7a3ed853036038054172b55581 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:29 -0700
Subject: cgroups: show correct file mode

We have some read-only files and write-only files, but currently they are
all set to 0644, which is counter-intuitive and cause trouble for some
cgroup tools like libcgroup.

This patch adds 'mode' to struct cftype to allow cgroup subsys to set it's
own files' file mode, and for the most cases cft->mode can be default to 0
and cgroup will figure out proper mode.

Acked-by: Paul Menage <menage@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  5 +++++
 kernel/cgroup.c        | 38 ++++++++++++++++++++++++++++++++++----
 kernel/cpuset.c        |  1 +
 3 files changed, 40 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7d824b80b3d7..b2816fba5306 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -258,6 +258,11 @@ struct cftype {
 	 */
 	char name[MAX_CFTYPE_NAME];
 	int private;
+	/*
+	 * If not 0, file mode is set to this value, otherwise it will
+	 * be figured out automatically
+	 */
+	mode_t mode;
 
 	/*
 	 * If non-zero, defines the maximum length of string that can
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9a6c2bfa1d9f..fea11c5c990c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1686,7 +1686,7 @@ static struct inode_operations cgroup_dir_inode_operations = {
 	.rename = cgroup_rename,
 };
 
-static int cgroup_create_file(struct dentry *dentry, int mode,
+static int cgroup_create_file(struct dentry *dentry, mode_t mode,
 				struct super_block *sb)
 {
 	static const struct dentry_operations cgroup_dops = {
@@ -1732,7 +1732,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
  * @mode: mode to set on new directory.
  */
 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
-				int mode)
+				mode_t mode)
 {
 	struct dentry *parent;
 	int error = 0;
@@ -1750,6 +1750,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
 	return error;
 }
 
+/**
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
+ *
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
+ */
+static mode_t cgroup_file_mode(const struct cftype *cft)
+{
+	mode_t mode = 0;
+
+	if (cft->mode)
+		return cft->mode;
+
+	if (cft->read || cft->read_u64 || cft->read_s64 ||
+	    cft->read_map || cft->read_seq_string)
+		mode |= S_IRUGO;
+
+	if (cft->write || cft->write_u64 || cft->write_s64 ||
+	    cft->write_string || cft->trigger)
+		mode |= S_IWUSR;
+
+	return mode;
+}
+
 int cgroup_add_file(struct cgroup *cgrp,
 		       struct cgroup_subsys *subsys,
 		       const struct cftype *cft)
@@ -1757,6 +1784,7 @@ int cgroup_add_file(struct cgroup *cgrp,
 	struct dentry *dir = cgrp->dentry;
 	struct dentry *dentry;
 	int error;
+	mode_t mode;
 
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -1767,7 +1795,8 @@ int cgroup_add_file(struct cgroup *cgrp,
 	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
 	dentry = lookup_one_len(name, dir, strlen(name));
 	if (!IS_ERR(dentry)) {
-		error = cgroup_create_file(dentry, 0644 | S_IFREG,
+		mode = cgroup_file_mode(cft);
+		error = cgroup_create_file(dentry, mode | S_IFREG,
 						cgrp->root->sb);
 		if (!error)
 			dentry->d_fsdata = (void *)cft;
@@ -2349,6 +2378,7 @@ static struct cftype files[] = {
 		.write_u64 = cgroup_tasks_write,
 		.release = cgroup_tasks_release,
 		.private = FILE_TASKLIST,
+		.mode = S_IRUGO | S_IWUSR,
 	},
 
 	{
@@ -2449,7 +2479,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
  * Must be called with the mutex on the parent inode held
  */
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
-			     int mode)
+			     mode_t mode)
 {
 	struct cgroup *cgrp;
 	struct cgroupfs_root *root = parent->root;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f76db9dcaa05..ee5ec386aa8b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1706,6 +1706,7 @@ static struct cftype files[] = {
 		.read_u64 = cpuset_read_u64,
 		.write_u64 = cpuset_write_u64,
 		.private = FILE_MEMORY_PRESSURE,
+		.mode = S_IRUGO,
 	},
 
 	{
-- 
cgit v1.2.3-59-g8ed1b


From 0670e08bdfc67272f8c3087030417465629b8073 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:30 -0700
Subject: cgroups: don't change release_agent when remount failed

Remount can fail in either case:
  - wrong mount options is specified, or option 'noprefix' is changed.
  - a to-be-added subsys is already mounted/active.

When using remount to change 'release_agent', for the above former failure
case, remount will return errno with release_agent unchanged, but for the
latter case, remount will return EBUSY with relase_agent changed, which is
unexpected I think:

 # mount -t cgroup -o cpu xxx /cgrp1
 # mount -t cgroup -o cpuset,release_agent=agent1 yyy /cgrp2
 # cat /cgrp2/release_agent
 agent1
 # mount -t cgroup -o remount,cpuset,noprefix,release_agent=agent2 yyy /cgrp2
 mount: /cgrp2 not mounted already, or bad option
 # cat /cgrp2/release_agent
 agent1     <-- ok
 # mount -t cgroup -o remount,cpu,cpuset,release_agent=agent2 yyy /cgrp2
 mount: /cgrp2 is busy
 # cat /cgrp2/release_agent
 agent2     <-- unexpected!

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fea11c5c990c..f2a3f5c9936c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -915,10 +915,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	ret = rebind_subsystems(root, opts.subsys_bits);
+	if (ret)
+		goto out_unlock;
 
 	/* (re)populate subsystem files */
-	if (!ret)
-		cgroup_populate_dir(cgrp);
+	cgroup_populate_dir(cgrp);
 
 	if (opts.release_agent)
 		strcpy(root->release_agent_path, opts.release_agent);
-- 
cgit v1.2.3-59-g8ed1b


From d969fbe69e07fcceb0558b35d4c75eb046041c5e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:31 -0700
Subject: debug cgroup: remove unneeded cgroup_lock

Since we are in cgroup write handler, so the cgrp is valid, so we don't
have to hold cgroup_mutex when calling cgroup_task_count().  One similar
example is in cgroup_tasks_open().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup_debug.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index daca6209202d..0c92d797baa6 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -40,9 +40,7 @@ static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
 {
 	u64 count;
 
-	cgroup_lock();
 	count = cgroup_task_count(cont);
-	cgroup_unlock();
 	return count;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 0b7f569e45bb6be142d87017030669a6a7d327a1 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:38 -0700
Subject: memcg: fix OOM killer under memcg

This patch tries to fix OOM Killer problems caused by hierarchy.
Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to
kill a task in memcg.

But, when hierarchy is used, it's broken and correct task cannot
be killed. For example, in following cgroup

	/groupA/	hierarchy=1, limit=1G,
		01	nolimit
		02	nolimit
All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to
groupA's 1Gbytes but OOM Killer just kills tasks in groupA.

This patch provides makes the bad process be selected from all tasks
under hierarchy. BTW, currently, oom_jiffies is updated against groupA
in above case. oom_jiffies of tree should be updated.

To see how oom_jiffies is used, please check mem_cgroup_oom_called()
callers.

[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: const fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/memcg_test.txt | 20 +++++++++++++++++++-
 include/linux/cgroup.h               |  2 +-
 kernel/cgroup.c                      |  2 +-
 mm/memcontrol.c                      | 30 ++++++++++++++++++++++++++++--
 4 files changed, 49 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index 523a9c16c400..8a11caf417a0 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -1,5 +1,5 @@
 Memory Resource Controller(Memcg)  Implementation Memo.
-Last Updated: 2009/1/19
+Last Updated: 2009/1/20
 Base Kernel Version: based on 2.6.29-rc2.
 
 Because VM is getting complex (one of reasons is memcg...), memcg's behavior
@@ -360,3 +360,21 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	# kill malloc task.
 
 	Of course, tmpfs v.s. swapoff test should be tested, too.
+
+ 9.8 OOM-Killer
+	Out-of-memory caused by memcg's limit will kill tasks under
+	the memcg. When hierarchy is used, a task under hierarchy
+	will be killed by the kernel.
+	In this case, panic_on_oom shouldn't be invoked and tasks
+	in other groups shouldn't be killed.
+
+	It's not difficult to cause OOM under memcg as following.
+	Case A) when you can swapoff
+	#swapoff -a
+	#echo 50M > /memory.limit_in_bytes
+	run 51M of malloc
+
+	Case B) when you use mem+swap limitation.
+	#echo 50M > memory.limit_in_bytes
+	#echo 50M > memory.memsw.limit_in_bytes
+	run 51M of malloc
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b2816fba5306..43763bd772b9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -503,7 +503,7 @@ struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
 
 /* Returns true if root is ancestor of cg */
 bool css_is_ancestor(struct cgroup_subsys_state *cg,
-		     struct cgroup_subsys_state *root);
+		     const struct cgroup_subsys_state *root);
 
 /* Get id and depth of css */
 unsigned short css_id(struct cgroup_subsys_state *css);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f2a3f5c9936c..382109b5baeb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3405,7 +3405,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 }
 
 bool css_is_ancestor(struct cgroup_subsys_state *child,
-		    struct cgroup_subsys_state *root)
+		    const struct cgroup_subsys_state *root)
 {
 	struct css_id *child_id = rcu_dereference(child->id);
 	struct css_id *root_id = rcu_dereference(root->id);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6f6a575e77ad..025f8abfae2d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -295,6 +295,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	struct mem_cgroup *mem = NULL;
+
+	if (!mm)
+		return NULL;
 	/*
 	 * Because we have no locks, mm->owner's may be being moved to other
 	 * cgroup. We use css_tryget() here even if this looks
@@ -486,10 +489,20 @@ void mem_cgroup_move_lists(struct page *page,
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
 	int ret;
+	struct mem_cgroup *curr = NULL;
 
 	task_lock(task);
-	ret = task->mm && mm_match_cgroup(task->mm, mem);
+	rcu_read_lock();
+	curr = try_get_mem_cgroup_from_mm(task->mm);
+	rcu_read_unlock();
 	task_unlock(task);
+	if (!curr)
+		return 0;
+	if (curr->use_hierarchy)
+		ret = css_is_ancestor(&curr->css, &mem->css);
+	else
+		ret = (curr == mem);
+	css_put(&curr->css);
 	return ret;
 }
 
@@ -820,6 +833,19 @@ bool mem_cgroup_oom_called(struct task_struct *task)
 	rcu_read_unlock();
 	return ret;
 }
+
+static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
+{
+	mem->last_oom_jiffies = jiffies;
+	return 0;
+}
+
+static void record_last_oom(struct mem_cgroup *mem)
+{
+	mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
+}
+
+
 /*
  * Unlike exported interface, "oom" parameter is added. if oom==true,
  * oom-killer can be invoked.
@@ -902,7 +928,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 				mutex_lock(&memcg_tasklist);
 				mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
 				mutex_unlock(&memcg_tasklist);
-				mem_over_limit->last_oom_jiffies = jiffies;
+				record_last_oom(mem_over_limit);
 			}
 			goto nomem;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 0b4217b3fdddc4a58939720d3ed809537577d48b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:49 -0700
Subject: cpuset: fix possible races in cpu/memory hotplug

Change to cpuset->cpus_allowed and cpuset->mems_allowed should be protected
by callback_mutex, otherwise the reader may read wrong cpus/mems. This is
cpuset's lock rule.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ee5ec386aa8b..31737957cb62 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2070,7 +2070,9 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 	}
 
 	cgroup_lock();
+	mutex_lock(&callback_mutex);
 	cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+	mutex_unlock(&callback_mutex);
 	scan_for_empty_cpusets(&top_cpuset);
 	ndoms = generate_sched_domains(&doms, &attr);
 	cgroup_unlock();
@@ -2093,11 +2095,12 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 	cgroup_lock();
 	switch (action) {
 	case MEM_ONLINE:
-		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-		break;
 	case MEM_OFFLINE:
+		mutex_lock(&callback_mutex);
 		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-		scan_for_empty_cpusets(&top_cpuset);
+		mutex_unlock(&callback_mutex);
+		if (action == MEM_OFFLINE)
+			scan_for_empty_cpusets(&top_cpuset);
 		break;
 	default:
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From 3b6766fe668b83c8a03c6ed01bcc2ac77cbae848 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:51 -0700
Subject: cpuset: rewrite update_tasks_nodemask()

This patch uses cgroup_scan_tasks() to rebind tasks' vmas to new cpuset's
mems_allowed.

Not only simplify the code largely, but also avoid allocating an array to
hold mm pointers of all the tasks in the cpuset.  This array can be big
(size > PAGESIZE) if we have lots of tasks in that cpuset, thus has a
chance to fail the allocation when under memory stress.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 109 ++++++++++++++++++++------------------------------------
 1 file changed, 39 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 31737957cb62..dca455e0482e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1026,6 +1026,31 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 	mutex_unlock(&callback_mutex);
 }
 
+/*
+ * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
+ * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
+ */
+static void cpuset_change_nodemask(struct task_struct *p,
+				   struct cgroup_scanner *scan)
+{
+	struct mm_struct *mm;
+	struct cpuset *cs;
+	int migrate;
+	const nodemask_t *oldmem = scan->data;
+
+	mm = get_task_mm(p);
+	if (!mm)
+		return;
+
+	cs = cgroup_cs(scan->cg);
+	migrate = is_memory_migrate(cs);
+
+	mpol_rebind_mm(mm, &cs->mems_allowed);
+	if (migrate)
+		cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
+	mmput(mm);
+}
+
 static void *cpuset_being_rebound;
 
 /**
@@ -1038,88 +1063,32 @@ static void *cpuset_being_rebound;
  */
 static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 {
-	struct task_struct *p;
-	struct mm_struct **mmarray;
-	int i, n, ntasks;
-	int migrate;
-	int fudge;
-	struct cgroup_iter it;
 	int retval;
+	struct cgroup_scanner scan;
 
 	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
 
-	fudge = 10;				/* spare mmarray[] slots */
-	fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
-	retval = -ENOMEM;
-
-	/*
-	 * Allocate mmarray[] to hold mm reference for each task
-	 * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
-	 * tasklist_lock.  We could use GFP_ATOMIC, but with a
-	 * few more lines of code, we can retry until we get a big
-	 * enough mmarray[] w/o using GFP_ATOMIC.
-	 */
-	while (1) {
-		ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */
-		ntasks += fudge;
-		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
-		if (!mmarray)
-			goto done;
-		read_lock(&tasklist_lock);		/* block fork */
-		if (cgroup_task_count(cs->css.cgroup) <= ntasks)
-			break;				/* got enough */
-		read_unlock(&tasklist_lock);		/* try again */
-		kfree(mmarray);
-	}
-
-	n = 0;
-
-	/* Load up mmarray[] with mm reference for each task in cpuset. */
-	cgroup_iter_start(cs->css.cgroup, &it);
-	while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
-		struct mm_struct *mm;
-
-		if (n >= ntasks) {
-			printk(KERN_WARNING
-				"Cpuset mempolicy rebind incomplete.\n");
-			break;
-		}
-		mm = get_task_mm(p);
-		if (!mm)
-			continue;
-		mmarray[n++] = mm;
-	}
-	cgroup_iter_end(cs->css.cgroup, &it);
-	read_unlock(&tasklist_lock);
+	scan.cg = cs->css.cgroup;
+	scan.test_task = NULL;
+	scan.process_task = cpuset_change_nodemask;
+	scan.heap = NULL;
+	scan.data = (nodemask_t *)oldmem;
 
 	/*
-	 * Now that we've dropped the tasklist spinlock, we can
-	 * rebind the vma mempolicies of each mm in mmarray[] to their
-	 * new cpuset, and release that mm.  The mpol_rebind_mm()
-	 * call takes mmap_sem, which we couldn't take while holding
-	 * tasklist_lock.  Forks can happen again now - the mpol_dup()
-	 * cpuset_being_rebound check will catch such forks, and rebind
-	 * their vma mempolicies too.  Because we still hold the global
-	 * cgroup_mutex, we know that no other rebind effort will
-	 * be contending for the global variable cpuset_being_rebound.
+	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
+	 * take while holding tasklist_lock.  Forks can happen - the
+	 * mpol_dup() cpuset_being_rebound check will catch such forks,
+	 * and rebind their vma mempolicies too.  Because we still hold
+	 * the global cgroup_mutex, we know that no other rebind effort
+	 * will be contending for the global variable cpuset_being_rebound.
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
 	 * is idempotent.  Also migrate pages in each mm to new nodes.
 	 */
-	migrate = is_memory_migrate(cs);
-	for (i = 0; i < n; i++) {
-		struct mm_struct *mm = mmarray[i];
-
-		mpol_rebind_mm(mm, &cs->mems_allowed);
-		if (migrate)
-			cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
-		mmput(mm);
-	}
+	retval = cgroup_scan_tasks(&scan);
 
 	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
-	kfree(mmarray);
 	cpuset_being_rebound = NULL;
-	retval = 0;
-done:
+
 	return retval;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 010cfac4ca0f9e85f54ba2117a372e72f4fb9a60 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:52 -0700
Subject: cpuset: avoid changing cpuset's mems when errno returned

When writing to cpuset.mems, cpuset has to update its mems_allowed before
calling update_tasks_nodemask(), but this function might return -ENOMEM.

To avoid this rare case, we allocate the memory before changing
mems_allowed, and then pass to update_tasks_nodemask().  Similar to what
update_cpumask() does.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index dca455e0482e..3778a21a4662 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1057,13 +1057,15 @@ static void *cpuset_being_rebound;
  * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
  * @oldmem: old mems_allowed of cpuset cs
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
  *
  * Called with cgroup_mutex held
- * Return 0 if successful, -errno if not.
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
  */
-static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
+static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
+				 struct ptr_heap *heap)
 {
-	int retval;
 	struct cgroup_scanner scan;
 
 	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
@@ -1071,7 +1073,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 	scan.cg = cs->css.cgroup;
 	scan.test_task = NULL;
 	scan.process_task = cpuset_change_nodemask;
-	scan.heap = NULL;
+	scan.heap = heap;
 	scan.data = (nodemask_t *)oldmem;
 
 	/*
@@ -1084,12 +1086,10 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
 	 * is idempotent.  Also migrate pages in each mm to new nodes.
 	 */
-	retval = cgroup_scan_tasks(&scan);
+	cgroup_scan_tasks(&scan);
 
 	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
 	cpuset_being_rebound = NULL;
-
-	return retval;
 }
 
 /*
@@ -1110,6 +1110,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 {
 	nodemask_t oldmem;
 	int retval;
+	struct ptr_heap heap;
 
 	/*
 	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -1144,12 +1145,18 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 	if (retval < 0)
 		goto done;
 
+	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+	if (retval < 0)
+		goto done;
+
 	mutex_lock(&callback_mutex);
 	cs->mems_allowed = trialcs->mems_allowed;
 	cs->mems_generation = cpuset_mems_generation++;
 	mutex_unlock(&callback_mutex);
 
-	retval = update_tasks_nodemask(cs, &oldmem);
+	update_tasks_nodemask(cs, &oldmem, &heap);
+
+	heap_free(&heap);
 done:
 	return retval;
 }
@@ -2003,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 			remove_tasks_in_empty_cpuset(cp);
 		else {
 			update_tasks_cpumask(cp, NULL);
-			update_tasks_nodemask(cp, &oldmems);
+			update_tasks_nodemask(cp, &oldmems, NULL);
 		}
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From 7f81b1ae18416b457e4d5ff23f0bd598e8a42224 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:53 -0700
Subject: cpuset: remove struct cpuset_hotplug_scanner

Use cgroup_scanner.data, instead of introducing cpuset_hotplug_scanner.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3778a21a4662..0619f109d38d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -128,10 +128,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)
 	return container_of(task_subsys_state(task, cpuset_subsys_id),
 			    struct cpuset, css);
 }
-struct cpuset_hotplug_scanner {
-	struct cgroup_scanner scan;
-	struct cgroup *to;
-};
 
 /* bits in struct cpuset flags field */
 typedef enum {
@@ -1890,10 +1886,9 @@ int __init cpuset_init(void)
 static void cpuset_do_move_task(struct task_struct *tsk,
 				struct cgroup_scanner *scan)
 {
-	struct cpuset_hotplug_scanner *chsp;
+	struct cgroup *new_cgroup = scan->data;
 
-	chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
-	cgroup_attach_task(chsp->to, tsk);
+	cgroup_attach_task(new_cgroup, tsk);
 }
 
 /**
@@ -1909,15 +1904,15 @@ static void cpuset_do_move_task(struct task_struct *tsk,
  */
 static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 {
-	struct cpuset_hotplug_scanner scan;
+	struct cgroup_scanner scan;
 
-	scan.scan.cg = from->css.cgroup;
-	scan.scan.test_task = NULL; /* select all tasks in cgroup */
-	scan.scan.process_task = cpuset_do_move_task;
-	scan.scan.heap = NULL;
-	scan.to = to->css.cgroup;
+	scan.cg = from->css.cgroup;
+	scan.test_task = NULL; /* select all tasks in cgroup */
+	scan.process_task = cpuset_do_move_task;
+	scan.heap = NULL;
+	scan.data = to->css.cgroup;
 
-	if (cgroup_scan_tasks(&scan.scan))
+	if (cgroup_scan_tasks(&scan))
 		printk(KERN_ERR "move_member_tasks_to_cpuset: "
 				"cgroup_scan_tasks failed\n");
 }
-- 
cgit v1.2.3-59-g8ed1b


From a1bc5a4eee990a1f290735c8694d0aebdad095fa Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 2 Apr 2009 16:57:54 -0700
Subject: cpusets: replace zone allowed functions with node allowed

The cpuset_zone_allowed() variants are actually only a function of the
zone's node.

Cc: Paul Menage <menage@google.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h | 33 +++++++++++++++++++++++-----
 kernel/cpuset.c        | 59 +++++++++++++++++++++-----------------------------
 2 files changed, 52 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 2e0d79678deb..05ea1dd7d681 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -12,6 +12,7 @@
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/cgroup.h>
+#include <linux/mm.h>
 
 #ifdef CONFIG_CPUSETS
 
@@ -29,19 +30,29 @@ void cpuset_init_current_mems_allowed(void);
 void cpuset_update_task_memory_state(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
-extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
-extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
+extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask);
+extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
 
-static int inline cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 {
 	return number_of_cpusets <= 1 ||
-		__cpuset_zone_allowed_softwall(z, gfp_mask);
+		__cpuset_node_allowed_softwall(node, gfp_mask);
 }
 
-static int inline cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 {
 	return number_of_cpusets <= 1 ||
-		__cpuset_zone_allowed_hardwall(z, gfp_mask);
+		__cpuset_node_allowed_hardwall(node, gfp_mask);
+}
+
+static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+{
+	return cpuset_node_allowed_softwall(zone_to_nid(z), gfp_mask);
+}
+
+static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+{
+	return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask);
 }
 
 extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -112,6 +123,16 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 	return 1;
 }
 
+static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+{
+	return 1;
+}
+
+static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
+{
+	return 1;
+}
+
 static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 {
 	return 1;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0619f109d38d..3ff910eb30d3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2181,26 +2181,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 }
 
 /**
- * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
- * If we're in interrupt, yes, we can always allocate.  If
- * __GFP_THISNODE is set, yes, we can always allocate.  If zone
- * z's node is in our tasks mems_allowed, yes.  If it's not a
- * __GFP_HARDWALL request and this zone's nodes is in the nearest
- * hardwalled cpuset ancestor to this tasks cpuset, yes.
- * If the task has been OOM killed and has access to memory reserves
- * as specified by the TIF_MEMDIE flag, yes.
+ * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
+ * set, yes, we can always allocate.  If node is in our task's mems_allowed,
+ * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
+ * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
+ * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
+ * flag, yes.
  * Otherwise, no.
  *
- * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
- * reduces to cpuset_zone_allowed_hardwall().  Otherwise,
- * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
- * from an enclosing cpuset.
+ * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
+ * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
+ * might sleep, and might allow a node from an enclosing cpuset.
  *
- * cpuset_zone_allowed_hardwall() only handles the simpler case of
- * hardwall cpusets, and never sleeps.
+ * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
+ * cpusets, and never sleeps.
  *
  * The __GFP_THISNODE placement logic is really handled elsewhere,
  * by forcibly using a zonelist starting at a specified node, and by
@@ -2239,20 +2237,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  *
  * Rule:
- *    Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
+ *    Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
  *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
  *    the code that might scan up ancestor cpusets and sleep.
  */
-
-int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 {
-	int node;			/* node that zone z is on */
 	const struct cpuset *cs;	/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
 
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	node = zone_to_nid(z);
 	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
 	if (node_isset(node, current->mems_allowed))
 		return 1;
@@ -2281,15 +2276,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 }
 
 /*
- * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
- * If we're in interrupt, yes, we can always allocate.
- * If __GFP_THISNODE is set, yes, we can always allocate.  If zone
- * z's node is in our tasks mems_allowed, yes.   If the task has been
- * OOM killed and has access to memory reserves as specified by the
- * TIF_MEMDIE flag, yes.  Otherwise, no.
+ * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
+ * set, yes, we can always allocate.  If node is in our task's mems_allowed,
+ * yes.  If the task has been OOM killed and has access to memory reserves as
+ * specified by the TIF_MEMDIE flag, yes.
+ * Otherwise, no.
  *
  * The __GFP_THISNODE placement logic is really handled elsewhere,
  * by forcibly using a zonelist starting at a specified node, and by
@@ -2297,20 +2292,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
  * any node on the zonelist except the first.  By the time any such
  * calls get to this routine, we should just shut up and say 'yes'.
  *
- * Unlike the cpuset_zone_allowed_softwall() variant, above,
- * this variant requires that the zone be in the current tasks
+ * Unlike the cpuset_node_allowed_softwall() variant, above,
+ * this variant requires that the node be in the current task's
  * mems_allowed or that we're in interrupt.  It does not scan up the
  * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
  * It never sleeps.
  */
-
-int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 {
-	int node;			/* node that zone z is on */
-
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	node = zone_to_nid(z);
 	if (node_isset(node, current->mems_allowed))
 		return 1;
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From db7f47cf4805e30decb0841764b21b7c4000f7dc Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Thu, 2 Apr 2009 16:57:55 -0700
Subject: cpusets: allow cpusets to be configured/built on non-SMP systems

Allow cpusets to be configured/built on non-SMP systems

Currently it's impossible to build cpusets under UML on x86-64, since
cpusets depends on SMP and x86-64 UML doesn't support SMP.

There's code in cpusets that doesn't depend on SMP.  This patch surrounds
the minimum amount of cpusets code with #ifdef CONFIG_SMP in order to
allow cpusets to build/run on UP systems (for testing purposes under UML).

Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig    |  2 +-
 kernel/cpuset.c | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index 92d410603932..1398a14b0191 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -531,7 +531,7 @@ config CGROUP_DEVICE
 
 config CPUSETS
 	bool "Cpuset support"
-	depends on SMP && CGROUPS
+	depends on CGROUPS
 	help
 	  This option will let you create and manage CPUSETs which
 	  allow dynamically partitioning a system into sets of CPUs and
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3ff910eb30d3..2b93b50cbe4b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -517,6 +517,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 	return 0;
 }
 
+#ifdef CONFIG_SMP
 /*
  * Helper routine for generate_sched_domains().
  * Do cpusets a, b have overlapping cpus_allowed masks?
@@ -811,6 +812,18 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
 
 	put_online_cpus();
 }
+#else /* !CONFIG_SMP */
+static void do_rebuild_sched_domains(struct work_struct *unused)
+{
+}
+
+static int generate_sched_domains(struct cpumask **domains,
+			struct sched_domain_attr **attributes)
+{
+	*domains = NULL;
+	return 1;
+}
+#endif /* CONFIG_SMP */
 
 static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
 
@@ -1164,8 +1177,10 @@ int current_cpuset_is_being_rebound(void)
 
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
+#ifdef CONFIG_SMP
 	if (val < -1 || val >= SD_LV_MAX)
 		return -EINVAL;
+#endif
 
 	if (val != cs->relax_domain_level) {
 		cs->relax_domain_level = val;
-- 
cgit v1.2.3-59-g8ed1b


From 6d7b2f5f9e88902b19f91d0c8a7ef58a5455f1a2 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 2 Apr 2009 16:57:57 -0700
Subject: cpusets: prevent PF_THREAD_BOUND tasks from attaching to non-root
 cpusets

Kthreads that have the PF_THREAD_BOUND bit set in their flags are bound to a
specific cpu.  Thus, their set of allowed cpus shall not change.

This patch prevents such threads from attaching to non-root cpusets.  They do
not have mempolicies that restrict them to a subset of system nodes and, since
their cpumask may never change, they cannot use any of the features of
cpusets.

The tasks will forever be a member of the root cpuset and will be returned
when listing the tasks attached to that cpuset.

Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2b93b50cbe4b..026faccca869 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1342,19 +1342,22 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
 			     struct cgroup *cont, struct task_struct *tsk)
 {
 	struct cpuset *cs = cgroup_cs(cont);
-	int ret = 0;
 
 	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
 		return -ENOSPC;
 
-	if (tsk->flags & PF_THREAD_BOUND) {
-		mutex_lock(&callback_mutex);
-		if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
-			ret = -EINVAL;
-		mutex_unlock(&callback_mutex);
-	}
+	/*
+	 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
+	 * cannot change their cpu affinity and isolating such threads by their
+	 * set of allowed nodes is unnecessary.  Thus, cpusets are not
+	 * applicable for such threads.  This prevents checking for success of
+	 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
+	 * be changed.
+	 */
+	if (tsk->flags & PF_THREAD_BOUND)
+		return -EINVAL;
 
-	return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
+	return security_task_setscheduler(tsk, 0, NULL);
 }
 
 static void cpuset_attach(struct cgroup_subsys *ss,
-- 
cgit v1.2.3-59-g8ed1b


From 90bc8d8b1a38f1ab131a2399a202e1889db95de8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:57:58 -0700
Subject: do_wait: fix waiting for the group stop with the dead leader

do_wait(WSTOPPED) assumes that p->state must be == TASK_STOPPED, this is
not true if the leader is already dead.  Check SIGNAL_STOP_STOPPED instead
and use signal->group_exit_code.

Trivial test-case:

	void *tfunc(void *arg)
	{
		pause();
		return NULL;
	}

	int main(void)
	{
		pthread_t thr;
		pthread_create(&thr, NULL, tfunc, NULL);
		pthread_exit(NULL);
		return 0;
	}

It doesn't react to ^Z (and then to ^C or ^\). The task is stopped, but
bash can't see this.

The bug is very old, and it was reported multiple times. This patch was sent
more than a year ago (http://marc.info/?t=119713920000003) but it was ignored.

This change also fixes other oddities (but not all) in this area.  For
example, before this patch:

	$ sleep 100
	^Z
	[1]+  Stopped                 sleep 100
	$ strace -p `pidof sleep`
	Process 11442 attached - interrupt to quit

strace hangs in do_wait(), because ->exit_code was already consumed by
bash.  After this patch, strace happily proceeds:

	--- SIGTSTP (Stopped) @ 0 (0) ---
	restart_syscall(<... resuming interrupted call ...>

To me, this looks much more "natural" and correct.

Another example.  Let's suppose we have the main thread M and sub-thread
T, the process is stopped, and its parent did wait(WSTOPPED).  Now we can
ptrace T but not M.  This looks at least strange to me.

Imho, do_wait() should not confuse the per-thread ptrace stops with the
per-process job control stops.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Kaz Kylheku <kkylheku@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3ad7c6..0c06b9efae3b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1417,6 +1417,18 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	return retval;
 }
 
+static int *task_stopped_code(struct task_struct *p, bool ptrace)
+{
+	if (ptrace) {
+		if (task_is_stopped_or_traced(p))
+			return &p->exit_code;
+	} else {
+		if (p->signal->flags & SIGNAL_STOP_STOPPED)
+			return &p->signal->group_exit_code;
+	}
+	return NULL;
+}
+
 /*
  * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
  * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
@@ -1427,7 +1439,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 			     int options, struct siginfo __user *infop,
 			     int __user *stat_addr, struct rusage __user *ru)
 {
-	int retval, exit_code, why;
+	int retval, exit_code, *p_code, why;
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 
@@ -1437,22 +1449,16 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 	exit_code = 0;
 	spin_lock_irq(&p->sighand->siglock);
 
-	if (unlikely(!task_is_stopped_or_traced(p)))
-		goto unlock_sig;
-
-	if (!ptrace && p->signal->group_stop_count > 0)
-		/*
-		 * A group stop is in progress and this is the group leader.
-		 * We won't report until all threads have stopped.
-		 */
+	p_code = task_stopped_code(p, ptrace);
+	if (unlikely(!p_code))
 		goto unlock_sig;
 
-	exit_code = p->exit_code;
+	exit_code = *p_code;
 	if (!exit_code)
 		goto unlock_sig;
 
 	if (!unlikely(options & WNOWAIT))
-		p->exit_code = 0;
+		*p_code = 0;
 
 	/* don't need the RCU readlock here as we're holding a spinlock */
 	uid = __task_cred(p)->uid;
@@ -1608,7 +1614,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
 	 */
 	*notask_error = 0;
 
-	if (task_is_stopped_or_traced(p))
+	if (task_stopped_code(p, ptrace))
 		return wait_task_stopped(ptrace, p, options,
 					 infop, stat_addr, ru);
 
-- 
cgit v1.2.3-59-g8ed1b


From 43918f2bf4806675943416d539d9d5e4d585ebff Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:00 -0700
Subject: signals: remove 'handler' parameter to tracehook functions

Container-init must behave like global-init to processes within the
container and hence it must be immune to unhandled fatal signals from
within the container (i.e SIG_DFL signals that terminate the process).

But the same container-init must behave like a normal process to processes
in ancestor namespaces and so if it receives the same fatal signal from a
process in ancestor namespace, the signal must be processed.

Implementing these semantics requires that send_signal() determine pid
namespace of the sender but since signals can originate from workqueues/
interrupt-handlers, determining pid namespace of sender may not always be
possible or safe.

This patchset implements the design/simplified semantics suggested by
Oleg Nesterov.  The simplified semantics for container-init are:

	- container-init must never be terminated by a signal from a
	  descendant process.

	- container-init must never be immune to SIGKILL from an ancestor
	  namespace (so a process in parent namespace must always be able
	  to terminate a descendant container).

	- container-init may be immune to unhandled fatal signals (like
	  SIGUSR1) even if they are from ancestor namespace. SIGKILL/SIGSTOP
	  are the only reliable signals to a container-init from ancestor
	  namespace.

This patch:

Based on an earlier patch submitted by Oleg Nesterov and comments from
Roland McGrath (http://lkml.org/lkml/2008/11/19/258).

The handler parameter is currently unused in the tracehook functions.
Besides, the tracehook functions are called with siglock held, so the
functions can check the handler if they later need to.

Removing the parameter simiplifies changes to sig_ignored() in a follow-on
patch.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/ptrace.c  |  2 +-
 include/linux/tracehook.h | 13 ++++---------
 kernel/signal.c           |  6 +++---
 3 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 19378715f415..b7cc21bc6ae0 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1455,6 +1455,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
 	 * system call instruction.
 	 */
 	if (test_thread_flag(TIF_SINGLESTEP) &&
-	    tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
+	    tracehook_consider_fatal_signal(current, SIGTRAP))
 		send_sigtrap(current, regs, 0, TRAP_BRKPT);
 }
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 6186a789d6c7..eb4c6545b384 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -388,17 +388,14 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
  * tracehook_consider_ignored_signal - suppress short-circuit of ignored signal
  * @task:		task receiving the signal
  * @sig:		signal number being sent
- * @handler:		%SIG_IGN or %SIG_DFL
  *
  * Return zero iff tracing doesn't care to examine this ignored signal,
  * so it can short-circuit normal delivery and never even get queued.
- * Either @handler is %SIG_DFL and @sig's default is ignore, or it's %SIG_IGN.
  *
  * Called with @task->sighand->siglock held.
  */
 static inline int tracehook_consider_ignored_signal(struct task_struct *task,
-						    int sig,
-						    void __user *handler)
+						    int sig)
 {
 	return (task_ptrace(task) & PT_PTRACED) != 0;
 }
@@ -407,19 +404,17 @@ static inline int tracehook_consider_ignored_signal(struct task_struct *task,
  * tracehook_consider_fatal_signal - suppress special handling of fatal signal
  * @task:		task receiving the signal
  * @sig:		signal number being sent
- * @handler:		%SIG_DFL or %SIG_IGN
  *
  * Return nonzero to prevent special handling of this termination signal.
- * Normally @handler is %SIG_DFL.  It can be %SIG_IGN if @sig is ignored,
- * in which case force_sig() is about to reset it to %SIG_DFL.
+ * Normally handler for signal is %SIG_DFL.  It can be %SIG_IGN if @sig is
+ * ignored, in which case force_sig() is about to reset it to %SIG_DFL.
  * When this returns zero, this signal might cause a quick termination
  * that does not give the debugger a chance to intercept the signal.
  *
  * Called with or without @task->sighand->siglock held.
  */
 static inline int tracehook_consider_fatal_signal(struct task_struct *task,
-						  int sig,
-						  void __user *handler)
+						  int sig)
 {
 	return (task_ptrace(task) & PT_PTRACED) != 0;
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 1c8814481a11..92a1ab004498 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -74,7 +74,7 @@ static int sig_ignored(struct task_struct *t, int sig)
 	/*
 	 * Tracers may want to know about even ignored signals.
 	 */
-	return !tracehook_consider_ignored_signal(t, sig, handler);
+	return !tracehook_consider_ignored_signal(t, sig);
 }
 
 /*
@@ -318,7 +318,7 @@ int unhandled_signal(struct task_struct *tsk, int sig)
 		return 1;
 	if (handler != SIG_IGN && handler != SIG_DFL)
 		return 0;
-	return !tracehook_consider_fatal_signal(tsk, sig, handler);
+	return !tracehook_consider_fatal_signal(tsk, sig);
 }
 
 
@@ -777,7 +777,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
 	    !sigismember(&t->real_blocked, sig) &&
 	    (sig == SIGKILL ||
-	     !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
+	     !tracehook_consider_fatal_signal(t, sig))) {
 		/*
 		 * This signal will be fatal to the whole group.
 		 */
-- 
cgit v1.2.3-59-g8ed1b


From f008faff0e2777c8b3fe853891b774ca465938d8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:02 -0700
Subject: signals: protect init from unwanted signals more

(This is a modified version of the patch submitted by Oleg Nesterov
http://lkml.org/lkml/2008/11/18/249 and tries to address comments that
came up in that discussion)

init ignores the SIG_DFL signals but we queue them anyway, including
SIGKILL.  This is mostly OK, the signal will be dropped silently when
dequeued, but the pending SIGKILL has 2 bad implications:

        - it implies fatal_signal_pending(), so we confuse things
          like wait_for_completion_killable/lock_page_killable.

        - for the sub-namespace inits, the pending SIGKILL can
          mask (legacy_queue) the subsequent SIGKILL from the
          parent namespace which must kill cinit reliably.
          (preparation, cinits don't have SIGNAL_UNKILLABLE yet)

The patch can't help when init is ptraced, but ptracing of init is not
"safe" anyway.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 92a1ab004498..8bf7a40e5c71 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -55,10 +55,21 @@ static int sig_handler_ignored(void __user *handler, int sig)
 		(handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 
-static int sig_ignored(struct task_struct *t, int sig)
+static int sig_task_ignored(struct task_struct *t, int sig)
 {
 	void __user *handler;
 
+	handler = sig_handler(t, sig);
+
+	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
+			handler == SIG_DFL)
+		return 1;
+
+	return sig_handler_ignored(handler, sig);
+}
+
+static int sig_ignored(struct task_struct *t, int sig)
+{
 	/*
 	 * Blocked signals are never ignored, since the
 	 * signal handler may change by the time it is
@@ -67,8 +78,7 @@ static int sig_ignored(struct task_struct *t, int sig)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	handler = sig_handler(t, sig);
-	if (!sig_handler_ignored(handler, sig))
+	if (!sig_task_ignored(t, sig))
 		return 0;
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 7978b567d31555fc828b8f945c605ad29e117b22 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:04 -0700
Subject: signals: add from_ancestor_ns parameter to send_signal()

send_signal() (or its helper) needs to determine the pid namespace of the
sender.  But a signal sent via kill_pid_info_as_uid() comes from within
the kernel and send_signal() does not need to determine the pid namespace
of the sender.  So define a helper for send_signal() which takes an
additional parameter, 'from_ancestor_ns' and have kill_pid_info_as_uid()
use that helper directly.

The 'from_ancestor_ns' parameter will be used in a follow-on patch.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8bf7a40e5c71..7b6de962a1af 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -823,8 +823,8 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
 	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
 
-static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
-			int group)
+static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
+			int group, int from_ancestor_ns)
 {
 	struct sigpending *pending;
 	struct sigqueue *q;
@@ -899,6 +899,12 @@ out_set:
 	return 0;
 }
 
+static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
+			int group)
+{
+	return __send_signal(sig, info, t, group, 0);
+}
+
 int print_fatal_signals;
 
 static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -1143,7 +1149,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 	if (sig && p->sighand) {
 		unsigned long flags;
 		spin_lock_irqsave(&p->sighand->siglock, flags);
-		ret = __group_send_sig_info(sig, info, p);
+		ret = __send_signal(sig, info, p, 1, 0);
 		spin_unlock_irqrestore(&p->sighand->siglock, flags);
 	}
 out_unlock:
-- 
cgit v1.2.3-59-g8ed1b


From 921cf9f63089c7442d44083477620132f4cea066 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:05 -0700
Subject: signals: protect cinit from unblocked SIG_DFL signals

Drop early any SIG_DFL or SIG_IGN signals to container-init from within
the same container.  But queue SIGSTOP and SIGKILL to the container-init
if they are from an ancestor container.

Blocked, fatal signals (i.e when SIG_DFL is to terminate) from within the
container can still terminate the container-init.  That will be addressed
in the next patch.

Note:	To be bisect-safe, SIGNAL_UNKILLABLE will be set for container-inits
   	in a follow-on patch. Until then, this patch is just a preparatory
	step.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 7b6de962a1af..fb19aae2363b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -55,20 +55,21 @@ static int sig_handler_ignored(void __user *handler, int sig)
 		(handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 
-static int sig_task_ignored(struct task_struct *t, int sig)
+static int sig_task_ignored(struct task_struct *t, int sig,
+		int from_ancestor_ns)
 {
 	void __user *handler;
 
 	handler = sig_handler(t, sig);
 
 	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
-			handler == SIG_DFL)
+			handler == SIG_DFL && !from_ancestor_ns)
 		return 1;
 
 	return sig_handler_ignored(handler, sig);
 }
 
-static int sig_ignored(struct task_struct *t, int sig)
+static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
 {
 	/*
 	 * Blocked signals are never ignored, since the
@@ -78,7 +79,7 @@ static int sig_ignored(struct task_struct *t, int sig)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	if (!sig_task_ignored(t, sig))
+	if (!sig_task_ignored(t, sig, from_ancestor_ns))
 		return 0;
 
 	/*
@@ -634,7 +635,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
  * Returns true if the signal should be actually delivered, otherwise
  * it should be dropped.
  */
-static int prepare_signal(int sig, struct task_struct *p)
+static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 {
 	struct signal_struct *signal = p->signal;
 	struct task_struct *t;
@@ -718,7 +719,7 @@ static int prepare_signal(int sig, struct task_struct *p)
 		}
 	}
 
-	return !sig_ignored(p, sig);
+	return !sig_ignored(p, sig, from_ancestor_ns);
 }
 
 /*
@@ -832,7 +833,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	trace_sched_signal_send(sig, t);
 
 	assert_spin_locked(&t->sighand->siglock);
-	if (!prepare_signal(sig, t))
+
+	if (!prepare_signal(sig, t, from_ancestor_ns))
 		return 0;
 
 	pending = group ? &t->signal->shared_pending : &t->pending;
@@ -902,7 +904,15 @@ out_set:
 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			int group)
 {
-	return __send_signal(sig, info, t, group, 0);
+	int from_ancestor_ns = 0;
+
+#ifdef CONFIG_PID_NS
+	if (!is_si_special(info) && SI_FROMUSER(info) &&
+			task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0)
+		from_ancestor_ns = 1;
+#endif
+
+	return __send_signal(sig, info, t, group, from_ancestor_ns);
 }
 
 int print_fatal_signals;
@@ -1336,7 +1346,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
 		goto ret;
 
 	ret = 1; /* the signal is ignored */
-	if (!prepare_signal(sig, t))
+	if (!prepare_signal(sig, t, 0))
 		goto out;
 
 	ret = 0;
-- 
cgit v1.2.3-59-g8ed1b


From e4da026f980df125a4918c3bb9fe93185c7ef12a Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:06 -0700
Subject: signals: zap_pid_ns_process() should use force_sig()

send_signal() assumes that signals with SEND_SIG_PRIV are generated from
within the same namespace.  So any nested container-init processes become
immune to the SIGKILL generated by kill_proc_info() in
zap_pid_ns_processes().

Use force_sig() in zap_pid_ns_processes() instead - force_sig() clears the
SIGNAL_UNKILLABLE flag ensuring the signal is processed by
container-inits.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid_namespace.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index fab8ea86fac3..2d1001b4858d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -152,6 +152,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 {
 	int nr;
 	int rc;
+	struct task_struct *task;
 
 	/*
 	 * The last thread in the cgroup-init thread group is terminating.
@@ -169,7 +170,19 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	read_lock(&tasklist_lock);
 	nr = next_pidmap(pid_ns, 1);
 	while (nr > 0) {
-		kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
+		rcu_read_lock();
+
+		/*
+		 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
+		 * any nested-container's init processes don't ignore the
+		 * signal
+		 */
+		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+		if (task)
+			force_sig(SIGKILL, task);
+
+		rcu_read_unlock();
+
 		nr = next_pidmap(pid_ns, nr);
 	}
 	read_unlock(&tasklist_lock);
-- 
cgit v1.2.3-59-g8ed1b


From b3bfa0cba867f23365b81658b47efd906830879b Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:08 -0700
Subject: signals: protect cinit from blocked fatal signals

Normally SIG_DFL signals to global and container-init are dropped early.
But if a signal is blocked when it is posted, we cannot drop the signal
since the receiver may install a handler before unblocking the signal.
Once this signal is queued however, the receiver container-init has no way
of knowing if the signal was sent from an ancestor or descendant
namespace.  This patch ensures that contianer-init drops all SIG_DFL
signals in get_signal_to_deliver() except SIGKILL/SIGSTOP.

If SIGSTOP/SIGKILL originate from a descendant of container-init they are
never queued (i.e dropped in sig_ignored() in an earler patch).

If SIGSTOP/SIGKILL originate from parent namespace, the signal is queued
and container-init processes the signal.

IOW, if get_signal_to_deliver() sees a sig_kernel_only() signal for global
or container-init, the signal must have been generated internally or must
have come from an ancestor ns and we process the signal.

Further, the signal_group_exit() check was needed to cover the case of a
multi-threaded init sending SIGKILL to other threads when doing an exit()
or exec().  But since the new sig_kernel_only() check covers the SIGKILL,
the signal_group_exit() check is no longer needed and can be removed.

Finally, now that we have all pieces in place, set SIGNAL_UNKILLABLE for
container-inits.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c   | 2 ++
 kernel/signal.c | 9 ++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index d7eb727eb535..adbea16ec649 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -841,6 +841,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	atomic_set(&sig->live, 1);
 	init_waitqueue_head(&sig->wait_chldexit);
 	sig->flags = 0;
+	if (clone_flags & CLONE_NEWPID)
+		sig->flags |= SIGNAL_UNKILLABLE;
 	sig->group_exit_code = 0;
 	sig->group_exit_task = NULL;
 	sig->group_stop_count = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index fb19aae2363b..ba3da25f0eea 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1870,9 +1870,16 @@ relock:
 
 		/*
 		 * Global init gets no signals it doesn't want.
+		 * Container-init gets no signals it doesn't want from same
+		 * container.
+		 *
+		 * Note that if global/container-init sees a sig_kernel_only()
+		 * signal here, the signal must have been generated internally
+		 * or must have come from an ancestor namespace. In either
+		 * case, the signal cannot be dropped.
 		 */
 		if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
-		    !signal_group_exit(signal))
+				!sig_kernel_only(signr))
 			continue;
 
 		if (sig_kernel_stop(signr)) {
-- 
cgit v1.2.3-59-g8ed1b


From 6588c1e3ff01418acafd938db0740e3477dc8cb7 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:09 -0700
Subject: signals: SI_USER: Masquerade si_pid when crossing pid ns boundary

When sending a signal to a descendant namespace, set ->si_pid to 0 since
the sender does not have a pid in the receiver's namespace.

Note:
	- If rt_sigqueueinfo() sets si_code to SI_USER when sending a
	  signal across a pid namespace boundary, the value in ->si_pid
	  will be cleared to 0.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index ba3da25f0eea..d8034737db4c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -883,6 +883,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			break;
 		default:
 			copy_siginfo(&q->info, info);
+			if (from_ancestor_ns)
+				q->info.si_pid = 0;
 			break;
 		}
 	} else if (!is_si_special(info)) {
-- 
cgit v1.2.3-59-g8ed1b


From 95c3eb76dc07fd81289888ffc42948196b34b444 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:11 -0700
Subject: ptrace: kill __ptrace_detach(), fix ->exit_state check

Move the code from __ptrace_detach() to its single caller and kill this
helper.

Also, fix the ->exit_state check, we shouldn't wake up EXIT_DEAD tasks.
Actually, I think task_is_stopped_or_traced() makes more sense, but this
needs another patch.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c9cf48b21f05..f62a568e84ec 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -235,16 +235,6 @@ out:
 	return retval;
 }
 
-static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
-{
-	child->exit_code = data;
-	/* .. re-parent .. */
-	__ptrace_unlink(child);
-	/* .. and wake it up. */
-	if (child->exit_state != EXIT_ZOMBIE)
-		wake_up_process(child);
-}
-
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
 	if (!valid_signal(data))
@@ -254,10 +244,16 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	ptrace_disable(child);
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
-	write_lock_irq(&tasklist_lock);
 	/* protect against de_thread()->release_task() */
-	if (child->ptrace)
-		__ptrace_detach(child, data);
+	write_lock_irq(&tasklist_lock);
+	if (child->ptrace) {
+		child->exit_code = data;
+
+		__ptrace_unlink(child);
+
+		if (!child->exit_state)
+			wake_up_process(child);
+	}
 	write_unlock_irq(&tasklist_lock);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 6d69cb87f05eef3b02370b2f7bae608ad2301a00 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:12 -0700
Subject: ptrace: simplify ptrace_exit()->ignoring_children() path

ignoring_children() takes parent->sighand->siglock and checks
k_sigaction[SIGCHLD] atomically.  But this buys nothing, we can't get the
"really" wrong result even if we race with sigaction(SIGCHLD).  If we read
the "stale" sa_handler/sa_flags we can pretend it was changed right after
the check.

Remove spin_lock(->siglock), and kill "int ign" which caches the result of
ignoring_children() which becomes rather trivial.

Perhaps it makes sense to export this helper, do_notify_parent() can use
it too.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 0c06b9efae3b..7a8311422930 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -732,19 +732,15 @@ static void exit_mm(struct task_struct * tsk)
 }
 
 /*
- * Return nonzero if @parent's children should reap themselves.
- *
- * Called with write_lock_irq(&tasklist_lock) held.
+ * Called with irqs disabled, returns true if childs should reap themselves.
  */
-static int ignoring_children(struct task_struct *parent)
+static int ignoring_children(struct sighand_struct *sigh)
 {
 	int ret;
-	struct sighand_struct *psig = parent->sighand;
-	unsigned long flags;
-	spin_lock_irqsave(&psig->siglock, flags);
-	ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
-	       (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
-	spin_unlock_irqrestore(&psig->siglock, flags);
+	spin_lock(&sigh->siglock);
+	ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
+	      (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
+	spin_unlock(&sigh->siglock);
 	return ret;
 }
 
@@ -757,7 +753,6 @@ static int ignoring_children(struct task_struct *parent)
 static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 {
 	struct task_struct *p, *n;
-	int ign = -1;
 
 	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
 		__ptrace_unlink(p);
@@ -779,12 +774,8 @@ static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 		if (!task_detached(p) && thread_group_empty(p)) {
 			if (!same_thread_group(p->real_parent, parent))
 				do_notify_parent(p, p->exit_signal);
-			else {
-				if (ign < 0)
-					ign = ignoring_children(parent);
-				if (ign)
-					p->exit_signal = -1;
-			}
+			else if (ignoring_children(parent->sighand))
+				p->exit_signal = -1;
 		}
 
 		if (task_detached(p)) {
-- 
cgit v1.2.3-59-g8ed1b


From b1b4c6799fb59e710454bfe0ab477cb8523a8667 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:13 -0700
Subject: ptrace: reintroduce __ptrace_detach() as a callee of ptrace_exit()

No functional changes, preparation for the next patch.

Move the "should we release this child" logic into the separate handler,
__ptrace_detach().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 62 +++++++++++++++++++++++++++++++----------------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 7a8311422930..576eae233b53 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -744,6 +744,38 @@ static int ignoring_children(struct sighand_struct *sigh)
 	return ret;
 }
 
+/* Returns nonzero if the tracee should be released. */
+int __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
+{
+	__ptrace_unlink(p);
+
+	if (p->exit_state != EXIT_ZOMBIE)
+		return 0;
+	/*
+	 * If it's a zombie, our attachedness prevented normal
+	 * parent notification or self-reaping.  Do notification
+	 * now if it would have happened earlier.  If it should
+	 * reap itself we return true.
+	 *
+	 * If it's our own child, there is no notification to do.
+	 * But if our normal children self-reap, then this child
+	 * was prevented by ptrace and we must reap it now.
+	 */
+	if (!task_detached(p) && thread_group_empty(p)) {
+		if (!same_thread_group(p->real_parent, tracer))
+			do_notify_parent(p, p->exit_signal);
+		else if (ignoring_children(tracer->sighand))
+			p->exit_signal = -1;
+	}
+
+	if (!task_detached(p))
+		return 0;
+
+	/* Mark it as in the process of being reaped. */
+	p->exit_state = EXIT_DEAD;
+	return 1;
+}
+
 /*
  * Detach all tasks we were using ptrace on.
  * Any that need to be release_task'd are put on the @dead list.
@@ -755,36 +787,8 @@ static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 	struct task_struct *p, *n;
 
 	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
-		__ptrace_unlink(p);
-
-		if (p->exit_state != EXIT_ZOMBIE)
-			continue;
-
-		/*
-		 * If it's a zombie, our attachedness prevented normal
-		 * parent notification or self-reaping.  Do notification
-		 * now if it would have happened earlier.  If it should
-		 * reap itself, add it to the @dead list.  We can't call
-		 * release_task() here because we already hold tasklist_lock.
-		 *
-		 * If it's our own child, there is no notification to do.
-		 * But if our normal children self-reap, then this child
-		 * was prevented by ptrace and we must reap it now.
-		 */
-		if (!task_detached(p) && thread_group_empty(p)) {
-			if (!same_thread_group(p->real_parent, parent))
-				do_notify_parent(p, p->exit_signal);
-			else if (ignoring_children(parent->sighand))
-				p->exit_signal = -1;
-		}
-
-		if (task_detached(p)) {
-			/*
-			 * Mark it as in the process of being reaped.
-			 */
-			p->exit_state = EXIT_DEAD;
+		if (__ptrace_detach(parent, p))
 			list_add(&p->ptrace_entry, dead);
-		}
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 4576145c1ecdaaea9ef8976a48335206aa1ebf91 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:14 -0700
Subject: ptrace: fix possible zombie leak on PTRACE_DETACH

When ptrace_detach() takes tasklist, the tracee can be SIGKILL'ed.  If it
has already passed exit_notify() we can leak a zombie, because a) ptracing
disables the auto-reaping logic, and b) ->real_parent was not notified
about the child's death.

ptrace_detach() should follow the ptrace_exit's logic, change the code
accordingly.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Tested-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h | 1 +
 kernel/ptrace.c        | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 98b93ca4db06..1a2b0cb55535 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -94,6 +94,7 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
+extern int __ptrace_detach(struct task_struct *tracer, struct task_struct *p);
 extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f62a568e84ec..ee553b6ad125 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -237,6 +237,8 @@ out:
 
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
+	int dead = 0;
+
 	if (!valid_signal(data))
 		return -EIO;
 
@@ -244,18 +246,21 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	ptrace_disable(child);
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
-	/* protect against de_thread()->release_task() */
 	write_lock_irq(&tasklist_lock);
+	/* protect against de_thread()->release_task() */
 	if (child->ptrace) {
 		child->exit_code = data;
 
-		__ptrace_unlink(child);
+		dead = __ptrace_detach(current, child);
 
 		if (!child->exit_state)
 			wake_up_process(child);
 	}
 	write_unlock_irq(&tasklist_lock);
 
+	if (unlikely(dead))
+		release_task(child);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 0a967a044a777e8b9c739120927114ddc0094298 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:15 -0700
Subject: reparent_thread: don't call kill_orphaned_pgrp() if task_detached()

If task_detached(p) == T, then either

  a) p is not the main thread, we will find the group leader on the
     ->children list.

or

  b) p is the group leader but its ->exit_state = EXIT_DEAD.  This
     can only happen when the last sub-thread has died, but in that case
     that thread has already called kill_orphaned_pgrp() from
     exit_notify().

In both cases kill_orphaned_pgrp() looks bogus.

Move the task_detached() check up and simplify the code, this is also
right from the "common sense" pov: we should do nothing with the detached
childs, except move them to the new parent's ->children list.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 576eae233b53..405e6877168b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -818,6 +818,8 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 
 	list_move_tail(&p->sibling, &p->real_parent->children);
 
+	if (task_detached(p))
+		return;
 	/* If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
 	 */
@@ -825,15 +827,13 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 		return;
 
 	/* We don't want people slaying init.  */
-	if (!task_detached(p))
-		p->exit_signal = SIGCHLD;
+	p->exit_signal = SIGCHLD;
 
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
 	if (!ptrace_reparented(p) &&
-	    p->exit_state == EXIT_ZOMBIE &&
-	    !task_detached(p) && thread_group_empty(p))
+	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p))
 		do_notify_parent(p, p->exit_signal);
 
 	kill_orphaned_pgrp(p, father);
-- 
cgit v1.2.3-59-g8ed1b


From b1442b055c154699a6a2c436f3352f71b6beede3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:16 -0700
Subject: reparent_thread: fix the "is it traced" check

reparent_thread() uses ptrace_reparented() to check whether this thread is
ptraced, in that case we should not notify the new parent.

But ptrace_reparented() is not exactly correct when the reparented thread
is traced by /sbin/init, because forget_original_parent() has already
changed ->real_parent.

Currently, the only problem is the false notification.  But with the next
patch the kernel crash in this (yes, pathological) case.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 405e6877168b..5be0a406faeb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -832,7 +832,7 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
-	if (!ptrace_reparented(p) &&
+	if (!p->ptrace &&
 	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p))
 		do_notify_parent(p, p->exit_signal);
 
-- 
cgit v1.2.3-59-g8ed1b


From 7f5d3652d469cdf9eb2365dfea7ce3fb9e1409cc Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:17 -0700
Subject: reparent_thread: fix a zombie leak if /sbin/init ignores SIGCHLD

If /sbin/init ignores SIGCHLD and we re-parent a zombie, it is leaked.
reparent_thread() does do_notify_parent() which sets ->exit_signal = -1 in
this case.  This means that nobody except us can reap it, the detached
task is not visible to do_wait().

Change reparent_thread() to return a boolean (like __pthread_detach) to
indicate that the thread is dead and must be released.  Also change
forget_original_parent() to add the child to ptrace_dead list in this
case.

The naming becomes insane, the next patch does the cleanup.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 5be0a406faeb..3e09b7cb3b20 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -810,8 +810,11 @@ static void ptrace_exit_finish(struct task_struct *parent,
 	}
 }
 
-static void reparent_thread(struct task_struct *p, struct task_struct *father)
+/* Returns nonzero if the child should be released. */
+static int reparent_thread(struct task_struct *p, struct task_struct *father)
 {
+	int dead;
+
 	if (p->pdeath_signal)
 		/* We already hold the tasklist_lock here.  */
 		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
@@ -819,12 +822,12 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 	list_move_tail(&p->sibling, &p->real_parent->children);
 
 	if (task_detached(p))
-		return;
+		return 0;
 	/* If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
 	 */
 	if (same_thread_group(p->real_parent, father))
-		return;
+		return 0;
 
 	/* We don't want people slaying init.  */
 	p->exit_signal = SIGCHLD;
@@ -832,11 +835,19 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
+	dead = 0;
 	if (!p->ptrace &&
-	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p))
+	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 		do_notify_parent(p, p->exit_signal);
+		if (task_detached(p)) {
+			p->exit_state = EXIT_DEAD;
+			dead = 1;
+		}
+	}
 
 	kill_orphaned_pgrp(p, father);
+
+	return dead;
 }
 
 /*
@@ -896,7 +907,8 @@ static void forget_original_parent(struct task_struct *father)
 			BUG_ON(p->ptrace);
 			p->parent = p->real_parent;
 		}
-		reparent_thread(p, father);
+		if (reparent_thread(p, father))
+			list_add(&p->ptrace_entry, &ptrace_dead);;
 	}
 
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.2.3-59-g8ed1b


From 39c626ae47c469abdfd30c6e42eff884931380d6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:18 -0700
Subject: forget_original_parent: split out the un-ptrace part

By discussion with Roland.

- Rename ptrace_exit() to exit_ptrace(), and change it to do all the
  necessary work with ->ptraced list by its own.

- Move this code from exit.c to ptrace.c

- Update the comment in ptrace_detach() to explain the rechecking of
  the child->ptrace.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Metzger, Markus T" <markus.t.metzger@intel.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h |  2 +-
 include/linux/sched.h  |  5 +++
 kernel/exit.c          | 95 ++++----------------------------------------------
 kernel/ptrace.c        | 78 +++++++++++++++++++++++++++++++++++++++--
 4 files changed, 88 insertions(+), 92 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 1a2b0cb55535..67c15653fc23 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -94,7 +94,7 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
-extern int __ptrace_detach(struct task_struct *tracer, struct task_struct *p);
+extern void exit_ptrace(struct task_struct *tracer);
 extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9186f8c5d5f2..b47c94e7560b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2061,6 +2061,11 @@ static inline int thread_group_empty(struct task_struct *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
+static inline int task_detached(struct task_struct *p)
+{
+	return p->exit_signal == -1;
+}
+
 /*
  * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
diff --git a/kernel/exit.c b/kernel/exit.c
index 3e09b7cb3b20..506693dfdd4e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -61,11 +61,6 @@ DEFINE_TRACE(sched_process_wait);
 
 static void exit_mm(struct task_struct * tsk);
 
-static inline int task_detached(struct task_struct *p)
-{
-	return p->exit_signal == -1;
-}
-
 static void __unhash_process(struct task_struct *p)
 {
 	nr_threads--;
@@ -731,85 +726,6 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-/*
- * Called with irqs disabled, returns true if childs should reap themselves.
- */
-static int ignoring_children(struct sighand_struct *sigh)
-{
-	int ret;
-	spin_lock(&sigh->siglock);
-	ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
-	      (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
-	spin_unlock(&sigh->siglock);
-	return ret;
-}
-
-/* Returns nonzero if the tracee should be released. */
-int __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
-{
-	__ptrace_unlink(p);
-
-	if (p->exit_state != EXIT_ZOMBIE)
-		return 0;
-	/*
-	 * If it's a zombie, our attachedness prevented normal
-	 * parent notification or self-reaping.  Do notification
-	 * now if it would have happened earlier.  If it should
-	 * reap itself we return true.
-	 *
-	 * If it's our own child, there is no notification to do.
-	 * But if our normal children self-reap, then this child
-	 * was prevented by ptrace and we must reap it now.
-	 */
-	if (!task_detached(p) && thread_group_empty(p)) {
-		if (!same_thread_group(p->real_parent, tracer))
-			do_notify_parent(p, p->exit_signal);
-		else if (ignoring_children(tracer->sighand))
-			p->exit_signal = -1;
-	}
-
-	if (!task_detached(p))
-		return 0;
-
-	/* Mark it as in the process of being reaped. */
-	p->exit_state = EXIT_DEAD;
-	return 1;
-}
-
-/*
- * Detach all tasks we were using ptrace on.
- * Any that need to be release_task'd are put on the @dead list.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
-{
-	struct task_struct *p, *n;
-
-	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
-		if (__ptrace_detach(parent, p))
-			list_add(&p->ptrace_entry, dead);
-	}
-}
-
-/*
- * Finish up exit-time ptrace cleanup.
- *
- * Called without locks.
- */
-static void ptrace_exit_finish(struct task_struct *parent,
-			       struct list_head *dead)
-{
-	struct task_struct *p, *n;
-
-	BUG_ON(!list_empty(&parent->ptraced));
-
-	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
-		list_del_init(&p->ptrace_entry);
-		release_task(p);
-	}
-}
-
 /* Returns nonzero if the child should be released. */
 static int reparent_thread(struct task_struct *p, struct task_struct *father)
 {
@@ -894,12 +810,10 @@ static void forget_original_parent(struct task_struct *father)
 	struct task_struct *p, *n, *reaper;
 	LIST_HEAD(ptrace_dead);
 
+	exit_ptrace(father);
+
 	write_lock_irq(&tasklist_lock);
 	reaper = find_new_reaper(father);
-	/*
-	 * First clean up ptrace if we were using it.
-	 */
-	ptrace_exit(father, &ptrace_dead);
 
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
 		p->real_parent = reaper;
@@ -914,7 +828,10 @@ static void forget_original_parent(struct task_struct *father)
 	write_unlock_irq(&tasklist_lock);
 	BUG_ON(!list_empty(&father->children));
 
-	ptrace_exit_finish(father, &ptrace_dead);
+	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
 }
 
 /*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ee553b6ad125..f5a9fa5aafa1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -235,9 +235,57 @@ out:
 	return retval;
 }
 
+/*
+ * Called with irqs disabled, returns true if childs should reap themselves.
+ */
+static int ignoring_children(struct sighand_struct *sigh)
+{
+	int ret;
+	spin_lock(&sigh->siglock);
+	ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
+	      (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
+	spin_unlock(&sigh->siglock);
+	return ret;
+}
+
+/*
+ * Called with tasklist_lock held for writing.
+ * Unlink a traced task, and clean it up if it was a traced zombie.
+ * Return true if it needs to be reaped with release_task().
+ * (We can't call release_task() here because we already hold tasklist_lock.)
+ *
+ * If it's a zombie, our attachedness prevented normal parent notification
+ * or self-reaping.  Do notification now if it would have happened earlier.
+ * If it should reap itself, return true.
+ *
+ * If it's our own child, there is no notification to do.
+ * But if our normal children self-reap, then this child
+ * was prevented by ptrace and we must reap it now.
+ */
+static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
+{
+	__ptrace_unlink(p);
+
+	if (p->exit_state == EXIT_ZOMBIE) {
+		if (!task_detached(p) && thread_group_empty(p)) {
+			if (!same_thread_group(p->real_parent, tracer))
+				do_notify_parent(p, p->exit_signal);
+			else if (ignoring_children(tracer->sighand))
+				p->exit_signal = -1;
+		}
+		if (task_detached(p)) {
+			/* Mark it as in the process of being reaped. */
+			p->exit_state = EXIT_DEAD;
+			return true;
+		}
+	}
+
+	return false;
+}
+
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
-	int dead = 0;
+	bool dead = false;
 
 	if (!valid_signal(data))
 		return -EIO;
@@ -247,7 +295,10 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
 	write_lock_irq(&tasklist_lock);
-	/* protect against de_thread()->release_task() */
+	/*
+	 * This child can be already killed. Make sure de_thread() or
+	 * our sub-thread doing do_wait() didn't do release_task() yet.
+	 */
 	if (child->ptrace) {
 		child->exit_code = data;
 
@@ -264,6 +315,29 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	return 0;
 }
 
+/*
+ * Detach all tasks we were using ptrace on.
+ */
+void exit_ptrace(struct task_struct *tracer)
+{
+	struct task_struct *p, *n;
+	LIST_HEAD(ptrace_dead);
+
+	write_lock_irq(&tasklist_lock);
+	list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
+		if (__ptrace_detach(tracer, p))
+			list_add(&p->ptrace_entry, &ptrace_dead);
+	}
+	write_unlock_irq(&tasklist_lock);
+
+	BUG_ON(!list_empty(&tracer->ptraced));
+
+	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
+}
+
 int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
 {
 	int copied = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 5dfc80be73dd0c212d2e6dd8dbf5afa07e680bbe Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:19 -0700
Subject: forget_original_parent: do not abuse child->ptrace_entry

By discussion with Roland.

- Use ->sibling instead of ->ptrace_entry to chain the need to be
  release_task'd childs. Nobody else can use ->sibling, this task
  is EXIT_DEAD and nobody can find it on its own list.

- rename ptrace_dead to dead_childs.

- Now that we don't have the "parallel" untrace code, change back
  reparent_thread() to return void, pass dead_childs as an argument.

Actually, I don't understand why do we notify /sbin/init when we
reparent a zombie, probably it is better to reap it unconditionally.

[akpm@linux-foundation.org: s/childs/children/]
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Metzger, Markus T" <markus.t.metzger@intel.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 87 ++++++++++++++++++++++++++++-------------------------------
 1 file changed, 41 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 506693dfdd4e..029415d9f82e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -726,46 +726,6 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-/* Returns nonzero if the child should be released. */
-static int reparent_thread(struct task_struct *p, struct task_struct *father)
-{
-	int dead;
-
-	if (p->pdeath_signal)
-		/* We already hold the tasklist_lock here.  */
-		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
-
-	list_move_tail(&p->sibling, &p->real_parent->children);
-
-	if (task_detached(p))
-		return 0;
-	/* If this is a threaded reparent there is no need to
-	 * notify anyone anything has happened.
-	 */
-	if (same_thread_group(p->real_parent, father))
-		return 0;
-
-	/* We don't want people slaying init.  */
-	p->exit_signal = SIGCHLD;
-
-	/* If we'd notified the old parent about this child's death,
-	 * also notify the new parent.
-	 */
-	dead = 0;
-	if (!p->ptrace &&
-	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
-		do_notify_parent(p, p->exit_signal);
-		if (task_detached(p)) {
-			p->exit_state = EXIT_DEAD;
-			dead = 1;
-		}
-	}
-
-	kill_orphaned_pgrp(p, father);
-
-	return dead;
-}
-
 /*
  * When we die, we re-parent all our children.
  * Try to give them to another thread in our thread
@@ -805,10 +765,46 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 	return pid_ns->child_reaper;
 }
 
+/*
+* Any that need to be release_task'd are put on the @dead list.
+ */
+static void reparent_thread(struct task_struct *father, struct task_struct *p,
+				struct list_head *dead)
+{
+	if (p->pdeath_signal)
+		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+
+	list_move_tail(&p->sibling, &p->real_parent->children);
+
+	if (task_detached(p))
+		return;
+	/*
+	 * If this is a threaded reparent there is no need to
+	 * notify anyone anything has happened.
+	 */
+	if (same_thread_group(p->real_parent, father))
+		return;
+
+	/* We don't want people slaying init.  */
+	p->exit_signal = SIGCHLD;
+
+	/* If it has exited notify the new parent about this child's death. */
+	if (!p->ptrace &&
+	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
+		do_notify_parent(p, p->exit_signal);
+		if (task_detached(p)) {
+			p->exit_state = EXIT_DEAD;
+			list_move_tail(&p->sibling, dead);
+		}
+	}
+
+	kill_orphaned_pgrp(p, father);
+}
+
 static void forget_original_parent(struct task_struct *father)
 {
 	struct task_struct *p, *n, *reaper;
-	LIST_HEAD(ptrace_dead);
+	LIST_HEAD(dead_children);
 
 	exit_ptrace(father);
 
@@ -821,15 +817,14 @@ static void forget_original_parent(struct task_struct *father)
 			BUG_ON(p->ptrace);
 			p->parent = p->real_parent;
 		}
-		if (reparent_thread(p, father))
-			list_add(&p->ptrace_entry, &ptrace_dead);;
+		reparent_thread(father, p, &dead_children);
 	}
-
 	write_unlock_irq(&tasklist_lock);
+
 	BUG_ON(!list_empty(&father->children));
 
-	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
-		list_del_init(&p->ptrace_entry);
+	list_for_each_entry_safe(p, n, &dead_children, sibling) {
+		list_del_init(&p->sibling);
 		release_task(p);
 	}
 }
-- 
cgit v1.2.3-59-g8ed1b


From 95a3540da9c81a5987be810e1d9a83640a366bd5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:21 -0700
Subject: ptrace_detach: the wrong wakeup breaks the ERESTARTxxx logic

Another ancient bug. Consider this trivial test-case,

	int main(void)
	{
		int pid = fork();

		if (pid) {
			ptrace(PTRACE_ATTACH, pid, NULL, NULL);
			wait(NULL);
			ptrace(PTRACE_DETACH, pid, NULL, NULL);
		} else {
			pause();
			printf("WE HAVE A KERNEL BUG!!!\n");
		}

		return 0;
	}

the child must not "escape" for sys_pause(), but it can and this was seen
in practice.

This is because ptrace_detach does:

	if (!child->exit_state)
		wake_up_process(child);

this wakeup can happen after this child has already restarted sys_pause(),
because it gets another wakeup from ptrace_untrace().

With or without this patch, perhaps sys_pause() needs a fix.  But this
wakeup also breaks the SIGNAL_STOP_STOPPED logic in ptrace_untrace().

Remove this wakeup.  The caller saw this task in TASK_TRACED state, and
unless it was SIGKILL'ed in between __ptrace_unlink()->ptrace_untrace()
should handle this case correctly.  If it was SIGKILL'ed, we don't need to
wakup the dying tracee too.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f5a9fa5aafa1..296e8105863a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -301,11 +301,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	 */
 	if (child->ptrace) {
 		child->exit_code = data;
-
 		dead = __ptrace_detach(current, child);
-
-		if (!child->exit_state)
-			wake_up_process(child);
 	}
 	write_unlock_irq(&tasklist_lock);
 
-- 
cgit v1.2.3-59-g8ed1b


From 1ee1184485df9c9a3503d3a684b911fb7c73d259 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:23 -0700
Subject: ptrace_untrace: fix the SIGNAL_STOP_STOPPED check

This bug is ancient too. ptrace_untrace() must not resume the task
if the group stop in progress, we should set TASK_STOPPED instead.

Unfortunately, we still have problems here:

	- if the process/thread was traced, SIGNAL_STOP_STOPPED
	  does not necessary means this thread group is stopped.

	- ptrace breaks the bookkeeping of ->group_stop_count.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 296e8105863a..5105f5a6a2ce 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -60,11 +60,15 @@ static void ptrace_untrace(struct task_struct *child)
 {
 	spin_lock(&child->sighand->siglock);
 	if (task_is_traced(child)) {
-		if (child->signal->flags & SIGNAL_STOP_STOPPED) {
+		/*
+		 * If the group stop is completed or in progress,
+		 * this thread was already counted as stopped.
+		 */
+		if (child->signal->flags & SIGNAL_STOP_STOPPED ||
+		    child->signal->group_stop_count)
 			__set_task_state(child, TASK_STOPPED);
-		} else {
+		else
 			signal_wake_up(child, 1);
-		}
 	}
 	spin_unlock(&child->sighand->siglock);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 2355b70fd59cb5be7de2052a9edeee7afb7ff099 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:58:24 -0700
Subject: workqueue: avoid recursion in run_workqueue()

1) lockdep will complain when run_workqueue() performs recursion.

2) The recursive implementation of run_workqueue() means that
   flush_workqueue() and its documentation are inconsistent.  This may
   hide deadlocks and other bugs.

3) The recursion in run_workqueue() will poison cwq->current_work, but
   flush_work() and __cancel_work_timer(), etcetera need a reliable
   cwq->current_work.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 41 +++++++++++------------------------------
 1 file changed, 11 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9aedd9fd825b..32f8e0d2bf5a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,8 +48,6 @@ struct cpu_workqueue_struct {
 
 	struct workqueue_struct *wq;
 	struct task_struct *thread;
-
-	int run_depth;		/* Detect run_workqueue() recursion depth */
 } ____cacheline_aligned;
 
 /*
@@ -262,13 +260,6 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
 	spin_lock_irq(&cwq->lock);
-	cwq->run_depth++;
-	if (cwq->run_depth > 3) {
-		/* morton gets to eat his hat */
-		printk("%s: recursion depth exceeded: %d\n",
-			__func__, cwq->run_depth);
-		dump_stack();
-	}
 	while (!list_empty(&cwq->worklist)) {
 		struct work_struct *work = list_entry(cwq->worklist.next,
 						struct work_struct, entry);
@@ -311,7 +302,6 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 		spin_lock_irq(&cwq->lock);
 		cwq->current_work = NULL;
 	}
-	cwq->run_depth--;
 	spin_unlock_irq(&cwq->lock);
 }
 
@@ -368,29 +358,20 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 
 static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 {
-	int active;
+	int active = 0;
+	struct wq_barrier barr;
 
-	if (cwq->thread == current) {
-		/*
-		 * Probably keventd trying to flush its own queue. So simply run
-		 * it by hand rather than deadlocking.
-		 */
-		run_workqueue(cwq);
-		active = 1;
-	} else {
-		struct wq_barrier barr;
+	WARN_ON(cwq->thread == current);
 
-		active = 0;
-		spin_lock_irq(&cwq->lock);
-		if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
-			insert_wq_barrier(cwq, &barr, &cwq->worklist);
-			active = 1;
-		}
-		spin_unlock_irq(&cwq->lock);
-
-		if (active)
-			wait_for_completion(&barr.done);
+	spin_lock_irq(&cwq->lock);
+	if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+		insert_wq_barrier(cwq, &barr, &cwq->worklist);
+		active = 1;
 	}
+	spin_unlock_irq(&cwq->lock);
+
+	if (active)
+		wait_for_completion(&barr.done);
 
 	return active;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 11dea1900931ac73184b2f5163a13d24a4e572ea Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Thu, 2 Apr 2009 16:58:27 -0700
Subject: proc_sysctl: use CONFIG_PROC_SYSCTL around ipc and utsname
 proc_handlers

As pointed out by Cedric Le Goater (in response to Alexey's original
comment wrt mqns), ipc_sysctl.c and utsname_sysctl.c are using
CONFIG_PROC_FS, not CONFIG_PROC_SYSCTL, to determine whether to define
the proc_handlers.  Change that.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Acked-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/ipc_sysctl.c        | 2 +-
 kernel/utsname_sysctl.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 4a7a12c95abe..40eab7314aeb 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -26,7 +26,7 @@ static void *get_ipc(ctl_table *table)
 	return which;
 }
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
 	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 3b34b3545936..92359cc747a7 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -37,7 +37,7 @@ static void put_uts(ctl_table *table, int write, void *which)
 		up_write(&uts_sem);
 }
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 /*
  *	Special case of dostring for the UTS structure. This has locks
  *	to observe. Should this be in kernel/sys.c ????
-- 
cgit v1.2.3-59-g8ed1b


From 8e654fba4a376f436bdfe361fc5cdbc87ac09b35 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Thu, 2 Apr 2009 16:58:33 -0700
Subject: sysctl: fix suid_dumpable and lease-break-time sysctls

Arne de Bruijn points out that commit
76fdbb25f963de5dc1e308325f0578a2f92b1c2d ("coredump masking: bound
suid_dumpable sysctl") mistakenly limits lease-break-time instead of
suid_dumpable.

Signed-off-by: Matthew Wilcox <matthew@wil.cx>
Reported-by: Arne de Bruijn <kernelbt@arbruijn.dds.nl>
Cc: Kawai, Hidehiro <hidehiro.kawai.ez@hitachi.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2e490a389dd2..5ec4543dfc06 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -95,12 +95,9 @@ static int sixty = 60;
 static int neg_one = -1;
 #endif
 
-#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
-static int two = 2;
-#endif
-
 static int zero;
 static int one = 1;
+static int two = 2;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 
@@ -1373,10 +1370,7 @@ static struct ctl_table fs_table[] = {
 		.data		= &lease_break_time,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
-		.extra2		= &two,
+		.proc_handler	= &proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_AIO
@@ -1417,7 +1411,10 @@ static struct ctl_table fs_table[] = {
 		.data		= &suid_dumpable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &two,
 	},
 #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
 	{
-- 
cgit v1.2.3-59-g8ed1b


From 2ae448efc87df6d328f5835969076c7f9fce59c3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:36 -0700
Subject: pids: improve get_task_pid() to fix the unsafe
 sys_wait4()->task_pgrp()

sys_wait4() does get_pid(task_pgrp(current)), this is not safe.  We can
add rcu lock/unlock around, but we already have get_task_pid() which can
be improved to handle the special pids in more reliable manner.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 kernel/pid.c  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 029415d9f82e..384f09caf2ef 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1737,7 +1737,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 		pid = find_get_pid(-upid);
 	} else if (upid == 0) {
 		type = PIDTYPE_PGID;
-		pid = get_pid(task_pgrp(current));
+		pid = get_task_pid(current, PIDTYPE_PGID);
 	} else /* upid > 0 */ {
 		type = PIDTYPE_PID;
 		pid = find_get_pid(upid);
diff --git a/kernel/pid.c b/kernel/pid.c
index 1b3586fe753a..6628abcc520e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -403,6 +403,8 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 {
 	struct pid *pid;
 	rcu_read_lock();
+	if (type != PIDTYPE_PID)
+		task = task->group_leader;
 	pid = get_pid(task->pids[type].pid);
 	rcu_read_unlock();
 	return pid;
-- 
cgit v1.2.3-59-g8ed1b


From 52ee2dfdd4f51cf422ea6a96a0846dc94244aa37 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:38 -0700
Subject: pids: refactor vnr/nr_ns helpers to make them safe

Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.

task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.

As for "special" pids, vnr/nr_ns helpers always need rcu.  However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.

And almost every helper has a callsite which needs a fix.

Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".

This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.

After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 27 ++++++++++++++++++++-------
 kernel/pid.c          | 31 ++++++++++++++++---------------
 2 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 722dd313bf8a..49df878a0cad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1519,17 +1519,23 @@ struct pid_namespace;
  *
  * see also pid_nr() etc in include/linux/pid.h
  */
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+			struct pid_namespace *ns);
 
 static inline pid_t task_pid_nr(struct task_struct *tsk)
 {
 	return tsk->pid;
 }
 
-pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
+}
 
 static inline pid_t task_pid_vnr(struct task_struct *tsk)
 {
-	return pid_vnr(task_pid(tsk));
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
 }
 
 
@@ -1551,11 +1557,15 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 	return tsk->signal->__pgrp;
 }
 
-pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
+}
 
 static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
 {
-	return pid_vnr(task_pgrp(tsk));
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
 }
 
 
@@ -1564,14 +1574,17 @@ static inline pid_t task_session_nr(struct task_struct *tsk)
 	return tsk->signal->__session;
 }
 
-pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+static inline pid_t task_session_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
+}
 
 static inline pid_t task_session_vnr(struct task_struct *tsk)
 {
-	return pid_vnr(task_session(tsk));
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
 }
 
-
 /**
  * pid_alive - check that a task structure is not stale
  * @p: Task structure to be checked.
diff --git a/kernel/pid.c b/kernel/pid.c
index 6628abcc520e..b2e5f78fd281 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -452,11 +452,24 @@ pid_t pid_vnr(struct pid *pid)
 }
 EXPORT_SYMBOL_GPL(pid_vnr);
 
-pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+			struct pid_namespace *ns)
 {
-	return pid_nr_ns(task_pid(tsk), ns);
+	pid_t nr = 0;
+
+	rcu_read_lock();
+	if (!ns)
+		ns = current->nsproxy->pid_ns;
+	if (likely(pid_alive(task))) {
+		if (type != PIDTYPE_PID)
+			task = task->group_leader;
+		nr = pid_nr_ns(task->pids[type].pid, ns);
+	}
+	rcu_read_unlock();
+
+	return nr;
 }
-EXPORT_SYMBOL(task_pid_nr_ns);
+EXPORT_SYMBOL(__task_pid_nr_ns);
 
 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
@@ -464,18 +477,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 }
 EXPORT_SYMBOL(task_tgid_nr_ns);
 
-pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return pid_nr_ns(task_pgrp(tsk), ns);
-}
-EXPORT_SYMBOL(task_pgrp_nr_ns);
-
-pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return pid_nr_ns(task_session(tsk), ns);
-}
-EXPORT_SYMBOL(task_session_nr_ns);
-
 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
 {
 	return ns_of_pid(task_pid(tsk));
-- 
cgit v1.2.3-59-g8ed1b


From 1b0f7ffd0ea27cd3a0b9ca04e3df9522048c32a3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:39 -0700
Subject: pids: kill signal_struct-> __pgrp/__session and friends

We are wasting 2 words in signal_struct without any reason to implement
task_pgrp_nr() and task_session_nr().

task_session_nr() has no callers since
2e2ba22ea4fd4bb85f0fa37c521066db6775cbef, we can remove it.

task_pgrp_nr() is still (I believe wrongly) used in fs/autofsX and
fs/coda.

This patch reimplements task_pgrp_nr() via task_pgrp_nr_ns(), and kills
__pgrp/__session and the related helpers.

The change in drivers/char/tty_io.c is cosmetic, but hopefully makes sense
anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Alan Cox <number6@the-village.bc.nu>		[tty parts]
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_io.c |  4 ++--
 include/linux/sched.h | 43 ++++++-------------------------------------
 kernel/exit.c         | 10 +++-------
 kernel/fork.c         |  2 --
 kernel/sys.c          |  4 +---
 5 files changed, 12 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index a44b701c5bba..66b99a2049e3 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2681,7 +2681,7 @@ void __do_SAK(struct tty_struct *tty)
 	/* Kill the entire session */
 	do_each_pid_task(session, PIDTYPE_SID, p) {
 		printk(KERN_NOTICE "SAK: killed process %d"
-			" (%s): task_session_nr(p)==tty->session\n",
+			" (%s): task_session(p)==tty->session\n",
 			task_pid_nr(p), p->comm);
 		send_sig(SIGKILL, p, 1);
 	} while_each_pid_task(session, PIDTYPE_SID, p);
@@ -2691,7 +2691,7 @@ void __do_SAK(struct tty_struct *tty)
 	do_each_thread(g, p) {
 		if (p->signal->tty == tty) {
 			printk(KERN_NOTICE "SAK: killed process %d"
-			    " (%s): task_session_nr(p)==tty->session\n",
+			    " (%s): task_session(p)==tty->session\n",
 			    task_pid_nr(p), p->comm);
 			send_sig(SIGKILL, p, 1);
 			continue;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 49df878a0cad..206ac003e8c0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -547,25 +547,8 @@ struct signal_struct {
 
 	struct list_head cpu_timers[3];
 
-	/* job control IDs */
-
-	/*
-	 * pgrp and session fields are deprecated.
-	 * use the task_session_Xnr and task_pgrp_Xnr routines below
-	 */
-
-	union {
-		pid_t pgrp __deprecated;
-		pid_t __pgrp;
-	};
-
 	struct pid *tty_old_pgrp;
 
-	union {
-		pid_t session __deprecated;
-		pid_t __session;
-	};
-
 	/* boolean value for session group leader */
 	int leader;
 
@@ -1469,16 +1452,6 @@ static inline int rt_task(struct task_struct *p)
 	return rt_prio(p->prio);
 }
 
-static inline void set_task_session(struct task_struct *tsk, pid_t session)
-{
-	tsk->signal->__session = session;
-}
-
-static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp)
-{
-	tsk->signal->__pgrp = pgrp;
-}
-
 static inline struct pid *task_pid(struct task_struct *task)
 {
 	return task->pids[PIDTYPE_PID].pid;
@@ -1552,11 +1525,6 @@ static inline pid_t task_tgid_vnr(struct task_struct *tsk)
 }
 
 
-static inline pid_t task_pgrp_nr(struct task_struct *tsk)
-{
-	return tsk->signal->__pgrp;
-}
-
 static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
 					struct pid_namespace *ns)
 {
@@ -1569,11 +1537,6 @@ static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
 }
 
 
-static inline pid_t task_session_nr(struct task_struct *tsk)
-{
-	return tsk->signal->__session;
-}
-
 static inline pid_t task_session_nr_ns(struct task_struct *tsk,
 					struct pid_namespace *ns)
 {
@@ -1585,6 +1548,12 @@ static inline pid_t task_session_vnr(struct task_struct *tsk)
 	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
 }
 
+/* obsolete, do not use */
+static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+{
+	return task_pgrp_nr_ns(tsk, &init_pid_ns);
+}
+
 /**
  * pid_alive - check that a task structure is not stale
  * @p: Task structure to be checked.
diff --git a/kernel/exit.c b/kernel/exit.c
index 384f09caf2ef..3bec141c82f6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -357,16 +357,12 @@ static void reparent_to_kthreadd(void)
 void __set_special_pids(struct pid *pid)
 {
 	struct task_struct *curr = current->group_leader;
-	pid_t nr = pid_nr(pid);
 
-	if (task_session(curr) != pid) {
+	if (task_session(curr) != pid)
 		change_pid(curr, PIDTYPE_SID, pid);
-		set_task_session(curr, nr);
-	}
-	if (task_pgrp(curr) != pid) {
+
+	if (task_pgrp(curr) != pid)
 		change_pid(curr, PIDTYPE_PGID, pid);
-		set_task_pgrp(curr, nr);
-	}
 }
 
 static void set_special_pids(struct pid *pid)
diff --git a/kernel/fork.c b/kernel/fork.c
index adbea16ec649..f74458231449 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1265,8 +1265,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			p->signal->leader_pid = pid;
 			tty_kref_put(p->signal->tty);
 			p->signal->tty = tty_kref_get(current->signal->tty);
-			set_task_pgrp(p, task_pgrp_nr(current));
-			set_task_session(p, task_session_nr(current));
 			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
 			attach_pid(p, PIDTYPE_SID, task_session(current));
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
diff --git a/kernel/sys.c b/kernel/sys.c
index 37f458e6882a..742cefa527e6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1013,10 +1013,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 	if (err)
 		goto out;
 
-	if (task_pgrp(p) != pgrp) {
+	if (task_pgrp(p) != pgrp)
 		change_pid(p, PIDTYPE_PGID, pgrp);
-		set_task_pgrp(p, pid_nr(pgrp));
-	}
 
 	err = 0;
 out:
-- 
cgit v1.2.3-59-g8ed1b


From 04d491ab2a53008a1aa98ac09561768c7f3adda3 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 2 Apr 2009 16:58:57 -0700
Subject: kexec: add dmesg log symbols to /proc/vmcoreinfo lists

It would be nice to be able to extract the dmesg log from a vmcore file
without needing to keep the debug symbols for the running kernel handy all
the time.  We have a facility to do this in /proc/vmcore.  This patch adds
the log_buf and log_end symbols to the vmcoreinfo area so that tools (like
makedumpfile) can easily extract the dmesg logs from a vmcore image.

[akpm@linux-foundation.org: several fixes and cleanups]
[akpm@linux-foundation.org: fix unused log_buf_kexec_setup()]
[akpm@linux-foundation.org: build fix]
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Cc: Simon Horman <horms@verge.net.au>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Simon Horman <horms@verge.net.au>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h |  4 ++++
 kernel/kexec.c         |  1 +
 kernel/printk.c        | 19 +++++++++++++++++++
 3 files changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e720b0da7751..556d781e69fe 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -242,6 +242,7 @@ extern struct ratelimit_state printk_ratelimit_state;
 extern int printk_ratelimit(void);
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
+void log_buf_kexec_setup(void);
 #else
 static inline int vprintk(const char *s, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
@@ -253,6 +254,9 @@ static inline int printk_ratelimit(void) { return 0; }
 static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
 					  unsigned int interval_msec)	\
 		{ return false; }
+static inline void log_buf_kexec_setup(void)
+{
+}
 #endif
 
 extern int printk_needs_cpu(int cpu);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 93eed85fe017..589832aac41f 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1409,6 +1409,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(list_head, prev);
 	VMCOREINFO_OFFSET(vm_struct, addr);
 	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+	log_buf_kexec_setup();
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
 	VMCOREINFO_NUMBER(NR_FREE_PAGES);
 	VMCOREINFO_NUMBER(PG_lru);
diff --git a/kernel/printk.c b/kernel/printk.c
index e3602d0755b0..a5f61a9acedb 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
+#include <linux/kexec.h>
 
 #include <asm/uaccess.h>
 
@@ -135,6 +136,24 @@ static char *log_buf = __log_buf;
 static int log_buf_len = __LOG_BUF_LEN;
 static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
 
+#ifdef CONFIG_KEXEC
+/*
+ * This appends the listed symbols to /proc/vmcoreinfo
+ *
+ * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
+ * obtain access to symbols that are otherwise very difficult to locate.  These
+ * symbols are specifically used so that utilities can access and extract the
+ * dmesg log from a vmcore file after a crash.
+ */
+void log_buf_kexec_setup(void)
+{
+	VMCOREINFO_SYMBOL(log_buf);
+	VMCOREINFO_SYMBOL(log_end);
+	VMCOREINFO_SYMBOL(log_buf_len);
+	VMCOREINFO_SYMBOL(logged_chars);
+}
+#endif
+
 static int __init log_buf_len_setup(char *str)
 {
 	unsigned size = memparse(str, &str);
-- 
cgit v1.2.3-59-g8ed1b


From edb79a213223488735fae1d408f4c136e9ed25d6 Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Thu, 2 Apr 2009 16:58:58 -0700
Subject: kexec: vmcoreinfo_data[] can become static

The vmcoreinfo_data[] array is not used outside of kernel/kexec.c, and
can therefore become static. This patch adds the relevant keyword to the
definition of the array.

Noticed by sparse.

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 589832aac41f..5a758c6e4950 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -42,7 +42,7 @@
 note_buf_t* crash_notes;
 
 /* vmcoreinfo stuff */
-unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
 size_t vmcoreinfo_size;
 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-- 
cgit v1.2.3-59-g8ed1b


From 2c53d9109f077900e140edb8b766132ad93b81cc Mon Sep 17 00:00:00 2001
From: Aravind Srinivasan <raa.aars@gmail.com>
Date: Thu, 2 Apr 2009 16:58:59 -0700
Subject: relay: fix for possible loss/corruption of produced subbufs

Fix possible loss/corruption of produced subbufs in
relay_subbufs_consumed().

When buf->subbufs_produced wraps around after UINT_MAX and
buf->subbufs_consumed is still < UINT_MAX, the condition

	if (buf->subbufs_consumed > buf->subbufs_produced)

will be true even for certain valid values of subbufs_consumed.  This may
lead to loss or corruption of produced subbufs.

Signed-off-by: Aravind Srinivasan <raa.aars@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 8f2179c8056f..e92db8c06acf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -797,13 +797,15 @@ void relay_subbufs_consumed(struct rchan *chan,
 	if (!chan)
 		return;
 
-	if (cpu >= NR_CPUS || !chan->buf[cpu])
+	if (cpu >= NR_CPUS || !chan->buf[cpu] ||
+					subbufs_consumed > chan->n_subbufs)
 		return;
 
 	buf = chan->buf[cpu];
-	buf->subbufs_consumed += subbufs_consumed;
-	if (buf->subbufs_consumed > buf->subbufs_produced)
+	if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
 		buf->subbufs_consumed = buf->subbufs_produced;
+	else
+		buf->subbufs_consumed += subbufs_consumed;
 }
 EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
 
-- 
cgit v1.2.3-59-g8ed1b


From e8c158bb313c1df421eab7dc4299cd39cbbf5895 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Thu, 2 Apr 2009 16:59:45 -0700
Subject: Factor out #ifdefs from kernel/spinlock.c to LOCK_CONTENDED_FLAGS

SGI has observed that on large systems, interrupts are not serviced for a
long period of time when waiting for a rwlock.  The following patch series
re-enables irqs while waiting for the lock, resembling the code which is
already there for spinlocks.

I only made the ia64 version, because the patch adds some overhead to the
fast path.  I assume there is currently no demand to have this for other
architectures, because the systems are not so large.  Of course, the
possibility to implement raw_{read|write}_lock_flags for any architecture
is still there.

This patch:

The new macro LOCK_CONTENDED_FLAGS expands to the correct implementation
depending on the config options, so that IRQ's are re-enabled when
possible, but they remain disabled if CONFIG_LOCKDEP is set.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Signed-off-by: Robin Holt <holt@sgi.com>
Cc: <linux-arch@vger.kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lockdep.h | 17 +++++++++++++++++
 kernel/spinlock.c       | 12 ++----------
 2 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 5a58ea3e91e9..da5a5a1f4cd2 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -364,6 +364,23 @@ do {								\
 
 #endif /* CONFIG_LOCK_STAT */
 
+#ifdef CONFIG_LOCKDEP
+
+/*
+ * On lockdep we dont want the hand-coded irq-enable of
+ * _raw_*_lock_flags() code, because lockdep assumes
+ * that interrupts are not re-enabled during lock-acquire:
+ */
+#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
+	LOCK_CONTENDED((_lock), (try), (lock))
+
+#else /* CONFIG_LOCKDEP */
+
+#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
+	lockfl((_lock), (flags))
+
+#endif /* CONFIG_LOCKDEP */
+
 #ifdef CONFIG_GENERIC_HARDIRQS
 extern void early_init_irq_lock_class(void);
 #else
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 29ab20749dd3..7283c6dc2d59 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -299,16 +299,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
 	local_irq_save(flags);
 	preempt_disable();
 	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-	/*
-	 * On lockdep we dont want the hand-coded irq-enable of
-	 * _raw_spin_lock_flags() code, because lockdep assumes
-	 * that interrupts are not re-enabled during lock-acquire:
-	 */
-#ifdef CONFIG_LOCKDEP
-	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
-#else
-	_raw_spin_lock_flags(lock, &flags);
-#endif
+	LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
+				_raw_spin_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_spin_lock_irqsave_nested);
-- 
cgit v1.2.3-59-g8ed1b


From f5f7eac41db827a47b2163330eecd7bb55ae9f12 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Thu, 2 Apr 2009 16:59:46 -0700
Subject: Allow rwlocks to re-enable interrupts

Pass the original flags to rwlock arch-code, so that it can re-enable
interrupts if implemented for that architecture.

Initially, make __raw_read_lock_flags and __raw_write_lock_flags stubs
which just do the same thing as non-flags variants.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Signed-off-by: Robin Holt <holt@sgi.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <linux-arch@vger.kernel.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/spinlock.h          | 3 +++
 arch/arm/include/asm/spinlock.h            | 3 +++
 arch/cris/include/arch-v32/arch/spinlock.h | 2 ++
 arch/ia64/include/asm/spinlock.h           | 3 +++
 arch/mips/include/asm/spinlock.h           | 2 ++
 arch/parisc/include/asm/spinlock.h         | 3 +++
 arch/powerpc/include/asm/spinlock.h        | 3 +++
 arch/s390/include/asm/spinlock.h           | 3 +++
 arch/sh/include/asm/spinlock.h             | 3 +++
 arch/sparc/include/asm/spinlock_32.h       | 2 ++
 arch/sparc/include/asm/spinlock_64.h       | 2 ++
 arch/x86/include/asm/spinlock.h            | 3 +++
 include/asm-m32r/spinlock.h                | 3 +++
 include/linux/spinlock.h                   | 6 ++++++
 kernel/spinlock.c                          | 6 ++++--
 15 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h
index aeeb125f6851..e38fb95cb335 100644
--- a/arch/alpha/include/asm/spinlock.h
+++ b/arch/alpha/include/asm/spinlock.h
@@ -166,6 +166,9 @@ static inline void __raw_write_unlock(raw_rwlock_t * lock)
 	lock->lock = 0;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index 2b41ebbfa7ff..c13681ac1ede 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -217,6 +217,9 @@ static inline int __raw_read_trylock(raw_rwlock_t *rw)
 /* read_can_lock - would read_trylock() succeed? */
 #define __raw_read_can_lock(x)		((x)->lock < 0x80000000)
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/cris/include/arch-v32/arch/spinlock.h b/arch/cris/include/arch-v32/arch/spinlock.h
index 0d5709b983a1..129756b96661 100644
--- a/arch/cris/include/arch-v32/arch/spinlock.h
+++ b/arch/cris/include/arch-v32/arch/spinlock.h
@@ -121,6 +121,8 @@ static  inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return 1;
 }
 
+#define _raw_read_lock_flags(lock, flags) _raw_read_lock(lock)
+#define _raw_write_lock_flags(lock, flags) _raw_write_lock(lock)
 
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
index 0229fb95fb38..0a619618b2fa 100644
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -213,6 +213,9 @@ static inline int __raw_read_trylock(raw_rwlock_t *x)
 	return (u32)ia64_cmpxchg4_acq((__u32 *)(x), new.word, old.word) == old.word;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
index 10e82441b496..5b60a09a0f08 100644
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -480,6 +480,8 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return ret;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
 
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index f3d2090a18dc..fae03e136fa8 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -187,6 +187,9 @@ static __inline__ int __raw_write_can_lock(raw_rwlock_t *rw)
 	return !rw->counter;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 36864364e601..c3b193121f81 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -287,6 +287,9 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
 	rw->lock = 0;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	__spin_yield(lock)
 #define _raw_read_relax(lock)	__rw_yield(lock)
 #define _raw_write_relax(lock)	__rw_yield(lock)
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index df84ae96915f..f3861b09ebb0 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -172,6 +172,9 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return _raw_write_trylock_retry(rw);
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
 
diff --git a/arch/sh/include/asm/spinlock.h b/arch/sh/include/asm/spinlock.h
index e793181d64da..60283565f89b 100644
--- a/arch/sh/include/asm/spinlock.h
+++ b/arch/sh/include/asm/spinlock.h
@@ -216,6 +216,9 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return (oldval > (RW_LOCK_BIAS - 1));
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h
index bf2d532593e3..46f91ab66a50 100644
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -177,6 +177,8 @@ static inline int __read_trylock(raw_rwlock_t *rw)
 #define __raw_write_unlock(rw)	do { (rw)->lock = 0; } while(0)
 
 #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+#define __raw_read_lock_flags(rw, flags)   __raw_read_lock(rw)
+#define __raw_write_lock_flags(rw, flags)  __raw_write_lock(rw)
 
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h
index c4d274d330e9..f6b2b92ad8d2 100644
--- a/arch/sparc/include/asm/spinlock_64.h
+++ b/arch/sparc/include/asm/spinlock_64.h
@@ -211,9 +211,11 @@ static int inline __write_trylock(raw_rwlock_t *lock)
 }
 
 #define __raw_read_lock(p)	__read_lock(p)
+#define __raw_read_lock_flags(p, f) __read_lock(p)
 #define __raw_read_trylock(p)	__read_trylock(p)
 #define __raw_read_unlock(p)	__read_unlock(p)
 #define __raw_write_lock(p)	__write_lock(p)
+#define __raw_write_lock_flags(p, f) __write_lock(p)
 #define __raw_write_unlock(p)	__write_unlock(p)
 #define __raw_write_trylock(p)	__write_trylock(p)
 
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 3a5696656680..e5e6caffec87 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -295,6 +295,9 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
 		     : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/include/asm-m32r/spinlock.h b/include/asm-m32r/spinlock.h
index f5cfba81ee10..dded923883b2 100644
--- a/include/asm-m32r/spinlock.h
+++ b/include/asm-m32r/spinlock.h
@@ -316,6 +316,9 @@ static inline int __raw_write_trylock(raw_rwlock_t *lock)
 	return 0;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index a0c66a2e00ad..252b245cfcf4 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -153,9 +153,11 @@ do {								\
  extern int _raw_spin_trylock(spinlock_t *lock);
  extern void _raw_spin_unlock(spinlock_t *lock);
  extern void _raw_read_lock(rwlock_t *lock);
+#define _raw_read_lock_flags(lock, flags) _raw_read_lock(lock)
  extern int _raw_read_trylock(rwlock_t *lock);
  extern void _raw_read_unlock(rwlock_t *lock);
  extern void _raw_write_lock(rwlock_t *lock);
+#define _raw_write_lock_flags(lock, flags) _raw_write_lock(lock)
  extern int _raw_write_trylock(rwlock_t *lock);
  extern void _raw_write_unlock(rwlock_t *lock);
 #else
@@ -165,9 +167,13 @@ do {								\
 # define _raw_spin_trylock(lock)	__raw_spin_trylock(&(lock)->raw_lock)
 # define _raw_spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
 # define _raw_read_lock(rwlock)		__raw_read_lock(&(rwlock)->raw_lock)
+# define _raw_read_lock_flags(lock, flags) \
+		__raw_read_lock_flags(&(lock)->raw_lock, *(flags))
 # define _raw_read_trylock(rwlock)	__raw_read_trylock(&(rwlock)->raw_lock)
 # define _raw_read_unlock(rwlock)	__raw_read_unlock(&(rwlock)->raw_lock)
 # define _raw_write_lock(rwlock)	__raw_write_lock(&(rwlock)->raw_lock)
+# define _raw_write_lock_flags(lock, flags) \
+		__raw_write_lock_flags(&(lock)->raw_lock, *(flags))
 # define _raw_write_trylock(rwlock)	__raw_write_trylock(&(rwlock)->raw_lock)
 # define _raw_write_unlock(rwlock)	__raw_write_unlock(&(rwlock)->raw_lock)
 #endif
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 7283c6dc2d59..7932653c4ebd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -121,7 +121,8 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+	LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
+			     _raw_read_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_read_lock_irqsave);
@@ -151,7 +152,8 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+	LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
+			     _raw_write_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_write_lock_irqsave);
-- 
cgit v1.2.3-59-g8ed1b


From b1f77b0581b8fd837acb4a973f7d5496cae6efee Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 03:20:49 +0100
Subject: kmemtrace, rcu: fix linux/rcutree.h and linux/rcuclassic.h
 dependencies

Impact: build fix for all non-x86 architectures

We want to remove percpu.h from rcuclassic.h/rcutree.h (for upcoming
kmemtrace changes) but that would break the DECLARE_PER_CPU based
declarations in these files.

Move the quiescent counter management functions to their respective
RCU implementation .c files - they were slightly above the inlining
limit anyway.

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1237898630.25315.83.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcuclassic.h | 15 ++-------------
 include/linux/rcutree.h    | 26 ++------------------------
 kernel/rcuclassic.c        | 23 +++++++++++++++++++++--
 kernel/rcutree.c           | 28 ++++++++++++++++++++++++----
 4 files changed, 49 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 80044a4f3ab9..2d688b4461f7 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -108,25 +108,14 @@ struct rcu_data {
 	struct rcu_head barrier;
 };
 
-DECLARE_PER_CPU(struct rcu_data, rcu_data);
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-
 /*
  * Increment the quiescent state counter.
  * The counter is a bit degenerated: We do not need to know
  * how many quiescent states passed, just if there was at least
  * one since the start of the grace period. Thus just a flag.
  */
-static inline void rcu_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	rdp->passed_quiesc = 1;
-}
-static inline void rcu_bh_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-	rdp->passed_quiesc = 1;
-}
+extern void rcu_qsctr_inc(int cpu);
+extern void rcu_bh_qsctr_inc(int cpu);
 
 extern int rcu_pending(int cpu);
 extern int rcu_needs_cpu(int cpu);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index a722fb67bb2d..5d6f425260bc 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -236,30 +236,8 @@ struct rcu_state {
 #endif /* #ifdef CONFIG_NO_HZ */
 };
 
-extern struct rcu_state rcu_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_data);
-
-extern struct rcu_state rcu_bh_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-
-/*
- * Increment the quiescent state counter.
- * The counter is a bit degenerated: We do not need to know
- * how many quiescent states passed, just if there was at least
- * one since the start of the grace period. Thus just a flag.
- */
-static inline void rcu_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	rdp->passed_quiesc = 1;
-	rdp->passed_quiesc_completed = rdp->completed;
-}
-static inline void rcu_bh_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-	rdp->passed_quiesc = 1;
-	rdp->passed_quiesc_completed = rdp->completed;
-}
+extern void rcu_qsctr_inc(int cpu);
+extern void rcu_bh_qsctr_inc(int cpu);
 
 extern int rcu_pending(int cpu);
 extern int rcu_needs_cpu(int cpu);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 654c640a6b9c..0f2b0b311304 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -65,6 +65,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
 	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
 	.cpumask = CPU_BITS_NONE,
 };
+
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
@@ -73,8 +74,26 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cpumask = CPU_BITS_NONE,
 };
 
-DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
+static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+}
+
+void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+}
 
 static int blimit = 10;
 static int qhimark = 10000;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 97ce31579ec0..a2015edfe167 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -72,11 +72,31 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
 	.n_force_qs_ngp = 0, \
 }
 
-struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
-DEFINE_PER_CPU(struct rcu_data, rcu_data);
+static struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
+static DEFINE_PER_CPU(struct rcu_data, rcu_data);
 
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+static struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+
+void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
 
 #ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-- 
cgit v1.2.3-59-g8ed1b


From 6258c4fb59e77d748f7efc2c137ad420372edd07 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Mar 2009 16:42:24 +0100
Subject: kmemtrace, rcu: fix rcu_tree_trace.c data structure dependencies

Impact: cleanup

We want to remove rcutree internals from the public rcutree.h file for
upcoming kmemtrace changes - but kernel/rcutree_trace.c depends on them.

Introduce kernel/rcutree.h for internal definitions. (Probably all
the other data types from include/linux/rcutree.h could be
moved here too - except rcu_data.)

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1237898630.25315.83.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c       |  8 ++++----
 kernel/rcutree.h       | 10 ++++++++++
 kernel/rcutree_trace.c |  2 ++
 3 files changed, 16 insertions(+), 4 deletions(-)
 create mode 100644 kernel/rcutree.h

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index a2015edfe167..7f3266922572 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -72,11 +72,11 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
 	.n_force_qs_ngp = 0, \
 }
 
-static struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
-static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_data);
 
-static struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
-static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 /*
  * Increment the quiescent state counter.
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
new file mode 100644
index 000000000000..5e872bbf07f5
--- /dev/null
+++ b/kernel/rcutree.h
@@ -0,0 +1,10 @@
+
+/*
+ * RCU implementation internal declarations:
+ */
+extern struct rcu_state rcu_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
+
+extern struct rcu_state rcu_bh_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d6db3e837826..4ee954f6a8d5 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,6 +43,8 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
+#include "rcutree.h"
+
 static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
 	if (!rdp->beenonline)
-- 
cgit v1.2.3-59-g8ed1b


From a979241c532f07c201fe94e0a632107268f02578 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Mar 2009 16:42:24 +0100
Subject: kmemtrace, rcu: fix rcupreempt.c data structure dependencies

Impact: cleanup

We want to remove percpu.h from rcupreempt.h, but if we do that
the percpu primitives there wont build anymore. Move them to the
.c file instead.

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1237898630.25315.83.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcupreempt.h | 51 ++++++++--------------------------------------
 kernel/rcupreempt.c        | 48 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 53 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
index 74304b4538d8..3eb8fdd19445 100644
--- a/include/linux/rcupreempt.h
+++ b/include/linux/rcupreempt.h
@@ -40,30 +40,15 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 
-struct rcu_dyntick_sched {
-	int dynticks;
-	int dynticks_snap;
-	int sched_qs;
-	int sched_qs_snap;
-	int sched_dynticks_snap;
-};
-
-DECLARE_PER_CPU(struct rcu_dyntick_sched, rcu_dyntick_sched);
-
-static inline void rcu_qsctr_inc(int cpu)
-{
-	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
-	rdssp->sched_qs++;
-}
-#define rcu_bh_qsctr_inc(cpu)
+extern void rcu_qsctr_inc(int cpu);
+static inline void rcu_bh_qsctr_inc(int cpu) { }
 
 /*
  * Someone might want to pass call_rcu_bh as a function pointer.
  * So this needs to just be a rename and not a macro function.
  *  (no parentheses)
  */
-#define call_rcu_bh	 	call_rcu
+#define call_rcu_bh		call_rcu
 
 /**
  * call_rcu_sched - Queue RCU callback for invocation after sched grace period.
@@ -117,30 +102,12 @@ extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
 struct softirq_action;
 
 #ifdef CONFIG_NO_HZ
-
-static inline void rcu_enter_nohz(void)
-{
-	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
-
-	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
-	__get_cpu_var(rcu_dyntick_sched).dynticks++;
-	WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
-}
-
-static inline void rcu_exit_nohz(void)
-{
-	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
-
-	__get_cpu_var(rcu_dyntick_sched).dynticks++;
-	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
-	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
-				&rs);
-}
-
-#else /* CONFIG_NO_HZ */
-#define rcu_enter_nohz()	do { } while (0)
-#define rcu_exit_nohz()		do { } while (0)
-#endif /* CONFIG_NO_HZ */
+extern void rcu_enter_nohz(void);
+extern void rcu_exit_nohz(void);
+#else
+# define rcu_enter_nohz()	do { } while (0)
+# define rcu_exit_nohz()	do { } while (0)
+#endif
 
 /*
  * A context switch is a grace period for rcupreempt synchronize_rcu()
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 5d59e850fb71..ce97a4df64d3 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -147,7 +147,51 @@ struct rcu_ctrlblk {
 	wait_queue_head_t sched_wq;	/* Place for rcu_sched to sleep. */
 };
 
+struct rcu_dyntick_sched {
+	int dynticks;
+	int dynticks_snap;
+	int sched_qs;
+	int sched_qs_snap;
+	int sched_dynticks_snap;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
+	.dynticks = 1,
+};
+
+void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	rdssp->sched_qs++;
+}
+
+#ifdef CONFIG_NO_HZ
+
+void rcu_enter_nohz(void)
+{
+	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	__get_cpu_var(rcu_dyntick_sched).dynticks++;
+	WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
+}
+
+void rcu_exit_nohz(void)
+{
+	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+	__get_cpu_var(rcu_dyntick_sched).dynticks++;
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
+				&rs);
+}
+
+#endif /* CONFIG_NO_HZ */
+
+
 static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+
 static struct rcu_ctrlblk rcu_ctrlblk = {
 	.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
 	.completed = 0,
@@ -427,10 +471,6 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
 	}
 }
 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
-	.dynticks = 1,
-};
-
 #ifdef CONFIG_NO_HZ
 static DEFINE_PER_CPU(int, rcu_update_flag);
 
-- 
cgit v1.2.3-59-g8ed1b


From ca2b84cb3c4a0d4d2143b46ec072cdff5d1b3b87 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:24 +0200
Subject: kmemtrace: use tracepoints

kmemtrace now uses tracepoints instead of markers. We no longer need to
use format specifiers to pass arguments.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
[ folded: Use the new TP_PROTO and TP_ARGS to fix the build.     ]
[ folded: fix build when CONFIG_KMEMTRACE is disabled.           ]
[ folded: define tracepoints when CONFIG_TRACEPOINTS is enabled. ]
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <ae61c0f37156db8ec8dc0d5778018edde60a92e3.1237813499.git.eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/slab_def.h  |  10 +--
 include/linux/slub_def.h  |  12 +--
 include/trace/kmemtrace.h |  92 +++++++++------------
 kernel/trace/kmemtrace.c  | 206 ++++++++++++++++++++++++++++++++--------------
 kernel/trace/trace.h      |   6 ++
 mm/slab.c                 |  24 +++---
 mm/slob.c                 |  28 +++----
 mm/slub.c                 |  30 +++----
 mm/util.c                 |  16 ++++
 9 files changed, 251 insertions(+), 173 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index f4523651fa42..5ac9b0bcaf9a 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -73,8 +73,8 @@ found:
 
 		ret = kmem_cache_alloc_notrace(cachep, flags);
 
-		kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret,
-				     size, slab_buffer_size(cachep), flags);
+		trace_kmalloc(_THIS_IP_, ret,
+			      size, slab_buffer_size(cachep), flags);
 
 		return ret;
 	}
@@ -128,9 +128,9 @@ found:
 
 		ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_,
-					  ret, size, slab_buffer_size(cachep),
-					  flags, node);
+		trace_kmalloc_node(_THIS_IP_, ret,
+				   size, slab_buffer_size(cachep),
+				   flags, node);
 
 		return ret;
 	}
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index a1f90528e70b..5046f90c1171 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -233,8 +233,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
 	unsigned int order = get_order(size);
 	void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order);
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret,
-			     size, PAGE_SIZE << order, flags);
+	trace_kmalloc(_THIS_IP_, ret, size, PAGE_SIZE << order, flags);
 
 	return ret;
 }
@@ -255,9 +254,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
 
 			ret = kmem_cache_alloc_notrace(s, flags);
 
-			kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC,
-					     _THIS_IP_, ret,
-					     size, s->size, flags);
+			trace_kmalloc(_THIS_IP_, ret, size, s->size, flags);
 
 			return ret;
 		}
@@ -296,9 +293,8 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 
 		ret = kmem_cache_alloc_node_notrace(s, flags, node);
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-					  _THIS_IP_, ret,
-					  size, s->size, flags, node);
+		trace_kmalloc_node(_THIS_IP_, ret,
+				   size, s->size, flags, node);
 
 		return ret;
 	}
diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h
index ad8b7857855a..28ee69f9cd46 100644
--- a/include/trace/kmemtrace.h
+++ b/include/trace/kmemtrace.h
@@ -9,65 +9,53 @@
 
 #ifdef __KERNEL__
 
+#include <linux/tracepoint.h>
 #include <linux/types.h>
-#include <linux/marker.h>
-
-enum kmemtrace_type_id {
-	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
-	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
-	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
-};
 
 #ifdef CONFIG_KMEMTRACE
-
 extern void kmemtrace_init(void);
-
-extern void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-					     unsigned long call_site,
-					     const void *ptr,
-					     size_t bytes_req,
-					     size_t bytes_alloc,
-					     gfp_t gfp_flags,
-					     int node);
-
-extern void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-				       unsigned long call_site,
-				       const void *ptr);
-
-#else /* CONFIG_KMEMTRACE */
-
+#else
 static inline void kmemtrace_init(void)
 {
 }
-
-static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-					     unsigned long call_site,
-					     const void *ptr,
-					     size_t bytes_req,
-					     size_t bytes_alloc,
-					     gfp_t gfp_flags,
-					     int node)
-{
-}
-
-static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-				       unsigned long call_site,
-				       const void *ptr)
-{
-}
-
-#endif /* CONFIG_KMEMTRACE */
-
-static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id,
-					unsigned long call_site,
-					const void *ptr,
-					size_t bytes_req,
-					size_t bytes_alloc,
-					gfp_t gfp_flags)
-{
-	kmemtrace_mark_alloc_node(type_id, call_site, ptr,
-				  bytes_req, bytes_alloc, gfp_flags, -1);
-}
+#endif
+
+DECLARE_TRACE(kmalloc,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
+DECLARE_TRACE(kmem_cache_alloc,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
+DECLARE_TRACE(kmalloc_node,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags,
+		      int node),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
+DECLARE_TRACE(kmem_cache_alloc_node,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags,
+		      int node),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
+DECLARE_TRACE(kfree,
+	      TP_PROTO(unsigned long call_site, const void *ptr),
+	      TP_ARGS(call_site, ptr));
+DECLARE_TRACE(kmem_cache_free,
+	      TP_PROTO(unsigned long call_site, const void *ptr),
+	      TP_ARGS(call_site, ptr));
 
 #endif /* __KERNEL__ */
 
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index ae201b3eda89..4f7b5db5dd06 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
+#include <linux/tracepoint.h>
 #include <trace/kmemtrace.h>
 
 #include "trace.h"
@@ -29,10 +30,150 @@ static struct tracer_flags kmem_tracer_flags = {
 	.opts = kmem_opts
 };
 
-
-static bool kmem_tracing_enabled __read_mostly;
 static struct trace_array *kmemtrace_array;
 
+/* Trace allocations */
+static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
+				   unsigned long call_site,
+				   const void *ptr,
+				   size_t bytes_req,
+				   size_t bytes_alloc,
+				   gfp_t gfp_flags,
+				   int node)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_alloc_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_ALLOC;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+	entry->bytes_req = bytes_req;
+	entry->bytes_alloc = bytes_alloc;
+	entry->gfp_flags = gfp_flags;
+	entry->node	=	node;
+
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+	trace_wake_up();
+}
+
+static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
+				  unsigned long call_site,
+				  const void *ptr)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_free_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_FREE;
+	entry->type_id	= type_id;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+	trace_wake_up();
+}
+
+static void kmemtrace_kmalloc(unsigned long call_site,
+			      const void *ptr,
+			      size_t bytes_req,
+			      size_t bytes_alloc,
+			      gfp_t gfp_flags)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
+				       const void *ptr,
+				       size_t bytes_req,
+				       size_t bytes_alloc,
+				       gfp_t gfp_flags)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+static void kmemtrace_kmalloc_node(unsigned long call_site,
+				   const void *ptr,
+				   size_t bytes_req,
+				   size_t bytes_alloc,
+				   gfp_t gfp_flags,
+				   int node)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, node);
+}
+
+static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
+					    const void *ptr,
+					    size_t bytes_req,
+					    size_t bytes_alloc,
+					    gfp_t gfp_flags,
+					    int node)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, node);
+}
+
+static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
+{
+	kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
+}
+
+static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
+{
+	kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
+}
+
+static int kmemtrace_start_probes(void)
+{
+	int err;
+
+	err = register_trace_kmalloc(kmemtrace_kmalloc);
+	if (err)
+		return err;
+	err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+	if (err)
+		return err;
+	err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
+	if (err)
+		return err;
+	err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+	if (err)
+		return err;
+	err = register_trace_kfree(kmemtrace_kfree);
+	if (err)
+		return err;
+	err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+
+	return err;
+}
+
+static void kmemtrace_stop_probes(void)
+{
+	unregister_trace_kmalloc(kmemtrace_kmalloc);
+	unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+	unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
+	unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+	unregister_trace_kfree(kmemtrace_kfree);
+	unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+}
+
 static int kmem_trace_init(struct trace_array *tr)
 {
 	int cpu;
@@ -41,14 +182,14 @@ static int kmem_trace_init(struct trace_array *tr)
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		tracing_reset(tr, cpu);
 
-	kmem_tracing_enabled = true;
+	kmemtrace_start_probes();
 
 	return 0;
 }
 
 static void kmem_trace_reset(struct trace_array *tr)
 {
-	kmem_tracing_enabled = false;
+	kmemtrace_stop_probes();
 }
 
 static void kmemtrace_headers(struct seq_file *s)
@@ -260,63 +401,6 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 	}
 }
 
-/* Trace allocations */
-void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-			     unsigned long call_site,
-			     const void *ptr,
-			     size_t bytes_req,
-			     size_t bytes_alloc,
-			     gfp_t gfp_flags,
-			     int node)
-{
-	struct ring_buffer_event *event;
-	struct kmemtrace_alloc_entry *entry;
-	struct trace_array *tr = kmemtrace_array;
-
-	if (!kmem_tracing_enabled)
-		return;
-
-	event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-
-	entry->call_site = call_site;
-	entry->ptr = ptr;
-	entry->bytes_req = bytes_req;
-	entry->bytes_alloc = bytes_alloc;
-	entry->gfp_flags = gfp_flags;
-	entry->node	=	node;
-
-	trace_buffer_unlock_commit(tr, event, 0, 0);
-}
-EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
-
-void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-		       unsigned long call_site,
-		       const void *ptr)
-{
-	struct ring_buffer_event *event;
-	struct kmemtrace_free_entry *entry;
-	struct trace_array *tr = kmemtrace_array;
-
-	if (!kmem_tracing_enabled)
-		return;
-
-	event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	entry->type_id	= type_id;
-	entry->call_site = call_site;
-	entry->ptr = ptr;
-
-	trace_buffer_unlock_commit(tr, event, 0, 0);
-}
-EXPORT_SYMBOL(kmemtrace_mark_free);
-
 static struct tracer kmem_tracer __read_mostly = {
 	.name		= "kmemtrace",
 	.init		= kmem_trace_init,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cb0ce3fc36d3..cbc168f1e43d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -182,6 +182,12 @@ struct trace_power {
 	struct power_trace	state_data;
 };
 
+enum kmemtrace_type_id {
+	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
+	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
+	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
+};
+
 struct kmemtrace_alloc_entry {
 	struct trace_entry	ent;
 	enum kmemtrace_type_id type_id;
diff --git a/mm/slab.c b/mm/slab.c
index 9ec66c3e6ee0..fa00fd6a644d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3565,8 +3565,8 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
-			     obj_size(cachep), cachep->buffer_size, flags);
+	trace_kmem_cache_alloc(_RET_IP_, ret,
+			       obj_size(cachep), cachep->buffer_size, flags);
 
 	return ret;
 }
@@ -3627,9 +3627,9 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	void *ret = __cache_alloc_node(cachep, flags, nodeid,
 				       __builtin_return_address(0));
 
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
-				  obj_size(cachep), cachep->buffer_size,
-				  flags, nodeid);
+	trace_kmem_cache_alloc_node(_RET_IP_, ret,
+				    obj_size(cachep), cachep->buffer_size,
+				    flags, nodeid);
 
 	return ret;
 }
@@ -3657,9 +3657,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
 		return cachep;
 	ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
 
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-				  (unsigned long) caller, ret,
-				  size, cachep->buffer_size, flags, node);
+	trace_kmalloc_node((unsigned long) caller, ret,
+			   size, cachep->buffer_size, flags, node);
 
 	return ret;
 }
@@ -3709,9 +3708,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 		return cachep;
 	ret = __cache_alloc(cachep, flags, caller);
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC,
-			     (unsigned long) caller, ret,
-			     size, cachep->buffer_size, flags);
+	trace_kmalloc((unsigned long) caller, ret,
+		      size, cachep->buffer_size, flags);
 
 	return ret;
 }
@@ -3757,7 +3755,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 	__cache_free(cachep, objp);
 	local_irq_restore(flags);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, objp);
+	trace_kmem_cache_free(_RET_IP_, objp);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -3785,7 +3783,7 @@ void kfree(const void *objp)
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, objp);
+	trace_kfree(_RET_IP_, objp);
 }
 EXPORT_SYMBOL(kfree);
 
diff --git a/mm/slob.c b/mm/slob.c
index 4dd6516447f2..00003587ebfa 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -490,9 +490,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 		*m = size;
 		ret = (void *)m + align;
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-					  _RET_IP_, ret,
-					  size, size + align, gfp, node);
+		trace_kmalloc_node(_RET_IP_, ret,
+				   size, size + align, gfp, node);
 	} else {
 		unsigned int order = get_order(size);
 
@@ -503,9 +502,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 			page->private = size;
 		}
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-					  _RET_IP_, ret,
-					  size, PAGE_SIZE << order, gfp, node);
+		trace_kmalloc_node(_RET_IP_, ret,
+				   size, PAGE_SIZE << order, gfp, node);
 	}
 
 	return ret;
@@ -527,7 +525,7 @@ void kfree(const void *block)
 	} else
 		put_page(&sp->page);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, block);
+	trace_kfree(_RET_IP_, block);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -599,16 +597,14 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 
 	if (c->size < PAGE_SIZE) {
 		b = slob_alloc(c->size, flags, c->align, node);
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
-					  _RET_IP_, b, c->size,
-					  SLOB_UNITS(c->size) * SLOB_UNIT,
-					  flags, node);
+		trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
+					    SLOB_UNITS(c->size) * SLOB_UNIT,
+					    flags, node);
 	} else {
 		b = slob_new_pages(flags, get_order(c->size), node);
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
-					  _RET_IP_, b, c->size,
-					  PAGE_SIZE << get_order(c->size),
-					  flags, node);
+		trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
+					    PAGE_SIZE << get_order(c->size),
+					    flags, node);
 	}
 
 	if (c->ctor)
@@ -646,7 +642,7 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
 		__kmem_cache_free(b, c->size);
 	}
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, b);
+	trace_kmem_cache_free(_RET_IP_, b);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
diff --git a/mm/slub.c b/mm/slub.c
index 7aaa121d0ea9..a98078bf738b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1621,8 +1621,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
 	void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_);
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
-			     s->objsize, s->size, gfpflags);
+	trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
 
 	return ret;
 }
@@ -1641,8 +1640,8 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 {
 	void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
 
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
-				  s->objsize, s->size, gfpflags, node);
+	trace_kmem_cache_alloc_node(_RET_IP_, ret,
+				    s->objsize, s->size, gfpflags, node);
 
 	return ret;
 }
@@ -1767,7 +1766,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 
 	slab_free(s, page, x, _RET_IP_);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, x);
+	trace_kmem_cache_free(_RET_IP_, x);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -2702,8 +2701,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 
 	ret = slab_alloc(s, flags, -1, _RET_IP_);
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret,
-			     size, s->size, flags);
+	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
 
 	return ret;
 }
@@ -2729,10 +2727,9 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 	if (unlikely(size > SLUB_MAX_SIZE)) {
 		ret = kmalloc_large_node(size, flags, node);
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-					  _RET_IP_, ret,
-					  size, PAGE_SIZE << get_order(size),
-					  flags, node);
+		trace_kmalloc_node(_RET_IP_, ret,
+				   size, PAGE_SIZE << get_order(size),
+				   flags, node);
 
 		return ret;
 	}
@@ -2744,8 +2741,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 
 	ret = slab_alloc(s, flags, node, _RET_IP_);
 
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret,
-				  size, s->size, flags, node);
+	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
 
 	return ret;
 }
@@ -2807,7 +2803,7 @@ void kfree(const void *x)
 	}
 	slab_free(page->slab, page, object, _RET_IP_);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, x);
+	trace_kfree(_RET_IP_, x);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -3290,8 +3286,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 	ret = slab_alloc(s, gfpflags, -1, caller);
 
 	/* Honor the call site pointer we recieved. */
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, caller, ret, size,
-			     s->size, gfpflags);
+	trace_kmalloc(caller, ret, size, s->size, gfpflags);
 
 	return ret;
 }
@@ -3313,8 +3308,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 	ret = slab_alloc(s, gfpflags, node, caller);
 
 	/* Honor the call site pointer we recieved. */
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, caller, ret,
-				  size, s->size, gfpflags, node);
+	trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
 
 	return ret;
 }
diff --git a/mm/util.c b/mm/util.c
index 7c122e49f769..2599e83eea17 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,7 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/tracepoint.h>
 #include <asm/uaccess.h>
 
 /**
@@ -236,3 +237,18 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
+
+/* Tracepoints definitions. */
+DEFINE_TRACE(kmalloc);
+DEFINE_TRACE(kmem_cache_alloc);
+DEFINE_TRACE(kmalloc_node);
+DEFINE_TRACE(kmem_cache_alloc_node);
+DEFINE_TRACE(kfree);
+DEFINE_TRACE(kmem_cache_free);
+
+EXPORT_TRACEPOINT_SYMBOL(kmalloc);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
+EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kfree);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
-- 
cgit v1.2.3-59-g8ed1b


From da2635a9854423b4aa3a5f0e4e6efcc39ac99004 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:25 +0200
Subject: kmemtrace: kmemtrace_alloc() must fill type_id

Impact: fix trace output

kmemtrace_alloc() was not filling type_id, which allowed garbage to make
it into tracing data.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
LKML-Reference: <284dba2732a144849d5aa82258fe0de2ad8dcb0b.1237813499.git.eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 4f7b5db5dd06..ae259f04ee39 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -52,6 +52,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 
 	entry->ent.type = TRACE_KMEM_ALLOC;
+	entry->type_id = type_id;
 	entry->call_site = call_site;
 	entry->ptr = ptr;
 	entry->bytes_req = bytes_req;
-- 
cgit v1.2.3-59-g8ed1b


From 42af9054c0eeed09ec58d13ec8bf52d225ebcfcc Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:26 +0200
Subject: kmemtrace: restore original tracing data binary format, improve ABI

When kmemtrace was ported to ftrace, the marker strings were taken as
an indication of how the traced data was being exposed to the userspace.
However, the actual format had been binary, not text.

This restores the original binary format, while also adding an origin CPU
field (since ftrace doesn't expose the data per-CPU to userspace), and
re-adding the timestamp field. It also drops arch-independent field
sizing where it didn't make sense, so pointers won't always be 64 bits
wide like they used to.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
LKML-Reference: <161be9ca8a27b432c4a6ab79f47788c4521652ae.1237813499.git.eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 82 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 58 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index ae259f04ee39..d8c2d0c91b4c 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -208,47 +208,81 @@ static void kmemtrace_headers(struct seq_file *s)
 }
 
 /*
- * The two following functions give the original output from kmemtrace,
- * or something close to....perhaps they need some missing things
+ * The following functions give the original output from kmemtrace,
+ * plus the origin CPU, since reordering occurs in-kernel now.
  */
+
+#define KMEMTRACE_USER_ALLOC	0
+#define KMEMTRACE_USER_FREE	1
+
+struct kmemtrace_user_event {
+	u8		event_id;
+	u8		type_id;
+	u16		event_size;
+	u32		cpu;
+	u64		timestamp;
+	unsigned long	call_site;
+	unsigned long	ptr;
+};
+
+struct kmemtrace_user_event_alloc {
+	size_t		bytes_req;
+	size_t		bytes_alloc;
+	unsigned	gfp_flags;
+	int		node;
+};
+
 static enum print_line_t
-kmemtrace_print_alloc_original(struct trace_iterator *iter,
-				struct kmemtrace_alloc_entry *entry)
+kmemtrace_print_alloc_user(struct trace_iterator *iter,
+			   struct kmemtrace_alloc_entry *entry)
 {
 	struct trace_seq *s = &iter->seq;
-	int ret;
-
-	/* Taken from the old linux/kmemtrace.h */
-	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu "
-	  "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
-	   entry->type_id, entry->call_site, (unsigned long) entry->ptr,
-	   (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc,
-	   (unsigned long) entry->gfp_flags, entry->node);
+	struct kmemtrace_user_event *ev;
+	struct kmemtrace_user_event_alloc *ev_alloc;
 
-	if (!ret)
+	ev = trace_seq_reserve(s, sizeof(*ev));
+	if (!ev)
+		return TRACE_TYPE_PARTIAL_LINE;
+	ev->event_id = KMEMTRACE_USER_ALLOC;
+	ev->type_id = entry->type_id;
+	ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
+	ev->cpu = iter->cpu;
+	ev->timestamp = iter->ts;
+	ev->call_site = entry->call_site; 
+	ev->ptr = (unsigned long) entry->ptr;
+
+	ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
+	if (!ev_alloc)
 		return TRACE_TYPE_PARTIAL_LINE;
+	ev_alloc->bytes_req = entry->bytes_req;
+	ev_alloc->bytes_alloc = entry->bytes_alloc;
+	ev_alloc->gfp_flags = entry->gfp_flags;
+	ev_alloc->node = entry->node;
 
 	return TRACE_TYPE_HANDLED;
 }
 
 static enum print_line_t
-kmemtrace_print_free_original(struct trace_iterator *iter,
-				struct kmemtrace_free_entry *entry)
+kmemtrace_print_free_user(struct trace_iterator *iter,
+			  struct kmemtrace_free_entry *entry)
 {
 	struct trace_seq *s = &iter->seq;
-	int ret;
+	struct kmemtrace_user_event *ev;
 
-	/* Taken from the old linux/kmemtrace.h */
-	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n",
-	   entry->type_id, entry->call_site, (unsigned long) entry->ptr);
-
-	if (!ret)
+	ev = trace_seq_reserve(s, sizeof(*ev));
+	if (!ev)
 		return TRACE_TYPE_PARTIAL_LINE;
+	ev->event_id = KMEMTRACE_USER_FREE;
+	ev->type_id = entry->type_id;
+	ev->event_size = sizeof(*ev);
+	ev->cpu = iter->cpu;
+	ev->timestamp = iter->ts;
+	ev->call_site = entry->call_site; 
+	ev->ptr = (unsigned long) entry->ptr;
 
 	return TRACE_TYPE_HANDLED;
 }
 
-
 /* The two other following provide a more minimalistic output */
 static enum print_line_t
 kmemtrace_print_alloc_compress(struct trace_iterator *iter,
@@ -385,7 +419,7 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
 			return kmemtrace_print_alloc_compress(iter, field);
 		else
-			return kmemtrace_print_alloc_original(iter, field);
+			return kmemtrace_print_alloc_user(iter, field);
 	}
 
 	case TRACE_KMEM_FREE: {
@@ -394,7 +428,7 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
 			return kmemtrace_print_free_compress(iter, field);
 		else
-			return kmemtrace_print_free_original(iter, field);
+			return kmemtrace_print_free_user(iter, field);
 	}
 
 	default:
-- 
cgit v1.2.3-59-g8ed1b


From c826e3cd0c931d60d548f2468122da570d145556 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Mar 2009 16:14:13 +0100
Subject: kmemtrace: small cleanups

Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
LKML-Reference: <161be9ca8a27b432c4a6ab79f47788c4521652ae.1237813499.git.eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 128 +++++++++++++++++++++++++----------------------
 1 file changed, 67 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index d8c2d0c91b4c..5011f4d91e37 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -6,15 +6,16 @@
  * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
  */
 
-#include <linux/dcache.h>
+#include <linux/tracepoint.h>
+#include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/dcache.h>
 #include <linux/fs.h>
-#include <linux/seq_file.h>
-#include <linux/tracepoint.h>
+
 #include <trace/kmemtrace.h>
 
-#include "trace.h"
 #include "trace_output.h"
+#include "trace.h"
 
 /* Select an alternative, minimalistic output than the original one */
 #define TRACE_KMEM_OPT_MINIMAL	0x1
@@ -26,8 +27,8 @@ static struct tracer_opt kmem_opts[] = {
 };
 
 static struct tracer_flags kmem_tracer_flags = {
-	.val = 0,
-	.opts = kmem_opts
+	.val			= 0,
+	.opts			= kmem_opts
 };
 
 static struct trace_array *kmemtrace_array;
@@ -41,24 +42,25 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 				   gfp_t gfp_flags,
 				   int node)
 {
-	struct ring_buffer_event *event;
-	struct kmemtrace_alloc_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
+	struct kmemtrace_alloc_entry *entry;
+	struct ring_buffer_event *event;
 
 	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
-	entry	= ring_buffer_event_data(event);
+
+	entry = ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 
-	entry->ent.type = TRACE_KMEM_ALLOC;
-	entry->type_id = type_id;
-	entry->call_site = call_site;
-	entry->ptr = ptr;
-	entry->bytes_req = bytes_req;
-	entry->bytes_alloc = bytes_alloc;
-	entry->gfp_flags = gfp_flags;
-	entry->node	=	node;
+	entry->ent.type		= TRACE_KMEM_ALLOC;
+	entry->type_id		= type_id;
+	entry->call_site	= call_site;
+	entry->ptr		= ptr;
+	entry->bytes_req	= bytes_req;
+	entry->bytes_alloc	= bytes_alloc;
+	entry->gfp_flags	= gfp_flags;
+	entry->node		= node;
 
 	ring_buffer_unlock_commit(tr->buffer, event);
 
@@ -69,9 +71,9 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 				  unsigned long call_site,
 				  const void *ptr)
 {
-	struct ring_buffer_event *event;
-	struct kmemtrace_free_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
+	struct kmemtrace_free_entry *entry;
+	struct ring_buffer_event *event;
 
 	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
@@ -79,10 +81,10 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 
-	entry->ent.type = TRACE_KMEM_FREE;
-	entry->type_id	= type_id;
-	entry->call_site = call_site;
-	entry->ptr = ptr;
+	entry->ent.type		= TRACE_KMEM_FREE;
+	entry->type_id		= type_id;
+	entry->call_site	= call_site;
+	entry->ptr		= ptr;
 
 	ring_buffer_unlock_commit(tr->buffer, event);
 
@@ -216,48 +218,50 @@ static void kmemtrace_headers(struct seq_file *s)
 #define KMEMTRACE_USER_FREE	1
 
 struct kmemtrace_user_event {
-	u8		event_id;
-	u8		type_id;
-	u16		event_size;
-	u32		cpu;
-	u64		timestamp;
-	unsigned long	call_site;
-	unsigned long	ptr;
+	u8			event_id;
+	u8			type_id;
+	u16			event_size;
+	u32			cpu;
+	u64			timestamp;
+	unsigned long		call_site;
+	unsigned long		ptr;
 };
 
 struct kmemtrace_user_event_alloc {
-	size_t		bytes_req;
-	size_t		bytes_alloc;
-	unsigned	gfp_flags;
-	int		node;
+	size_t			bytes_req;
+	size_t			bytes_alloc;
+	unsigned		gfp_flags;
+	int			node;
 };
 
 static enum print_line_t
 kmemtrace_print_alloc_user(struct trace_iterator *iter,
 			   struct kmemtrace_alloc_entry *entry)
 {
+	struct kmemtrace_user_event_alloc *ev_alloc;
 	struct trace_seq *s = &iter->seq;
 	struct kmemtrace_user_event *ev;
-	struct kmemtrace_user_event_alloc *ev_alloc;
 
 	ev = trace_seq_reserve(s, sizeof(*ev));
 	if (!ev)
 		return TRACE_TYPE_PARTIAL_LINE;
-	ev->event_id = KMEMTRACE_USER_ALLOC;
-	ev->type_id = entry->type_id;
-	ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
-	ev->cpu = iter->cpu;
-	ev->timestamp = iter->ts;
-	ev->call_site = entry->call_site; 
-	ev->ptr = (unsigned long) entry->ptr;
+
+	ev->event_id		= KMEMTRACE_USER_ALLOC;
+	ev->type_id		= entry->type_id;
+	ev->event_size		= sizeof(*ev) + sizeof(*ev_alloc);
+	ev->cpu			= iter->cpu;
+	ev->timestamp		= iter->ts;
+	ev->call_site		= entry->call_site;
+	ev->ptr			= (unsigned long)entry->ptr;
 
 	ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
 	if (!ev_alloc)
 		return TRACE_TYPE_PARTIAL_LINE;
-	ev_alloc->bytes_req = entry->bytes_req;
-	ev_alloc->bytes_alloc = entry->bytes_alloc;
-	ev_alloc->gfp_flags = entry->gfp_flags;
-	ev_alloc->node = entry->node;
+
+	ev_alloc->bytes_req	= entry->bytes_req;
+	ev_alloc->bytes_alloc	= entry->bytes_alloc;
+	ev_alloc->gfp_flags	= entry->gfp_flags;
+	ev_alloc->node		= entry->node;
 
 	return TRACE_TYPE_HANDLED;
 }
@@ -272,13 +276,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter,
 	ev = trace_seq_reserve(s, sizeof(*ev));
 	if (!ev)
 		return TRACE_TYPE_PARTIAL_LINE;
-	ev->event_id = KMEMTRACE_USER_FREE;
-	ev->type_id = entry->type_id;
-	ev->event_size = sizeof(*ev);
-	ev->cpu = iter->cpu;
-	ev->timestamp = iter->ts;
-	ev->call_site = entry->call_site; 
-	ev->ptr = (unsigned long) entry->ptr;
+
+	ev->event_id		= KMEMTRACE_USER_FREE;
+	ev->type_id		= entry->type_id;
+	ev->event_size		= sizeof(*ev);
+	ev->cpu			= iter->cpu;
+	ev->timestamp		= iter->ts;
+	ev->call_site		= entry->call_site;
+	ev->ptr			= (unsigned long)entry->ptr;
 
 	return TRACE_TYPE_HANDLED;
 }
@@ -354,7 +359,7 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
 
 static enum print_line_t
 kmemtrace_print_free_compress(struct trace_iterator *iter,
-				struct kmemtrace_free_entry *entry)
+			      struct kmemtrace_free_entry *entry)
 {
 	struct trace_seq *s = &iter->seq;
 	int ret;
@@ -415,6 +420,7 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 	switch (entry->type) {
 	case TRACE_KMEM_ALLOC: {
 		struct kmemtrace_alloc_entry *field;
+
 		trace_assign_type(field, entry);
 		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
 			return kmemtrace_print_alloc_compress(iter, field);
@@ -424,6 +430,7 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 
 	case TRACE_KMEM_FREE: {
 		struct kmemtrace_free_entry *field;
+
 		trace_assign_type(field, entry);
 		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
 			return kmemtrace_print_free_compress(iter, field);
@@ -437,12 +444,12 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 }
 
 static struct tracer kmem_tracer __read_mostly = {
-	.name		= "kmemtrace",
-	.init		= kmem_trace_init,
-	.reset		= kmem_trace_reset,
-	.print_line	= kmemtrace_print_line,
-	.print_header = kmemtrace_headers,
-	.flags		= &kmem_tracer_flags
+	.name			= "kmemtrace",
+	.init			= kmem_trace_init,
+	.reset			= kmem_trace_reset,
+	.print_line		= kmemtrace_print_line,
+	.print_header		= kmemtrace_headers,
+	.flags			= &kmem_tracer_flags
 };
 
 void kmemtrace_init(void)
@@ -454,5 +461,4 @@ static int __init init_kmem_tracer(void)
 {
 	return register_tracer(&kmem_tracer);
 }
-
 device_initcall(init_kmem_tracer);
-- 
cgit v1.2.3-59-g8ed1b


From a4b3ada83d06554d307dd54abdc62b2e5648264a Mon Sep 17 00:00:00 2001
From: Carl Henrik Lunde <chlunde@ping.uio.no>
Date: Fri, 3 Apr 2009 14:27:15 +0200
Subject: blktrace: NUL-terminate user space messages

Impact: fix corrupted blkparse output

Make sure messages from user space are NUL-terminated strings,
otherwise we could dump random memory to the block trace file.

Additionally, I've limited the message to BLK_TN_MAX_MSG-1
characters, because the last character would be stripped by
vscnprintf anyway.

Signed-off-by: Carl Henrik Lunde <chlunde@ping.uio.no>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "Alan D. Brunelle" <alan.brunelle@hp.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090403122714.GT5178@kernel.dk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 947c5b3f90c4..a400b861fad3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -327,10 +327,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 	char *msg;
 	struct blk_trace *bt;
 
-	if (count > BLK_TN_MAX_MSG)
+	if (count > BLK_TN_MAX_MSG - 1)
 		return -EINVAL;
 
-	msg = kmalloc(count, GFP_KERNEL);
+	msg = kmalloc(count + 1, GFP_KERNEL);
 	if (msg == NULL)
 		return -ENOMEM;
 
@@ -339,6 +339,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 		return -EFAULT;
 	}
 
+	msg[count] = '\0';
 	bt = filp->private_data;
 	__trace_note_message(bt, "%s", msg);
 	kfree(msg);
-- 
cgit v1.2.3-59-g8ed1b


From 7635b03adf3d7b84da7649b81efa91e6ebf11b85 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 3 Apr 2009 15:31:34 +0800
Subject: blktrace: small cleanup in blk_msg_write()

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "Alan D. Brunelle" <alan.brunelle@hp.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49D5BB56.7000807@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index a400b861fad3..73d7860b72e2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -327,7 +327,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 	char *msg;
 	struct blk_trace *bt;
 
-	if (count > BLK_TN_MAX_MSG - 1)
+	if (count >= BLK_TN_MAX_MSG)
 		return -EINVAL;
 
 	msg = kmalloc(count + 1, GFP_KERNEL);
-- 
cgit v1.2.3-59-g8ed1b


From e2494e1b42ebac402324105d57646489d19e2b01 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 13:43:26 +0800
Subject: blktrace: fix pdu_len when tracing packet command requests

Impact: output all of packet commands - not just the first 4 / 8 bytes

Since commit d7e3c3249ef23b4617393c69fe464765b4ff1645 ("block: add
large command support"), struct request->cmd has been changed from
unsinged char cmd[BLK_MAX_CDB] to unsigned char *cmd.

v1 -> v2: by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>

- make sure rq->cmd_len is always intialized, and then we can use
  rq->cmd_len instead of BLK_MAX_CDB.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49D4507E.2060602@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blk-core.c        | 1 +
 kernel/trace/blktrace.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/block/blk-core.c b/block/blk-core.c
index 29bcfac6c688..859879d0a0bf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -132,6 +132,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->cmd = rq->__cmd;
+	rq->cmd_len = BLK_MAX_CDB;
 	rq->tag = -1;
 	rq->ref_count = 1;
 }
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 73d7860b72e2..b32ff446c3fb 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -643,7 +643,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 	if (blk_pc_request(rq)) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
 		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
-				sizeof(rq->cmd), rq->cmd);
+				rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
 		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
-- 
cgit v1.2.3-59-g8ed1b


From 07fe7cb7c7c179f473fd9c823348fd3eb5dad369 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Create a dynamically sized pool of threads for doing very slow work
 items

Create a dynamically sized pool of threads for doing very slow work items, such
as invoking mkdir() or rmdir() - things that may take a long time and may
sleep, holding mutexes/semaphores and hogging a thread, and are thus unsuitable
for workqueues.

The number of threads is always at least a settable minimum, but more are
started when there's more work to do, up to a limit.  Because of the nature of
the load, it's not suitable for a 1-thread-per-CPU type pool.  A system with
one CPU may well want several threads.

This is used by FS-Cache to do slow caching operations in the background, such
as looking up, creating or deleting cache objects.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 include/linux/slow-work.h |  88 +++++++++++
 init/Kconfig              |  12 ++
 kernel/Makefile           |   1 +
 kernel/slow-work.c        | 388 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 489 insertions(+)
 create mode 100644 include/linux/slow-work.h
 create mode 100644 kernel/slow-work.c

(limited to 'kernel')

diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
new file mode 100644
index 000000000000..4dd754af393e
--- /dev/null
+++ b/include/linux/slow-work.h
@@ -0,0 +1,88 @@
+/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_SLOW_WORK_H
+#define _LINUX_SLOW_WORK_H
+
+#ifdef CONFIG_SLOW_WORK
+
+struct slow_work;
+
+/*
+ * The operations used to support slow work items
+ */
+struct slow_work_ops {
+	/* get a ref on a work item
+	 * - return 0 if successful, -ve if not
+	 */
+	int (*get_ref)(struct slow_work *work);
+
+	/* discard a ref to a work item */
+	void (*put_ref)(struct slow_work *work);
+
+	/* execute a work item */
+	void (*execute)(struct slow_work *work);
+};
+
+/*
+ * A slow work item
+ * - A reference is held on the parent object by the thread pool when it is
+ *   queued
+ */
+struct slow_work {
+	unsigned long		flags;
+#define SLOW_WORK_PENDING	0	/* item pending (further) execution */
+#define SLOW_WORK_EXECUTING	1	/* item currently executing */
+#define SLOW_WORK_ENQ_DEFERRED	2	/* item enqueue deferred */
+#define SLOW_WORK_VERY_SLOW	3	/* item is very slow */
+	const struct slow_work_ops *ops; /* operations table for this item */
+	struct list_head	link;	/* link in queue */
+};
+
+/**
+ * slow_work_init - Initialise a slow work item
+ * @work: The work item to initialise
+ * @ops: The operations to use to handle the slow work item
+ *
+ * Initialise a slow work item.
+ */
+static inline void slow_work_init(struct slow_work *work,
+				  const struct slow_work_ops *ops)
+{
+	work->flags = 0;
+	work->ops = ops;
+	INIT_LIST_HEAD(&work->link);
+}
+
+/**
+ * slow_work_init - Initialise a very slow work item
+ * @work: The work item to initialise
+ * @ops: The operations to use to handle the slow work item
+ *
+ * Initialise a very slow work item.  This item will be restricted such that
+ * only a certain number of the pool threads will be able to execute items of
+ * this type.
+ */
+static inline void vslow_work_init(struct slow_work *work,
+				   const struct slow_work_ops *ops)
+{
+	work->flags = 1 << SLOW_WORK_VERY_SLOW;
+	work->ops = ops;
+	INIT_LIST_HEAD(&work->link);
+}
+
+extern int slow_work_enqueue(struct slow_work *work);
+extern int slow_work_register_user(void);
+extern void slow_work_unregister_user(void);
+
+
+#endif /* CONFIG_SLOW_WORK */
+#endif /* _LINUX_SLOW_WORK_H */
diff --git a/init/Kconfig b/init/Kconfig
index 1398a14b0191..236a79377b8e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1014,6 +1014,18 @@ config MARKERS
 
 source "arch/Kconfig"
 
+config SLOW_WORK
+	default n
+	bool "Enable slow work thread pool"
+	help
+	  The slow work thread pool provides a number of dynamically allocated
+	  threads that can be used by the kernel to perform operations that
+	  take a relatively long time.
+
+	  An example of this would be CacheFiles doing a path lookup followed
+	  by a series of mkdirs and a create call, all of which have to touch
+	  disk.
+
 endmenu		# General setup
 
 config HAVE_GENERIC_DMA_COHERENT
diff --git a/kernel/Makefile b/kernel/Makefile
index e4791b3ba55d..bab1dffe37e9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_SLOW_WORK) += slow-work.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
new file mode 100644
index 000000000000..5a7392734c82
--- /dev/null
+++ b/kernel/slow-work.c
@@ -0,0 +1,388 @@
+/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+#include <asm/system.h>
+
+/*
+ * The pool of threads has at least min threads in it as long as someone is
+ * using the facility, and may have as many as max.
+ *
+ * A portion of the pool may be processing very slow operations.
+ */
+static unsigned slow_work_min_threads = 2;
+static unsigned slow_work_max_threads = 4;
+static unsigned vslow_work_proportion = 50; /* % of threads that may process
+					     * very slow work */
+static atomic_t slow_work_thread_count;
+static atomic_t vslow_work_executing_count;
+
+/*
+ * The queues of work items and the lock governing access to them.  These are
+ * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
+ * as the number of threads bears no relation to the number of CPUs.
+ *
+ * There are two queues of work items: one for slow work items, and one for
+ * very slow work items.
+ */
+static LIST_HEAD(slow_work_queue);
+static LIST_HEAD(vslow_work_queue);
+static DEFINE_SPINLOCK(slow_work_queue_lock);
+
+/*
+ * The thread controls.  A variable used to signal to the threads that they
+ * should exit when the queue is empty, a waitqueue used by the threads to wait
+ * for signals, and a completion set by the last thread to exit.
+ */
+static bool slow_work_threads_should_exit;
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
+static DECLARE_COMPLETION(slow_work_last_thread_exited);
+
+/*
+ * The number of users of the thread pool and its lock.  Whilst this is zero we
+ * have no threads hanging around, and when this reaches zero, we wait for all
+ * active or queued work items to complete and kill all the threads we do have.
+ */
+static int slow_work_user_count;
+static DEFINE_MUTEX(slow_work_user_lock);
+
+/*
+ * Calculate the maximum number of active threads in the pool that are
+ * permitted to process very slow work items.
+ *
+ * The answer is rounded up to at least 1, but may not equal or exceed the
+ * maximum number of the threads in the pool.  This means we always have at
+ * least one thread that can process slow work items, and we always have at
+ * least one thread that won't get tied up doing so.
+ */
+static unsigned slow_work_calc_vsmax(void)
+{
+	unsigned vsmax;
+
+	vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
+	vsmax /= 100;
+	vsmax = max(vsmax, 1U);
+	return min(vsmax, slow_work_max_threads - 1);
+}
+
+/*
+ * Attempt to execute stuff queued on a slow thread.  Return true if we managed
+ * it, false if there was nothing to do.
+ */
+static bool slow_work_execute(void)
+{
+	struct slow_work *work = NULL;
+	unsigned vsmax;
+	bool very_slow;
+
+	vsmax = slow_work_calc_vsmax();
+
+	/* find something to execute */
+	spin_lock_irq(&slow_work_queue_lock);
+	if (!list_empty(&vslow_work_queue) &&
+	    atomic_read(&vslow_work_executing_count) < vsmax) {
+		work = list_entry(vslow_work_queue.next,
+				  struct slow_work, link);
+		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+			BUG();
+		list_del_init(&work->link);
+		atomic_inc(&vslow_work_executing_count);
+		very_slow = true;
+	} else if (!list_empty(&slow_work_queue)) {
+		work = list_entry(slow_work_queue.next,
+				  struct slow_work, link);
+		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+			BUG();
+		list_del_init(&work->link);
+		very_slow = false;
+	} else {
+		very_slow = false; /* avoid the compiler warning */
+	}
+	spin_unlock_irq(&slow_work_queue_lock);
+
+	if (!work)
+		return false;
+
+	if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
+		BUG();
+
+	work->ops->execute(work);
+
+	if (very_slow)
+		atomic_dec(&vslow_work_executing_count);
+	clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
+
+	/* if someone tried to enqueue the item whilst we were executing it,
+	 * then it'll be left unenqueued to avoid multiple threads trying to
+	 * execute it simultaneously
+	 *
+	 * there is, however, a race between us testing the pending flag and
+	 * getting the spinlock, and between the enqueuer setting the pending
+	 * flag and getting the spinlock, so we use a deferral bit to tell us
+	 * if the enqueuer got there first
+	 */
+	if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
+		spin_lock_irq(&slow_work_queue_lock);
+
+		if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
+		    test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
+			goto auto_requeue;
+
+		spin_unlock_irq(&slow_work_queue_lock);
+	}
+
+	work->ops->put_ref(work);
+	return true;
+
+auto_requeue:
+	/* we must complete the enqueue operation
+	 * - we transfer our ref on the item back to the appropriate queue
+	 * - don't wake another thread up as we're awake already
+	 */
+	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+		list_add_tail(&work->link, &vslow_work_queue);
+	else
+		list_add_tail(&work->link, &slow_work_queue);
+	spin_unlock_irq(&slow_work_queue_lock);
+	return true;
+}
+
+/**
+ * slow_work_enqueue - Schedule a slow work item for processing
+ * @work: The work item to queue
+ *
+ * Schedule a slow work item for processing.  If the item is already undergoing
+ * execution, this guarantees not to re-enter the execution routine until the
+ * first execution finishes.
+ *
+ * The item is pinned by this function as it retains a reference to it, managed
+ * through the item operations.  The item is unpinned once it has been
+ * executed.
+ *
+ * An item may hog the thread that is running it for a relatively large amount
+ * of time, sufficient, for example, to perform several lookup, mkdir, create
+ * and setxattr operations.  It may sleep on I/O and may sleep to obtain locks.
+ *
+ * Conversely, if a number of items are awaiting processing, it may take some
+ * time before any given item is given attention.  The number of threads in the
+ * pool may be increased to deal with demand, but only up to a limit.
+ *
+ * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
+ * the very slow queue, from which only a portion of the threads will be
+ * allowed to pick items to execute.  This ensures that very slow items won't
+ * overly block ones that are just ordinarily slow.
+ *
+ * Returns 0 if successful, -EAGAIN if not.
+ */
+int slow_work_enqueue(struct slow_work *work)
+{
+	unsigned long flags;
+
+	BUG_ON(slow_work_user_count <= 0);
+	BUG_ON(!work);
+	BUG_ON(!work->ops);
+	BUG_ON(!work->ops->get_ref);
+
+	/* when honouring an enqueue request, we only promise that we will run
+	 * the work function in the future; we do not promise to run it once
+	 * per enqueue request
+	 *
+	 * we use the PENDING bit to merge together repeat requests without
+	 * having to disable IRQs and take the spinlock, whilst still
+	 * maintaining our promise
+	 */
+	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+		spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+		/* we promise that we will not attempt to execute the work
+		 * function in more than one thread simultaneously
+		 *
+		 * this, however, leaves us with a problem if we're asked to
+		 * enqueue the work whilst someone is executing the work
+		 * function as simply queueing the work immediately means that
+		 * another thread may try executing it whilst it is already
+		 * under execution
+		 *
+		 * to deal with this, we set the ENQ_DEFERRED bit instead of
+		 * enqueueing, and the thread currently executing the work
+		 * function will enqueue the work item when the work function
+		 * returns and it has cleared the EXECUTING bit
+		 */
+		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
+			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
+		} else {
+			if (work->ops->get_ref(work) < 0)
+				goto cant_get_ref;
+			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+				list_add_tail(&work->link, &vslow_work_queue);
+			else
+				list_add_tail(&work->link, &slow_work_queue);
+			wake_up(&slow_work_thread_wq);
+		}
+
+		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	}
+	return 0;
+
+cant_get_ref:
+	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	return -EAGAIN;
+}
+EXPORT_SYMBOL(slow_work_enqueue);
+
+/*
+ * Determine if there is slow work available for dispatch
+ */
+static inline bool slow_work_available(int vsmax)
+{
+	return !list_empty(&slow_work_queue) ||
+		(!list_empty(&vslow_work_queue) &&
+		 atomic_read(&vslow_work_executing_count) < vsmax);
+}
+
+/*
+ * Worker thread dispatcher
+ */
+static int slow_work_thread(void *_data)
+{
+	int vsmax;
+
+	DEFINE_WAIT(wait);
+
+	set_freezable();
+	set_user_nice(current, -5);
+
+	for (;;) {
+		vsmax = vslow_work_proportion;
+		vsmax *= atomic_read(&slow_work_thread_count);
+		vsmax /= 100;
+
+		prepare_to_wait(&slow_work_thread_wq, &wait,
+				TASK_INTERRUPTIBLE);
+		if (!freezing(current) &&
+		    !slow_work_threads_should_exit &&
+		    !slow_work_available(vsmax))
+			schedule();
+		finish_wait(&slow_work_thread_wq, &wait);
+
+		try_to_freeze();
+
+		vsmax = vslow_work_proportion;
+		vsmax *= atomic_read(&slow_work_thread_count);
+		vsmax /= 100;
+
+		if (slow_work_available(vsmax) && slow_work_execute()) {
+			cond_resched();
+			continue;
+		}
+
+		if (slow_work_threads_should_exit)
+			break;
+	}
+
+	if (atomic_dec_and_test(&slow_work_thread_count))
+		complete_and_exit(&slow_work_last_thread_exited, 0);
+	return 0;
+}
+
+/**
+ * slow_work_register_user - Register a user of the facility
+ *
+ * Register a user of the facility, starting up the initial threads if there
+ * aren't any other users at this point.  This will return 0 if successful, or
+ * an error if not.
+ */
+int slow_work_register_user(void)
+{
+	struct task_struct *p;
+	int loop;
+
+	mutex_lock(&slow_work_user_lock);
+
+	if (slow_work_user_count == 0) {
+		printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
+		init_completion(&slow_work_last_thread_exited);
+
+		slow_work_threads_should_exit = false;
+
+		/* start the minimum number of threads */
+		for (loop = 0; loop < slow_work_min_threads; loop++) {
+			atomic_inc(&slow_work_thread_count);
+			p = kthread_run(slow_work_thread, NULL, "kslowd");
+			if (IS_ERR(p))
+				goto error;
+		}
+		printk(KERN_NOTICE "Slow work thread pool: Ready\n");
+	}
+
+	slow_work_user_count++;
+	mutex_unlock(&slow_work_user_lock);
+	return 0;
+
+error:
+	if (atomic_dec_and_test(&slow_work_thread_count))
+		complete(&slow_work_last_thread_exited);
+	if (loop > 0) {
+		printk(KERN_ERR "Slow work thread pool:"
+		       " Aborting startup on ENOMEM\n");
+		slow_work_threads_should_exit = true;
+		wake_up_all(&slow_work_thread_wq);
+		wait_for_completion(&slow_work_last_thread_exited);
+		printk(KERN_ERR "Slow work thread pool: Aborted\n");
+	}
+	mutex_unlock(&slow_work_user_lock);
+	return PTR_ERR(p);
+}
+EXPORT_SYMBOL(slow_work_register_user);
+
+/**
+ * slow_work_unregister_user - Unregister a user of the facility
+ *
+ * Unregister a user of the facility, killing all the threads if this was the
+ * last one.
+ */
+void slow_work_unregister_user(void)
+{
+	mutex_lock(&slow_work_user_lock);
+
+	BUG_ON(slow_work_user_count <= 0);
+
+	slow_work_user_count--;
+	if (slow_work_user_count == 0) {
+		printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
+		slow_work_threads_should_exit = true;
+		wake_up_all(&slow_work_thread_wq);
+		wait_for_completion(&slow_work_last_thread_exited);
+		printk(KERN_NOTICE "Slow work thread pool:"
+		       " Shut down complete\n");
+	}
+
+	mutex_unlock(&slow_work_user_lock);
+}
+EXPORT_SYMBOL(slow_work_unregister_user);
+
+/*
+ * Initialise the slow work facility
+ */
+static int __init init_slow_work(void)
+{
+	unsigned nr_cpus = num_possible_cpus();
+
+	if (nr_cpus > slow_work_max_threads)
+		slow_work_max_threads = nr_cpus;
+	return 0;
+}
+
+subsys_initcall(init_slow_work);
-- 
cgit v1.2.3-59-g8ed1b


From 109d9272c423f46604d45fedfe87e21ee0b25180 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Make slow-work thread pool actually dynamic

Make the slow-work thread pool actually dynamic in the number of threads it
contains.  With this patch, it will both create additional threads when it has
extra work to do, and cull excess threads that aren't doing anything.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 kernel/slow-work.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 137 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 5a7392734c82..454abb21c8bd 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -16,6 +16,14 @@
 #include <linux/wait.h>
 #include <asm/system.h>
 
+#define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
+					 * things to do */
+#define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
+					 * OOM */
+
+static void slow_work_cull_timeout(unsigned long);
+static void slow_work_oom_timeout(unsigned long);
+
 /*
  * The pool of threads has at least min threads in it as long as someone is
  * using the facility, and may have as many as max.
@@ -29,6 +37,12 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process
 static atomic_t slow_work_thread_count;
 static atomic_t vslow_work_executing_count;
 
+static bool slow_work_may_not_start_new_thread;
+static bool slow_work_cull; /* cull a thread due to lack of activity */
+static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
+static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
+static struct slow_work slow_work_new_thread; /* new thread starter */
+
 /*
  * The queues of work items and the lock governing access to them.  These are
  * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
@@ -89,6 +103,14 @@ static bool slow_work_execute(void)
 
 	vsmax = slow_work_calc_vsmax();
 
+	/* see if we can schedule a new thread to be started if we're not
+	 * keeping up with the work */
+	if (!waitqueue_active(&slow_work_thread_wq) &&
+	    (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
+	    atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
+	    !slow_work_may_not_start_new_thread)
+		slow_work_enqueue(&slow_work_new_thread);
+
 	/* find something to execute */
 	spin_lock_irq(&slow_work_queue_lock);
 	if (!list_empty(&vslow_work_queue) &&
@@ -242,6 +264,33 @@ cant_get_ref:
 }
 EXPORT_SYMBOL(slow_work_enqueue);
 
+/*
+ * Worker thread culling algorithm
+ */
+static bool slow_work_cull_thread(void)
+{
+	unsigned long flags;
+	bool do_cull = false;
+
+	spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+	if (slow_work_cull) {
+		slow_work_cull = false;
+
+		if (list_empty(&slow_work_queue) &&
+		    list_empty(&vslow_work_queue) &&
+		    atomic_read(&slow_work_thread_count) >
+		    slow_work_min_threads) {
+			mod_timer(&slow_work_cull_timer,
+				  jiffies + SLOW_WORK_CULL_TIMEOUT);
+			do_cull = true;
+		}
+	}
+
+	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	return do_cull;
+}
+
 /*
  * Determine if there is slow work available for dispatch
  */
@@ -273,7 +322,8 @@ static int slow_work_thread(void *_data)
 				TASK_INTERRUPTIBLE);
 		if (!freezing(current) &&
 		    !slow_work_threads_should_exit &&
-		    !slow_work_available(vsmax))
+		    !slow_work_available(vsmax) &&
+		    !slow_work_cull)
 			schedule();
 		finish_wait(&slow_work_thread_wq, &wait);
 
@@ -285,11 +335,20 @@ static int slow_work_thread(void *_data)
 
 		if (slow_work_available(vsmax) && slow_work_execute()) {
 			cond_resched();
+			if (list_empty(&slow_work_queue) &&
+			    list_empty(&vslow_work_queue) &&
+			    atomic_read(&slow_work_thread_count) >
+			    slow_work_min_threads)
+				mod_timer(&slow_work_cull_timer,
+					  jiffies + SLOW_WORK_CULL_TIMEOUT);
 			continue;
 		}
 
 		if (slow_work_threads_should_exit)
 			break;
+
+		if (slow_work_cull && slow_work_cull_thread())
+			break;
 	}
 
 	if (atomic_dec_and_test(&slow_work_thread_count))
@@ -297,6 +356,77 @@ static int slow_work_thread(void *_data)
 	return 0;
 }
 
+/*
+ * Handle thread cull timer expiration
+ */
+static void slow_work_cull_timeout(unsigned long data)
+{
+	slow_work_cull = true;
+	wake_up(&slow_work_thread_wq);
+}
+
+/*
+ * Get a reference on slow work thread starter
+ */
+static int slow_work_new_thread_get_ref(struct slow_work *work)
+{
+	return 0;
+}
+
+/*
+ * Drop a reference on slow work thread starter
+ */
+static void slow_work_new_thread_put_ref(struct slow_work *work)
+{
+}
+
+/*
+ * Start a new slow work thread
+ */
+static void slow_work_new_thread_execute(struct slow_work *work)
+{
+	struct task_struct *p;
+
+	if (slow_work_threads_should_exit)
+		return;
+
+	if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
+		return;
+
+	if (!mutex_trylock(&slow_work_user_lock))
+		return;
+
+	slow_work_may_not_start_new_thread = true;
+	atomic_inc(&slow_work_thread_count);
+	p = kthread_run(slow_work_thread, NULL, "kslowd");
+	if (IS_ERR(p)) {
+		printk(KERN_DEBUG "Slow work thread pool: OOM\n");
+		if (atomic_dec_and_test(&slow_work_thread_count))
+			BUG(); /* we're running on a slow work thread... */
+		mod_timer(&slow_work_oom_timer,
+			  jiffies + SLOW_WORK_OOM_TIMEOUT);
+	} else {
+		/* ratelimit the starting of new threads */
+		mod_timer(&slow_work_oom_timer, jiffies + 1);
+	}
+
+	mutex_unlock(&slow_work_user_lock);
+}
+
+static const struct slow_work_ops slow_work_new_thread_ops = {
+	.get_ref	= slow_work_new_thread_get_ref,
+	.put_ref	= slow_work_new_thread_put_ref,
+	.execute	= slow_work_new_thread_execute,
+};
+
+/*
+ * post-OOM new thread start suppression expiration
+ */
+static void slow_work_oom_timeout(unsigned long data)
+{
+	slow_work_may_not_start_new_thread = false;
+}
+
 /**
  * slow_work_register_user - Register a user of the facility
  *
@@ -316,6 +446,10 @@ int slow_work_register_user(void)
 		init_completion(&slow_work_last_thread_exited);
 
 		slow_work_threads_should_exit = false;
+		slow_work_init(&slow_work_new_thread,
+			       &slow_work_new_thread_ops);
+		slow_work_may_not_start_new_thread = false;
+		slow_work_cull = false;
 
 		/* start the minimum number of threads */
 		for (loop = 0; loop < slow_work_min_threads; loop++) {
@@ -369,6 +503,8 @@ void slow_work_unregister_user(void)
 		       " Shut down complete\n");
 	}
 
+	del_timer_sync(&slow_work_cull_timer);
+
 	mutex_unlock(&slow_work_user_lock);
 }
 EXPORT_SYMBOL(slow_work_unregister_user);
-- 
cgit v1.2.3-59-g8ed1b


From 12e22c5e4bc08ab4b05ac079fe40d9891c5e81a0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Make the slow work pool configurable

Make the slow work pool configurable through /proc/sys/kernel/slow-work.

 (*) /proc/sys/kernel/slow-work/min-threads

     The minimum number of threads that should be in the pool as long as it is
     in use.  This may be anywhere between 2 and max-threads.

 (*) /proc/sys/kernel/slow-work/max-threads

     The maximum number of threads that should in the pool.  This may be
     anywhere between min-threads and 255 or NR_CPUS * 2, whichever is greater.

 (*) /proc/sys/kernel/slow-work/vslow-percentage

     The percentage of active threads in the pool that may be used to execute
     very slow work items.  This may be between 1 and 99.  The resultant number
     is bounded to between 1 and one fewer than the number of active threads.
     This ensures there is always at least one thread that can process very
     slow work items, and always at least one thread that won't.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 include/linux/slow-work.h |   5 ++
 kernel/slow-work.c        | 118 +++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sysctl.c           |   9 ++++
 3 files changed, 130 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 4dd754af393e..8262809dfa8b 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -14,6 +14,8 @@
 
 #ifdef CONFIG_SLOW_WORK
 
+#include <linux/sysctl.h>
+
 struct slow_work;
 
 /*
@@ -83,6 +85,9 @@ extern int slow_work_enqueue(struct slow_work *work);
 extern int slow_work_register_user(void);
 extern void slow_work_unregister_user(void);
 
+#ifdef CONFIG_SYSCTL
+extern ctl_table slow_work_sysctls[];
+#endif
 
 #endif /* CONFIG_SLOW_WORK */
 #endif /* _LINUX_SLOW_WORK_H */
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 454abb21c8bd..3f65900aa3cb 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -14,7 +14,6 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/wait.h>
-#include <asm/system.h>
 
 #define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
 					 * things to do */
@@ -24,6 +23,14 @@
 static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
 
+#ifdef CONFIG_SYSCTL
+static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
+
+static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
+					void __user *, size_t *, loff_t *);
+#endif
+
 /*
  * The pool of threads has at least min threads in it as long as someone is
  * using the facility, and may have as many as max.
@@ -34,6 +41,51 @@ static unsigned slow_work_min_threads = 2;
 static unsigned slow_work_max_threads = 4;
 static unsigned vslow_work_proportion = 50; /* % of threads that may process
 					     * very slow work */
+
+#ifdef CONFIG_SYSCTL
+static const int slow_work_min_min_threads = 2;
+static int slow_work_max_max_threads = 255;
+static const int slow_work_min_vslow = 1;
+static const int slow_work_max_vslow = 99;
+
+ctl_table slow_work_sysctls[] = {
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "min-threads",
+		.data		= &slow_work_min_threads,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= slow_work_min_threads_sysctl,
+		.extra1		= (void *) &slow_work_min_min_threads,
+		.extra2		= &slow_work_max_threads,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "max-threads",
+		.data		= &slow_work_max_threads,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= slow_work_max_threads_sysctl,
+		.extra1		= &slow_work_min_threads,
+		.extra2		= (void *) &slow_work_max_max_threads,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "vslow-percentage",
+		.data		= &vslow_work_proportion,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= (void *) &slow_work_min_vslow,
+		.extra2		= (void *) &slow_work_max_vslow,
+	},
+	{ .ctl_name = 0 }
+};
+#endif
+
+/*
+ * The active state of the thread pool
+ */
 static atomic_t slow_work_thread_count;
 static atomic_t vslow_work_executing_count;
 
@@ -427,6 +479,64 @@ static void slow_work_oom_timeout(unsigned long data)
 	slow_work_may_not_start_new_thread = false;
 }
 
+#ifdef CONFIG_SYSCTL
+/*
+ * Handle adjustment of the minimum number of threads
+ */
+static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
+					struct file *filp, void __user *buffer,
+					size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int n;
+
+	if (ret == 0) {
+		mutex_lock(&slow_work_user_lock);
+		if (slow_work_user_count > 0) {
+			/* see if we need to start or stop threads */
+			n = atomic_read(&slow_work_thread_count) -
+				slow_work_min_threads;
+
+			if (n < 0 && !slow_work_may_not_start_new_thread)
+				slow_work_enqueue(&slow_work_new_thread);
+			else if (n > 0)
+				mod_timer(&slow_work_cull_timer,
+					  jiffies + SLOW_WORK_CULL_TIMEOUT);
+		}
+		mutex_unlock(&slow_work_user_lock);
+	}
+
+	return ret;
+}
+
+/*
+ * Handle adjustment of the maximum number of threads
+ */
+static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
+					struct file *filp, void __user *buffer,
+					size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int n;
+
+	if (ret == 0) {
+		mutex_lock(&slow_work_user_lock);
+		if (slow_work_user_count > 0) {
+			/* see if we need to stop threads */
+			n = slow_work_max_threads -
+				atomic_read(&slow_work_thread_count);
+
+			if (n < 0)
+				mod_timer(&slow_work_cull_timer,
+					  jiffies + SLOW_WORK_CULL_TIMEOUT);
+		}
+		mutex_unlock(&slow_work_user_lock);
+	}
+
+	return ret;
+}
+#endif /* CONFIG_SYSCTL */
+
 /**
  * slow_work_register_user - Register a user of the facility
  *
@@ -516,8 +626,12 @@ static int __init init_slow_work(void)
 {
 	unsigned nr_cpus = num_possible_cpus();
 
-	if (nr_cpus > slow_work_max_threads)
+	if (slow_work_max_threads < nr_cpus)
 		slow_work_max_threads = nr_cpus;
+#ifdef CONFIG_SYSCTL
+	if (slow_work_max_max_threads < nr_cpus * 2)
+		slow_work_max_max_threads = nr_cpus * 2;
+#endif
 	return 0;
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5ec4543dfc06..82350f8f04f6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -48,6 +48,7 @@
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
+#include <linux/slow-work.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -897,6 +898,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &scan_unevictable_handler,
 	},
 #endif
+#ifdef CONFIG_SLOW_WORK
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "slow-work",
+		.mode		= 0555,
+		.child		= slow_work_sysctls,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.2.3-59-g8ed1b


From 8f0aa2f25b31ba27db84259141e52ee6ec0d2820 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Document the slow work thread pool

Document the slow work thread pool.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 Documentation/slow-work.txt | 174 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/slow-work.h   |   2 +
 kernel/slow-work.c          |   2 +
 3 files changed, 178 insertions(+)
 create mode 100644 Documentation/slow-work.txt

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
new file mode 100644
index 000000000000..ebc50f808ea4
--- /dev/null
+++ b/Documentation/slow-work.txt
@@ -0,0 +1,174 @@
+		     ====================================
+		     SLOW WORK ITEM EXECUTION THREAD POOL
+		     ====================================
+
+By: David Howells <dhowells@redhat.com>
+
+The slow work item execution thread pool is a pool of threads for performing
+things that take a relatively long time, such as making mkdir calls.
+Typically, when processing something, these items will spend a lot of time
+blocking a thread on I/O, thus making that thread unavailable for doing other
+work.
+
+The standard workqueue model is unsuitable for this class of work item as that
+limits the owner to a single thread or a single thread per CPU.  For some
+tasks, however, more threads - or fewer - are required.
+
+There is just one pool per system.  It contains no threads unless something
+wants to use it - and that something must register its interest first.  When
+the pool is active, the number of threads it contains is dynamic, varying
+between a maximum and minimum setting, depending on the load.
+
+
+====================
+CLASSES OF WORK ITEM
+====================
+
+This pool support two classes of work items:
+
+ (*) Slow work items.
+
+ (*) Very slow work items.
+
+The former are expected to finish much quicker than the latter.
+
+An operation of the very slow class may do a batch combination of several
+lookups, mkdirs, and a create for instance.
+
+An operation of the ordinarily slow class may, for example, write stuff or
+expand files, provided the time taken to do so isn't too long.
+
+Operations of both types may sleep during execution, thus tying up the thread
+loaned to it.
+
+
+THREAD-TO-CLASS ALLOCATION
+--------------------------
+
+Not all the threads in the pool are available to work on very slow work items.
+The number will be between one and one fewer than the number of active threads.
+This is configurable (see the "Pool Configuration" section).
+
+All the threads are available to work on ordinarily slow work items, but a
+percentage of the threads will prefer to work on very slow work items.
+
+The configuration ensures that at least one thread will be available to work on
+very slow work items, and at least one thread will be available that won't work
+on very slow work items at all.
+
+
+=====================
+USING SLOW WORK ITEMS
+=====================
+
+Firstly, a module or subsystem wanting to make use of slow work items must
+register its interest:
+
+	 int ret = slow_work_register_user();
+
+This will return 0 if successful, or a -ve error upon failure.
+
+
+Slow work items may then be set up by:
+
+ (1) Declaring a slow_work struct type variable:
+
+	#include <linux/slow-work.h>
+
+	struct slow_work myitem;
+
+ (2) Declaring the operations to be used for this item:
+
+	struct slow_work_ops myitem_ops = {
+		.get_ref = myitem_get_ref,
+		.put_ref = myitem_put_ref,
+		.execute = myitem_execute,
+	};
+
+     [*] For a description of the ops, see section "Item Operations".
+
+ (3) Initialising the item:
+
+	slow_work_init(&myitem, &myitem_ops);
+
+     or:
+
+	vslow_work_init(&myitem, &myitem_ops);
+
+     depending on its class.
+
+A suitably set up work item can then be enqueued for processing:
+
+	int ret = slow_work_enqueue(&myitem);
+
+This will return a -ve error if the thread pool is unable to gain a reference
+on the item, 0 otherwise.
+
+
+The items are reference counted, so there ought to be no need for a flush
+operation.  When all a module's slow work items have been processed, and the
+module has no further interest in the facility, it should unregister its
+interest:
+
+	slow_work_unregister_user();
+
+
+===============
+ITEM OPERATIONS
+===============
+
+Each work item requires a table of operations of type struct slow_work_ops.
+All members are required:
+
+ (*) Get a reference on an item:
+
+	int (*get_ref)(struct slow_work *work);
+
+     This allows the thread pool to attempt to pin an item by getting a
+     reference on it.  This function should return 0 if the reference was
+     granted, or a -ve error otherwise.  If an error is returned,
+     slow_work_enqueue() will fail.
+
+     The reference is held whilst the item is queued and whilst it is being
+     executed.  The item may then be requeued with the same reference held, or
+     the reference will be released.
+
+ (*) Release a reference on an item:
+
+	void (*put_ref)(struct slow_work *work);
+
+     This allows the thread pool to unpin an item by releasing the reference on
+     it.  The thread pool will not touch the item again once this has been
+     called.
+
+ (*) Execute an item:
+
+	void (*execute)(struct slow_work *work);
+
+     This should perform the work required of the item.  It may sleep, it may
+     perform disk I/O and it may wait for locks.
+
+
+==================
+POOL CONFIGURATION
+==================
+
+The slow-work thread pool has a number of configurables:
+
+ (*) /proc/sys/kernel/slow-work/min-threads
+
+     The minimum number of threads that should be in the pool whilst it is in
+     use.  This may be anywhere between 2 and max-threads.
+
+ (*) /proc/sys/kernel/slow-work/max-threads
+
+     The maximum number of threads that should in the pool.  This may be
+     anywhere between min-threads and 255 or NR_CPUS * 2, whichever is greater.
+
+ (*) /proc/sys/kernel/slow-work/vslow-percentage
+
+     The percentage of active threads in the pool that may be used to execute
+     very slow work items.  This may be between 1 and 99.  The resultant number
+     is bounded to between 1 and one fewer than the number of active threads.
+     This ensures there is always at least one thread that can process very
+     slow work items, and always at least one thread that won't.
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 8262809dfa8b..85958277f83d 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -7,6 +7,8 @@
  * modify it under the terms of the GNU General Public Licence
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
+ *
+ * See Documentation/slow-work.txt
  */
 
 #ifndef _LINUX_SLOW_WORK_H
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 3f65900aa3cb..cf2bc01186ef 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -7,6 +7,8 @@
  * modify it under the terms of the GNU General Public Licence
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
+ *
+ * See Documentation/slow-work.txt
  */
 
 #include <linux/module.h>
-- 
cgit v1.2.3-59-g8ed1b


From ca96a895a6bae7efe7b11a35d9f43e6228467562 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Fri, 9 Jan 2009 16:44:16 +0100
Subject: audit: EXECVE record - removed bogus newline

(updated)
Added hunk that changes the comment, the rest is the same.

EXECVE records contain a newline after every argument. auditd converts
"\n" to " " so you cannot see newlines even in raw logs, but they're
there nevertheless. If you're not using auditd, you need to work round
them. These '\n' chars are can be easily replaced by spaces when
creating record in kernel. Note there is no need for trailing '\n' in
an audit record.

record before this patch:
"type=EXECVE msg=audit(1231421801.566:31): argc=4 a0=\"./test\"\na1=\"a\"\na2=\"b\"\na3=\"c\"\n"

record after this patch:
"type=EXECVE msg=audit(1231421801.566:31): argc=4 a0=\"./test\" a1=\"a\" a2=\"b\" a3=\"c\""

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2bfc64786765..738c03695b79 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1024,7 +1024,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 {
 	char arg_num_len_buf[12];
 	const char __user *tmp_p = p;
-	/* how many digits are in arg_num? 3 is the length of a=\n */
+	/* how many digits are in arg_num? 3 is the length of " a=" */
 	size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
 	size_t len, len_left, to_send;
 	size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
@@ -1110,7 +1110,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 		 * so we can be sure nothing was lost.
 		 */
 		if ((i == 0) && (too_long))
-			audit_log_format(*ab, "a%d_len=%zu ", arg_num,
+			audit_log_format(*ab, " a%d_len=%zu", arg_num,
 					 has_cntl ? 2*len : len);
 
 		/*
@@ -1130,7 +1130,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 		buf[to_send] = '\0';
 
 		/* actually log it */
-		audit_log_format(*ab, "a%d", arg_num);
+		audit_log_format(*ab, " a%d", arg_num);
 		if (too_long)
 			audit_log_format(*ab, "[%d]", i);
 		audit_log_format(*ab, "=");
@@ -1138,7 +1138,6 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 			audit_log_n_hex(*ab, buf, to_send);
 		else
 			audit_log_format(*ab, "\"%s\"", buf);
-		audit_log_format(*ab, "\n");
 
 		p += to_send;
 		len_left -= to_send;
@@ -1166,7 +1165,7 @@ static void audit_log_execve_info(struct audit_context *context,
 
 	p = (const char __user *)axi->mm->arg_start;
 
-	audit_log_format(*ab, "argc=%d ", axi->argc);
+	audit_log_format(*ab, "argc=%d", axi->argc);
 
 	/*
 	 * we need some kernel buffer to hold the userspace args.  Just
-- 
cgit v1.2.3-59-g8ed1b


From 6b96255998053a89f45c0855de954b71f5c3887b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 5 Jan 2009 13:41:13 -0800
Subject: auditsc: fix kernel-doc notation

Fix auditsc kernel-doc notation:

Warning(linux-2.6.28-git7//kernel/auditsc.c:2156): No description found for parameter 'attr'
Warning(linux-2.6.28-git7//kernel/auditsc.c:2156): Excess function parameter 'u_attr' description in '__audit_mq_open'
Warning(linux-2.6.28-git7//kernel/auditsc.c:2204): No description found for parameter 'notification'
Warning(linux-2.6.28-git7//kernel/auditsc.c:2204): Excess function parameter 'u_notification' description in '__audit_mq_notify'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
cc:	Al Viro <viro@zeniv.linux.org.uk>
cc:	Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 738c03695b79..b344b86557a2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2149,7 +2149,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
  * __audit_mq_open - record audit data for a POSIX MQ open
  * @oflag: open flag
  * @mode: mode bits
- * @u_attr: queue attributes
+ * @attr: queue attributes
  *
  */
 void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
@@ -2196,7 +2196,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
 /**
  * __audit_mq_notify - record audit data for a POSIX MQ notify
  * @mqdes: MQ descriptor
- * @u_notification: Notification event
+ * @notification: Notification event
  *
  */
 
-- 
cgit v1.2.3-59-g8ed1b


From c28bb7da74ab74a2860d652493aaff7de104d79e Mon Sep 17 00:00:00 2001
From: Zhenwen Xu <helight.xu@gmail.com>
Date: Thu, 12 Mar 2009 22:16:12 +0800
Subject: make the e->rule.xxx shorter in kernel auditfilter.c

make the e->rule.xxx shorter in kernel/auditfilter.c
--
---------------------------------
Zhenwen Xu - Open and Free
Home Page:	http://zhwen.org
My Studio:	http://dim4.cn

>From 99692dc640b278f1cb1a15646ce42f22e89c0f77 Mon Sep 17 00:00:00 2001
From: Zhenwen Xu <Helight.Xu@gmail.com>
Date: Thu, 12 Mar 2009 22:04:59 +0800
Subject: [PATCH] make the e->rule.xxx shorter in kernel/auditfilter.c

Signed-off-by: Zhenwen Xu <Helight.Xu@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditfilter.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index fbf24d121d97..a6fe71fd5d1b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -135,18 +135,18 @@ static void audit_remove_watch(struct audit_watch *watch)
 static inline void audit_free_rule(struct audit_entry *e)
 {
 	int i;
-
+	struct audit_krule *erule = &e->rule;
 	/* some rules don't have associated watches */
-	if (e->rule.watch)
-		audit_put_watch(e->rule.watch);
-	if (e->rule.fields)
-		for (i = 0; i < e->rule.field_count; i++) {
-			struct audit_field *f = &e->rule.fields[i];
+	if (erule->watch)
+		audit_put_watch(erule->watch);
+	if (erule->fields)
+		for (i = 0; i < erule->field_count; i++) {
+			struct audit_field *f = &erule->fields[i];
 			kfree(f->lsm_str);
 			security_audit_rule_free(f->lsm_rule);
 		}
-	kfree(e->rule.fields);
-	kfree(e->rule.filterkey);
+	kfree(erule->fields);
+	kfree(erule->filterkey);
 	kfree(e);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From b3897f567100d18e0597f638b911d23aa5e0dd23 Mon Sep 17 00:00:00 2001
From: Miloslav Trmac <mitr@redhat.com>
Date: Thu, 19 Mar 2009 09:48:27 -0400
Subject: Audit: fix handling of 'strings' with NULL characters

currently audit_log_n_untrustedstring() uses audit_string_contains_control()
to check if the 'string' has any control characters.  If the 'string' has an
embedded NULL audit_string_contains_control() will return that the data has
no control characters and will then pass the string to audit_log_n_string
with the total length, not the length up to the first NULL.
audit_log_n_string() does a memcpy of the entire length and so the actual
audit record emitted may then contain a NULL and then whatever random memory
is after the NULL.

Since we want to log the entire octet stream (if we can't trust the data
to be a string we can't trust that a NULL isn't actually a part of it)
we should just consider NULL as a control character.  If the caller is
certain they want to stop at the first NULL they should be using
audit_log_untrustedstring.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index ce6d8ea3131e..fa3805516dff 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1382,7 +1382,7 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
 int audit_string_contains_control(const char *string, size_t len)
 {
 	const unsigned char *p;
-	for (p = string; p < (const unsigned char *)string + len && *p; p++) {
+	for (p = string; p < (const unsigned char *)string + len; p++) {
 		if (*p == '"' || *p < 0x21 || *p > 0x7e)
 			return 1;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 55ad2f8d340678397de5916b9cd960f17ebd7150 Mon Sep 17 00:00:00 2001
From: Miloslav Trmac <mitr@redhat.com>
Date: Thu, 19 Mar 2009 09:52:47 -0400
Subject: audit: ignore terminating NUL in AUDIT_USER_TTY messages

AUDIT_USER_TTY, like all other messages sent from user-space, is sent
NUL-terminated.  Unlike other user-space audit messages, which come only
from trusted sources, AUDIT_USER_TTY messages are processed using
audit_log_n_untrustedstring().

This patch modifies AUDIT_USER_TTY handling to ignore the trailing NUL
and use the "quoted_string" representation of the message if possible.

Signed-off-by: Miloslav Trmac <mitr@redhat.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index fa3805516dff..5560390cb0f5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -766,6 +766,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 				audit_log_format(ab, " msg=");
 				size = nlmsg_len(nlh);
+				if (size > 0 &&
+				    ((unsigned char *)data)[size - 1] == '\0')
+					size--;
 				audit_log_n_untrustedstring(ab, data, size);
 			}
 			audit_set_pid(ab, pid);
-- 
cgit v1.2.3-59-g8ed1b


From 6d208da89aabee8502debe842832ca0ab298d16d Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Wed, 1 Apr 2009 15:47:27 -0400
Subject: audit: Fix possible return value truncation in audit_get_context()

The audit subsystem treats syscall return codes as type long, unfortunately
the audit_get_context() function mistakenly converts the return code to an
int type in the parameters which could cause problems on systems where the
sizeof(int) != sizeof(long).

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b344b86557a2..e821d626dfe6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -752,7 +752,7 @@ static void audit_set_auditable(struct audit_context *ctx)
 
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 						      int return_valid,
-						      int return_code)
+						      long return_code)
 {
 	struct audit_context *context = tsk->audit_context;
 
-- 
cgit v1.2.3-59-g8ed1b


From 318b6d3d7ddbcad3d6867e630711b8a705d873d7 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 13 Jan 2009 17:32:40 -0500
Subject: audit: incorrect ref counting in audit tree tag_chunk

tag_chunk has bad exit paths in which the inotify ref counting is wrong.
At the top of the function we found &old_watch using  inotify_find_watch().
inotify_find_watch takes a reference to the watch.  This is never dropped
on an error path.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit_tree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8ad9545b8db9..917ab9525568 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -385,6 +385,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	mutex_lock(&inode->inotify_mutex);
 	if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
 		mutex_unlock(&inode->inotify_mutex);
+		put_inotify_watch(&old->watch);
 		free_chunk(chunk);
 		return -ENOSPC;
 	}
@@ -394,6 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		chunk->dead = 1;
 		inotify_evict_watch(&chunk->watch);
 		mutex_unlock(&inode->inotify_mutex);
+		put_inotify_watch(&old->watch);
 		put_inotify_watch(&chunk->watch);
 		return 0;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 679173b724631f49e537a15fa48ea2000bdc1808 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 26 Jan 2009 18:09:45 -0500
Subject: audit: audit_set_auditable defined but not used
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

after 0590b9335a1c72a3f0defcc6231287f7817e07c8 audit_set_auditable() is now only
used by the audit tree code.  If CONFIG_AUDIT_TREE is unset it will be defined
but unused.  This patch simply moves the function inside a CONFIG_AUDIT_TREE
block.

cc1: warnings being treated as errors
/home/acme_unencrypted/git/linux-2.6-tip/kernel/auditsc.c:745: error: ‘audit_set_auditable’ defined but not used
make[2]: *** [kernel/auditsc.o] Error 1
make[1]: *** [kernel] Error 2
make[1]: *** Waiting for unfinished jobs....

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e821d626dfe6..aa0428e08367 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -329,6 +329,14 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
  */
 
 #ifdef CONFIG_AUDIT_TREE
+static void audit_set_auditable(struct audit_context *ctx)
+{
+	if (!ctx->prio) {
+		ctx->prio = 1;
+		ctx->current_state = AUDIT_RECORD_CONTEXT;
+	}
+}
+
 static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
 {
 	struct audit_tree_refs *p = ctx->trees;
@@ -742,14 +750,6 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
 	rcu_read_unlock();
 }
 
-static void audit_set_auditable(struct audit_context *ctx)
-{
-	if (!ctx->prio) {
-		ctx->prio = 1;
-		ctx->current_state = AUDIT_RECORD_CONTEXT;
-	}
-}
-
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 						      int return_valid,
 						      long return_code)
-- 
cgit v1.2.3-59-g8ed1b


From def57543418a5f47debae28a0a9dea2effc11692 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 10 Mar 2009 18:00:14 -0400
Subject: Audit: remove spaces from audit_log_d_path

audit_log_d_path had spaces in the strings which would be emitted on the
error paths.  This patch simply replaces those spaces with an _ or removes
the needless spaces entirely.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c   | 4 ++--
 kernel/auditsc.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 5560390cb0f5..9442c3533ba9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1440,13 +1440,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	/* We will allow 11 spaces for ' (deleted)' to be appended */
 	pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
 	if (!pathname) {
-		audit_log_format(ab, "<no memory>");
+		audit_log_string(ab, "<no_memory>");
 		return;
 	}
 	p = d_path(path, pathname, PATH_MAX+11);
 	if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
 		/* FIXME: can we save some information here? */
-		audit_log_format(ab, "<too long>");
+		audit_log_string(ab, "<too_long>");
 	} else
 		audit_log_untrustedstring(ab, p);
 	kfree(pathname);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index aa0428e08367..7d6ac7c1f414 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1478,7 +1478,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			case 0:
 				/* name was specified as a relative path and the
 				 * directory component is the cwd */
-				audit_log_d_path(ab, " name=", &context->pwd);
+				audit_log_d_path(ab, "name=", &context->pwd);
 				break;
 			default:
 				/* log the name's directory component */
-- 
cgit v1.2.3-59-g8ed1b


From cd5f9a4c3199b090e91ea0064cb110985ba54814 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 6 Apr 2009 13:38:46 -0700
Subject: kernel/sysctl.c: avoid annoying warnings

Some of the limit constants are used only depending on some complex
configuration dependencies, yet it's not worth making the simple
variables depend on those configuration details.  Just mark them as
perhaps not being unused, and avoid the warning.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 82350f8f04f6..b125e3387568 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,8 +97,8 @@ static int neg_one = -1;
 #endif
 
 static int zero;
-static int one = 1;
-static int two = 2;
+static int __maybe_unused one = 1;
+static int __maybe_unused two = 2;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 
-- 
cgit v1.2.3-59-g8ed1b


From 432870dab85a2f69dc417022646cb9a70acf7f94 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 6 Apr 2009 16:16:02 +0200
Subject: exit_notify: kill the wrong capable(CAP_KILL) check

The CAP_KILL check in exit_notify() looks just wrong, kill it.

Whatever logic we have to reset ->exit_signal, the malicious user
can bypass it if it execs the setuid application before exiting.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 6686ed1e4aa3..32cbf2607cb0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -837,8 +837,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	 */
 	if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
 	    (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
-	     tsk->self_exec_id != tsk->parent_exec_id) &&
-	    !capable(CAP_KILL))
+	     tsk->self_exec_id != tsk->parent_exec_id))
 		tsk->exit_signal = SIGCHLD;
 
 	signal = tracehook_notify_death(tsk, &cookie, group_dead);
-- 
cgit v1.2.3-59-g8ed1b


From 2e45e77787c9d0720b046eb69856edf43b17e33e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 7 Apr 2009 17:12:43 +0930
Subject: Revert "module: remove the SHF_ALLOC flag on the __versions section."

This reverts commit 9cb610d8e35fe3ec95a2fe2030b02f85aeea83c1.

This was an impressively stupid patch.  Firstly, we reset the SHF_ALLOC
flag lower down in the same function, so the patch was useless.  Even
better, find_sec() ignores sections with SHF_ALLOC not set, so
it breaks CONFIG_MODVERSIONS=y with CONFIG_MODULE_FORCE_LOAD=n, which
refuses to load the module since it can't find the __versions section.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index c268a771595c..05f014efa32c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1952,9 +1952,6 @@ static noinline struct module *load_module(void __user *umod,
 		if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
 			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 #endif
-		/* Don't keep __versions around; it's just for loading. */
-		if (strcmp(secstrings + sechdrs[i].sh_name, "__versions") == 0)
-			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 	}
 
 	modindex = find_sec(hdr, sechdrs, secstrings,
-- 
cgit v1.2.3-59-g8ed1b


From 301fd748e2c81e78e74edbc694a64caa7b95dda2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 3 Apr 2009 11:12:23 -0400
Subject: tracing: remove CALLER_ADDR2 from wakeup tracer

Maneesh Soni was getting a crash when running the wakeup tracer.
We debugged it down to the recording of the function with the
CALLER_ADDR2 macro.  This is used to get the location of the caller
to schedule.

But the problem comes when schedule is called by assmebly. In the case
that Maneesh had, retint_careful would call schedule. But retint_careful
does not set up a proper frame pointer. CALLER_ADDR2 is defined as
__builtin_return_address(2). This produces the following assembly in
the wakeup tracer code.

   mov    0x0(%rbp),%rcx  <--- get the frame pointer of the caller
   mov    %r14d,%r8d
   mov    0xf2de8e(%rip),%rdi

   mov    0x8(%rcx),%rsi  <-- this is __builtin_return_address(1)
   mov    0x28(%rdi,%rax,8),%rbx

   mov    (%rcx),%rax  <-- get the frame pointer of the caller's caller
   mov    %r12,%rcx
   mov    0x8(%rax),%rdx <-- this is __builtin_return_address(2)

At the reading of 0x8(%rax) Maneesh's machine would take a fault.
The reason is that retint_careful did not set up the return address
and the content of %rax here was zero.

To verify this, I sent Maneesh a patch to create a frame pointer
in retint_careful. He ran the test again but this time he would take
the same type of fault from sysret_careful. The retint_careful was no
longer an issue, but there are other callers that still have issues.

Instead of adding frame pointers for all callers to schedule (in possibly
all archs), it is much safer to simply not use CALLER_ADDR2. This
loses out on knowing what called schedule, but the function tracer
will help there if needed.

Reported-by: Maneesh Soni <maneesh@in.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3c5ad6b2ec84..5bc00e8f153e 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -154,7 +154,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 	if (unlikely(!tracer_enabled || next != wakeup_task))
 		goto out_unlock;
 
-	trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
 	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
 
 	/*
@@ -257,6 +257,12 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 	data = wakeup_trace->data[wakeup_cpu];
 	data->preempt_timestamp = ftrace_now(cpu);
 	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
+
+	/*
+	 * We must be careful in using CALLER_ADDR2. But since wake_up
+	 * is not called by an assembly function  (where as schedule is)
+	 * it should be safe to use it here.
+	 */
 	trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
 
 out_locked:
-- 
cgit v1.2.3-59-g8ed1b


From cf8e3474654f20433aab9aa35826d43b5f245008 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 30 Mar 2009 13:48:00 +0800
Subject: tracing: fix incorrect return type of ns2usecs()

Impact: fix time output bug in 32bits system

ns2usecs() returns 'long', it's incorrect.

(In i386)
...
          <idle>-0     [000]   521.442100: _spin_lock <-tick_do_update_jiffies64
          <idle>-0     [000]   521.442101: do_timer <-tick_do_update_jiffies64
          <idle>-0     [000]   521.442102: update_wall_time <-do_timer
          <idle>-0     [000]   521.442102: update_xtime_cache <-update_wall_time
....
(It always print the time less than 2200 seconds besides ...)
Because 'long' is 32bits in i386. ( (1<<31) useconds is about 2200 seconds)

...
          <idle>-0     [001] 4154502640.134759: rcu_bh_qsctr_inc <-__do_softirq
          <idle>-0     [001] 4154502640.134760: _local_bh_enable <-__do_softirq
          <idle>-0     [001] 4154502640.134761: idle_cpu <-irq_exit
...
(very large value)
Because 'long' is a signed type and it is 32bits in i386.

Changes in v2:
return 'unsigned long long' instead of 'cycle_t'

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <49D05D10.4030009@cn.fujitsu.com>
Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        | 3 +--
 kernel/trace/trace.h        | 2 +-
 kernel/trace/trace_output.c | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a0174a40c563..457dd8c97e0d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -147,8 +147,7 @@ static int __init set_ftrace_dump_on_oops(char *str)
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
 
-long
-ns2usecs(cycle_t nsec)
+unsigned long long ns2usecs(cycle_t nsec)
 {
 	nsec += 500;
 	do_div(nsec, 1000);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cb0ce3fc36d3..0d81a4a2a4a5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -596,7 +596,7 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
-extern long ns2usecs(cycle_t nsec);
+extern unsigned long long ns2usecs(cycle_t nsec);
 extern int
 trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
 extern int
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d72b9a63b247..64b54a59c55b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -423,7 +423,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
 
 		trace_find_cmdline(entry->pid, comm);
 
-		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
+		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]"
 				       " %ld.%03ldms (+%ld.%03ldms): ", comm,
 				       entry->pid, iter->cpu, entry->flags,
 				       entry->preempt_count, iter->idx,
-- 
cgit v1.2.3-59-g8ed1b


From 5f0c6c03c5fee91c02c696bc9bf4c0d41392abe7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 27 Mar 2009 14:22:10 +0100
Subject: tracing/ftrace: fix missing include string.h

Building a kernel with tracing can raise the following warning on
tip/master:

kernel/trace/trace.c:1249: error: implicit declaration of function 'vbin_printf'

We are missing an include to string.h

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1238160130-7437-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 457dd8c97e0d..2230b46f9e1c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,6 +30,7 @@
 #include <linux/percpu.h>
 #include <linux/splice.h>
 #include <linux/kdebug.h>
+#include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
-- 
cgit v1.2.3-59-g8ed1b


From 8bcae09b93e7f96f700b6bb372c2b3f2b36636dc Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Tue, 31 Mar 2009 15:24:51 +0800
Subject: ftrace: Add check of sched_stopped for probe_sched_wakeup

The wakeup tracing in sched_switch does not stop when a user
disables tracing. This is because the probe_sched_wakeup() is missing
the check to prevent the wakeup from being traced.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
LKML-Reference: <49D1C543.3010307@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_switch.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index de35f200abd3..9117cea6f1ae 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -62,6 +62,9 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
 	pc = preempt_count();
 	tracing_record_cmdline(current);
 
+	if (sched_stopped)
+		return;
+
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = ctx_trace->data[cpu];
-- 
cgit v1.2.3-59-g8ed1b


From b0dfa978c7a1699fb3506fbfcba0b6a5c4bd17ae Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 1 Apr 2009 22:53:08 +0200
Subject: tracing/ftrace: alloc the started cpumask for the trace file

Impact: fix a crash while cat trace file

Currently we are using a cpumask to remind each cpu where a
trace occured. It lets us notice the user that a cpu just had
its first trace.

But on latest -tip we have the following crash once we cat the trace
file:

IP: [<c0270c4a>] print_trace_fmt+0x45/0xe7
*pde = 00000000
Oops: 0000 [#1] PREEMPT SMP
last sysfs file: /sys/class/net/eth0/carrier
Pid: 3897, comm: cat Not tainted (2.6.29-tip-02825-g0f22972-dirty #81)
EIP: 0060:[<c0270c4a>] EFLAGS: 00010297 CPU: 0
EIP is at print_trace_fmt+0x45/0xe7
EAX: 00000000 EBX: 00000000 ECX: c12d9e98 EDX: ccdb7010
ESI: d31f4000 EDI: 00322401 EBP: d31f3f10 ESP: d31f3efc
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
Process cat (pid: 3897, ti=d31f2000 task=d3b3cf20 task.ti=d31f2000)
Stack:
d31f4080 ccdb7010 d31f4000 d691fe70 ccdb7010 d31f3f24 c0270e5c d31f4000
d691fe70 d31f4000 d31f3f34 c02718e8 c12d9e98 d691fe70 d31f3f70 c02bfc33
00001000 09130000 d3b46e00 d691fe98 00000000 00000079 00000001 00000000
Call Trace:
[<c0270e5c>] ? print_trace_line+0x170/0x17c
[<c02718e8>] ? s_show+0xa7/0xbd
[<c02bfc33>] ? seq_read+0x24a/0x327
[<c02bf9e9>] ? seq_read+0x0/0x327
[<c02ab18b>] ? vfs_read+0x86/0xe1
[<c02ab289>] ? sys_read+0x40/0x65
[<c0202d8f>] ? sysenter_do_call+0x12/0x3c
Code: 00 00 00 89 45 ec f7 c7 00 20 00 00 89 55 f0 74 4e f6 86 98 10 00 00 02 74 45 8b 86 8c 10 00 00 8b 9e a8 10 00 00 e8 52 f3 ff ff <0f> a3 03 19 c0 85 c0 75 2b 8b 86 8c 10 00 00 8b 9e a8 10 00 00
EIP: [<c0270c4a>] print_trace_fmt+0x45/0xe7 SS:ESP 0068:d31f3efc
CR2: 0000000000000000
---[ end trace aa9cf38e5ebed9dd ]---

This is because we alloc the iter->started cpumask on tracing_pipe_open but
not on tracing_open.

It hadn't been noticed until now because we need to have ring buffer overruns
to activate the starting of cpu buffer detection.

Also, we need a check to not print the messagge for the first trace on the file.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1238619188-6109-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2230b46f9e1c..fc8c7d66832b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1632,7 +1632,11 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 		return;
 
 	cpumask_set_cpu(iter->cpu, iter->started);
-	trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
+
+	/* Don't print started cpu buffer for the first entry of the trace */
+	if (iter->idx > 1)
+		trace_seq_printf(s, "##### CPU %u buffer started ####\n",
+				iter->cpu);
 }
 
 static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
@@ -1867,6 +1871,11 @@ __tracing_open(struct inode *inode, struct file *file)
 	if (current_trace)
 		*iter->trace = *current_trace;
 
+	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL))
+		goto fail;
+
+	cpumask_clear(iter->started);
+
 	if (current_trace && current_trace->print_max)
 		iter->tr = &max_tr;
 	else
@@ -1917,6 +1926,7 @@ __tracing_open(struct inode *inode, struct file *file)
 		if (iter->buffer_iter[cpu])
 			ring_buffer_read_finish(iter->buffer_iter[cpu]);
 	}
+	free_cpumask_var(iter->started);
  fail:
 	mutex_unlock(&trace_types_lock);
 	kfree(iter->trace);
@@ -1960,6 +1970,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 
 	seq_release(inode, file);
 	mutex_destroy(&iter->mutex);
+	free_cpumask_var(iter->started);
 	kfree(iter->trace);
 	kfree(iter);
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From bc2b6871c17b3aff79fb14e1a1c06c5f5a187f76 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Mon, 23 Mar 2009 11:58:31 +0530
Subject: Update /debug/tracing/README

Some of the tracers have been renamed, which was not updated in the in-kernel
run-time README file. Update it.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
LKML-Reference: <200903231158.32151.knikanth@suse.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fc8c7d66832b..9d28476a9851 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2369,9 +2369,9 @@ static const char readme_msg[] =
 	"# mkdir /debug\n"
 	"# mount -t debugfs nodev /debug\n\n"
 	"# cat /debug/tracing/available_tracers\n"
-	"wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
+	"wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
 	"# cat /debug/tracing/current_tracer\n"
-	"none\n"
+	"nop\n"
 	"# echo sched_switch > /debug/tracing/current_tracer\n"
 	"# cat /debug/tracing/current_tracer\n"
 	"sched_switch\n"
-- 
cgit v1.2.3-59-g8ed1b


From 1bbe2a83ab68e5cf8c66c372c7cb3b51910c2cfe Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 3 Apr 2009 18:24:46 +0800
Subject: ftrace: Correct a text align for event format output

If we cat debugfs/tracing/events/ftrace/bprint/format, we'll see:
name: bprint
ID: 6
format:
	field:unsigned char common_type;	offset:0;	size:1;
	field:unsigned char common_flags;	offset:1;	size:1;
	field:unsigned char common_preempt_count;	offset:2;	size:1;
	field:int common_pid;	offset:4;	size:4;
	field:int common_tgid;	offset:8;	size:4;

	field:unsigned long ip;	offset:12;	size:4;
	field:char * fmt;	offset:16;	size:4;
	field: char buf;	offset:20;	size:0;

print fmt: "%08lx (%d) fmt:%p %s"

There is an inconsistent blank before char buf.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
LKML-Reference: <49D5E3EE.70201@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_export.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4d9952d3df50..07a22c33ebf3 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -40,7 +40,7 @@
 
 #undef TRACE_FIELD_ZERO_CHAR
 #define TRACE_FIELD_ZERO_CHAR(item)					\
-	ret = trace_seq_printf(s, "\tfield: char " #item ";\t"		\
+	ret = trace_seq_printf(s, "\tfield:char " #item ";\t"		\
 			       "offset:%u;\tsize:0;\n",			\
 			       (unsigned int)offsetof(typeof(field), item)); \
 	if (!ret)							\
-- 
cgit v1.2.3-59-g8ed1b


From fafd688e4c0c34da0f3de909881117d374e4c7af Mon Sep 17 00:00:00 2001
From: Peter W Morreale <pmorreale@novell.com>
Date: Mon, 6 Apr 2009 19:00:29 -0700
Subject: mm: add /proc controls for pdflush threads

Add /proc entries to give the admin the ability to control the minimum and
maximum number of pdflush threads.  This allows finer control of pdflush
on both large and small machines.

The rationale is simply one size does not fit all.  Admins on large and/or
small systems may want to tune the min/max pdflush thread count to best
suit their needs.  Right now the min/max is hardcoded to 2/8.  While
probably a fair estimate for smaller machines, large machines with large
numbers of CPUs and large numbers of filesystems/block devices may benefit
from larger numbers of threads working on different block devices.

Even if the background flushing algorithm is radically changed, it is
still likely that multiple threads will be involved and admins would still
desire finer control on the min/max other than to have to recompile the
kernel.

The patch adds '/proc/sys/vm/nr_pdflush_threads_min' and
'/proc/sys/vm/nr_pdflush_threads_max' with r/w permissions.

The minimum value for nr_pdflush_threads_min is 1 and the maximum value is
the current value of nr_pdflush_threads_max.  This minimum is required
since additional thread creation is performed in a pdflush thread itself.

The minimum value for nr_pdflush_threads_max is the current value of
nr_pdflush_threads_min and the maximum value can be 1000.

Documentation/sysctl/vm.txt is also updated.

[akpm@linux-foundation.org: fix comment, fix whitespace, use __read_mostly]
Signed-off-by: Peter W Morreale <pmorreale@novell.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 28 ++++++++++++++++++++++++++++
 include/linux/writeback.h   |  2 ++
 kernel/sysctl.c             | 23 +++++++++++++++++++++++
 mm/pdflush.c                | 31 +++++++++++++++++++------------
 4 files changed, 72 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 3197fc83bc51..97c4b3284329 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -39,6 +39,8 @@ Currently, these files are in /proc/sys/vm:
 - nr_hugepages
 - nr_overcommit_hugepages
 - nr_pdflush_threads
+- nr_pdflush_threads_min
+- nr_pdflush_threads_max
 - nr_trim_pages         (only if CONFIG_MMU=n)
 - numa_zonelist_order
 - oom_dump_tasks
@@ -463,6 +465,32 @@ The default value is 0.
 
 ==============================================================
 
+nr_pdflush_threads_min
+
+This value controls the minimum number of pdflush threads.
+
+At boot time, the kernel will create and maintain 'nr_pdflush_threads_min'
+threads for the kernel's lifetime.
+
+The default value is 2.  The minimum value you can specify is 1, and
+the maximum value is the current setting of 'nr_pdflush_threads_max'.
+
+See 'nr_pdflush_threads_max' below for more information.
+
+==============================================================
+
+nr_pdflush_threads_max
+
+This value controls the maximum number of pdflush threads that can be
+created.  The pdflush algorithm will create a new pdflush thread (up to
+this maximum) if no pdflush threads have been available for >= 1 second.
+
+The default value is 8.  The minimum value you can specify is the
+current value of 'nr_pdflush_threads_min' and the
+maximum is 1000.
+
+==============================================================
+
 overcommit_memory:
 
 This value contains a flag that enables memory overcommitment.
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 93445477f86a..9c1ed1fb6ddb 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -168,6 +168,8 @@ void writeback_set_ratelimit(void);
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
 				   read-only. */
+extern int nr_pdflush_threads_max; /* Global so it can be exported to sysctl */
+extern int nr_pdflush_threads_min; /* Global so it can be exported to sysctl */
 
 
 #endif		/* WRITEBACK_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b125e3387568..72eb1a41dcab 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -101,6 +101,7 @@ static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
+static int one_thousand = 1000;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -1026,6 +1027,28 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0444 /* read-only*/,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nr_pdflush_threads_min",
+		.data		= &nr_pdflush_threads_min,
+		.maxlen		= sizeof nr_pdflush_threads_min,
+		.mode		= 0644 /* read-write */,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+		.extra2		= &nr_pdflush_threads_max,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nr_pdflush_threads_max",
+		.data		= &nr_pdflush_threads_max,
+		.maxlen		= sizeof nr_pdflush_threads_max,
+		.mode		= 0644 /* read-write */,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &nr_pdflush_threads_min,
+		.extra2		= &one_thousand,
+	},
 	{
 		.ctl_name	= VM_SWAPPINESS,
 		.procname	= "swappiness",
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 235ac440c44e..f2caf96993f8 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -57,6 +57,14 @@ static DEFINE_SPINLOCK(pdflush_lock);
  */
 int nr_pdflush_threads = 0;
 
+/*
+ * The max/min number of pdflush threads. R/W by sysctl at
+ * /proc/sys/vm/nr_pdflush_threads_max/min
+ */
+int nr_pdflush_threads_max __read_mostly = MAX_PDFLUSH_THREADS;
+int nr_pdflush_threads_min __read_mostly = MIN_PDFLUSH_THREADS;
+
+
 /*
  * The time at which the pdflush thread pool last went empty
  */
@@ -68,7 +76,7 @@ static unsigned long last_empty_jifs;
  * Thread pool management algorithm:
  * 
  * - The minimum and maximum number of pdflush instances are bound
- *   by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
+ *   by nr_pdflush_threads_min and nr_pdflush_threads_max.
  * 
  * - If there have been no idle pdflush instances for 1 second, create
  *   a new one.
@@ -134,14 +142,13 @@ static int __pdflush(struct pdflush_work *my_work)
 		 * To throttle creation, we reset last_empty_jifs.
 		 */
 		if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
-			if (list_empty(&pdflush_list)) {
-				if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
-					last_empty_jifs = jiffies;
-					nr_pdflush_threads++;
-					spin_unlock_irq(&pdflush_lock);
-					start_one_pdflush_thread();
-					spin_lock_irq(&pdflush_lock);
-				}
+			if (list_empty(&pdflush_list) &&
+			    nr_pdflush_threads < nr_pdflush_threads_max) {
+				last_empty_jifs = jiffies;
+				nr_pdflush_threads++;
+				spin_unlock_irq(&pdflush_lock);
+				start_one_pdflush_thread();
+				spin_lock_irq(&pdflush_lock);
 			}
 		}
 
@@ -153,7 +160,7 @@ static int __pdflush(struct pdflush_work *my_work)
 		 */
 		if (list_empty(&pdflush_list))
 			continue;
-		if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
+		if (nr_pdflush_threads <= nr_pdflush_threads_min)
 			continue;
 		pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
 		if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
@@ -259,9 +266,9 @@ static int __init pdflush_init(void)
 	 * Pre-set nr_pdflush_threads...  If we fail to create,
 	 * the count will be decremented.
 	 */
-	nr_pdflush_threads = MIN_PDFLUSH_THREADS;
+	nr_pdflush_threads = nr_pdflush_threads_min;
 
-	for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
+	for (i = 0; i < nr_pdflush_threads_min; i++)
 		start_one_pdflush_thread();
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From b918e5e60d775549478e4268155142156a95aa17 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:00:58 -0700
Subject: kprobes: cleanup aggr_kprobe related code

Currently, kprobes can disable all probes at once, but can't disable it
individually (not unregister, just disable an kprobe, because
unregistering needs to wait for scheduler synchronization).  These patches
introduce APIs for on-the-fly per-probe disabling and re-enabling by
dis-arming/re-arming its breakpoint instruction.

This patch:

Change old_p to ap in add_new_kprobe() for readability, copy flags member
in add_aggr_kprobe(), and simplify the code flow of
register_aggr_kprobe().

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 60 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5016bfb682b9..a55bfadfd766 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -518,20 +518,20 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
 }
 
 /*
-* Add the new probe to old_p->list. Fail if this is the
+* Add the new probe to ap->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
-static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
 	if (p->break_handler) {
-		if (old_p->break_handler)
+		if (ap->break_handler)
 			return -EEXIST;
-		list_add_tail_rcu(&p->list, &old_p->list);
-		old_p->break_handler = aggr_break_handler;
+		list_add_tail_rcu(&p->list, &ap->list);
+		ap->break_handler = aggr_break_handler;
 	} else
-		list_add_rcu(&p->list, &old_p->list);
-	if (p->post_handler && !old_p->post_handler)
-		old_p->post_handler = aggr_post_handler;
+		list_add_rcu(&p->list, &ap->list);
+	if (p->post_handler && !ap->post_handler)
+		ap->post_handler = aggr_post_handler;
 	return 0;
 }
 
@@ -544,6 +544,7 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 	copy_kprobe(p, ap);
 	flush_insn_slot(ap);
 	ap->addr = p->addr;
+	ap->flags = p->flags;
 	ap->pre_handler = aggr_pre_handler;
 	ap->fault_handler = aggr_fault_handler;
 	/* We don't care the kprobe which has gone. */
@@ -566,44 +567,43 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 					  struct kprobe *p)
 {
 	int ret = 0;
-	struct kprobe *ap;
+	struct kprobe *ap = old_p;
 
-	if (kprobe_gone(old_p)) {
+	if (old_p->pre_handler != aggr_pre_handler) {
+		/* If old_p is not an aggr_probe, create new aggr_kprobe. */
+		ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+		if (!ap)
+			return -ENOMEM;
+		add_aggr_kprobe(ap, old_p);
+	}
+
+	if (kprobe_gone(ap)) {
 		/*
 		 * Attempting to insert new probe at the same location that
 		 * had a probe in the module vaddr area which already
 		 * freed. So, the instruction slot has already been
 		 * released. We need a new slot for the new probe.
 		 */
-		ret = arch_prepare_kprobe(old_p);
+		ret = arch_prepare_kprobe(ap);
 		if (ret)
+			/*
+			 * Even if fail to allocate new slot, don't need to
+			 * free aggr_probe. It will be used next time, or
+			 * freed by unregister_kprobe.
+			 */
 			return ret;
-	}
-	if (old_p->pre_handler == aggr_pre_handler) {
-		copy_kprobe(old_p, p);
-		ret = add_new_kprobe(old_p, p);
-		ap = old_p;
-	} else {
-		ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
-		if (!ap) {
-			if (kprobe_gone(old_p))
-				arch_remove_kprobe(old_p);
-			return -ENOMEM;
-		}
-		add_aggr_kprobe(ap, old_p);
-		copy_kprobe(ap, p);
-		ret = add_new_kprobe(ap, p);
-	}
-	if (kprobe_gone(old_p)) {
+		/* Clear gone flag to prevent allocating new slot again. */
+		ap->flags &= ~KPROBE_FLAG_GONE;
 		/*
 		 * If the old_p has gone, its breakpoint has been disarmed.
 		 * We have to arm it again after preparing real kprobes.
 		 */
-		ap->flags &= ~KPROBE_FLAG_GONE;
 		if (kprobe_enabled)
 			arch_arm_kprobe(ap);
 	}
-	return ret;
+
+	copy_kprobe(ap, p);
+	return add_new_kprobe(ap, p);
 }
 
 static int __kprobes in_kprobes_functions(unsigned long addr)
-- 
cgit v1.2.3-59-g8ed1b


From 99081ab553d6a88dd6b3774af5eef94dbb8faad7 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:00:59 -0700
Subject: kprobes: move EXPORT_SYMBOL_GPL just after function definitions

Clean up positions of EXPORT_SYMBOL_GPL in kernel/kprobes.c according to
checkpatch.pl.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a55bfadfd766..ca4c22d78cfd 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -722,6 +722,7 @@ out:
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_kprobe);
 
 /*
  * Unregister a kprobe without a scheduler synchronization.
@@ -803,11 +804,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_kprobes);
 
 void __kprobes unregister_kprobe(struct kprobe *p)
 {
 	unregister_kprobes(&p, 1);
 }
+EXPORT_SYMBOL_GPL(unregister_kprobe);
 
 void __kprobes unregister_kprobes(struct kprobe **kps, int num)
 {
@@ -826,6 +829,7 @@ void __kprobes unregister_kprobes(struct kprobe **kps, int num)
 		if (kps[i]->addr)
 			__unregister_kprobe_bottom(kps[i]);
 }
+EXPORT_SYMBOL_GPL(unregister_kprobes);
 
 static struct notifier_block kprobe_exceptions_nb = {
 	.notifier_call = kprobe_exceptions_notify,
@@ -865,16 +869,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_jprobes);
 
 int __kprobes register_jprobe(struct jprobe *jp)
 {
 	return register_jprobes(&jp, 1);
 }
+EXPORT_SYMBOL_GPL(register_jprobe);
 
 void __kprobes unregister_jprobe(struct jprobe *jp)
 {
 	unregister_jprobes(&jp, 1);
 }
+EXPORT_SYMBOL_GPL(unregister_jprobe);
 
 void __kprobes unregister_jprobes(struct jprobe **jps, int num)
 {
@@ -894,6 +901,7 @@ void __kprobes unregister_jprobes(struct jprobe **jps, int num)
 			__unregister_kprobe_bottom(&jps[i]->kp);
 	}
 }
+EXPORT_SYMBOL_GPL(unregister_jprobes);
 
 #ifdef CONFIG_KRETPROBES
 /*
@@ -987,6 +995,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
 		free_rp_inst(rp);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_kretprobe);
 
 int __kprobes register_kretprobes(struct kretprobe **rps, int num)
 {
@@ -1004,11 +1013,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_kretprobes);
 
 void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
 	unregister_kretprobes(&rp, 1);
 }
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
 
 void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
 {
@@ -1030,24 +1041,30 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
 		}
 	}
 }
+EXPORT_SYMBOL_GPL(unregister_kretprobes);
 
 #else /* CONFIG_KRETPROBES */
 int __kprobes register_kretprobe(struct kretprobe *rp)
 {
 	return -ENOSYS;
 }
+EXPORT_SYMBOL_GPL(register_kretprobe);
 
 int __kprobes register_kretprobes(struct kretprobe **rps, int num)
 {
 	return -ENOSYS;
 }
+EXPORT_SYMBOL_GPL(register_kretprobes);
+
 void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
 }
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
 
 void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
 {
 }
+EXPORT_SYMBOL_GPL(unregister_kretprobes);
 
 static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 					   struct pt_regs *regs)
@@ -1418,16 +1435,5 @@ late_initcall(debugfs_kprobe_init);
 
 module_init(init_kprobes);
 
-EXPORT_SYMBOL_GPL(register_kprobe);
-EXPORT_SYMBOL_GPL(unregister_kprobe);
-EXPORT_SYMBOL_GPL(register_kprobes);
-EXPORT_SYMBOL_GPL(unregister_kprobes);
-EXPORT_SYMBOL_GPL(register_jprobe);
-EXPORT_SYMBOL_GPL(unregister_jprobe);
-EXPORT_SYMBOL_GPL(register_jprobes);
-EXPORT_SYMBOL_GPL(unregister_jprobes);
+/* defined in arch/.../kernel/kprobes.c */
 EXPORT_SYMBOL_GPL(jprobe_return);
-EXPORT_SYMBOL_GPL(register_kretprobe);
-EXPORT_SYMBOL_GPL(unregister_kretprobe);
-EXPORT_SYMBOL_GPL(register_kretprobes);
-EXPORT_SYMBOL_GPL(unregister_kretprobes);
-- 
cgit v1.2.3-59-g8ed1b


From e579abeb58eb4b8d7321c6eb44dd9e2d0cbaebaa Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:01:01 -0700
Subject: kprobes: rename kprobe_enabled to kprobes_all_disarmed

Rename kprobe_enabled to kprobes_all_disarmed and invert logic due to
avoiding naming confusion from per-probe disabling.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ca4c22d78cfd..dae198b68e97 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -68,7 +68,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
 /* NOTE: change this value only with kprobe_mutex held */
-static bool kprobe_enabled;
+static bool kprobes_all_disarmed;
 
 static DEFINE_MUTEX(kprobe_mutex);	/* Protects kprobe_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -598,7 +598,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 		 * If the old_p has gone, its breakpoint has been disarmed.
 		 * We have to arm it again after preparing real kprobes.
 		 */
-		if (kprobe_enabled)
+		if (!kprobes_all_disarmed)
 			arch_arm_kprobe(ap);
 	}
 
@@ -709,7 +709,7 @@ int __kprobes register_kprobe(struct kprobe *p)
 	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
-	if (kprobe_enabled)
+	if (!kprobes_all_disarmed)
 		arch_arm_kprobe(p);
 
 out_unlock_text:
@@ -751,7 +751,7 @@ valid_p:
 		 * enabled and not gone - otherwise, the breakpoint would
 		 * already have been removed. We save on flushing icache.
 		 */
-		if (kprobe_enabled && !kprobe_gone(old_p)) {
+		if (!kprobes_all_disarmed && !kprobe_gone(old_p)) {
 			mutex_lock(&text_mutex);
 			arch_disarm_kprobe(p);
 			mutex_unlock(&text_mutex);
@@ -1190,8 +1190,8 @@ static int __init init_kprobes(void)
 		}
 	}
 
-	/* By default, kprobes are enabled */
-	kprobe_enabled = true;
+	/* By default, kprobes are armed */
+	kprobes_all_disarmed = false;
 
 	err = arch_init_kprobes();
 	if (!err)
@@ -1289,7 +1289,7 @@ static struct file_operations debugfs_kprobes_operations = {
 	.release        = seq_release,
 };
 
-static void __kprobes enable_all_kprobes(void)
+static void __kprobes arm_all_kprobes(void)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
@@ -1298,8 +1298,8 @@ static void __kprobes enable_all_kprobes(void)
 
 	mutex_lock(&kprobe_mutex);
 
-	/* If kprobes are already enabled, just return */
-	if (kprobe_enabled)
+	/* If kprobes are armed, just return */
+	if (!kprobes_all_disarmed)
 		goto already_enabled;
 
 	mutex_lock(&text_mutex);
@@ -1311,7 +1311,7 @@ static void __kprobes enable_all_kprobes(void)
 	}
 	mutex_unlock(&text_mutex);
 
-	kprobe_enabled = true;
+	kprobes_all_disarmed = false;
 	printk(KERN_INFO "Kprobes globally enabled\n");
 
 already_enabled:
@@ -1319,7 +1319,7 @@ already_enabled:
 	return;
 }
 
-static void __kprobes disable_all_kprobes(void)
+static void __kprobes disarm_all_kprobes(void)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
@@ -1328,11 +1328,11 @@ static void __kprobes disable_all_kprobes(void)
 
 	mutex_lock(&kprobe_mutex);
 
-	/* If kprobes are already disabled, just return */
-	if (!kprobe_enabled)
+	/* If kprobes are already disarmed, just return */
+	if (kprobes_all_disarmed)
 		goto already_disabled;
 
-	kprobe_enabled = false;
+	kprobes_all_disarmed = true;
 	printk(KERN_INFO "Kprobes globally disabled\n");
 	mutex_lock(&text_mutex);
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -1364,7 +1364,7 @@ static ssize_t read_enabled_file_bool(struct file *file,
 {
 	char buf[3];
 
-	if (kprobe_enabled)
+	if (!kprobes_all_disarmed)
 		buf[0] = '1';
 	else
 		buf[0] = '0';
@@ -1387,12 +1387,12 @@ static ssize_t write_enabled_file_bool(struct file *file,
 	case 'y':
 	case 'Y':
 	case '1':
-		enable_all_kprobes();
+		arm_all_kprobes();
 		break;
 	case 'n':
 	case 'N':
 	case '0':
-		disable_all_kprobes();
+		disarm_all_kprobes();
 		break;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From de5bd88d5a5cce3cacea904d3503e5ebdb3852a2 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:01:02 -0700
Subject: kprobes: support per-kprobe disabling

Add disable_kprobe() and enable_kprobe() to disable/enable kprobes
temporarily.

disable_kprobe() asynchronously disables probe handlers of specified
kprobe.  So, after calling it, some handlers can be called at a while.
enable_kprobe() enables specified kprobe.

aggr_pre_handler and aggr_post_handler check disabled probes.  On the
other hand aggr_break_handler and aggr_fault_handler don't check it
because these handlers will be called while executing pre or post handlers
and usually those help error handling.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kprobes.txt |  34 ++++++++--
 include/linux/kprobes.h   |  23 ++++++-
 kernel/kprobes.c          | 167 ++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 191 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 48b3de90eb1e..f609af242d6c 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -212,7 +212,9 @@ hit, Kprobes calls kp->pre_handler.  After the probed instruction
 is single-stepped, Kprobe calls kp->post_handler.  If a fault
 occurs during execution of kp->pre_handler or kp->post_handler,
 or during single-stepping of the probed instruction, Kprobes calls
-kp->fault_handler.  Any or all handlers can be NULL.
+kp->fault_handler.  Any or all handlers can be NULL. If kp->flags
+is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled,
+so, it's handlers aren't hit until calling enable_kprobe(kp).
 
 NOTE:
 1. With the introduction of the "symbol_name" field to struct kprobe,
@@ -363,6 +365,22 @@ probes) in the specified array, they clear the addr field of those
 incorrect probes. However, other probes in the array are
 unregistered correctly.
 
+4.7 disable_kprobe
+
+#include <linux/kprobes.h>
+int disable_kprobe(struct kprobe *kp);
+
+Temporarily disables the specified kprobe. You can enable it again by using
+enable_kprobe(). You must specify the kprobe which has been registered.
+
+4.8 enable_kprobe
+
+#include <linux/kprobes.h>
+int enable_kprobe(struct kprobe *kp);
+
+Enables kprobe which has been disabled by disable_kprobe(). You must specify
+the kprobe which has been registered.
+
 5. Kprobes Features and Limitations
 
 Kprobes allows multiple probes at the same address.  Currently,
@@ -500,10 +518,14 @@ the probe. If the probed function belongs to a module, the module name
 is also specified. Following columns show probe status. If the probe is on
 a virtual address that is no longer valid (module init sections, module
 virtual addresses that correspond to modules that've been unloaded),
-such probes are marked with [GONE].
+such probes are marked with [GONE]. If the probe is temporarily disabled,
+such probes are marked with [DISABLED].
 
-/debug/kprobes/enabled: Turn kprobes ON/OFF
+/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
 
-Provides a knob to globally turn registered kprobes ON or OFF. By default,
-all kprobes are enabled. By echoing "0" to this file, all registered probes
-will be disarmed, till such time a "1" is echoed to this file.
+Provides a knob to globally and forcibly turn registered kprobes ON or OFF.
+By default, all kprobes are enabled. By echoing "0" to this file, all
+registered probes will be disarmed, till such time a "1" is echoed to this
+file. Note that this knob just disarms and arms all kprobes and doesn't
+change each probe's disabling state. This means that disabled kprobes (marked
+[DISABLED]) will be not enabled if you turn ON all kprobes by this knob.
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 39826a678364..1071cfddddc9 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -112,18 +112,28 @@ struct kprobe {
 	/* copy of the original instruction */
 	struct arch_specific_insn ainsn;
 
-	/* Indicates various status flags.  Protected by kprobe_mutex. */
+	/*
+	 * Indicates various status flags.
+	 * Protected by kprobe_mutex after this kprobe is registered.
+	 */
 	u32 flags;
 };
 
 /* Kprobe status flags */
 #define KPROBE_FLAG_GONE	1 /* breakpoint has already gone */
+#define KPROBE_FLAG_DISABLED	2 /* probe is temporarily disabled */
 
+/* Has this kprobe gone ? */
 static inline int kprobe_gone(struct kprobe *p)
 {
 	return p->flags & KPROBE_FLAG_GONE;
 }
 
+/* Is this kprobe disabled ? */
+static inline int kprobe_disabled(struct kprobe *p)
+{
+	return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE);
+}
 /*
  * Special probe type that uses setjmp-longjmp type tricks to resume
  * execution at a specified entry with a matching prototype corresponding
@@ -283,6 +293,9 @@ void unregister_kretprobes(struct kretprobe **rps, int num);
 void kprobe_flush_task(struct task_struct *tk);
 void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 
+int disable_kprobe(struct kprobe *kp);
+int enable_kprobe(struct kprobe *kp);
+
 #else /* !CONFIG_KPROBES: */
 
 static inline int kprobes_built_in(void)
@@ -349,5 +362,13 @@ static inline void unregister_kretprobes(struct kretprobe **rps, int num)
 static inline void kprobe_flush_task(struct task_struct *tk)
 {
 }
+static inline int disable_kprobe(struct kprobe *kp)
+{
+	return -ENOSYS;
+}
+static inline int enable_kprobe(struct kprobe *kp)
+{
+	return -ENOSYS;
+}
 #endif /* CONFIG_KPROBES */
 #endif /* _LINUX_KPROBES_H */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index dae198b68e97..a5e74ddee0e2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -328,7 +328,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	struct kprobe *kp;
 
 	list_for_each_entry_rcu(kp, &p->list, list) {
-		if (kp->pre_handler && !kprobe_gone(kp)) {
+		if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
 			set_kprobe_instance(kp);
 			if (kp->pre_handler(kp, regs))
 				return 1;
@@ -344,7 +344,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 	struct kprobe *kp;
 
 	list_for_each_entry_rcu(kp, &p->list, list) {
-		if (kp->post_handler && !kprobe_gone(kp)) {
+		if (kp->post_handler && likely(!kprobe_disabled(kp))) {
 			set_kprobe_instance(kp);
 			kp->post_handler(kp, regs, flags);
 			reset_kprobe_instance();
@@ -523,6 +523,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
 */
 static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
+	BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
 	if (p->break_handler) {
 		if (ap->break_handler)
 			return -EEXIST;
@@ -532,6 +533,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 		list_add_rcu(&p->list, &ap->list);
 	if (p->post_handler && !ap->post_handler)
 		ap->post_handler = aggr_post_handler;
+
+	if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
+		ap->flags &= ~KPROBE_FLAG_DISABLED;
+		if (!kprobes_all_disarmed)
+			/* Arm the breakpoint again. */
+			arch_arm_kprobe(ap);
+	}
 	return 0;
 }
 
@@ -592,20 +600,36 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 			 * freed by unregister_kprobe.
 			 */
 			return ret;
-		/* Clear gone flag to prevent allocating new slot again. */
-		ap->flags &= ~KPROBE_FLAG_GONE;
+
 		/*
-		 * If the old_p has gone, its breakpoint has been disarmed.
-		 * We have to arm it again after preparing real kprobes.
+		 * Clear gone flag to prevent allocating new slot again, and
+		 * set disabled flag because it is not armed yet.
 		 */
-		if (!kprobes_all_disarmed)
-			arch_arm_kprobe(ap);
+		ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
+			    | KPROBE_FLAG_DISABLED;
 	}
 
 	copy_kprobe(ap, p);
 	return add_new_kprobe(ap, p);
 }
 
+/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
+static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
+{
+	struct kprobe *kp;
+
+	list_for_each_entry_rcu(kp, &p->list, list) {
+		if (!kprobe_disabled(kp))
+			/*
+			 * There is an active probe on the list.
+			 * We can't disable aggr_kprobe.
+			 */
+			return 0;
+	}
+	p->flags |= KPROBE_FLAG_DISABLED;
+	return 1;
+}
+
 static int __kprobes in_kprobes_functions(unsigned long addr)
 {
 	struct kprobe_blackpoint *kb;
@@ -664,7 +688,9 @@ int __kprobes register_kprobe(struct kprobe *p)
 		return -EINVAL;
 	}
 
-	p->flags = 0;
+	/* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
+	p->flags &= KPROBE_FLAG_DISABLED;
+
 	/*
 	 * Check if are we probing a module.
 	 */
@@ -709,7 +735,7 @@ int __kprobes register_kprobe(struct kprobe *p)
 	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
-	if (!kprobes_all_disarmed)
+	if (!kprobes_all_disarmed && !kprobe_disabled(p))
 		arch_arm_kprobe(p);
 
 out_unlock_text:
@@ -724,25 +750,37 @@ out:
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
 
-/*
- * Unregister a kprobe without a scheduler synchronization.
- */
-static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
 {
 	struct kprobe *old_p, *list_p;
 
 	old_p = get_kprobe(p->addr);
 	if (unlikely(!old_p))
-		return -EINVAL;
+		return NULL;
 
 	if (p != old_p) {
 		list_for_each_entry_rcu(list_p, &old_p->list, list)
 			if (list_p == p)
 			/* kprobe p is a valid probe */
-				goto valid_p;
-		return -EINVAL;
+				goto valid;
+		return NULL;
 	}
-valid_p:
+valid:
+	return old_p;
+}
+
+/*
+ * Unregister a kprobe without a scheduler synchronization.
+ */
+static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+{
+	struct kprobe *old_p, *list_p;
+
+	old_p = __get_valid_kprobe(p);
+	if (old_p == NULL)
+		return -EINVAL;
+
 	if (old_p == p ||
 	    (old_p->pre_handler == aggr_pre_handler &&
 	     list_is_singular(&old_p->list))) {
@@ -751,7 +789,7 @@ valid_p:
 		 * enabled and not gone - otherwise, the breakpoint would
 		 * already have been removed. We save on flushing icache.
 		 */
-		if (!kprobes_all_disarmed && !kprobe_gone(old_p)) {
+		if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) {
 			mutex_lock(&text_mutex);
 			arch_disarm_kprobe(p);
 			mutex_unlock(&text_mutex);
@@ -769,6 +807,11 @@ valid_p:
 		}
 noclean:
 		list_del_rcu(&p->list);
+		if (!kprobe_disabled(old_p)) {
+			try_to_disable_aggr_kprobe(old_p);
+			if (!kprobes_all_disarmed && kprobe_disabled(old_p))
+				arch_disarm_kprobe(old_p);
+		}
 	}
 	return 0;
 }
@@ -1078,6 +1121,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 static void __kprobes kill_kprobe(struct kprobe *p)
 {
 	struct kprobe *kp;
+
 	p->flags |= KPROBE_FLAG_GONE;
 	if (p->pre_handler == aggr_pre_handler) {
 		/*
@@ -1219,12 +1263,18 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
 	else
 		kprobe_type = "k";
 	if (sym)
-		seq_printf(pi, "%p  %s  %s+0x%x  %s %s\n", p->addr, kprobe_type,
-			sym, offset, (modname ? modname : " "),
-			(kprobe_gone(p) ? "[GONE]" : ""));
+		seq_printf(pi, "%p  %s  %s+0x%x  %s %s%s\n",
+			p->addr, kprobe_type, sym, offset,
+			(modname ? modname : " "),
+			(kprobe_gone(p) ? "[GONE]" : ""),
+			((kprobe_disabled(p) && !kprobe_gone(p)) ?
+			 "[DISABLED]" : ""));
 	else
-		seq_printf(pi, "%p  %s  %p %s\n", p->addr, kprobe_type, p->addr,
-			(kprobe_gone(p) ? "[GONE]" : ""));
+		seq_printf(pi, "%p  %s  %p %s%s\n",
+			p->addr, kprobe_type, p->addr,
+			(kprobe_gone(p) ? "[GONE]" : ""),
+			((kprobe_disabled(p) && !kprobe_gone(p)) ?
+			 "[DISABLED]" : ""));
 }
 
 static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1289,6 +1339,71 @@ static struct file_operations debugfs_kprobes_operations = {
 	.release        = seq_release,
 };
 
+/* Disable one kprobe */
+int __kprobes disable_kprobe(struct kprobe *kp)
+{
+	int ret = 0;
+	struct kprobe *p;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* Check whether specified probe is valid. */
+	p = __get_valid_kprobe(kp);
+	if (unlikely(p == NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* If the probe is already disabled (or gone), just return */
+	if (kprobe_disabled(kp))
+		goto out;
+
+	kp->flags |= KPROBE_FLAG_DISABLED;
+	if (p != kp)
+		/* When kp != p, p is always enabled. */
+		try_to_disable_aggr_kprobe(p);
+
+	if (!kprobes_all_disarmed && kprobe_disabled(p))
+		arch_disarm_kprobe(p);
+out:
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(disable_kprobe);
+
+/* Enable one kprobe */
+int __kprobes enable_kprobe(struct kprobe *kp)
+{
+	int ret = 0;
+	struct kprobe *p;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* Check whether specified probe is valid. */
+	p = __get_valid_kprobe(kp);
+	if (unlikely(p == NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (kprobe_gone(kp)) {
+		/* This kprobe has gone, we couldn't enable it. */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!kprobes_all_disarmed && kprobe_disabled(p))
+		arch_arm_kprobe(p);
+
+	p->flags &= ~KPROBE_FLAG_DISABLED;
+	if (p != kp)
+		kp->flags &= ~KPROBE_FLAG_DISABLED;
+out:
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(enable_kprobe);
+
 static void __kprobes arm_all_kprobes(void)
 {
 	struct hlist_head *head;
@@ -1306,7 +1421,7 @@ static void __kprobes arm_all_kprobes(void)
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist)
-			if (!kprobe_gone(p))
+			if (!kprobe_disabled(p))
 				arch_arm_kprobe(p);
 	}
 	mutex_unlock(&text_mutex);
@@ -1338,7 +1453,7 @@ static void __kprobes disarm_all_kprobes(void)
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist) {
-			if (!arch_trampoline_kprobe(p) && !kprobe_gone(p))
+			if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
 				arch_disarm_kprobe(p);
 		}
 	}
-- 
cgit v1.2.3-59-g8ed1b